1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Constant data definitions
83
84 static const uint32_t _sha256_round_consts[64] = {
85 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
86 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
87 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
88 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
89 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
90 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
91 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
92 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
93 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
94 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
95 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
96 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
97 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
98 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
99 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
100 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
101 };
102
103 static const uint64_t _sha512_round_consts[80] = {
104 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
105 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
106 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
107 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
108 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
109 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
110 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
111 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
112 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
113 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
114 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
115 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
116 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
117 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
118 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
119 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
120 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
121 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
122 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
123 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
124 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
125 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
126 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
127 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
128 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
129 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
130 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
131 };
132
133 static const uint64_t _sha3_round_consts[24] = {
134 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
135 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
136 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
137 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
138 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
139 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
140 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
141 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
142 };
143
144 static const uint64_t _double_keccak_round_consts[24] = {
145 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
146 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
147 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
148 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
149 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
150 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
151 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
152 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
153 };
154
155 //Omit 3rd limb of modulus since it is 0
156 static const int64_t _modulus_P256[5] = {
157 0x000fffffffffffffL, 0x00000fffffffffffL,
158 0x0000001000000000L, 0x0000ffffffff0000L
159 };
160
161 static const char _encodeBlock_toBase64[64] = {
162 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
163 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
164 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
165 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
166 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
167 };
168
169 static const char _encodeBlock_toBase64URL[64] = {
170 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
171 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
172 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
173 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
174 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
175 };
176
177 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
178 // except the trailing character '=' is also treated illegal value in this intrinsic. That
179 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
180 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
181 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
182 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
184 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
185 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
186 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
187 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
188 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
189 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
190 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
191 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
192 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
193 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
194 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
195 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
196 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
197 };
198
199 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
200 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
201 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
202 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
203 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
204 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
205 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
206 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
207 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
208 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
210 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
211 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
212 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
213 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
214 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
215 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
216 };
217
218 // A legal value of base64 code is in range [0, 127]. We need two lookups
219 // with tbl/tbx and combine them to get the decode data. The 1st table vector
220 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
221 // table vector lookup use tbx, out of range indices are unchanged in
222 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
223 // The value of index 64 is set to 0, so that we know that we already get the
224 // decoded data with the 1st lookup.
225 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
226 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
227 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
228 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
229 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
230 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
231 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
232 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
233 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
234 };
235
236 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
237 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
238 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
239 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
240 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
241 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
242 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
243 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
244 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
245 };
246
247
248 // Stub Code definitions
249
250 class StubGenerator: public StubCodeGenerator {
251 private:
252
253 #ifdef PRODUCT
254 #define inc_counter_np(counter) ((void)0)
255 #else
256 void inc_counter_np_(uint& counter) {
257 __ incrementw(ExternalAddress((address)&counter));
258 }
259 #define inc_counter_np(counter) \
260 BLOCK_COMMENT("inc_counter " #counter); \
261 inc_counter_np_(counter);
262 #endif
263
264 // Call stubs are used to call Java from C
265 //
266 // Arguments:
267 // c_rarg0: call wrapper address address
268 // c_rarg1: result address
269 // c_rarg2: result type BasicType
270 // c_rarg3: method Method*
271 // c_rarg4: (interpreter) entry point address
272 // c_rarg5: parameters intptr_t*
273 // c_rarg6: parameter size (in words) int
274 // c_rarg7: thread Thread*
275 //
276 // There is no return from the stub itself as any Java result
277 // is written to result
278 //
279 // we save r30 (lr) as the return PC at the base of the frame and
280 // link r29 (fp) below it as the frame pointer installing sp (r31)
281 // into fp.
282 //
283 // we save r0-r7, which accounts for all the c arguments.
284 //
285 // TODO: strictly do we need to save them all? they are treated as
286 // volatile by C so could we omit saving the ones we are going to
287 // place in global registers (thread? method?) or those we only use
288 // during setup of the Java call?
289 //
290 // we don't need to save r8 which C uses as an indirect result location
291 // return register.
292 //
293 // we don't need to save r9-r15 which both C and Java treat as
294 // volatile
295 //
296 // we don't need to save r16-18 because Java does not use them
297 //
298 // we save r19-r28 which Java uses as scratch registers and C
299 // expects to be callee-save
300 //
301 // we save the bottom 64 bits of each value stored in v8-v15; it is
302 // the responsibility of the caller to preserve larger values.
303 //
304 // so the stub frame looks like this when we enter Java code
305 //
306 // [ return_from_Java ] <--- sp
307 // [ argument word n ]
308 // ...
309 // -29 [ argument word 1 ]
310 // -28 [ saved Floating-point Control Register ]
311 // -26 [ saved v15 ] <--- sp_after_call
312 // -25 [ saved v14 ]
313 // -24 [ saved v13 ]
314 // -23 [ saved v12 ]
315 // -22 [ saved v11 ]
316 // -21 [ saved v10 ]
317 // -20 [ saved v9 ]
318 // -19 [ saved v8 ]
319 // -18 [ saved r28 ]
320 // -17 [ saved r27 ]
321 // -16 [ saved r26 ]
322 // -15 [ saved r25 ]
323 // -14 [ saved r24 ]
324 // -13 [ saved r23 ]
325 // -12 [ saved r22 ]
326 // -11 [ saved r21 ]
327 // -10 [ saved r20 ]
328 // -9 [ saved r19 ]
329 // -8 [ call wrapper (r0) ]
330 // -7 [ result (r1) ]
331 // -6 [ result type (r2) ]
332 // -5 [ method (r3) ]
333 // -4 [ entry point (r4) ]
334 // -3 [ parameters (r5) ]
335 // -2 [ parameter size (r6) ]
336 // -1 [ thread (r7) ]
337 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
338 // 1 [ saved lr (r30) ]
339
340 // Call stub stack layout word offsets from fp
341 enum call_stub_layout {
342 sp_after_call_off = -28,
343
344 fpcr_off = sp_after_call_off,
345 d15_off = -26,
346 d13_off = -24,
347 d11_off = -22,
348 d9_off = -20,
349
350 r28_off = -18,
351 r26_off = -16,
352 r24_off = -14,
353 r22_off = -12,
354 r20_off = -10,
355 call_wrapper_off = -8,
356 result_off = -7,
357 result_type_off = -6,
358 method_off = -5,
359 entry_point_off = -4,
360 parameter_size_off = -2,
361 thread_off = -1,
362 fp_f = 0,
363 retaddr_off = 1,
364 };
365
366 address generate_call_stub(address& return_address) {
367 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
368 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
369 "adjust this code");
370
371 StubId stub_id = StubId::stubgen_call_stub_id;
372 GrowableArray<address> entries;
373 int entry_count = StubInfo::entry_count(stub_id);
374 assert(entry_count == 2, "sanity check");
375 address start = load_archive_data(stub_id, &entries);
376 if (start != nullptr) {
377 assert(entries.length() == 1, "expected 1 extra entry");
378 return_address = entries.at(0);
379 return start;
380 }
381 StubCodeMark mark(this, stub_id);
382 start = __ pc();
383
384 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
385
386 const Address fpcr_save (rfp, fpcr_off * wordSize);
387 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
388 const Address result (rfp, result_off * wordSize);
389 const Address result_type (rfp, result_type_off * wordSize);
390 const Address method (rfp, method_off * wordSize);
391 const Address entry_point (rfp, entry_point_off * wordSize);
392 const Address parameter_size(rfp, parameter_size_off * wordSize);
393
394 const Address thread (rfp, thread_off * wordSize);
395
396 const Address d15_save (rfp, d15_off * wordSize);
397 const Address d13_save (rfp, d13_off * wordSize);
398 const Address d11_save (rfp, d11_off * wordSize);
399 const Address d9_save (rfp, d9_off * wordSize);
400
401 const Address r28_save (rfp, r28_off * wordSize);
402 const Address r26_save (rfp, r26_off * wordSize);
403 const Address r24_save (rfp, r24_off * wordSize);
404 const Address r22_save (rfp, r22_off * wordSize);
405 const Address r20_save (rfp, r20_off * wordSize);
406
407 // stub code
408
409 address aarch64_entry = __ pc();
410
411 // set up frame and move sp to end of save area
412 __ enter();
413 __ sub(sp, rfp, -sp_after_call_off * wordSize);
414
415 // save register parameters and Java scratch/global registers
416 // n.b. we save thread even though it gets installed in
417 // rthread because we want to sanity check rthread later
418 __ str(c_rarg7, thread);
419 __ strw(c_rarg6, parameter_size);
420 __ stp(c_rarg4, c_rarg5, entry_point);
421 __ stp(c_rarg2, c_rarg3, result_type);
422 __ stp(c_rarg0, c_rarg1, call_wrapper);
423
424 __ stp(r20, r19, r20_save);
425 __ stp(r22, r21, r22_save);
426 __ stp(r24, r23, r24_save);
427 __ stp(r26, r25, r26_save);
428 __ stp(r28, r27, r28_save);
429
430 __ stpd(v9, v8, d9_save);
431 __ stpd(v11, v10, d11_save);
432 __ stpd(v13, v12, d13_save);
433 __ stpd(v15, v14, d15_save);
434
435 __ get_fpcr(rscratch1);
436 __ str(rscratch1, fpcr_save);
437 // Set FPCR to the state we need. We do want Round to Nearest. We
438 // don't want non-IEEE rounding modes or floating-point traps.
439 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
440 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
441 __ set_fpcr(rscratch1);
442
443 // install Java thread in global register now we have saved
444 // whatever value it held
445 __ mov(rthread, c_rarg7);
446 // And method
447 __ mov(rmethod, c_rarg3);
448
449 // set up the heapbase register
450 __ reinit_heapbase();
451
452 #ifdef ASSERT
453 // make sure we have no pending exceptions
454 {
455 Label L;
456 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
457 __ cmp(rscratch1, (u1)NULL_WORD);
458 __ br(Assembler::EQ, L);
459 __ stop("StubRoutines::call_stub: entered with pending exception");
460 __ BIND(L);
461 }
462 #endif
463 // pass parameters if any
464 __ mov(esp, sp);
465 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
466 __ andr(sp, rscratch1, -2 * wordSize);
467
468 BLOCK_COMMENT("pass parameters if any");
469 Label parameters_done;
470 // parameter count is still in c_rarg6
471 // and parameter pointer identifying param 1 is in c_rarg5
472 __ cbzw(c_rarg6, parameters_done);
473
474 address loop = __ pc();
475 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
476 __ subsw(c_rarg6, c_rarg6, 1);
477 __ push(rscratch1);
478 __ br(Assembler::GT, loop);
479
480 __ BIND(parameters_done);
481
482 // call Java entry -- passing methdoOop, and current sp
483 // rmethod: Method*
484 // r19_sender_sp: sender sp
485 BLOCK_COMMENT("call Java function");
486 __ mov(r19_sender_sp, sp);
487 __ blr(c_rarg4);
488
489 // we do this here because the notify will already have been done
490 // if we get to the next instruction via an exception
491 //
492 // n.b. adding this instruction here affects the calculation of
493 // whether or not a routine returns to the call stub (used when
494 // doing stack walks) since the normal test is to check the return
495 // pc against the address saved below. so we may need to allow for
496 // this extra instruction in the check.
497
498 // save current address for use by exception handling code
499
500 return_address = __ pc();
501 entries.append(return_address);
502
503 // store result depending on type (everything that is not
504 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
505 // n.b. this assumes Java returns an integral result in r0
506 // and a floating result in j_farg0
507 // All of j_rargN may be used to return inline type fields so be careful
508 // not to clobber those.
509 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
510 // assignment of Rresult below.
511 Register Rresult = r14, Rresult_type = r15;
512 __ ldr(Rresult, result);
513 Label is_long, is_float, is_double, check_prim, exit;
514 __ ldr(Rresult_type, result_type);
515 __ cmp(Rresult_type, (u1)T_OBJECT);
516 __ br(Assembler::EQ, check_prim);
517 __ cmp(Rresult_type, (u1)T_LONG);
518 __ br(Assembler::EQ, is_long);
519 __ cmp(Rresult_type, (u1)T_FLOAT);
520 __ br(Assembler::EQ, is_float);
521 __ cmp(Rresult_type, (u1)T_DOUBLE);
522 __ br(Assembler::EQ, is_double);
523
524 // handle T_INT case
525 __ strw(r0, Address(Rresult));
526
527 __ BIND(exit);
528
529 // pop parameters
530 __ sub(esp, rfp, -sp_after_call_off * wordSize);
531
532 #ifdef ASSERT
533 // verify that threads correspond
534 {
535 Label L, S;
536 __ ldr(rscratch1, thread);
537 __ cmp(rthread, rscratch1);
538 __ br(Assembler::NE, S);
539 __ get_thread(rscratch1);
540 __ cmp(rthread, rscratch1);
541 __ br(Assembler::EQ, L);
542 __ BIND(S);
543 __ stop("StubRoutines::call_stub: threads must correspond");
544 __ BIND(L);
545 }
546 #endif
547
548 __ pop_cont_fastpath(rthread);
549
550 // restore callee-save registers
551 __ ldpd(v15, v14, d15_save);
552 __ ldpd(v13, v12, d13_save);
553 __ ldpd(v11, v10, d11_save);
554 __ ldpd(v9, v8, d9_save);
555
556 __ ldp(r28, r27, r28_save);
557 __ ldp(r26, r25, r26_save);
558 __ ldp(r24, r23, r24_save);
559 __ ldp(r22, r21, r22_save);
560 __ ldp(r20, r19, r20_save);
561
562 // restore fpcr
563 __ ldr(rscratch1, fpcr_save);
564 __ set_fpcr(rscratch1);
565
566 __ ldp(c_rarg0, c_rarg1, call_wrapper);
567 __ ldrw(c_rarg2, result_type);
568 __ ldr(c_rarg3, method);
569 __ ldp(c_rarg4, c_rarg5, entry_point);
570 __ ldp(c_rarg6, c_rarg7, parameter_size);
571
572 // leave frame and return to caller
573 __ leave();
574 __ ret(lr);
575
576 // handle return types different from T_INT
577 __ BIND(check_prim);
578 if (InlineTypeReturnedAsFields) {
579 // Check for scalarized return value
580 __ tbz(r0, 0, is_long);
581 // Load pack handler address
582 __ andr(rscratch1, r0, -2);
583 __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
584 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
585 __ blr(rscratch1);
586 __ b(exit);
587 }
588
589 __ BIND(is_long);
590 __ str(r0, Address(Rresult, 0));
591 __ br(Assembler::AL, exit);
592
593 __ BIND(is_float);
594 __ strs(j_farg0, Address(Rresult, 0));
595 __ br(Assembler::AL, exit);
596
597 __ BIND(is_double);
598 __ strd(j_farg0, Address(Rresult, 0));
599 __ br(Assembler::AL, exit);
600
601 // record the stub entry and end plus the auxiliary entry
602 store_archive_data(stub_id, start, __ pc(), &entries);
603
604 return start;
605 }
606
607 // Return point for a Java call if there's an exception thrown in
608 // Java code. The exception is caught and transformed into a
609 // pending exception stored in JavaThread that can be tested from
610 // within the VM.
611 //
612 // Note: Usually the parameters are removed by the callee. In case
613 // of an exception crossing an activation frame boundary, that is
614 // not the case if the callee is compiled code => need to setup the
615 // rsp.
616 //
617 // r0: exception oop
618
619 address generate_catch_exception() {
620 StubId stub_id = StubId::stubgen_catch_exception_id;
621 int entry_count = StubInfo::entry_count(stub_id);
622 assert(entry_count == 1, "sanity check");
623 address start = load_archive_data(stub_id);
624 if (start != nullptr) {
625 return start;
626 }
627 StubCodeMark mark(this, stub_id);
628 start = __ pc();
629
630 // same as in generate_call_stub():
631 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
632 const Address thread (rfp, thread_off * wordSize);
633
634 #ifdef ASSERT
635 // verify that threads correspond
636 {
637 Label L, S;
638 __ ldr(rscratch1, thread);
639 __ cmp(rthread, rscratch1);
640 __ br(Assembler::NE, S);
641 __ get_thread(rscratch1);
642 __ cmp(rthread, rscratch1);
643 __ br(Assembler::EQ, L);
644 __ bind(S);
645 __ stop("StubRoutines::catch_exception: threads must correspond");
646 __ bind(L);
647 }
648 #endif
649
650 // set pending exception
651 __ verify_oop(r0);
652
653 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
654 // special case -- add file name string to AOT address table
655 address file = (address)AOTCodeCache::add_C_string(__FILE__);
656 __ lea(rscratch1, ExternalAddress(file));
657 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
658 __ movw(rscratch1, (int)__LINE__);
659 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
660
661 // complete return to VM
662 assert(StubRoutines::_call_stub_return_address != nullptr,
663 "_call_stub_return_address must have been generated before");
664 __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
665
666 // record the stub entry and end
667 store_archive_data(stub_id, start, __ pc());
668
669 return start;
670 }
671
672 // Continuation point for runtime calls returning with a pending
673 // exception. The pending exception check happened in the runtime
674 // or native call stub. The pending exception in Thread is
675 // converted into a Java-level exception.
676 //
677 // Contract with Java-level exception handlers:
678 // r0: exception
679 // r3: throwing pc
680 //
681 // NOTE: At entry of this stub, exception-pc must be in LR !!
682
683 // NOTE: this is always used as a jump target within generated code
684 // so it just needs to be generated code with no x86 prolog
685
686 address generate_forward_exception() {
687 StubId stub_id = StubId::stubgen_forward_exception_id;
688 int entry_count = StubInfo::entry_count(stub_id);
689 assert(entry_count == 1, "sanity check");
690 address start = load_archive_data(stub_id);
691 if (start != nullptr) {
692 return start;
693 }
694 StubCodeMark mark(this, stub_id);
695 start = __ pc();
696
697 // Upon entry, LR points to the return address returning into
698 // Java (interpreted or compiled) code; i.e., the return address
699 // becomes the throwing pc.
700 //
701 // Arguments pushed before the runtime call are still on the stack
702 // but the exception handler will reset the stack pointer ->
703 // ignore them. A potential result in registers can be ignored as
704 // well.
705
706 #ifdef ASSERT
707 // make sure this code is only executed if there is a pending exception
708 {
709 Label L;
710 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
711 __ cbnz(rscratch1, L);
712 __ stop("StubRoutines::forward exception: no pending exception (1)");
713 __ bind(L);
714 }
715 #endif
716
717 // compute exception handler into r19
718
719 // call the VM to find the handler address associated with the
720 // caller address. pass thread in r0 and caller pc (ret address)
721 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
722 // the stack.
723 __ mov(c_rarg1, lr);
724 // lr will be trashed by the VM call so we move it to R19
725 // (callee-saved) because we also need to pass it to the handler
726 // returned by this call.
727 __ mov(r19, lr);
728 BLOCK_COMMENT("call exception_handler_for_return_address");
729 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
730 SharedRuntime::exception_handler_for_return_address),
731 rthread, c_rarg1);
732 // Reinitialize the ptrue predicate register, in case the external runtime
733 // call clobbers ptrue reg, as we may return to SVE compiled code.
734 __ reinitialize_ptrue();
735
736 // we should not really care that lr is no longer the callee
737 // address. we saved the value the handler needs in r19 so we can
738 // just copy it to r3. however, the C2 handler will push its own
739 // frame and then calls into the VM and the VM code asserts that
740 // the PC for the frame above the handler belongs to a compiled
741 // Java method. So, we restore lr here to satisfy that assert.
742 __ mov(lr, r19);
743 // setup r0 & r3 & clear pending exception
744 __ mov(r3, r19);
745 __ mov(r19, r0);
746 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
747 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
748
749 #ifdef ASSERT
750 // make sure exception is set
751 {
752 Label L;
753 __ cbnz(r0, L);
754 __ stop("StubRoutines::forward exception: no pending exception (2)");
755 __ bind(L);
756 }
757 #endif
758
759 // continue at exception handler
760 // r0: exception
761 // r3: throwing pc
762 // r19: exception handler
763 __ verify_oop(r0);
764 __ br(r19);
765
766 // record the stub entry and end
767 store_archive_data(stub_id, start, __ pc());
768
769 return start;
770 }
771
772 // Non-destructive plausibility checks for oops
773 //
774 // Arguments:
775 // r0: oop to verify
776 // rscratch1: error message
777 //
778 // Stack after saving c_rarg3:
779 // [tos + 0]: saved c_rarg3
780 // [tos + 1]: saved c_rarg2
781 // [tos + 2]: saved lr
782 // [tos + 3]: saved rscratch2
783 // [tos + 4]: saved r0
784 // [tos + 5]: saved rscratch1
785 address generate_verify_oop() {
786 StubId stub_id = StubId::stubgen_verify_oop_id;
787 int entry_count = StubInfo::entry_count(stub_id);
788 assert(entry_count == 1, "sanity check");
789 address start = load_archive_data(stub_id);
790 if (start != nullptr) {
791 return start;
792 }
793 StubCodeMark mark(this, stub_id);
794 start = __ pc();
795
796 Label exit, error;
797
798 // save c_rarg2 and c_rarg3
799 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
800
801 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
802 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
803 __ ldr(c_rarg3, Address(c_rarg2));
804 __ add(c_rarg3, c_rarg3, 1);
805 __ str(c_rarg3, Address(c_rarg2));
806
807 // object is in r0
808 // make sure object is 'reasonable'
809 __ cbz(r0, exit); // if obj is null it is OK
810
811 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
812 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
813
814 // return if everything seems ok
815 __ bind(exit);
816
817 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
818 __ ret(lr);
819
820 // handle errors
821 __ bind(error);
822 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
823
824 __ push(RegSet::range(r0, r29), sp);
825 // debug(char* msg, int64_t pc, int64_t regs[])
826 __ mov(c_rarg0, rscratch1); // pass address of error message
827 __ mov(c_rarg1, lr); // pass return address
828 __ mov(c_rarg2, sp); // pass address of regs on stack
829 #ifndef PRODUCT
830 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
831 #endif
832 BLOCK_COMMENT("call MacroAssembler::debug");
833 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
834 __ blr(rscratch1);
835 __ hlt(0);
836
837 // record the stub entry and end
838 store_archive_data(stub_id, start, __ pc());
839
840 return start;
841 }
842
843 // Generate indices for iota vector.
844 void generate_iota_indices(StubId stub_id) {
845 GrowableArray<address> entries;
846 int entry_count = StubInfo::entry_count(stub_id);
847 assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
848 address start = load_archive_data(stub_id, &entries);
849 if (start != nullptr) {
850 assert(entries.length() == entry_count - 1,
851 "unexpected entries count %d", entries.length());
852 StubRoutines::aarch64::_vector_iota_indices[0] = start;
853 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
854 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
855 }
856 return;
857 }
858 __ align(CodeEntryAlignment);
859 StubCodeMark mark(this, stub_id);
860 start = __ pc();
861 // B
862 __ emit_data64(0x0706050403020100, relocInfo::none);
863 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
864 entries.append(__ pc());
865 // H
866 __ emit_data64(0x0003000200010000, relocInfo::none);
867 __ emit_data64(0x0007000600050004, relocInfo::none);
868 entries.append(__ pc());
869 // S
870 __ emit_data64(0x0000000100000000, relocInfo::none);
871 __ emit_data64(0x0000000300000002, relocInfo::none);
872 entries.append(__ pc());
873 // D
874 __ emit_data64(0x0000000000000000, relocInfo::none);
875 __ emit_data64(0x0000000000000001, relocInfo::none);
876 entries.append(__ pc());
877 // S - FP
878 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
879 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
880 entries.append(__ pc());
881 // D - FP
882 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
883 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
884
885 // record the stub entry and end
886 store_archive_data(stub_id, start, __ pc(), &entries);
887
888 // install the entry addresses in the entry array
889 assert(entries.length() == entry_count - 1,
890 "unexpected entries count %d", entries.length());
891 StubRoutines::aarch64::_vector_iota_indices[0] = start;
892 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
893 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
894 }
895 }
896
897 // The inner part of zero_words(). This is the bulk operation,
898 // zeroing words in blocks, possibly using DC ZVA to do it. The
899 // caller is responsible for zeroing the last few words.
900 //
901 // Inputs:
902 // r10: the HeapWord-aligned base address of an array to zero.
903 // r11: the count in HeapWords, r11 > 0.
904 //
905 // Returns r10 and r11, adjusted for the caller to clear.
906 // r10: the base address of the tail of words left to clear.
907 // r11: the number of words in the tail.
908 // r11 < MacroAssembler::zero_words_block_size.
909
910 address generate_zero_blocks() {
911 StubId stub_id = StubId::stubgen_zero_blocks_id;
912 int entry_count = StubInfo::entry_count(stub_id);
913 assert(entry_count == 1, "sanity check");
914 address start = load_archive_data(stub_id);
915 if (start != nullptr) {
916 return start;
917 }
918 __ align(CodeEntryAlignment);
919 StubCodeMark mark(this, stub_id);
920 Label done;
921 Label base_aligned;
922
923 Register base = r10, cnt = r11;
924
925 start = __ pc();
926
927 if (UseBlockZeroing) {
928 int zva_length = VM_Version::zva_length();
929
930 // Ensure ZVA length can be divided by 16. This is required by
931 // the subsequent operations.
932 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
933
934 __ tbz(base, 3, base_aligned);
935 __ str(zr, Address(__ post(base, 8)));
936 __ sub(cnt, cnt, 1);
937 __ bind(base_aligned);
938
939 // Ensure count >= zva_length * 2 so that it still deserves a zva after
940 // alignment.
941 Label small;
942 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
943 __ subs(rscratch1, cnt, low_limit >> 3);
944 __ br(Assembler::LT, small);
945 __ zero_dcache_blocks(base, cnt);
946 __ bind(small);
947 }
948
949 {
950 // Number of stp instructions we'll unroll
951 const int unroll =
952 MacroAssembler::zero_words_block_size / 2;
953 // Clear the remaining blocks.
954 Label loop;
955 __ subs(cnt, cnt, unroll * 2);
956 __ br(Assembler::LT, done);
957 __ bind(loop);
958 for (int i = 0; i < unroll; i++)
959 __ stp(zr, zr, __ post(base, 16));
960 __ subs(cnt, cnt, unroll * 2);
961 __ br(Assembler::GE, loop);
962 __ bind(done);
963 __ add(cnt, cnt, unroll * 2);
964 }
965
966 __ ret(lr);
967
968 // record the stub entry and end
969 store_archive_data(stub_id, start, __ pc());
970
971 return start;
972 }
973
974
975 typedef enum {
976 copy_forwards = 1,
977 copy_backwards = -1
978 } copy_direction;
979
980 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
981 // for arraycopy stubs.
982 class ArrayCopyBarrierSetHelper : StackObj {
983 BarrierSetAssembler* _bs_asm;
984 MacroAssembler* _masm;
985 DecoratorSet _decorators;
986 BasicType _type;
987 Register _gct1;
988 Register _gct2;
989 Register _gct3;
990 FloatRegister _gcvt1;
991 FloatRegister _gcvt2;
992 FloatRegister _gcvt3;
993
994 public:
995 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
996 DecoratorSet decorators,
997 BasicType type,
998 Register gct1,
999 Register gct2,
1000 Register gct3,
1001 FloatRegister gcvt1,
1002 FloatRegister gcvt2,
1003 FloatRegister gcvt3)
1004 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
1005 _masm(masm),
1006 _decorators(decorators),
1007 _type(type),
1008 _gct1(gct1),
1009 _gct2(gct2),
1010 _gct3(gct3),
1011 _gcvt1(gcvt1),
1012 _gcvt2(gcvt2),
1013 _gcvt3(gcvt3) {
1014 }
1015
1016 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
1017 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
1018 dst1, dst2, src,
1019 _gct1, _gct2, _gcvt1);
1020 }
1021
1022 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
1023 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
1024 dst, src1, src2,
1025 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
1026 }
1027
1028 void copy_load_at_16(Register dst1, Register dst2, Address src) {
1029 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
1030 dst1, dst2, src,
1031 _gct1);
1032 }
1033
1034 void copy_store_at_16(Address dst, Register src1, Register src2) {
1035 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
1036 dst, src1, src2,
1037 _gct1, _gct2, _gct3);
1038 }
1039
1040 void copy_load_at_8(Register dst, Address src) {
1041 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
1042 dst, noreg, src,
1043 _gct1);
1044 }
1045
1046 void copy_store_at_8(Address dst, Register src) {
1047 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
1048 dst, src, noreg,
1049 _gct1, _gct2, _gct3);
1050 }
1051 };
1052
1053 // Bulk copy of blocks of 8 words.
1054 //
1055 // count is a count of words.
1056 //
1057 // Precondition: count >= 8
1058 //
1059 // Postconditions:
1060 //
1061 // The least significant bit of count contains the remaining count
1062 // of words to copy. The rest of count is trash.
1063 //
1064 // s and d are adjusted to point to the remaining words to copy
1065 //
1066 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
1067 int entry_count = StubInfo::entry_count(stub_id);
1068 assert(entry_count == 1, "sanity check");
1069 address start = load_archive_data(stub_id);
1070 if (start != nullptr) {
1071 return start;
1072 }
1073 BasicType type;
1074 copy_direction direction;
1075
1076 switch (stub_id) {
1077 case StubId::stubgen_copy_byte_f_id:
1078 direction = copy_forwards;
1079 type = T_BYTE;
1080 break;
1081 case StubId::stubgen_copy_byte_b_id:
1082 direction = copy_backwards;
1083 type = T_BYTE;
1084 break;
1085 case StubId::stubgen_copy_oop_f_id:
1086 direction = copy_forwards;
1087 type = T_OBJECT;
1088 break;
1089 case StubId::stubgen_copy_oop_b_id:
1090 direction = copy_backwards;
1091 type = T_OBJECT;
1092 break;
1093 case StubId::stubgen_copy_oop_uninit_f_id:
1094 direction = copy_forwards;
1095 type = T_OBJECT;
1096 break;
1097 case StubId::stubgen_copy_oop_uninit_b_id:
1098 direction = copy_backwards;
1099 type = T_OBJECT;
1100 break;
1101 default:
1102 ShouldNotReachHere();
1103 }
1104
1105 int unit = wordSize * direction;
1106 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
1107
1108 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
1109 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
1110 const Register stride = r14;
1111 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1112 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1113 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1114
1115 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
1116 assert_different_registers(s, d, count, rscratch1, rscratch2);
1117
1118 Label again, drain;
1119
1120 __ align(CodeEntryAlignment);
1121
1122 StubCodeMark mark(this, stub_id);
1123
1124 start = __ pc();
1125
1126 Label unaligned_copy_long;
1127 if (AvoidUnalignedAccesses) {
1128 __ tbnz(d, 3, unaligned_copy_long);
1129 }
1130
1131 if (direction == copy_forwards) {
1132 __ sub(s, s, bias);
1133 __ sub(d, d, bias);
1134 }
1135
1136 #ifdef ASSERT
1137 // Make sure we are never given < 8 words
1138 {
1139 Label L;
1140 __ cmp(count, (u1)8);
1141 __ br(Assembler::GE, L);
1142 __ stop("genrate_copy_longs called with < 8 words");
1143 __ bind(L);
1144 }
1145 #endif
1146
1147 // Fill 8 registers
1148 if (UseSIMDForMemoryOps) {
1149 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1150 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1151 } else {
1152 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1153 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1154 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1155 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1156 }
1157
1158 __ subs(count, count, 16);
1159 __ br(Assembler::LO, drain);
1160
1161 int prefetch = PrefetchCopyIntervalInBytes;
1162 bool use_stride = false;
1163 if (direction == copy_backwards) {
1164 use_stride = prefetch > 256;
1165 prefetch = -prefetch;
1166 if (use_stride) __ mov(stride, prefetch);
1167 }
1168
1169 __ bind(again);
1170
1171 if (PrefetchCopyIntervalInBytes > 0)
1172 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1173
1174 if (UseSIMDForMemoryOps) {
1175 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1176 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1177 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1178 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1179 } else {
1180 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1181 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1182 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1183 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1184 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1185 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1186 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1187 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1188 }
1189
1190 __ subs(count, count, 8);
1191 __ br(Assembler::HS, again);
1192
1193 // Drain
1194 __ bind(drain);
1195 if (UseSIMDForMemoryOps) {
1196 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1197 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1198 } else {
1199 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1200 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1201 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1202 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1203 }
1204
1205 {
1206 Label L1, L2;
1207 __ tbz(count, exact_log2(4), L1);
1208 if (UseSIMDForMemoryOps) {
1209 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
1210 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
1211 } else {
1212 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1213 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1214 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1215 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
1216 }
1217 __ bind(L1);
1218
1219 if (direction == copy_forwards) {
1220 __ add(s, s, bias);
1221 __ add(d, d, bias);
1222 }
1223
1224 __ tbz(count, 1, L2);
1225 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
1226 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
1227 __ bind(L2);
1228 }
1229
1230 __ ret(lr);
1231
1232 if (AvoidUnalignedAccesses) {
1233 Label drain, again;
1234 // Register order for storing. Order is different for backward copy.
1235
1236 __ bind(unaligned_copy_long);
1237
1238 // source address is even aligned, target odd aligned
1239 //
1240 // when forward copying word pairs we read long pairs at offsets
1241 // {0, 2, 4, 6} (in long words). when backwards copying we read
1242 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
1243 // address by -2 in the forwards case so we can compute the
1244 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
1245 // or -1.
1246 //
1247 // when forward copying we need to store 1 word, 3 pairs and
1248 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
1249 // zero offset We adjust the destination by -1 which means we
1250 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1251 //
1252 // When backwards copyng we need to store 1 word, 3 pairs and
1253 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1254 // offsets {1, 3, 5, 7, 8} * unit.
1255
1256 if (direction == copy_forwards) {
1257 __ sub(s, s, 16);
1258 __ sub(d, d, 8);
1259 }
1260
1261 // Fill 8 registers
1262 //
1263 // for forwards copy s was offset by -16 from the original input
1264 // value of s so the register contents are at these offsets
1265 // relative to the 64 bit block addressed by that original input
1266 // and so on for each successive 64 byte block when s is updated
1267 //
1268 // t0 at offset 0, t1 at offset 8
1269 // t2 at offset 16, t3 at offset 24
1270 // t4 at offset 32, t5 at offset 40
1271 // t6 at offset 48, t7 at offset 56
1272
1273 // for backwards copy s was not offset so the register contents
1274 // are at these offsets into the preceding 64 byte block
1275 // relative to that original input and so on for each successive
1276 // preceding 64 byte block when s is updated. this explains the
1277 // slightly counter-intuitive looking pattern of register usage
1278 // in the stp instructions for backwards copy.
1279 //
1280 // t0 at offset -16, t1 at offset -8
1281 // t2 at offset -32, t3 at offset -24
1282 // t4 at offset -48, t5 at offset -40
1283 // t6 at offset -64, t7 at offset -56
1284
1285 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1286 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1287 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1288 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1289
1290 __ subs(count, count, 16);
1291 __ br(Assembler::LO, drain);
1292
1293 int prefetch = PrefetchCopyIntervalInBytes;
1294 bool use_stride = false;
1295 if (direction == copy_backwards) {
1296 use_stride = prefetch > 256;
1297 prefetch = -prefetch;
1298 if (use_stride) __ mov(stride, prefetch);
1299 }
1300
1301 __ bind(again);
1302
1303 if (PrefetchCopyIntervalInBytes > 0)
1304 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1305
1306 if (direction == copy_forwards) {
1307 // allowing for the offset of -8 the store instructions place
1308 // registers into the target 64 bit block at the following
1309 // offsets
1310 //
1311 // t0 at offset 0
1312 // t1 at offset 8, t2 at offset 16
1313 // t3 at offset 24, t4 at offset 32
1314 // t5 at offset 40, t6 at offset 48
1315 // t7 at offset 56
1316
1317 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1318 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1319 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1320 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1321 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1322 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1323 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1324 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1325 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1326 } else {
1327 // d was not offset when we started so the registers are
1328 // written into the 64 bit block preceding d with the following
1329 // offsets
1330 //
1331 // t1 at offset -8
1332 // t3 at offset -24, t0 at offset -16
1333 // t5 at offset -48, t2 at offset -32
1334 // t7 at offset -56, t4 at offset -48
1335 // t6 at offset -64
1336 //
1337 // note that this matches the offsets previously noted for the
1338 // loads
1339
1340 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1341 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1342 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1343 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1344 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1345 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1346 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1347 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1348 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1349 }
1350
1351 __ subs(count, count, 8);
1352 __ br(Assembler::HS, again);
1353
1354 // Drain
1355 //
1356 // this uses the same pattern of offsets and register arguments
1357 // as above
1358 __ bind(drain);
1359 if (direction == copy_forwards) {
1360 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1361 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1362 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1363 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1364 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1365 } else {
1366 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1367 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1368 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1369 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1370 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1371 }
1372 // now we need to copy any remaining part block which may
1373 // include a 4 word block subblock and/or a 2 word subblock.
1374 // bits 2 and 1 in the count are the tell-tale for whether we
1375 // have each such subblock
1376 {
1377 Label L1, L2;
1378 __ tbz(count, exact_log2(4), L1);
1379 // this is the same as above but copying only 4 longs hence
1380 // with only one intervening stp between the str instructions
1381 // but note that the offsets and registers still follow the
1382 // same pattern
1383 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1384 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1385 if (direction == copy_forwards) {
1386 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1387 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1388 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1389 } else {
1390 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1391 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1392 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1393 }
1394 __ bind(L1);
1395
1396 __ tbz(count, 1, L2);
1397 // this is the same as above but copying only 2 longs hence
1398 // there is no intervening stp between the str instructions
1399 // but note that the offset and register patterns are still
1400 // the same
1401 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1402 if (direction == copy_forwards) {
1403 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1404 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1405 } else {
1406 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1407 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1408 }
1409 __ bind(L2);
1410
1411 // for forwards copy we need to re-adjust the offsets we
1412 // applied so that s and d are follow the last words written
1413
1414 if (direction == copy_forwards) {
1415 __ add(s, s, 16);
1416 __ add(d, d, 8);
1417 }
1418
1419 }
1420
1421 __ ret(lr);
1422 }
1423
1424 // record the stub entry and end
1425 store_archive_data(stub_id, start, __ pc());
1426
1427 return start;
1428 }
1429
1430 // Small copy: less than 16 bytes.
1431 //
1432 // NB: Ignores all of the bits of count which represent more than 15
1433 // bytes, so a caller doesn't have to mask them.
1434
1435 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1436 bool is_backwards = step < 0;
1437 size_t granularity = g_uabs(step);
1438 int direction = is_backwards ? -1 : 1;
1439
1440 Label Lword, Lint, Lshort, Lbyte;
1441
1442 assert(granularity
1443 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1444
1445 const Register t0 = r3;
1446 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1447 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1448
1449 // ??? I don't know if this bit-test-and-branch is the right thing
1450 // to do. It does a lot of jumping, resulting in several
1451 // mispredicted branches. It might make more sense to do this
1452 // with something like Duff's device with a single computed branch.
1453
1454 __ tbz(count, 3 - exact_log2(granularity), Lword);
1455 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1456 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1457 __ bind(Lword);
1458
1459 if (granularity <= sizeof (jint)) {
1460 __ tbz(count, 2 - exact_log2(granularity), Lint);
1461 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1462 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1463 __ bind(Lint);
1464 }
1465
1466 if (granularity <= sizeof (jshort)) {
1467 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1468 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1469 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1470 __ bind(Lshort);
1471 }
1472
1473 if (granularity <= sizeof (jbyte)) {
1474 __ tbz(count, 0, Lbyte);
1475 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1476 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1477 __ bind(Lbyte);
1478 }
1479 }
1480
1481 // All-singing all-dancing memory copy.
1482 //
1483 // Copy count units of memory from s to d. The size of a unit is
1484 // step, which can be positive or negative depending on the direction
1485 // of copy. If is_aligned is false, we align the source address.
1486 //
1487
1488 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1489 Register s, Register d, Register count, int step) {
1490 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1491 bool is_backwards = step < 0;
1492 unsigned int granularity = g_uabs(step);
1493 const Register t0 = r3, t1 = r4;
1494
1495 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1496 // load all the data before writing anything
1497 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1498 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1499 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1500 const Register send = r17, dend = r16;
1501 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1502 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1503 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1504
1505 if (PrefetchCopyIntervalInBytes > 0)
1506 __ prfm(Address(s, 0), PLDL1KEEP);
1507 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1508 __ br(Assembler::HI, copy_big);
1509
1510 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1511 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1512
1513 __ cmp(count, u1(16/granularity));
1514 __ br(Assembler::LS, copy16);
1515
1516 __ cmp(count, u1(64/granularity));
1517 __ br(Assembler::HI, copy80);
1518
1519 __ cmp(count, u1(32/granularity));
1520 __ br(Assembler::LS, copy32);
1521
1522 // 33..64 bytes
1523 if (UseSIMDForMemoryOps) {
1524 bs.copy_load_at_32(v0, v1, Address(s, 0));
1525 bs.copy_load_at_32(v2, v3, Address(send, -32));
1526 bs.copy_store_at_32(Address(d, 0), v0, v1);
1527 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1528 } else {
1529 bs.copy_load_at_16(t0, t1, Address(s, 0));
1530 bs.copy_load_at_16(t2, t3, Address(s, 16));
1531 bs.copy_load_at_16(t4, t5, Address(send, -32));
1532 bs.copy_load_at_16(t6, t7, Address(send, -16));
1533
1534 bs.copy_store_at_16(Address(d, 0), t0, t1);
1535 bs.copy_store_at_16(Address(d, 16), t2, t3);
1536 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1537 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1538 }
1539 __ b(finish);
1540
1541 // 17..32 bytes
1542 __ bind(copy32);
1543 bs.copy_load_at_16(t0, t1, Address(s, 0));
1544 bs.copy_load_at_16(t6, t7, Address(send, -16));
1545
1546 bs.copy_store_at_16(Address(d, 0), t0, t1);
1547 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1548 __ b(finish);
1549
1550 // 65..80/96 bytes
1551 // (96 bytes if SIMD because we do 32 byes per instruction)
1552 __ bind(copy80);
1553 if (UseSIMDForMemoryOps) {
1554 bs.copy_load_at_32(v0, v1, Address(s, 0));
1555 bs.copy_load_at_32(v2, v3, Address(s, 32));
1556 // Unaligned pointers can be an issue for copying.
1557 // The issue has more chances to happen when granularity of data is
1558 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1559 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1560 // The most performance drop has been seen for the range 65-80 bytes.
1561 // For such cases using the pair of ldp/stp instead of the third pair of
1562 // ldpq/stpq fixes the performance issue.
1563 if (granularity < sizeof (jint)) {
1564 Label copy96;
1565 __ cmp(count, u1(80/granularity));
1566 __ br(Assembler::HI, copy96);
1567 bs.copy_load_at_16(t0, t1, Address(send, -16));
1568
1569 bs.copy_store_at_32(Address(d, 0), v0, v1);
1570 bs.copy_store_at_32(Address(d, 32), v2, v3);
1571
1572 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1573 __ b(finish);
1574
1575 __ bind(copy96);
1576 }
1577 bs.copy_load_at_32(v4, v5, Address(send, -32));
1578
1579 bs.copy_store_at_32(Address(d, 0), v0, v1);
1580 bs.copy_store_at_32(Address(d, 32), v2, v3);
1581
1582 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1583 } else {
1584 bs.copy_load_at_16(t0, t1, Address(s, 0));
1585 bs.copy_load_at_16(t2, t3, Address(s, 16));
1586 bs.copy_load_at_16(t4, t5, Address(s, 32));
1587 bs.copy_load_at_16(t6, t7, Address(s, 48));
1588 bs.copy_load_at_16(t8, t9, Address(send, -16));
1589
1590 bs.copy_store_at_16(Address(d, 0), t0, t1);
1591 bs.copy_store_at_16(Address(d, 16), t2, t3);
1592 bs.copy_store_at_16(Address(d, 32), t4, t5);
1593 bs.copy_store_at_16(Address(d, 48), t6, t7);
1594 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1595 }
1596 __ b(finish);
1597
1598 // 0..16 bytes
1599 __ bind(copy16);
1600 __ cmp(count, u1(8/granularity));
1601 __ br(Assembler::LO, copy8);
1602
1603 // 8..16 bytes
1604 bs.copy_load_at_8(t0, Address(s, 0));
1605 bs.copy_load_at_8(t1, Address(send, -8));
1606 bs.copy_store_at_8(Address(d, 0), t0);
1607 bs.copy_store_at_8(Address(dend, -8), t1);
1608 __ b(finish);
1609
1610 if (granularity < 8) {
1611 // 4..7 bytes
1612 __ bind(copy8);
1613 __ tbz(count, 2 - exact_log2(granularity), copy4);
1614 __ ldrw(t0, Address(s, 0));
1615 __ ldrw(t1, Address(send, -4));
1616 __ strw(t0, Address(d, 0));
1617 __ strw(t1, Address(dend, -4));
1618 __ b(finish);
1619 if (granularity < 4) {
1620 // 0..3 bytes
1621 __ bind(copy4);
1622 __ cbz(count, finish); // get rid of 0 case
1623 if (granularity == 2) {
1624 __ ldrh(t0, Address(s, 0));
1625 __ strh(t0, Address(d, 0));
1626 } else { // granularity == 1
1627 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1628 // the first and last byte.
1629 // Handle the 3 byte case by loading and storing base + count/2
1630 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1631 // This does means in the 1 byte case we load/store the same
1632 // byte 3 times.
1633 __ lsr(count, count, 1);
1634 __ ldrb(t0, Address(s, 0));
1635 __ ldrb(t1, Address(send, -1));
1636 __ ldrb(t2, Address(s, count));
1637 __ strb(t0, Address(d, 0));
1638 __ strb(t1, Address(dend, -1));
1639 __ strb(t2, Address(d, count));
1640 }
1641 __ b(finish);
1642 }
1643 }
1644
1645 __ bind(copy_big);
1646 if (is_backwards) {
1647 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1648 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1649 }
1650
1651 // Now we've got the small case out of the way we can align the
1652 // source address on a 2-word boundary.
1653
1654 // Here we will materialize a count in r15, which is used by copy_memory_small
1655 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1656 // Up until here, we have used t9, which aliases r15, but from here on, that register
1657 // can not be used as a temp register, as it contains the count.
1658
1659 Label aligned;
1660
1661 if (is_aligned) {
1662 // We may have to adjust by 1 word to get s 2-word-aligned.
1663 __ tbz(s, exact_log2(wordSize), aligned);
1664 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1665 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1666 __ sub(count, count, wordSize/granularity);
1667 } else {
1668 if (is_backwards) {
1669 __ andr(r15, s, 2 * wordSize - 1);
1670 } else {
1671 __ neg(r15, s);
1672 __ andr(r15, r15, 2 * wordSize - 1);
1673 }
1674 // r15 is the byte adjustment needed to align s.
1675 __ cbz(r15, aligned);
1676 int shift = exact_log2(granularity);
1677 if (shift > 0) {
1678 __ lsr(r15, r15, shift);
1679 }
1680 __ sub(count, count, r15);
1681
1682 #if 0
1683 // ?? This code is only correct for a disjoint copy. It may or
1684 // may not make sense to use it in that case.
1685
1686 // Copy the first pair; s and d may not be aligned.
1687 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1688 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1689
1690 // Align s and d, adjust count
1691 if (is_backwards) {
1692 __ sub(s, s, r15);
1693 __ sub(d, d, r15);
1694 } else {
1695 __ add(s, s, r15);
1696 __ add(d, d, r15);
1697 }
1698 #else
1699 copy_memory_small(decorators, type, s, d, r15, step);
1700 #endif
1701 }
1702
1703 __ bind(aligned);
1704
1705 // s is now 2-word-aligned.
1706
1707 // We have a count of units and some trailing bytes. Adjust the
1708 // count and do a bulk copy of words. If the shift is zero
1709 // perform a move instead to benefit from zero latency moves.
1710 int shift = exact_log2(wordSize/granularity);
1711 if (shift > 0) {
1712 __ lsr(r15, count, shift);
1713 } else {
1714 __ mov(r15, count);
1715 }
1716 if (direction == copy_forwards) {
1717 if (type != T_OBJECT) {
1718 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
1719 __ blr(rscratch1);
1720 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1721 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
1722 __ blr(rscratch1);
1723 } else {
1724 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
1725 __ blr(rscratch1);
1726 }
1727 } else {
1728 if (type != T_OBJECT) {
1729 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
1730 __ blr(rscratch1);
1731 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1732 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
1733 __ blr(rscratch1);
1734 } else {
1735 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
1736 __ blr(rscratch1);
1737 }
1738 }
1739
1740 // And the tail.
1741 copy_memory_small(decorators, type, s, d, count, step);
1742
1743 if (granularity >= 8) __ bind(copy8);
1744 if (granularity >= 4) __ bind(copy4);
1745 __ bind(finish);
1746 }
1747
1748
1749 void clobber_registers() {
1750 #ifdef ASSERT
1751 RegSet clobbered
1752 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1753 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1754 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1755 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1756 __ mov(*it, rscratch1);
1757 }
1758 #endif
1759
1760 }
1761
1762 // Scan over array at a for count oops, verifying each one.
1763 // Preserves a and count, clobbers rscratch1 and rscratch2.
1764 void verify_oop_array (int size, Register a, Register count, Register temp) {
1765 Label loop, end;
1766 __ mov(rscratch1, a);
1767 __ mov(rscratch2, zr);
1768 __ bind(loop);
1769 __ cmp(rscratch2, count);
1770 __ br(Assembler::HS, end);
1771 if (size == wordSize) {
1772 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1773 __ verify_oop(temp);
1774 } else {
1775 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1776 __ decode_heap_oop(temp); // calls verify_oop
1777 }
1778 __ add(rscratch2, rscratch2, 1);
1779 __ b(loop);
1780 __ bind(end);
1781 }
1782
1783 // Arguments:
1784 // stub_id - is used to name the stub and identify all details of
1785 // how to perform the copy.
1786 //
1787 // nopush_entry - is assigned to the stub's post push entry point
1788 // unless it is null
1789 //
1790 // Inputs:
1791 // c_rarg0 - source array address
1792 // c_rarg1 - destination array address
1793 // c_rarg2 - element count, treated as ssize_t, can be zero
1794 //
1795 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1796 // the hardware handle it. The two dwords within qwords that span
1797 // cache line boundaries will still be loaded and stored atomically.
1798 //
1799 // Side Effects: nopush_entry is set to the (post push) entry point
1800 // so it can be used by the corresponding conjoint
1801 // copy method
1802 //
1803 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1804 int size;
1805 bool aligned;
1806 bool is_oop;
1807 bool dest_uninitialized;
1808 switch (stub_id) {
1809 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1810 size = sizeof(jbyte);
1811 aligned = false;
1812 is_oop = false;
1813 dest_uninitialized = false;
1814 break;
1815 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1816 size = sizeof(jbyte);
1817 aligned = true;
1818 is_oop = false;
1819 dest_uninitialized = false;
1820 break;
1821 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1822 size = sizeof(jshort);
1823 aligned = false;
1824 is_oop = false;
1825 dest_uninitialized = false;
1826 break;
1827 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1828 size = sizeof(jshort);
1829 aligned = true;
1830 is_oop = false;
1831 dest_uninitialized = false;
1832 break;
1833 case StubId::stubgen_jint_disjoint_arraycopy_id:
1834 size = sizeof(jint);
1835 aligned = false;
1836 is_oop = false;
1837 dest_uninitialized = false;
1838 break;
1839 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1840 size = sizeof(jint);
1841 aligned = true;
1842 is_oop = false;
1843 dest_uninitialized = false;
1844 break;
1845 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1846 // since this is always aligned we can (should!) use the same
1847 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1848 ShouldNotReachHere();
1849 break;
1850 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1851 size = sizeof(jlong);
1852 aligned = true;
1853 is_oop = false;
1854 dest_uninitialized = false;
1855 break;
1856 case StubId::stubgen_oop_disjoint_arraycopy_id:
1857 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1858 aligned = !UseCompressedOops;
1859 is_oop = true;
1860 dest_uninitialized = false;
1861 break;
1862 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1863 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1864 aligned = !UseCompressedOops;
1865 is_oop = true;
1866 dest_uninitialized = false;
1867 break;
1868 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1869 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1870 aligned = !UseCompressedOops;
1871 is_oop = true;
1872 dest_uninitialized = true;
1873 break;
1874 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1875 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1876 aligned = !UseCompressedOops;
1877 is_oop = true;
1878 dest_uninitialized = true;
1879 break;
1880 default:
1881 ShouldNotReachHere();
1882 break;
1883 }
1884 // all stubs provide a 2nd entry which omits the frame push for
1885 // use when bailing out from a conjoint copy. However we may also
1886 // need some extra addressses for memory access protection.
1887 int entry_count = StubInfo::entry_count(stub_id);
1888 assert(entry_count == 2, "sanity check");
1889 assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
1890
1891 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
1892 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
1893 GrowableArray<address> entries;
1894 GrowableArray<address> extras;
1895 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
1896 address start = load_archive_data(stub_id, &entries, extras_ptr);
1897 if (start != nullptr) {
1898 assert(entries.length() == entry_count - 1,
1899 "unexpected entries count %d", entries.length());
1900 *nopush_entry = entries.at(0);
1901 assert(extras.length() == extra_count,
1902 "unexpected extra count %d", extras.length());
1903 if (add_extras) {
1904 // register one handler at offset 0
1905 register_unsafe_access_handlers(extras, 0, 1);
1906 }
1907 return start;
1908 }
1909
1910 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1911 RegSet saved_reg = RegSet::of(s, d, count);
1912
1913 __ align(CodeEntryAlignment);
1914 StubCodeMark mark(this, stub_id);
1915 start = __ pc();
1916 __ enter();
1917
1918 *nopush_entry = __ pc();
1919 entries.append(*nopush_entry);
1920
1921 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1922 BLOCK_COMMENT("Post-Push Entry:");
1923
1924 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1925 if (dest_uninitialized) {
1926 decorators |= IS_DEST_UNINITIALIZED;
1927 }
1928 if (aligned) {
1929 decorators |= ARRAYCOPY_ALIGNED;
1930 }
1931
1932 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1933 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1934
1935 if (is_oop) {
1936 // save regs before copy_memory
1937 __ push(RegSet::of(d, count), sp);
1938 }
1939 {
1940 // UnsafeMemoryAccess page error: continue after unsafe access
1941 UnsafeMemoryAccessMark umam(this, add_extras, true);
1942 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1943 }
1944
1945 if (is_oop) {
1946 __ pop(RegSet::of(d, count), sp);
1947 if (VerifyOops)
1948 verify_oop_array(size, d, count, r16);
1949 }
1950
1951 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1952
1953 __ leave();
1954 __ mov(r0, zr); // return 0
1955 __ ret(lr);
1956
1957 address end = __ pc();
1958
1959 if (add_extras) {
1960 // retrieve the registered handler addresses
1961 retrieve_unsafe_access_handlers(start, end, extras);
1962 assert(extras.length() == extra_count
1963 , "incorrect handlers count %d", extras.length());
1964 }
1965
1966 // record the stub entry and end plus the no_push entry and any
1967 // extra handler addresses
1968 store_archive_data(stub_id, start, end, &entries, extras_ptr);
1969
1970 return start;
1971 }
1972
1973 // Arguments:
1974 // stub_id - is used to name the stub and identify all details of
1975 // how to perform the copy.
1976 //
1977 // nooverlap_target - identifes the (post push) entry for the
1978 // corresponding disjoint copy routine which can be
1979 // jumped to if the ranges do not actually overlap
1980 //
1981 // nopush_entry - is assigned to the stub's post push entry point
1982 // unless it is null
1983 //
1984 //
1985 // Inputs:
1986 // c_rarg0 - source array address
1987 // c_rarg1 - destination array address
1988 // c_rarg2 - element count, treated as ssize_t, can be zero
1989 //
1990 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1991 // the hardware handle it. The two dwords within qwords that span
1992 // cache line boundaries will still be loaded and stored atomically.
1993 //
1994 // Side Effects:
1995 // nopush_entry is set to the no-overlap entry point so it can be
1996 // used by some other conjoint copy method
1997 //
1998 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1999 int size;
2000 bool aligned;
2001 bool is_oop;
2002 bool dest_uninitialized;
2003 switch (stub_id) {
2004 case StubId::stubgen_jbyte_arraycopy_id:
2005 size = sizeof(jbyte);
2006 aligned = false;
2007 is_oop = false;
2008 dest_uninitialized = false;
2009 break;
2010 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
2011 size = sizeof(jbyte);
2012 aligned = true;
2013 is_oop = false;
2014 dest_uninitialized = false;
2015 break;
2016 case StubId::stubgen_jshort_arraycopy_id:
2017 size = sizeof(jshort);
2018 aligned = false;
2019 is_oop = false;
2020 dest_uninitialized = false;
2021 break;
2022 case StubId::stubgen_arrayof_jshort_arraycopy_id:
2023 size = sizeof(jshort);
2024 aligned = true;
2025 is_oop = false;
2026 dest_uninitialized = false;
2027 break;
2028 case StubId::stubgen_jint_arraycopy_id:
2029 size = sizeof(jint);
2030 aligned = false;
2031 is_oop = false;
2032 dest_uninitialized = false;
2033 break;
2034 case StubId::stubgen_arrayof_jint_arraycopy_id:
2035 size = sizeof(jint);
2036 aligned = true;
2037 is_oop = false;
2038 dest_uninitialized = false;
2039 break;
2040 case StubId::stubgen_jlong_arraycopy_id:
2041 // since this is always aligned we can (should!) use the same
2042 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
2043 ShouldNotReachHere();
2044 break;
2045 case StubId::stubgen_arrayof_jlong_arraycopy_id:
2046 size = sizeof(jlong);
2047 aligned = true;
2048 is_oop = false;
2049 dest_uninitialized = false;
2050 break;
2051 case StubId::stubgen_oop_arraycopy_id:
2052 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2053 aligned = !UseCompressedOops;
2054 is_oop = true;
2055 dest_uninitialized = false;
2056 break;
2057 case StubId::stubgen_arrayof_oop_arraycopy_id:
2058 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2059 aligned = !UseCompressedOops;
2060 is_oop = true;
2061 dest_uninitialized = false;
2062 break;
2063 case StubId::stubgen_oop_arraycopy_uninit_id:
2064 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2065 aligned = !UseCompressedOops;
2066 is_oop = true;
2067 dest_uninitialized = true;
2068 break;
2069 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2070 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2071 aligned = !UseCompressedOops;
2072 is_oop = true;
2073 dest_uninitialized = true;
2074 break;
2075 default:
2076 ShouldNotReachHere();
2077 }
2078 // only some conjoint stubs generate a 2nd entry
2079 int entry_count = StubInfo::entry_count(stub_id);
2080 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2081 assert(entry_count == expected_entry_count,
2082 "expected entry count %d does not match declared entry count %d for stub %s",
2083 expected_entry_count, entry_count, StubInfo::name(stub_id));
2084
2085 // We need to protect memory accesses in certain cases
2086 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
2087 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
2088 GrowableArray<address> entries;
2089 GrowableArray<address> extras;
2090 GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
2091 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
2092 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2093 if (start != nullptr) {
2094 assert(entries.length() == expected_entry_count - 1,
2095 "unexpected entries count %d", entries.length());
2096 assert(extras.length() == extra_count,
2097 "unexpected extra count %d", extras.length());
2098 if (nopush_entry != nullptr) {
2099 *nopush_entry = entries.at(0);
2100 }
2101 if (add_extras) {
2102 // register one handler at offset 0
2103 register_unsafe_access_handlers(extras, 0, 1);
2104 }
2105 return start;
2106 }
2107
2108 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2109 RegSet saved_regs = RegSet::of(s, d, count);
2110 StubCodeMark mark(this, stub_id);
2111 start = __ pc();
2112 __ enter();
2113
2114 if (nopush_entry != nullptr) {
2115 *nopush_entry = __ pc();
2116 entries.append(*nopush_entry);
2117 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2118 BLOCK_COMMENT("Post-Push Entry:");
2119 }
2120
2121 // use fwd copy when (d-s) above_equal (count*size)
2122 Label L_overlapping;
2123 __ sub(rscratch1, d, s);
2124 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
2125 __ br(Assembler::LO, L_overlapping);
2126 __ b(RuntimeAddress(nooverlap_target));
2127 __ bind(L_overlapping);
2128
2129 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2130 if (dest_uninitialized) {
2131 decorators |= IS_DEST_UNINITIALIZED;
2132 }
2133 if (aligned) {
2134 decorators |= ARRAYCOPY_ALIGNED;
2135 }
2136
2137 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2138 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
2139
2140 if (is_oop) {
2141 // save regs before copy_memory
2142 __ push(RegSet::of(d, count), sp);
2143 }
2144 {
2145 // UnsafeMemoryAccess page error: continue after unsafe access
2146 UnsafeMemoryAccessMark umam(this, add_extras, true);
2147 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
2148 }
2149 if (is_oop) {
2150 __ pop(RegSet::of(d, count), sp);
2151 if (VerifyOops)
2152 verify_oop_array(size, d, count, r16);
2153 }
2154 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
2155 __ leave();
2156 __ mov(r0, zr); // return 0
2157 __ ret(lr);
2158
2159 assert(entries.length() == expected_entry_count - 1,
2160 "unexpected entries count %d", entries.length());
2161
2162 address end = __ pc();
2163
2164 if (add_extras) {
2165 // retrieve the registered handler addresses
2166 retrieve_unsafe_access_handlers(start, end, extras);
2167 assert(extras.length() == extra_count,
2168 "incorrect handlers count %d", extras.length());
2169 }
2170
2171 // record the stub entry and end plus any no_push entry and/or
2172 // extra handler addresses
2173 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2174
2175 return start;
2176 }
2177
2178 // Helper for generating a dynamic type check.
2179 // Smashes rscratch1, rscratch2.
2180 void generate_type_check(Register sub_klass,
2181 Register super_check_offset,
2182 Register super_klass,
2183 Register temp1,
2184 Register temp2,
2185 Register result,
2186 Label& L_success) {
2187 assert_different_registers(sub_klass, super_check_offset, super_klass);
2188
2189 BLOCK_COMMENT("type_check:");
2190
2191 Label L_miss;
2192
2193 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2194 super_check_offset);
2195 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
2196
2197 // Fall through on failure!
2198 __ BIND(L_miss);
2199 }
2200
2201 //
2202 // Generate checkcasting array copy stub
2203 //
2204 // Input:
2205 // c_rarg0 - source array address
2206 // c_rarg1 - destination array address
2207 // c_rarg2 - element count, treated as ssize_t, can be zero
2208 // c_rarg3 - size_t ckoff (super_check_offset)
2209 // c_rarg4 - oop ckval (super_klass)
2210 //
2211 // Output:
2212 // r0 == 0 - success
2213 // r0 == -1^K - failure, where K is partial transfer count
2214 //
2215 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
2216 bool dest_uninitialized;
2217 switch (stub_id) {
2218 case StubId::stubgen_checkcast_arraycopy_id:
2219 dest_uninitialized = false;
2220 break;
2221 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2222 dest_uninitialized = true;
2223 break;
2224 default:
2225 ShouldNotReachHere();
2226 }
2227
2228 // The normal stub provides a 2nd entry which omits the frame push
2229 // for use when bailing out from a disjoint copy.
2230 // Only some conjoint stubs generate a 2nd entry
2231 int entry_count = StubInfo::entry_count(stub_id);
2232 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2233 GrowableArray<address> entries;
2234 GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
2235 assert(entry_count == expected_entry_count,
2236 "expected entry count %d does not match declared entry count %d for stub %s",
2237 expected_entry_count, entry_count, StubInfo::name(stub_id));
2238 address start = load_archive_data(stub_id, entries_ptr);
2239 if (start != nullptr) {
2240 assert(entries.length() + 1 == expected_entry_count,
2241 "expected entry count %d does not match return entry count %d for stub %s",
2242 expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
2243 if (nopush_entry != nullptr) {
2244 *nopush_entry = entries.at(0);
2245 }
2246 return start;
2247 }
2248
2249 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
2250
2251 // Input registers (after setup_arg_regs)
2252 const Register from = c_rarg0; // source array address
2253 const Register to = c_rarg1; // destination array address
2254 const Register count = c_rarg2; // elementscount
2255 const Register ckoff = c_rarg3; // super_check_offset
2256 const Register ckval = c_rarg4; // super_klass
2257
2258 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
2259
2260 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
2261 const Register copied_oop = r22; // actual oop copied
2262 const Register count_save = r21; // orig elementscount
2263 const Register start_to = r20; // destination array start address
2264 const Register r19_klass = r19; // oop._klass
2265
2266 // Registers used as gc temps (r5, r6, r7 are save-on-call)
2267 const Register gct1 = r5, gct2 = r6, gct3 = r7;
2268
2269 //---------------------------------------------------------------
2270 // Assembler stub will be used for this call to arraycopy
2271 // if the two arrays are subtypes of Object[] but the
2272 // destination array type is not equal to or a supertype
2273 // of the source type. Each element must be separately
2274 // checked.
2275
2276 assert_different_registers(from, to, count, ckoff, ckval, start_to,
2277 copied_oop, r19_klass, count_save);
2278
2279 __ align(CodeEntryAlignment);
2280 StubCodeMark mark(this, stub_id);
2281 start = __ pc();
2282
2283 __ enter(); // required for proper stackwalking of RuntimeStub frame
2284
2285 #ifdef ASSERT
2286 // caller guarantees that the arrays really are different
2287 // otherwise, we would have to make conjoint checks
2288 { Label L;
2289 __ b(L); // conjoint check not yet implemented
2290 __ stop("checkcast_copy within a single array");
2291 __ bind(L);
2292 }
2293 #endif //ASSERT
2294
2295 // Caller of this entry point must set up the argument registers.
2296 if (nopush_entry != nullptr) {
2297 *nopush_entry = __ pc();
2298 entries.append(*nopush_entry);
2299 BLOCK_COMMENT("Entry:");
2300 }
2301
2302 // Empty array: Nothing to do.
2303 __ cbz(count, L_done);
2304 __ push(RegSet::of(r19, r20, r21, r22), sp);
2305
2306 #ifdef ASSERT
2307 BLOCK_COMMENT("assert consistent ckoff/ckval");
2308 // The ckoff and ckval must be mutually consistent,
2309 // even though caller generates both.
2310 { Label L;
2311 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2312 __ ldrw(start_to, Address(ckval, sco_offset));
2313 __ cmpw(ckoff, start_to);
2314 __ br(Assembler::EQ, L);
2315 __ stop("super_check_offset inconsistent");
2316 __ bind(L);
2317 }
2318 #endif //ASSERT
2319
2320 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2321 bool is_oop = true;
2322 int element_size = UseCompressedOops ? 4 : 8;
2323 if (dest_uninitialized) {
2324 decorators |= IS_DEST_UNINITIALIZED;
2325 }
2326
2327 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2328 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
2329
2330 // save the original count
2331 __ mov(count_save, count);
2332
2333 // Copy from low to high addresses
2334 __ mov(start_to, to); // Save destination array start address
2335 __ b(L_load_element);
2336
2337 // ======== begin loop ========
2338 // (Loop is rotated; its entry is L_load_element.)
2339 // Loop control:
2340 // for (; count != 0; count--) {
2341 // copied_oop = load_heap_oop(from++);
2342 // ... generate_type_check ...;
2343 // store_heap_oop(to++, copied_oop);
2344 // }
2345 __ align(OptoLoopAlignment);
2346
2347 __ BIND(L_store_element);
2348 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
2349 __ post(to, element_size), copied_oop, noreg,
2350 gct1, gct2, gct3);
2351 __ sub(count, count, 1);
2352 __ cbz(count, L_do_card_marks);
2353
2354 // ======== loop entry is here ========
2355 __ BIND(L_load_element);
2356 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
2357 copied_oop, noreg, __ post(from, element_size),
2358 gct1);
2359 __ cbz(copied_oop, L_store_element);
2360
2361 __ load_klass(r19_klass, copied_oop);// query the object klass
2362
2363 BLOCK_COMMENT("type_check:");
2364 generate_type_check(/*sub_klass*/r19_klass,
2365 /*super_check_offset*/ckoff,
2366 /*super_klass*/ckval,
2367 /*r_array_base*/gct1,
2368 /*temp2*/gct2,
2369 /*result*/r10, L_store_element);
2370
2371 // Fall through on failure!
2372
2373 // ======== end loop ========
2374
2375 // It was a real error; we must depend on the caller to finish the job.
2376 // Register count = remaining oops, count_orig = total oops.
2377 // Emit GC store barriers for the oops we have copied and report
2378 // their number to the caller.
2379
2380 __ subs(count, count_save, count); // K = partially copied oop count
2381 __ eon(count, count, zr); // report (-1^K) to caller
2382 __ br(Assembler::EQ, L_done_pop);
2383
2384 __ BIND(L_do_card_marks);
2385 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2386
2387 __ bind(L_done_pop);
2388 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2389 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2390
2391 __ bind(L_done);
2392 __ mov(r0, count);
2393 __ leave();
2394 __ ret(lr);
2395
2396 // record the stub entry and end plus any no_push entry
2397 store_archive_data(stub_id, start, __ pc() , entries_ptr);
2398 return start;
2399 }
2400
2401 // Perform range checks on the proposed arraycopy.
2402 // Kills temp, but nothing else.
2403 // Also, clean the sign bits of src_pos and dst_pos.
2404 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2405 Register src_pos, // source position (c_rarg1)
2406 Register dst, // destination array oo (c_rarg2)
2407 Register dst_pos, // destination position (c_rarg3)
2408 Register length,
2409 Register temp,
2410 Label& L_failed) {
2411 BLOCK_COMMENT("arraycopy_range_checks:");
2412
2413 assert_different_registers(rscratch1, temp);
2414
2415 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2416 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2417 __ addw(temp, length, src_pos);
2418 __ cmpw(temp, rscratch1);
2419 __ br(Assembler::HI, L_failed);
2420
2421 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2422 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2423 __ addw(temp, length, dst_pos);
2424 __ cmpw(temp, rscratch1);
2425 __ br(Assembler::HI, L_failed);
2426
2427 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2428 __ movw(src_pos, src_pos);
2429 __ movw(dst_pos, dst_pos);
2430
2431 BLOCK_COMMENT("arraycopy_range_checks done");
2432 }
2433
2434 // These stubs get called from some dumb test routine.
2435 // I'll write them properly when they're called from
2436 // something that's actually doing something.
2437 static void fake_arraycopy_stub(address src, address dst, int count) {
2438 assert(count == 0, "huh?");
2439 }
2440
2441
2442 //
2443 // Generate 'unsafe' array copy stub
2444 // Though just as safe as the other stubs, it takes an unscaled
2445 // size_t argument instead of an element count.
2446 //
2447 // Input:
2448 // c_rarg0 - source array address
2449 // c_rarg1 - destination array address
2450 // c_rarg2 - byte count, treated as ssize_t, can be zero
2451 //
2452 // Examines the alignment of the operands and dispatches
2453 // to a long, int, short, or byte copy loop.
2454 //
2455 address generate_unsafe_copy(address byte_copy_entry,
2456 address short_copy_entry,
2457 address int_copy_entry,
2458 address long_copy_entry) {
2459 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2460 int entry_count = StubInfo::entry_count(stub_id);
2461 assert(entry_count == 1, "sanity check");
2462 address start = load_archive_data(stub_id);
2463 if (start != nullptr) {
2464 return start;
2465 }
2466 Label L_long_aligned, L_int_aligned, L_short_aligned;
2467 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2468
2469 __ align(CodeEntryAlignment);
2470 StubCodeMark mark(this, stub_id);
2471 start = __ pc();
2472 __ enter(); // required for proper stackwalking of RuntimeStub frame
2473
2474 // bump this on entry, not on exit:
2475 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2476
2477 __ orr(rscratch1, s, d);
2478 __ orr(rscratch1, rscratch1, count);
2479
2480 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2481 __ cbz(rscratch1, L_long_aligned);
2482 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2483 __ cbz(rscratch1, L_int_aligned);
2484 __ tbz(rscratch1, 0, L_short_aligned);
2485 __ b(RuntimeAddress(byte_copy_entry));
2486
2487 __ BIND(L_short_aligned);
2488 __ lsr(count, count, LogBytesPerShort); // size => short_count
2489 __ b(RuntimeAddress(short_copy_entry));
2490 __ BIND(L_int_aligned);
2491 __ lsr(count, count, LogBytesPerInt); // size => int_count
2492 __ b(RuntimeAddress(int_copy_entry));
2493 __ BIND(L_long_aligned);
2494 __ lsr(count, count, LogBytesPerLong); // size => long_count
2495 __ b(RuntimeAddress(long_copy_entry));
2496
2497 // record the stub entry and end
2498 store_archive_data(stub_id, start, __ pc());
2499
2500 return start;
2501 }
2502
2503 //
2504 // Generate generic array copy stubs
2505 //
2506 // Input:
2507 // c_rarg0 - src oop
2508 // c_rarg1 - src_pos (32-bits)
2509 // c_rarg2 - dst oop
2510 // c_rarg3 - dst_pos (32-bits)
2511 // c_rarg4 - element count (32-bits)
2512 //
2513 // Output:
2514 // r0 == 0 - success
2515 // r0 == -1^K - failure, where K is partial transfer count
2516 //
2517 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2518 address int_copy_entry, address oop_copy_entry,
2519 address long_copy_entry, address checkcast_copy_entry) {
2520 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2521 int entry_count = StubInfo::entry_count(stub_id);
2522 assert(entry_count == 1, "sanity check");
2523 address start = load_archive_data(stub_id);
2524 if (start != nullptr) {
2525 return start;
2526 }
2527 Label L_failed, L_objArray;
2528 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2529
2530 // Input registers
2531 const Register src = c_rarg0; // source array oop
2532 const Register src_pos = c_rarg1; // source position
2533 const Register dst = c_rarg2; // destination array oop
2534 const Register dst_pos = c_rarg3; // destination position
2535 const Register length = c_rarg4;
2536
2537
2538 // Registers used as temps
2539 const Register dst_klass = c_rarg5;
2540
2541 __ align(CodeEntryAlignment);
2542
2543 StubCodeMark mark(this, stub_id);
2544
2545 start = __ pc();
2546
2547 __ enter(); // required for proper stackwalking of RuntimeStub frame
2548
2549 // bump this on entry, not on exit:
2550 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2551
2552 //-----------------------------------------------------------------------
2553 // Assembler stub will be used for this call to arraycopy
2554 // if the following conditions are met:
2555 //
2556 // (1) src and dst must not be null.
2557 // (2) src_pos must not be negative.
2558 // (3) dst_pos must not be negative.
2559 // (4) length must not be negative.
2560 // (5) src klass and dst klass should be the same and not null.
2561 // (6) src and dst should be arrays.
2562 // (7) src_pos + length must not exceed length of src.
2563 // (8) dst_pos + length must not exceed length of dst.
2564 //
2565
2566 // if (src == nullptr) return -1;
2567 __ cbz(src, L_failed);
2568
2569 // if (src_pos < 0) return -1;
2570 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2571
2572 // if (dst == nullptr) return -1;
2573 __ cbz(dst, L_failed);
2574
2575 // if (dst_pos < 0) return -1;
2576 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2577
2578 // registers used as temp
2579 const Register scratch_length = r16; // elements count to copy
2580 const Register scratch_src_klass = r17; // array klass
2581 const Register lh = r15; // layout helper
2582
2583 // if (length < 0) return -1;
2584 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2585 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2586
2587 __ load_klass(scratch_src_klass, src);
2588 #ifdef ASSERT
2589 // assert(src->klass() != nullptr);
2590 {
2591 BLOCK_COMMENT("assert klasses not null {");
2592 Label L1, L2;
2593 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2594 __ bind(L1);
2595 __ stop("broken null klass");
2596 __ bind(L2);
2597 __ load_klass(rscratch1, dst);
2598 __ cbz(rscratch1, L1); // this would be broken also
2599 BLOCK_COMMENT("} assert klasses not null done");
2600 }
2601 #endif
2602
2603 // Load layout helper (32-bits)
2604 //
2605 // |array_tag| | header_size | element_type | |log2_element_size|
2606 // 32 30 24 16 8 2 0
2607 //
2608 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2609 //
2610
2611 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2612
2613 // Handle objArrays completely differently...
2614 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2615 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2616 __ movw(rscratch1, objArray_lh);
2617 __ eorw(rscratch2, lh, rscratch1);
2618 __ cbzw(rscratch2, L_objArray);
2619
2620 // if (src->klass() != dst->klass()) return -1;
2621 __ load_klass(rscratch2, dst);
2622 __ eor(rscratch2, rscratch2, scratch_src_klass);
2623 __ cbnz(rscratch2, L_failed);
2624
2625 // Check for flat inline type array -> return -1
2626 __ test_flat_array_oop(src, rscratch2, L_failed);
2627
2628 // Check for null-free (non-flat) inline type array -> handle as object array
2629 __ test_null_free_array_oop(src, rscratch2, L_objArray);
2630
2631 // if (!src->is_Array()) return -1;
2632 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2633
2634 // At this point, it is known to be a typeArray (array_tag 0x3).
2635 #ifdef ASSERT
2636 {
2637 BLOCK_COMMENT("assert primitive array {");
2638 Label L;
2639 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2640 __ cmpw(lh, rscratch2);
2641 __ br(Assembler::GE, L);
2642 __ stop("must be a primitive array");
2643 __ bind(L);
2644 BLOCK_COMMENT("} assert primitive array done");
2645 }
2646 #endif
2647
2648 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2649 rscratch2, L_failed);
2650
2651 // TypeArrayKlass
2652 //
2653 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2654 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2655 //
2656
2657 const Register rscratch1_offset = rscratch1; // array offset
2658 const Register r15_elsize = lh; // element size
2659
2660 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2661 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2662 __ add(src, src, rscratch1_offset); // src array offset
2663 __ add(dst, dst, rscratch1_offset); // dst array offset
2664 BLOCK_COMMENT("choose copy loop based on element size");
2665
2666 // next registers should be set before the jump to corresponding stub
2667 const Register from = c_rarg0; // source array address
2668 const Register to = c_rarg1; // destination array address
2669 const Register count = c_rarg2; // elements count
2670
2671 // 'from', 'to', 'count' registers should be set in such order
2672 // since they are the same as 'src', 'src_pos', 'dst'.
2673
2674 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2675
2676 // The possible values of elsize are 0-3, i.e. exact_log2(element
2677 // size in bytes). We do a simple bitwise binary search.
2678 __ BIND(L_copy_bytes);
2679 __ tbnz(r15_elsize, 1, L_copy_ints);
2680 __ tbnz(r15_elsize, 0, L_copy_shorts);
2681 __ lea(from, Address(src, src_pos));// src_addr
2682 __ lea(to, Address(dst, dst_pos));// dst_addr
2683 __ movw(count, scratch_length); // length
2684 __ b(RuntimeAddress(byte_copy_entry));
2685
2686 __ BIND(L_copy_shorts);
2687 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2688 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2689 __ movw(count, scratch_length); // length
2690 __ b(RuntimeAddress(short_copy_entry));
2691
2692 __ BIND(L_copy_ints);
2693 __ tbnz(r15_elsize, 0, L_copy_longs);
2694 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2695 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2696 __ movw(count, scratch_length); // length
2697 __ b(RuntimeAddress(int_copy_entry));
2698
2699 __ BIND(L_copy_longs);
2700 #ifdef ASSERT
2701 {
2702 BLOCK_COMMENT("assert long copy {");
2703 Label L;
2704 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2705 __ cmpw(r15_elsize, LogBytesPerLong);
2706 __ br(Assembler::EQ, L);
2707 __ stop("must be long copy, but elsize is wrong");
2708 __ bind(L);
2709 BLOCK_COMMENT("} assert long copy done");
2710 }
2711 #endif
2712 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2713 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2714 __ movw(count, scratch_length); // length
2715 __ b(RuntimeAddress(long_copy_entry));
2716
2717 // ObjArrayKlass
2718 __ BIND(L_objArray);
2719 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2720
2721 Label L_plain_copy, L_checkcast_copy;
2722 // test array classes for subtyping
2723 __ load_klass(r15, dst);
2724 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2725 __ br(Assembler::NE, L_checkcast_copy);
2726
2727 // Identically typed arrays can be copied without element-wise checks.
2728 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2729 rscratch2, L_failed);
2730
2731 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2732 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2733 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2734 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2735 __ movw(count, scratch_length); // length
2736 __ BIND(L_plain_copy);
2737 __ b(RuntimeAddress(oop_copy_entry));
2738
2739 __ BIND(L_checkcast_copy);
2740 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2741 {
2742 // Before looking at dst.length, make sure dst is also an objArray.
2743 __ ldrw(rscratch1, Address(r15, lh_offset));
2744 __ movw(rscratch2, objArray_lh);
2745 __ eorw(rscratch1, rscratch1, rscratch2);
2746 __ cbnzw(rscratch1, L_failed);
2747
2748 // It is safe to examine both src.length and dst.length.
2749 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2750 r15, L_failed);
2751
2752 __ load_klass(dst_klass, dst); // reload
2753
2754 // Marshal the base address arguments now, freeing registers.
2755 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2756 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2757 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2758 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2759 __ movw(count, length); // length (reloaded)
2760 Register sco_temp = c_rarg3; // this register is free now
2761 assert_different_registers(from, to, count, sco_temp,
2762 dst_klass, scratch_src_klass);
2763 // assert_clean_int(count, sco_temp);
2764
2765 // Generate the type check.
2766 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2767 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2768
2769 // Smashes rscratch1, rscratch2
2770 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2771 L_plain_copy);
2772
2773 // Fetch destination element klass from the ObjArrayKlass header.
2774 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2775 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2776 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2777
2778 // the checkcast_copy loop needs two extra arguments:
2779 assert(c_rarg3 == sco_temp, "#3 already in place");
2780 // Set up arguments for checkcast_copy_entry.
2781 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2782 __ b(RuntimeAddress(checkcast_copy_entry));
2783 }
2784
2785 __ BIND(L_failed);
2786 __ mov(r0, -1);
2787 __ leave(); // required for proper stackwalking of RuntimeStub frame
2788 __ ret(lr);
2789
2790 // record the stub entry and end
2791 store_archive_data(stub_id, start, __ pc());
2792
2793 return start;
2794 }
2795
2796 //
2797 // Generate stub for array fill. If "aligned" is true, the
2798 // "to" address is assumed to be heapword aligned.
2799 //
2800 // Arguments for generated stub:
2801 // to: c_rarg0
2802 // value: c_rarg1
2803 // count: c_rarg2 treated as signed
2804 //
2805 address generate_fill(StubId stub_id) {
2806 BasicType t;
2807 bool aligned;
2808
2809 switch (stub_id) {
2810 case StubId::stubgen_jbyte_fill_id:
2811 t = T_BYTE;
2812 aligned = false;
2813 break;
2814 case StubId::stubgen_jshort_fill_id:
2815 t = T_SHORT;
2816 aligned = false;
2817 break;
2818 case StubId::stubgen_jint_fill_id:
2819 t = T_INT;
2820 aligned = false;
2821 break;
2822 case StubId::stubgen_arrayof_jbyte_fill_id:
2823 t = T_BYTE;
2824 aligned = true;
2825 break;
2826 case StubId::stubgen_arrayof_jshort_fill_id:
2827 t = T_SHORT;
2828 aligned = true;
2829 break;
2830 case StubId::stubgen_arrayof_jint_fill_id:
2831 t = T_INT;
2832 aligned = true;
2833 break;
2834 default:
2835 ShouldNotReachHere();
2836 };
2837 int entry_count = StubInfo::entry_count(stub_id);
2838 assert(entry_count == 1, "sanity check");
2839 address start = load_archive_data(stub_id);
2840 if (start != nullptr) {
2841 return start;
2842 }
2843 __ align(CodeEntryAlignment);
2844 StubCodeMark mark(this, stub_id);
2845 start = __ pc();
2846
2847 BLOCK_COMMENT("Entry:");
2848
2849 const Register to = c_rarg0; // source array address
2850 const Register value = c_rarg1; // value
2851 const Register count = c_rarg2; // elements count
2852
2853 const Register bz_base = r10; // base for block_zero routine
2854 const Register cnt_words = r11; // temp register
2855
2856 __ enter();
2857
2858 Label L_fill_elements, L_exit1;
2859
2860 int shift = -1;
2861 switch (t) {
2862 case T_BYTE:
2863 shift = 0;
2864 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2865 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2866 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2867 __ br(Assembler::LO, L_fill_elements);
2868 break;
2869 case T_SHORT:
2870 shift = 1;
2871 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2872 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2873 __ br(Assembler::LO, L_fill_elements);
2874 break;
2875 case T_INT:
2876 shift = 2;
2877 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2878 __ br(Assembler::LO, L_fill_elements);
2879 break;
2880 default: ShouldNotReachHere();
2881 }
2882
2883 // Align source address at 8 bytes address boundary.
2884 Label L_skip_align1, L_skip_align2, L_skip_align4;
2885 if (!aligned) {
2886 switch (t) {
2887 case T_BYTE:
2888 // One byte misalignment happens only for byte arrays.
2889 __ tbz(to, 0, L_skip_align1);
2890 __ strb(value, Address(__ post(to, 1)));
2891 __ subw(count, count, 1);
2892 __ bind(L_skip_align1);
2893 // Fallthrough
2894 case T_SHORT:
2895 // Two bytes misalignment happens only for byte and short (char) arrays.
2896 __ tbz(to, 1, L_skip_align2);
2897 __ strh(value, Address(__ post(to, 2)));
2898 __ subw(count, count, 2 >> shift);
2899 __ bind(L_skip_align2);
2900 // Fallthrough
2901 case T_INT:
2902 // Align to 8 bytes, we know we are 4 byte aligned to start.
2903 __ tbz(to, 2, L_skip_align4);
2904 __ strw(value, Address(__ post(to, 4)));
2905 __ subw(count, count, 4 >> shift);
2906 __ bind(L_skip_align4);
2907 break;
2908 default: ShouldNotReachHere();
2909 }
2910 }
2911
2912 //
2913 // Fill large chunks
2914 //
2915 __ lsrw(cnt_words, count, 3 - shift); // number of words
2916 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2917 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2918 if (UseBlockZeroing) {
2919 Label non_block_zeroing, rest;
2920 // If the fill value is zero we can use the fast zero_words().
2921 __ cbnz(value, non_block_zeroing);
2922 __ mov(bz_base, to);
2923 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2924 address tpc = __ zero_words(bz_base, cnt_words);
2925 if (tpc == nullptr) {
2926 fatal("CodeCache is full at generate_fill");
2927 }
2928 __ b(rest);
2929 __ bind(non_block_zeroing);
2930 __ fill_words(to, cnt_words, value);
2931 __ bind(rest);
2932 } else {
2933 __ fill_words(to, cnt_words, value);
2934 }
2935
2936 // Remaining count is less than 8 bytes. Fill it by a single store.
2937 // Note that the total length is no less than 8 bytes.
2938 if (t == T_BYTE || t == T_SHORT) {
2939 Label L_exit1;
2940 __ cbzw(count, L_exit1);
2941 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2942 __ str(value, Address(to, -8)); // overwrite some elements
2943 __ bind(L_exit1);
2944 __ leave();
2945 __ ret(lr);
2946 }
2947
2948 // Handle copies less than 8 bytes.
2949 Label L_fill_2, L_fill_4, L_exit2;
2950 __ bind(L_fill_elements);
2951 switch (t) {
2952 case T_BYTE:
2953 __ tbz(count, 0, L_fill_2);
2954 __ strb(value, Address(__ post(to, 1)));
2955 __ bind(L_fill_2);
2956 __ tbz(count, 1, L_fill_4);
2957 __ strh(value, Address(__ post(to, 2)));
2958 __ bind(L_fill_4);
2959 __ tbz(count, 2, L_exit2);
2960 __ strw(value, Address(to));
2961 break;
2962 case T_SHORT:
2963 __ tbz(count, 0, L_fill_4);
2964 __ strh(value, Address(__ post(to, 2)));
2965 __ bind(L_fill_4);
2966 __ tbz(count, 1, L_exit2);
2967 __ strw(value, Address(to));
2968 break;
2969 case T_INT:
2970 __ cbzw(count, L_exit2);
2971 __ strw(value, Address(to));
2972 break;
2973 default: ShouldNotReachHere();
2974 }
2975 __ bind(L_exit2);
2976 __ leave();
2977 __ ret(lr);
2978
2979 // record the stub entry and end
2980 store_archive_data(stub_id, start, __ pc());
2981
2982 return start;
2983 }
2984
2985 address generate_unsafecopy_common_error_exit() {
2986 StubId stub_id = StubId::stubgen_unsafecopy_common_id;
2987 int entry_count = StubInfo::entry_count(stub_id);
2988 assert(entry_count == 1, "sanity check");
2989 address start = load_archive_data(stub_id);
2990 if (start != nullptr) {
2991 return start;
2992 }
2993 __ align(CodeEntryAlignment);
2994 StubCodeMark mark(this, stub_id);
2995 start = __ pc();
2996 __ leave();
2997 __ mov(r0, 0);
2998 __ ret(lr);
2999
3000 // record the stub entry and end
3001 store_archive_data(stub_id, start, __ pc());
3002
3003 return start;
3004 }
3005
3006 //
3007 // Generate 'unsafe' set memory stub
3008 // Though just as safe as the other stubs, it takes an unscaled
3009 // size_t (# bytes) argument instead of an element count.
3010 //
3011 // This fill operation is atomicity preserving: as long as the
3012 // address supplied is sufficiently aligned, all writes of up to 64
3013 // bits in size are single-copy atomic.
3014 //
3015 // Input:
3016 // c_rarg0 - destination array address
3017 // c_rarg1 - byte count (size_t)
3018 // c_rarg2 - byte value
3019 //
3020 address generate_unsafe_setmemory() {
3021 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
3022 int entry_count = StubInfo::entry_count(stub_id);
3023 assert(entry_count == 1, "sanity check");
3024 // we expect one set of extra unsafememory access handler entries
3025 GrowableArray<address> extras;
3026 int extra_count = 1 * UnsafeMemoryAccess::COLUMN_COUNT;
3027 address start = load_archive_data(stub_id, nullptr, &extras);
3028 if (start != nullptr) {
3029 assert(extras.length() == extra_count,
3030 "unexpected extra entry count %d", extras.length());
3031 register_unsafe_access_handlers(extras, 0, 1);
3032 return start;
3033 }
3034
3035 __ align(CodeEntryAlignment);
3036 StubCodeMark mark(this, stub_id);
3037 start = __ pc();
3038
3039 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
3040 Label tail;
3041
3042 {
3043 UnsafeMemoryAccessMark umam(this, true, false);
3044
3045 __ enter(); // required for proper stackwalking of RuntimeStub frame
3046
3047 __ dup(v0, __ T16B, value);
3048
3049 if (AvoidUnalignedAccesses) {
3050 __ cmp(count, (u1)16);
3051 __ br(__ LO, tail);
3052
3053 __ mov(rscratch1, 16);
3054 __ andr(rscratch2, dest, 15);
3055 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
3056 __ strq(v0, Address(dest));
3057 __ sub(count, count, rscratch1);
3058 __ add(dest, dest, rscratch1);
3059 }
3060
3061 __ subs(count, count, (u1)64);
3062 __ br(__ LO, tail);
3063 {
3064 Label again;
3065 __ bind(again);
3066 __ stpq(v0, v0, Address(dest));
3067 __ stpq(v0, v0, Address(dest, 32));
3068
3069 __ subs(count, count, 64);
3070 __ add(dest, dest, 64);
3071 __ br(__ HS, again);
3072 }
3073
3074 __ bind(tail);
3075 // The count of bytes is off by 64, but we don't need to correct
3076 // it because we're only going to use the least-significant few
3077 // count bits from here on.
3078 // __ add(count, count, 64);
3079
3080 {
3081 Label dont;
3082 __ tbz(count, exact_log2(32), dont);
3083 __ stpq(v0, v0, __ post(dest, 32));
3084 __ bind(dont);
3085 }
3086 {
3087 Label dont;
3088 __ tbz(count, exact_log2(16), dont);
3089 __ strq(v0, __ post(dest, 16));
3090 __ bind(dont);
3091 }
3092 {
3093 Label dont;
3094 __ tbz(count, exact_log2(8), dont);
3095 __ strd(v0, __ post(dest, 8));
3096 __ bind(dont);
3097 }
3098
3099 Label finished;
3100 __ tst(count, 7);
3101 __ br(__ EQ, finished);
3102
3103 {
3104 Label dont;
3105 __ tbz(count, exact_log2(4), dont);
3106 __ strs(v0, __ post(dest, 4));
3107 __ bind(dont);
3108 }
3109 {
3110 Label dont;
3111 __ tbz(count, exact_log2(2), dont);
3112 __ bfi(value, value, 8, 8);
3113 __ strh(value, __ post(dest, 2));
3114 __ bind(dont);
3115 }
3116 {
3117 Label dont;
3118 __ tbz(count, exact_log2(1), dont);
3119 __ strb(value, Address(dest));
3120 __ bind(dont);
3121 }
3122
3123 __ bind(finished);
3124 __ leave();
3125 __ ret(lr);
3126 // have to exit the block and destroy the UnsafeMemoryAccessMark
3127 // in order to retrieve the handler end address
3128 }
3129
3130 // install saved handler addresses in extras
3131 address end = __ pc();
3132 retrieve_unsafe_access_handlers(start, end, extras);
3133 assert(extras.length() == extra_count,
3134 "incorrect handlers count %d", extras.length());
3135 // record the stub entry and end plus the extras
3136 store_archive_data(stub_id, start, end, nullptr, &extras);
3137
3138 return start;
3139 }
3140
3141 address generate_data_cache_writeback() {
3142 const Register line = c_rarg0; // address of line to write back
3143
3144 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3145 int entry_count = StubInfo::entry_count(stub_id);
3146 assert(entry_count == 1, "sanity check");
3147 address start = load_archive_data(stub_id);
3148 if (start != nullptr) {
3149 return start;
3150 }
3151 __ align(CodeEntryAlignment);
3152 StubCodeMark mark(this, stub_id);
3153
3154 start = __ pc();
3155 __ enter();
3156 __ cache_wb(Address(line, 0));
3157 __ leave();
3158 __ ret(lr);
3159
3160 // record the stub entry and end
3161 store_archive_data(stub_id, start, __ pc());
3162
3163 return start;
3164 }
3165
3166 address generate_data_cache_writeback_sync() {
3167 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3168 int entry_count = StubInfo::entry_count(stub_id);
3169 assert(entry_count == 1, "sanity check");
3170 address start = load_archive_data(stub_id);
3171 if (start != nullptr) {
3172 return start;
3173 }
3174 const Register is_pre = c_rarg0; // pre or post sync
3175 __ align(CodeEntryAlignment);
3176 StubCodeMark mark(this, stub_id);
3177
3178 // pre wbsync is a no-op
3179 // post wbsync translates to an sfence
3180
3181 Label skip;
3182 start = __ pc();
3183 __ enter();
3184 __ cbnz(is_pre, skip);
3185 __ cache_wbsync(false);
3186 __ bind(skip);
3187 __ leave();
3188 __ ret(lr);
3189
3190 // record the stub entry and end
3191 store_archive_data(stub_id, start, __ pc());
3192
3193 return start;
3194 }
3195
3196 void generate_arraycopy_stubs() {
3197 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
3198 // entry immediately following their stack push. This can be used
3199 // as a post-push branch target for compatible stubs when they
3200 // identify a special case that can be handled by the fallback
3201 // stub e.g a disjoint copy stub may be use as a special case
3202 // fallback for its compatible conjoint copy stub.
3203 //
3204 // A no push entry is always returned in the following local and
3205 // then published by assigning to the appropriate entry field in
3206 // class StubRoutines. The entry value is then passed to the
3207 // generator for the compatible stub. That means the entry must be
3208 // listed when saving to/restoring from the AOT cache, ensuring
3209 // that the inter-stub jumps are noted at AOT-cache save and
3210 // relocated at AOT cache load.
3211 address nopush_entry;
3212
3213 // generate the common exit first so later stubs can rely on it if
3214 // they want an UnsafeMemoryAccess exit non-local to the stub
3215 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3216 // register the stub as the default exit with class UnsafeMemoryAccess
3217 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3218
3219 // generate and publish arch64-specific bulk copy routines first
3220 // so we can call them from other copy stubs
3221 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3222 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3223
3224 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3225 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3226
3227 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3228 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3229
3230 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
3231
3232 //*** jbyte
3233 // Always need aligned and unaligned versions
3234 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
3235 // disjoint nopush entry is needed by conjoint copy
3236 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3237 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
3238 // conjoint nopush entry is needed by generic/unsafe copy
3239 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
3240 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
3241 // disjoint arrayof nopush entry is needed by conjoint copy
3242 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3243 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
3244
3245 //*** jshort
3246 // Always need aligned and unaligned versions
3247 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
3248 // disjoint nopush entry is needed by conjoint copy
3249 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
3250 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
3251 // conjoint nopush entry is used by generic/unsafe copy
3252 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
3253 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
3254 // disjoint arrayof nopush entry is needed by conjoint copy
3255 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
3256 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
3257
3258 //*** jint
3259 // Aligned versions
3260 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
3261 // disjoint arrayof nopush entry is needed by conjoint copy
3262 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
3263 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
3264 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3265 // jint_arraycopy_nopush always points to the unaligned version
3266 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
3267 // disjoint nopush entry is needed by conjoint copy
3268 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
3269 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
3270 // conjoint nopush entry is needed by generic/unsafe copy
3271 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
3272
3273 //*** jlong
3274 // It is always aligned
3275 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
3276 // disjoint arrayof nopush entry is needed by conjoint copy
3277 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
3278 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
3279 // conjoint nopush entry is needed by generic/unsafe copy
3280 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
3281 // disjoint normal/nopush and conjoint normal entries are not
3282 // generated since the arrayof versions are the same
3283 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3284 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
3285 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3286
3287 //*** oops
3288 {
3289 StubRoutines::_arrayof_oop_disjoint_arraycopy
3290 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
3291 // disjoint arrayof nopush entry is needed by conjoint copy
3292 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
3293 StubRoutines::_arrayof_oop_arraycopy
3294 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
3295 // conjoint arrayof nopush entry is needed by generic/unsafe copy
3296 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
3297 // Aligned versions without pre-barriers
3298 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
3299 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
3300 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
3301 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
3302 // note that we don't need a returned nopush entry because the
3303 // generic/unsafe copy does not cater for uninit arrays.
3304 StubRoutines::_arrayof_oop_arraycopy_uninit
3305 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
3306 }
3307
3308 // for oop copies reuse arrayof entries for non-arrayof cases
3309 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3310 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
3311 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3312 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3313 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
3314 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3315
3316 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
3317 // checkcast nopush entry is needed by generic copy
3318 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
3319 // note that we don't need a returned nopush entry because the
3320 // generic copy does not cater for uninit arrays.
3321 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
3322
3323 // unsafe arraycopy may fallback on conjoint stubs
3324 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
3325 StubRoutines::_jshort_arraycopy_nopush,
3326 StubRoutines::_jint_arraycopy_nopush,
3327 StubRoutines::_jlong_arraycopy_nopush);
3328
3329 // generic arraycopy may fallback on conjoint stubs
3330 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
3331 StubRoutines::_jshort_arraycopy_nopush,
3332 StubRoutines::_jint_arraycopy_nopush,
3333 StubRoutines::_oop_arraycopy_nopush,
3334 StubRoutines::_jlong_arraycopy_nopush,
3335 StubRoutines::_checkcast_arraycopy_nopush);
3336
3337 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
3338 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
3339 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
3340 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3341 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3342 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3343 }
3344
3345 void generate_math_stubs() { Unimplemented(); }
3346
3347 // Arguments:
3348 //
3349 // Inputs:
3350 // c_rarg0 - source byte array address
3351 // c_rarg1 - destination byte array address
3352 // c_rarg2 - sessionKe (key) in little endian int array
3353 //
3354 address generate_aescrypt_encryptBlock() {
3355 assert(UseAES, "need AES cryptographic extension support");
3356 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
3357 int entry_count = StubInfo::entry_count(stub_id);
3358 assert(entry_count == 1, "sanity check");
3359 address start = load_archive_data(stub_id);
3360 if (start != nullptr) {
3361 return start;
3362 }
3363 __ align(CodeEntryAlignment);
3364 StubCodeMark mark(this, stub_id);
3365
3366 const Register from = c_rarg0; // source array address
3367 const Register to = c_rarg1; // destination array address
3368 const Register key = c_rarg2; // key array address
3369 const Register keylen = rscratch1;
3370
3371 start = __ pc();
3372 __ enter();
3373
3374 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3375
3376 __ aesenc_loadkeys(key, keylen);
3377 __ aesecb_encrypt(from, to, keylen);
3378
3379 __ mov(r0, 0);
3380
3381 __ leave();
3382 __ ret(lr);
3383
3384 // record the stub entry and end
3385 store_archive_data(stub_id, start, __ pc());
3386
3387 return start;
3388 }
3389
3390 // Arguments:
3391 //
3392 // Inputs:
3393 // c_rarg0 - source byte array address
3394 // c_rarg1 - destination byte array address
3395 // c_rarg2 - sessionKd (key) in little endian int array
3396 //
3397 address generate_aescrypt_decryptBlock() {
3398 assert(UseAES, "need AES cryptographic extension support");
3399 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
3400 int entry_count = StubInfo::entry_count(stub_id);
3401 assert(entry_count == 1, "sanity check");
3402 address start = load_archive_data(stub_id);
3403 if (start != nullptr) {
3404 return start;
3405 }
3406 __ align(CodeEntryAlignment);
3407 StubCodeMark mark(this, stub_id);
3408 Label L_doLast;
3409
3410 const Register from = c_rarg0; // source array address
3411 const Register to = c_rarg1; // destination array address
3412 const Register key = c_rarg2; // key array address
3413 const Register keylen = rscratch1;
3414
3415 start = __ pc();
3416 __ enter(); // required for proper stackwalking of RuntimeStub frame
3417
3418 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3419
3420 __ aesecb_decrypt(from, to, key, keylen);
3421
3422 __ mov(r0, 0);
3423
3424 __ leave();
3425 __ ret(lr);
3426
3427 // record the stub entry and end
3428 store_archive_data(stub_id, start, __ pc());
3429
3430 return start;
3431 }
3432
3433 // Arguments:
3434 //
3435 // Inputs:
3436 // c_rarg0 - source byte array address
3437 // c_rarg1 - destination byte array address
3438 // c_rarg2 - sessionKe (key) in little endian int array
3439 // c_rarg3 - r vector byte array address
3440 // c_rarg4 - input length
3441 //
3442 // Output:
3443 // x0 - input length
3444 //
3445 address generate_cipherBlockChaining_encryptAESCrypt() {
3446 assert(UseAES, "need AES cryptographic extension support");
3447 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
3448 int entry_count = StubInfo::entry_count(stub_id);
3449 assert(entry_count == 1, "sanity check");
3450 address start = load_archive_data(stub_id);
3451 if (start != nullptr) {
3452 return start;
3453 }
3454 __ align(CodeEntryAlignment);
3455 StubCodeMark mark(this, stub_id);
3456
3457 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3458
3459 const Register from = c_rarg0; // source array address
3460 const Register to = c_rarg1; // destination array address
3461 const Register key = c_rarg2; // key array address
3462 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3463 // and left with the results of the last encryption block
3464 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3465 const Register keylen = rscratch1;
3466
3467 start = __ pc();
3468
3469 __ enter();
3470
3471 __ movw(rscratch2, len_reg);
3472
3473 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3474
3475 __ ld1(v0, __ T16B, rvec);
3476
3477 __ cmpw(keylen, 52);
3478 __ br(Assembler::CC, L_loadkeys_44);
3479 __ br(Assembler::EQ, L_loadkeys_52);
3480
3481 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3482 __ rev32(v17, __ T16B, v17);
3483 __ rev32(v18, __ T16B, v18);
3484 __ BIND(L_loadkeys_52);
3485 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3486 __ rev32(v19, __ T16B, v19);
3487 __ rev32(v20, __ T16B, v20);
3488 __ BIND(L_loadkeys_44);
3489 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3490 __ rev32(v21, __ T16B, v21);
3491 __ rev32(v22, __ T16B, v22);
3492 __ rev32(v23, __ T16B, v23);
3493 __ rev32(v24, __ T16B, v24);
3494 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3495 __ rev32(v25, __ T16B, v25);
3496 __ rev32(v26, __ T16B, v26);
3497 __ rev32(v27, __ T16B, v27);
3498 __ rev32(v28, __ T16B, v28);
3499 __ ld1(v29, v30, v31, __ T16B, key);
3500 __ rev32(v29, __ T16B, v29);
3501 __ rev32(v30, __ T16B, v30);
3502 __ rev32(v31, __ T16B, v31);
3503
3504 __ BIND(L_aes_loop);
3505 __ ld1(v1, __ T16B, __ post(from, 16));
3506 __ eor(v0, __ T16B, v0, v1);
3507
3508 __ br(Assembler::CC, L_rounds_44);
3509 __ br(Assembler::EQ, L_rounds_52);
3510
3511 __ aese(v0, v17); __ aesmc(v0, v0);
3512 __ aese(v0, v18); __ aesmc(v0, v0);
3513 __ BIND(L_rounds_52);
3514 __ aese(v0, v19); __ aesmc(v0, v0);
3515 __ aese(v0, v20); __ aesmc(v0, v0);
3516 __ BIND(L_rounds_44);
3517 __ aese(v0, v21); __ aesmc(v0, v0);
3518 __ aese(v0, v22); __ aesmc(v0, v0);
3519 __ aese(v0, v23); __ aesmc(v0, v0);
3520 __ aese(v0, v24); __ aesmc(v0, v0);
3521 __ aese(v0, v25); __ aesmc(v0, v0);
3522 __ aese(v0, v26); __ aesmc(v0, v0);
3523 __ aese(v0, v27); __ aesmc(v0, v0);
3524 __ aese(v0, v28); __ aesmc(v0, v0);
3525 __ aese(v0, v29); __ aesmc(v0, v0);
3526 __ aese(v0, v30);
3527 __ eor(v0, __ T16B, v0, v31);
3528
3529 __ st1(v0, __ T16B, __ post(to, 16));
3530
3531 __ subw(len_reg, len_reg, 16);
3532 __ cbnzw(len_reg, L_aes_loop);
3533
3534 __ st1(v0, __ T16B, rvec);
3535
3536 __ mov(r0, rscratch2);
3537
3538 __ leave();
3539 __ ret(lr);
3540
3541 // record the stub entry and end
3542 store_archive_data(stub_id, start, __ pc());
3543
3544 return start;
3545 }
3546
3547 // Arguments:
3548 //
3549 // Inputs:
3550 // c_rarg0 - source byte array address
3551 // c_rarg1 - destination byte array address
3552 // c_rarg2 - sessionKd (key) in little endian int array
3553 // c_rarg3 - r vector byte array address
3554 // c_rarg4 - input length
3555 //
3556 // Output:
3557 // r0 - input length
3558 //
3559 address generate_cipherBlockChaining_decryptAESCrypt() {
3560 assert(UseAES, "need AES cryptographic extension support");
3561 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3562 int entry_count = StubInfo::entry_count(stub_id);
3563 assert(entry_count == 1, "sanity check");
3564 address start = load_archive_data(stub_id);
3565 if (start != nullptr) {
3566 return start;
3567 }
3568 __ align(CodeEntryAlignment);
3569 StubCodeMark mark(this, stub_id);
3570
3571 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3572
3573 const Register from = c_rarg0; // source array address
3574 const Register to = c_rarg1; // destination array address
3575 const Register key = c_rarg2; // key array address
3576 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3577 // and left with the results of the last encryption block
3578 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3579 const Register keylen = rscratch1;
3580
3581 start = __ pc();
3582
3583 __ enter();
3584
3585 __ movw(rscratch2, len_reg);
3586
3587 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3588
3589 __ ld1(v2, __ T16B, rvec);
3590
3591 __ ld1(v31, __ T16B, __ post(key, 16));
3592 __ rev32(v31, __ T16B, v31);
3593
3594 __ cmpw(keylen, 52);
3595 __ br(Assembler::CC, L_loadkeys_44);
3596 __ br(Assembler::EQ, L_loadkeys_52);
3597
3598 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3599 __ rev32(v17, __ T16B, v17);
3600 __ rev32(v18, __ T16B, v18);
3601 __ BIND(L_loadkeys_52);
3602 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3603 __ rev32(v19, __ T16B, v19);
3604 __ rev32(v20, __ T16B, v20);
3605 __ BIND(L_loadkeys_44);
3606 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3607 __ rev32(v21, __ T16B, v21);
3608 __ rev32(v22, __ T16B, v22);
3609 __ rev32(v23, __ T16B, v23);
3610 __ rev32(v24, __ T16B, v24);
3611 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3612 __ rev32(v25, __ T16B, v25);
3613 __ rev32(v26, __ T16B, v26);
3614 __ rev32(v27, __ T16B, v27);
3615 __ rev32(v28, __ T16B, v28);
3616 __ ld1(v29, v30, __ T16B, key);
3617 __ rev32(v29, __ T16B, v29);
3618 __ rev32(v30, __ T16B, v30);
3619
3620 __ BIND(L_aes_loop);
3621 __ ld1(v0, __ T16B, __ post(from, 16));
3622 __ orr(v1, __ T16B, v0, v0);
3623
3624 __ br(Assembler::CC, L_rounds_44);
3625 __ br(Assembler::EQ, L_rounds_52);
3626
3627 __ aesd(v0, v17); __ aesimc(v0, v0);
3628 __ aesd(v0, v18); __ aesimc(v0, v0);
3629 __ BIND(L_rounds_52);
3630 __ aesd(v0, v19); __ aesimc(v0, v0);
3631 __ aesd(v0, v20); __ aesimc(v0, v0);
3632 __ BIND(L_rounds_44);
3633 __ aesd(v0, v21); __ aesimc(v0, v0);
3634 __ aesd(v0, v22); __ aesimc(v0, v0);
3635 __ aesd(v0, v23); __ aesimc(v0, v0);
3636 __ aesd(v0, v24); __ aesimc(v0, v0);
3637 __ aesd(v0, v25); __ aesimc(v0, v0);
3638 __ aesd(v0, v26); __ aesimc(v0, v0);
3639 __ aesd(v0, v27); __ aesimc(v0, v0);
3640 __ aesd(v0, v28); __ aesimc(v0, v0);
3641 __ aesd(v0, v29); __ aesimc(v0, v0);
3642 __ aesd(v0, v30);
3643 __ eor(v0, __ T16B, v0, v31);
3644 __ eor(v0, __ T16B, v0, v2);
3645
3646 __ st1(v0, __ T16B, __ post(to, 16));
3647 __ orr(v2, __ T16B, v1, v1);
3648
3649 __ subw(len_reg, len_reg, 16);
3650 __ cbnzw(len_reg, L_aes_loop);
3651
3652 __ st1(v2, __ T16B, rvec);
3653
3654 __ mov(r0, rscratch2);
3655
3656 __ leave();
3657 __ ret(lr);
3658
3659 // record the stub entry and end
3660 store_archive_data(stub_id, start, __ pc());
3661
3662 return start;
3663 }
3664
3665 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3666 // Inputs: 128-bits. in is preserved.
3667 // The least-significant 64-bit word is in the upper dword of each vector.
3668 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3669 // Output: result
3670 void be_add_128_64(FloatRegister result, FloatRegister in,
3671 FloatRegister inc, FloatRegister tmp) {
3672 assert_different_registers(result, tmp, inc);
3673
3674 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3675 // input
3676 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3677 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3678 // MSD == 0 (must be!) to LSD
3679 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3680 }
3681
3682 // CTR AES crypt.
3683 // Arguments:
3684 //
3685 // Inputs:
3686 // c_rarg0 - source byte array address
3687 // c_rarg1 - destination byte array address
3688 // c_rarg2 - sessionKe (key) in little endian int array
3689 // c_rarg3 - counter vector byte array address
3690 // c_rarg4 - input length
3691 // c_rarg5 - saved encryptedCounter start
3692 // c_rarg6 - saved used length
3693 //
3694 // Output:
3695 // r0 - input length
3696 //
3697 address generate_counterMode_AESCrypt() {
3698 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3699 int entry_count = StubInfo::entry_count(stub_id);
3700 assert(entry_count == 1, "sanity check");
3701 address start = load_archive_data(stub_id);
3702 if (start != nullptr) {
3703 return start;
3704 }
3705 const Register in = c_rarg0;
3706 const Register out = c_rarg1;
3707 const Register key = c_rarg2;
3708 const Register counter = c_rarg3;
3709 const Register saved_len = c_rarg4, len = r10;
3710 const Register saved_encrypted_ctr = c_rarg5;
3711 const Register used_ptr = c_rarg6, used = r12;
3712
3713 const Register offset = r7;
3714 const Register keylen = r11;
3715
3716 const unsigned char block_size = 16;
3717 const int bulk_width = 4;
3718 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3719 // performance with larger data sizes, but it also means that the
3720 // fast path isn't used until you have at least 8 blocks, and up
3721 // to 127 bytes of data will be executed on the slow path. For
3722 // that reason, and also so as not to blow away too much icache, 4
3723 // blocks seems like a sensible compromise.
3724
3725 // Algorithm:
3726 //
3727 // if (len == 0) {
3728 // goto DONE;
3729 // }
3730 // int result = len;
3731 // do {
3732 // if (used >= blockSize) {
3733 // if (len >= bulk_width * blockSize) {
3734 // CTR_large_block();
3735 // if (len == 0)
3736 // goto DONE;
3737 // }
3738 // for (;;) {
3739 // 16ByteVector v0 = counter;
3740 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3741 // used = 0;
3742 // if (len < blockSize)
3743 // break; /* goto NEXT */
3744 // 16ByteVector v1 = load16Bytes(in, offset);
3745 // v1 = v1 ^ encryptedCounter;
3746 // store16Bytes(out, offset);
3747 // used = blockSize;
3748 // offset += blockSize;
3749 // len -= blockSize;
3750 // if (len == 0)
3751 // goto DONE;
3752 // }
3753 // }
3754 // NEXT:
3755 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3756 // len--;
3757 // } while (len != 0);
3758 // DONE:
3759 // return result;
3760 //
3761 // CTR_large_block()
3762 // Wide bulk encryption of whole blocks.
3763
3764 __ align(CodeEntryAlignment);
3765 StubCodeMark mark(this, stub_id);
3766 start = __ pc();
3767 __ enter();
3768
3769 Label DONE, CTR_large_block, large_block_return;
3770 __ ldrw(used, Address(used_ptr));
3771 __ cbzw(saved_len, DONE);
3772
3773 __ mov(len, saved_len);
3774 __ mov(offset, 0);
3775
3776 // Compute #rounds for AES based on the length of the key array
3777 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3778
3779 __ aesenc_loadkeys(key, keylen);
3780
3781 {
3782 Label L_CTR_loop, NEXT;
3783
3784 __ bind(L_CTR_loop);
3785
3786 __ cmp(used, block_size);
3787 __ br(__ LO, NEXT);
3788
3789 // Maybe we have a lot of data
3790 __ subsw(rscratch1, len, bulk_width * block_size);
3791 __ br(__ HS, CTR_large_block);
3792 __ BIND(large_block_return);
3793 __ cbzw(len, DONE);
3794
3795 // Setup the counter
3796 __ movi(v4, __ T4S, 0);
3797 __ movi(v5, __ T4S, 1);
3798 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3799
3800 // 128-bit big-endian increment
3801 __ ld1(v0, __ T16B, counter);
3802 __ rev64(v16, __ T16B, v0);
3803 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3804 __ rev64(v16, __ T16B, v16);
3805 __ st1(v16, __ T16B, counter);
3806 // Previous counter value is in v0
3807 // v4 contains { 0, 1 }
3808
3809 {
3810 // We have fewer than bulk_width blocks of data left. Encrypt
3811 // them one by one until there is less than a full block
3812 // remaining, being careful to save both the encrypted counter
3813 // and the counter.
3814
3815 Label inner_loop;
3816 __ bind(inner_loop);
3817 // Counter to encrypt is in v0
3818 __ aesecb_encrypt(noreg, noreg, keylen);
3819 __ st1(v0, __ T16B, saved_encrypted_ctr);
3820
3821 // Do we have a remaining full block?
3822
3823 __ mov(used, 0);
3824 __ cmp(len, block_size);
3825 __ br(__ LO, NEXT);
3826
3827 // Yes, we have a full block
3828 __ ldrq(v1, Address(in, offset));
3829 __ eor(v1, __ T16B, v1, v0);
3830 __ strq(v1, Address(out, offset));
3831 __ mov(used, block_size);
3832 __ add(offset, offset, block_size);
3833
3834 __ subw(len, len, block_size);
3835 __ cbzw(len, DONE);
3836
3837 // Increment the counter, store it back
3838 __ orr(v0, __ T16B, v16, v16);
3839 __ rev64(v16, __ T16B, v16);
3840 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3841 __ rev64(v16, __ T16B, v16);
3842 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3843
3844 __ b(inner_loop);
3845 }
3846
3847 __ BIND(NEXT);
3848
3849 // Encrypt a single byte, and loop.
3850 // We expect this to be a rare event.
3851 __ ldrb(rscratch1, Address(in, offset));
3852 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3853 __ eor(rscratch1, rscratch1, rscratch2);
3854 __ strb(rscratch1, Address(out, offset));
3855 __ add(offset, offset, 1);
3856 __ add(used, used, 1);
3857 __ subw(len, len,1);
3858 __ cbnzw(len, L_CTR_loop);
3859 }
3860
3861 __ bind(DONE);
3862 __ strw(used, Address(used_ptr));
3863 __ mov(r0, saved_len);
3864
3865 __ leave(); // required for proper stackwalking of RuntimeStub frame
3866 __ ret(lr);
3867
3868 // Bulk encryption
3869
3870 __ BIND (CTR_large_block);
3871 assert(bulk_width == 4 || bulk_width == 8, "must be");
3872
3873 if (bulk_width == 8) {
3874 __ sub(sp, sp, 4 * 16);
3875 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3876 }
3877 __ sub(sp, sp, 4 * 16);
3878 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3879 RegSet saved_regs = (RegSet::of(in, out, offset)
3880 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3881 __ push(saved_regs, sp);
3882 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3883 __ add(in, in, offset);
3884 __ add(out, out, offset);
3885
3886 // Keys should already be loaded into the correct registers
3887
3888 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3889 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3890
3891 // AES/CTR loop
3892 {
3893 Label L_CTR_loop;
3894 __ BIND(L_CTR_loop);
3895
3896 // Setup the counters
3897 __ movi(v8, __ T4S, 0);
3898 __ movi(v9, __ T4S, 1);
3899 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3900
3901 for (int i = 0; i < bulk_width; i++) {
3902 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3903 __ rev64(v0_ofs, __ T16B, v16);
3904 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3905 }
3906
3907 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3908
3909 // Encrypt the counters
3910 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3911
3912 if (bulk_width == 8) {
3913 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3914 }
3915
3916 // XOR the encrypted counters with the inputs
3917 for (int i = 0; i < bulk_width; i++) {
3918 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3919 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3920 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3921 }
3922
3923 // Write the encrypted data
3924 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3925 if (bulk_width == 8) {
3926 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3927 }
3928
3929 __ subw(len, len, 16 * bulk_width);
3930 __ cbnzw(len, L_CTR_loop);
3931 }
3932
3933 // Save the counter back where it goes
3934 __ rev64(v16, __ T16B, v16);
3935 __ st1(v16, __ T16B, counter);
3936
3937 __ pop(saved_regs, sp);
3938
3939 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3940 if (bulk_width == 8) {
3941 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3942 }
3943
3944 __ andr(rscratch1, len, -16 * bulk_width);
3945 __ sub(len, len, rscratch1);
3946 __ add(offset, offset, rscratch1);
3947 __ mov(used, 16);
3948 __ strw(used, Address(used_ptr));
3949 __ b(large_block_return);
3950
3951 // record the stub entry and end
3952 store_archive_data(stub_id, start, __ pc());
3953
3954 return start;
3955 }
3956
3957 // Vector AES Galois Counter Mode implementation. Parameters:
3958 //
3959 // in = c_rarg0
3960 // len = c_rarg1
3961 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3962 // out = c_rarg3
3963 // key = c_rarg4
3964 // state = c_rarg5 - GHASH.state
3965 // subkeyHtbl = c_rarg6 - powers of H
3966 // counter = c_rarg7 - 16 bytes of CTR
3967 // return - number of processed bytes
3968 address generate_galoisCounterMode_AESCrypt() {
3969 Label ghash_polynomial; // local data generated after code
3970 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3971 int entry_count = StubInfo::entry_count(stub_id);
3972 assert(entry_count == 1, "sanity check");
3973 address start = load_archive_data(stub_id);
3974 if (start != nullptr) {
3975 return start;
3976 }
3977 __ align(CodeEntryAlignment);
3978 StubCodeMark mark(this, stub_id);
3979 start = __ pc();
3980 __ enter();
3981
3982 const Register in = c_rarg0;
3983 const Register len = c_rarg1;
3984 const Register ct = c_rarg2;
3985 const Register out = c_rarg3;
3986 // and updated with the incremented counter in the end
3987
3988 const Register key = c_rarg4;
3989 const Register state = c_rarg5;
3990
3991 const Register subkeyHtbl = c_rarg6;
3992
3993 const Register counter = c_rarg7;
3994
3995 const Register keylen = r10;
3996 // Save state before entering routine
3997 __ sub(sp, sp, 4 * 16);
3998 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3999 __ sub(sp, sp, 4 * 16);
4000 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
4001
4002 // __ andr(len, len, -512);
4003 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
4004 __ str(len, __ pre(sp, -2 * wordSize));
4005
4006 Label DONE;
4007 __ cbz(len, DONE);
4008
4009 // Compute #rounds for AES based on the length of the key array
4010 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4011
4012 __ aesenc_loadkeys(key, keylen);
4013 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
4014 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
4015
4016 // AES/CTR loop
4017 {
4018 Label L_CTR_loop;
4019 __ BIND(L_CTR_loop);
4020
4021 // Setup the counters
4022 __ movi(v8, __ T4S, 0);
4023 __ movi(v9, __ T4S, 1);
4024 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
4025
4026 assert(v0->encoding() < v8->encoding(), "");
4027 for (int i = v0->encoding(); i < v8->encoding(); i++) {
4028 FloatRegister f = as_FloatRegister(i);
4029 __ rev32(f, __ T16B, v16);
4030 __ addv(v16, __ T4S, v16, v8);
4031 }
4032
4033 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
4034
4035 // Encrypt the counters
4036 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
4037
4038 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
4039
4040 // XOR the encrypted counters with the inputs
4041 for (int i = 0; i < 8; i++) {
4042 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
4043 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
4044 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
4045 }
4046 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
4047 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
4048
4049 __ subw(len, len, 16 * 8);
4050 __ cbnzw(len, L_CTR_loop);
4051 }
4052
4053 __ rev32(v16, __ T16B, v16);
4054 __ st1(v16, __ T16B, counter);
4055
4056 __ ldr(len, Address(sp));
4057 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
4058
4059 // GHASH/CTR loop
4060 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
4061 len, /*unrolls*/4);
4062
4063 #ifdef ASSERT
4064 { Label L;
4065 __ cmp(len, (unsigned char)0);
4066 __ br(Assembler::EQ, L);
4067 __ stop("stubGenerator: abort");
4068 __ bind(L);
4069 }
4070 #endif
4071
4072 __ bind(DONE);
4073 // Return the number of bytes processed
4074 __ ldr(r0, __ post(sp, 2 * wordSize));
4075
4076 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
4077 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
4078
4079 __ leave(); // required for proper stackwalking of RuntimeStub frame
4080 __ ret(lr);
4081
4082 // bind label and generate polynomial data
4083 __ align(wordSize * 2);
4084 __ bind(ghash_polynomial);
4085 __ emit_int64(0x87); // The low-order bits of the field
4086 // polynomial (i.e. p = z^7+z^2+z+1)
4087 // repeated in the low and high parts of a
4088 // 128-bit vector
4089 __ emit_int64(0x87);
4090
4091 // record the stub entry and end
4092 store_archive_data(stub_id, start, __ pc());
4093
4094 return start;
4095 }
4096
4097 class Cached64Bytes {
4098 private:
4099 MacroAssembler *_masm;
4100 Register _regs[8];
4101
4102 public:
4103 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
4104 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
4105 auto it = rs.begin();
4106 for (auto &r: _regs) {
4107 r = *it;
4108 ++it;
4109 }
4110 }
4111
4112 void gen_loads(Register base) {
4113 for (int i = 0; i < 8; i += 2) {
4114 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
4115 }
4116 }
4117
4118 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
4119 void extract_u32(Register dest, int i) {
4120 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
4121 }
4122 };
4123
4124 // Utility routines for md5.
4125 // Clobbers r10 and r11.
4126 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4127 int k, int s, int t) {
4128 Register rscratch3 = r10;
4129 Register rscratch4 = r11;
4130
4131 __ eorw(rscratch3, r3, r4);
4132 __ movw(rscratch2, t);
4133 __ andw(rscratch3, rscratch3, r2);
4134 __ addw(rscratch4, r1, rscratch2);
4135 reg_cache.extract_u32(rscratch1, k);
4136 __ eorw(rscratch3, rscratch3, r4);
4137 __ addw(rscratch4, rscratch4, rscratch1);
4138 __ addw(rscratch3, rscratch3, rscratch4);
4139 __ rorw(rscratch2, rscratch3, 32 - s);
4140 __ addw(r1, rscratch2, r2);
4141 }
4142
4143 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4144 int k, int s, int t) {
4145 Register rscratch3 = r10;
4146 Register rscratch4 = r11;
4147
4148 reg_cache.extract_u32(rscratch1, k);
4149 __ movw(rscratch2, t);
4150 __ addw(rscratch4, r1, rscratch2);
4151 __ addw(rscratch4, rscratch4, rscratch1);
4152 __ bicw(rscratch2, r3, r4);
4153 __ andw(rscratch3, r2, r4);
4154 __ addw(rscratch2, rscratch2, rscratch4);
4155 __ addw(rscratch2, rscratch2, rscratch3);
4156 __ rorw(rscratch2, rscratch2, 32 - s);
4157 __ addw(r1, rscratch2, r2);
4158 }
4159
4160 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4161 int k, int s, int t) {
4162 Register rscratch3 = r10;
4163 Register rscratch4 = r11;
4164
4165 __ eorw(rscratch3, r3, r4);
4166 __ movw(rscratch2, t);
4167 __ addw(rscratch4, r1, rscratch2);
4168 reg_cache.extract_u32(rscratch1, k);
4169 __ eorw(rscratch3, rscratch3, r2);
4170 __ addw(rscratch4, rscratch4, rscratch1);
4171 __ addw(rscratch3, rscratch3, rscratch4);
4172 __ rorw(rscratch2, rscratch3, 32 - s);
4173 __ addw(r1, rscratch2, r2);
4174 }
4175
4176 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4177 int k, int s, int t) {
4178 Register rscratch3 = r10;
4179 Register rscratch4 = r11;
4180
4181 __ movw(rscratch3, t);
4182 __ ornw(rscratch2, r2, r4);
4183 __ addw(rscratch4, r1, rscratch3);
4184 reg_cache.extract_u32(rscratch1, k);
4185 __ eorw(rscratch3, rscratch2, r3);
4186 __ addw(rscratch4, rscratch4, rscratch1);
4187 __ addw(rscratch3, rscratch3, rscratch4);
4188 __ rorw(rscratch2, rscratch3, 32 - s);
4189 __ addw(r1, rscratch2, r2);
4190 }
4191
4192 // Arguments:
4193 //
4194 // Inputs:
4195 // c_rarg0 - byte[] source+offset
4196 // c_rarg1 - int[] SHA.state
4197 // c_rarg2 - int offset
4198 // c_rarg3 - int limit
4199 //
4200 address generate_md5_implCompress(StubId stub_id) {
4201 bool multi_block;
4202 switch (stub_id) {
4203 case StubId::stubgen_md5_implCompress_id:
4204 multi_block = false;
4205 break;
4206 case StubId::stubgen_md5_implCompressMB_id:
4207 multi_block = true;
4208 break;
4209 default:
4210 ShouldNotReachHere();
4211 }
4212 int entry_count = StubInfo::entry_count(stub_id);
4213 assert(entry_count == 1, "sanity check");
4214 address start = load_archive_data(stub_id);
4215 if (start != nullptr) {
4216 return start;
4217 }
4218 __ align(CodeEntryAlignment);
4219
4220 StubCodeMark mark(this, stub_id);
4221 start = __ pc();
4222
4223 Register buf = c_rarg0;
4224 Register state = c_rarg1;
4225 Register ofs = c_rarg2;
4226 Register limit = c_rarg3;
4227 Register a = r4;
4228 Register b = r5;
4229 Register c = r6;
4230 Register d = r7;
4231 Register rscratch3 = r10;
4232 Register rscratch4 = r11;
4233
4234 Register state_regs[2] = { r12, r13 };
4235 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
4236 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
4237
4238 __ push(saved_regs, sp);
4239
4240 __ ldp(state_regs[0], state_regs[1], Address(state));
4241 __ ubfx(a, state_regs[0], 0, 32);
4242 __ ubfx(b, state_regs[0], 32, 32);
4243 __ ubfx(c, state_regs[1], 0, 32);
4244 __ ubfx(d, state_regs[1], 32, 32);
4245
4246 Label md5_loop;
4247 __ BIND(md5_loop);
4248
4249 reg_cache.gen_loads(buf);
4250
4251 // Round 1
4252 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
4253 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
4254 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
4255 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
4256 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
4257 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
4258 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
4259 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
4260 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
4261 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
4262 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
4263 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
4264 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
4265 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
4266 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
4267 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
4268
4269 // Round 2
4270 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
4271 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
4272 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
4273 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
4274 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
4275 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
4276 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
4277 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
4278 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
4279 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
4280 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
4281 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
4282 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
4283 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
4284 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
4285 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
4286
4287 // Round 3
4288 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
4289 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
4290 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
4291 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
4292 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
4293 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
4294 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
4295 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
4296 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
4297 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
4298 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
4299 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
4300 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
4301 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
4302 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
4303 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
4304
4305 // Round 4
4306 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
4307 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
4308 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
4309 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
4310 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
4311 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
4312 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
4313 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
4314 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
4315 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
4316 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
4317 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
4318 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
4319 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
4320 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
4321 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
4322
4323 __ addw(a, state_regs[0], a);
4324 __ ubfx(rscratch2, state_regs[0], 32, 32);
4325 __ addw(b, rscratch2, b);
4326 __ addw(c, state_regs[1], c);
4327 __ ubfx(rscratch4, state_regs[1], 32, 32);
4328 __ addw(d, rscratch4, d);
4329
4330 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
4331 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
4332
4333 if (multi_block) {
4334 __ add(buf, buf, 64);
4335 __ add(ofs, ofs, 64);
4336 __ cmp(ofs, limit);
4337 __ br(Assembler::LE, md5_loop);
4338 __ mov(c_rarg0, ofs); // return ofs
4339 }
4340
4341 // write hash values back in the correct order
4342 __ stp(state_regs[0], state_regs[1], Address(state));
4343
4344 __ pop(saved_regs, sp);
4345
4346 __ ret(lr);
4347
4348 // record the stub entry and end
4349 store_archive_data(stub_id, start, __ pc());
4350
4351 return start;
4352 }
4353
4354 // Arguments:
4355 //
4356 // Inputs:
4357 // c_rarg0 - byte[] source+offset
4358 // c_rarg1 - int[] SHA.state
4359 // c_rarg2 - int offset
4360 // c_rarg3 - int limit
4361 //
4362 address generate_sha1_implCompress(StubId stub_id) {
4363 bool multi_block;
4364 switch (stub_id) {
4365 case StubId::stubgen_sha1_implCompress_id:
4366 multi_block = false;
4367 break;
4368 case StubId::stubgen_sha1_implCompressMB_id:
4369 multi_block = true;
4370 break;
4371 default:
4372 ShouldNotReachHere();
4373 }
4374 int entry_count = StubInfo::entry_count(stub_id);
4375 assert(entry_count == 1, "sanity check");
4376 address start = load_archive_data(stub_id);
4377 if (start != nullptr) {
4378 return start;
4379 }
4380 __ align(CodeEntryAlignment);
4381
4382 StubCodeMark mark(this, stub_id);
4383 start = __ pc();
4384
4385 Register buf = c_rarg0;
4386 Register state = c_rarg1;
4387 Register ofs = c_rarg2;
4388 Register limit = c_rarg3;
4389
4390 Label keys;
4391 Label sha1_loop;
4392
4393 // load the keys into v0..v3
4394 __ adr(rscratch1, keys);
4395 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
4396 // load 5 words state into v6, v7
4397 __ ldrq(v6, Address(state, 0));
4398 __ ldrs(v7, Address(state, 16));
4399
4400
4401 __ BIND(sha1_loop);
4402 // load 64 bytes of data into v16..v19
4403 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
4404 __ rev32(v16, __ T16B, v16);
4405 __ rev32(v17, __ T16B, v17);
4406 __ rev32(v18, __ T16B, v18);
4407 __ rev32(v19, __ T16B, v19);
4408
4409 // do the sha1
4410 __ addv(v4, __ T4S, v16, v0);
4411 __ orr(v20, __ T16B, v6, v6);
4412
4413 FloatRegister d0 = v16;
4414 FloatRegister d1 = v17;
4415 FloatRegister d2 = v18;
4416 FloatRegister d3 = v19;
4417
4418 for (int round = 0; round < 20; round++) {
4419 FloatRegister tmp1 = (round & 1) ? v4 : v5;
4420 FloatRegister tmp2 = (round & 1) ? v21 : v22;
4421 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
4422 FloatRegister tmp4 = (round & 1) ? v5 : v4;
4423 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
4424
4425 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
4426 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
4427 __ sha1h(tmp2, __ T4S, v20);
4428 if (round < 5)
4429 __ sha1c(v20, __ T4S, tmp3, tmp4);
4430 else if (round < 10 || round >= 15)
4431 __ sha1p(v20, __ T4S, tmp3, tmp4);
4432 else
4433 __ sha1m(v20, __ T4S, tmp3, tmp4);
4434 if (round < 16) __ sha1su1(d0, __ T4S, d3);
4435
4436 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4437 }
4438
4439 __ addv(v7, __ T2S, v7, v21);
4440 __ addv(v6, __ T4S, v6, v20);
4441
4442 if (multi_block) {
4443 __ add(ofs, ofs, 64);
4444 __ cmp(ofs, limit);
4445 __ br(Assembler::LE, sha1_loop);
4446 __ mov(c_rarg0, ofs); // return ofs
4447 }
4448
4449 __ strq(v6, Address(state, 0));
4450 __ strs(v7, Address(state, 16));
4451
4452 __ ret(lr);
4453
4454 __ bind(keys);
4455 __ emit_int32(0x5a827999);
4456 __ emit_int32(0x6ed9eba1);
4457 __ emit_int32(0x8f1bbcdc);
4458 __ emit_int32(0xca62c1d6);
4459
4460 // record the stub entry and end
4461 store_archive_data(stub_id, start, __ pc());
4462
4463 return start;
4464 }
4465
4466
4467 // Arguments:
4468 //
4469 // Inputs:
4470 // c_rarg0 - byte[] source+offset
4471 // c_rarg1 - int[] SHA.state
4472 // c_rarg2 - int offset
4473 // c_rarg3 - int limit
4474 //
4475 address generate_sha256_implCompress(StubId stub_id) {
4476 bool multi_block;
4477 switch (stub_id) {
4478 case StubId::stubgen_sha256_implCompress_id:
4479 multi_block = false;
4480 break;
4481 case StubId::stubgen_sha256_implCompressMB_id:
4482 multi_block = true;
4483 break;
4484 default:
4485 ShouldNotReachHere();
4486 }
4487 int entry_count = StubInfo::entry_count(stub_id);
4488 assert(entry_count == 1, "sanity check");
4489 address start = load_archive_data(stub_id);
4490 if (start != nullptr) {
4491 return start;
4492 }
4493 __ align(CodeEntryAlignment);
4494 StubCodeMark mark(this, stub_id);
4495 start = __ pc();
4496
4497 Register buf = c_rarg0;
4498 Register state = c_rarg1;
4499 Register ofs = c_rarg2;
4500 Register limit = c_rarg3;
4501
4502 Label sha1_loop;
4503
4504 __ stpd(v8, v9, __ pre(sp, -32));
4505 __ stpd(v10, v11, Address(sp, 16));
4506
4507 // dga == v0
4508 // dgb == v1
4509 // dg0 == v2
4510 // dg1 == v3
4511 // dg2 == v4
4512 // t0 == v6
4513 // t1 == v7
4514
4515 // load 16 keys to v16..v31
4516 __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
4517 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4518 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4519 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4520 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4521
4522 // load 8 words (256 bits) state
4523 __ ldpq(v0, v1, state);
4524
4525 __ BIND(sha1_loop);
4526 // load 64 bytes of data into v8..v11
4527 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4528 __ rev32(v8, __ T16B, v8);
4529 __ rev32(v9, __ T16B, v9);
4530 __ rev32(v10, __ T16B, v10);
4531 __ rev32(v11, __ T16B, v11);
4532
4533 __ addv(v6, __ T4S, v8, v16);
4534 __ orr(v2, __ T16B, v0, v0);
4535 __ orr(v3, __ T16B, v1, v1);
4536
4537 FloatRegister d0 = v8;
4538 FloatRegister d1 = v9;
4539 FloatRegister d2 = v10;
4540 FloatRegister d3 = v11;
4541
4542
4543 for (int round = 0; round < 16; round++) {
4544 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4545 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4546 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4547 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4548
4549 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4550 __ orr(v4, __ T16B, v2, v2);
4551 if (round < 15)
4552 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4553 __ sha256h(v2, __ T4S, v3, tmp2);
4554 __ sha256h2(v3, __ T4S, v4, tmp2);
4555 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4556
4557 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4558 }
4559
4560 __ addv(v0, __ T4S, v0, v2);
4561 __ addv(v1, __ T4S, v1, v3);
4562
4563 if (multi_block) {
4564 __ add(ofs, ofs, 64);
4565 __ cmp(ofs, limit);
4566 __ br(Assembler::LE, sha1_loop);
4567 __ mov(c_rarg0, ofs); // return ofs
4568 }
4569
4570 __ ldpd(v10, v11, Address(sp, 16));
4571 __ ldpd(v8, v9, __ post(sp, 32));
4572
4573 __ stpq(v0, v1, state);
4574
4575 __ ret(lr);
4576
4577 // record the stub entry and end
4578 store_archive_data(stub_id, start, __ pc());
4579
4580 return start;
4581 }
4582
4583 // Double rounds for sha512.
4584 void sha512_dround(int dr,
4585 FloatRegister vi0, FloatRegister vi1,
4586 FloatRegister vi2, FloatRegister vi3,
4587 FloatRegister vi4, FloatRegister vrc0,
4588 FloatRegister vrc1, FloatRegister vin0,
4589 FloatRegister vin1, FloatRegister vin2,
4590 FloatRegister vin3, FloatRegister vin4) {
4591 if (dr < 36) {
4592 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4593 }
4594 __ addv(v5, __ T2D, vrc0, vin0);
4595 __ ext(v6, __ T16B, vi2, vi3, 8);
4596 __ ext(v5, __ T16B, v5, v5, 8);
4597 __ ext(v7, __ T16B, vi1, vi2, 8);
4598 __ addv(vi3, __ T2D, vi3, v5);
4599 if (dr < 32) {
4600 __ ext(v5, __ T16B, vin3, vin4, 8);
4601 __ sha512su0(vin0, __ T2D, vin1);
4602 }
4603 __ sha512h(vi3, __ T2D, v6, v7);
4604 if (dr < 32) {
4605 __ sha512su1(vin0, __ T2D, vin2, v5);
4606 }
4607 __ addv(vi4, __ T2D, vi1, vi3);
4608 __ sha512h2(vi3, __ T2D, vi1, vi0);
4609 }
4610
4611 // Arguments:
4612 //
4613 // Inputs:
4614 // c_rarg0 - byte[] source+offset
4615 // c_rarg1 - int[] SHA.state
4616 // c_rarg2 - int offset
4617 // c_rarg3 - int limit
4618 //
4619 address generate_sha512_implCompress(StubId stub_id) {
4620 bool multi_block;
4621 switch (stub_id) {
4622 case StubId::stubgen_sha512_implCompress_id:
4623 multi_block = false;
4624 break;
4625 case StubId::stubgen_sha512_implCompressMB_id:
4626 multi_block = true;
4627 break;
4628 default:
4629 ShouldNotReachHere();
4630 }
4631 int entry_count = StubInfo::entry_count(stub_id);
4632 assert(entry_count == 1, "sanity check");
4633 address start = load_archive_data(stub_id);
4634 if (start != nullptr) {
4635 return start;
4636 }
4637 __ align(CodeEntryAlignment);
4638 StubCodeMark mark(this, stub_id);
4639 start = __ pc();
4640
4641 Register buf = c_rarg0;
4642 Register state = c_rarg1;
4643 Register ofs = c_rarg2;
4644 Register limit = c_rarg3;
4645
4646 __ stpd(v8, v9, __ pre(sp, -64));
4647 __ stpd(v10, v11, Address(sp, 16));
4648 __ stpd(v12, v13, Address(sp, 32));
4649 __ stpd(v14, v15, Address(sp, 48));
4650
4651 Label sha512_loop;
4652
4653 // load state
4654 __ ld1(v8, v9, v10, v11, __ T2D, state);
4655
4656 // load first 4 round constants
4657 __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
4658 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4659
4660 __ BIND(sha512_loop);
4661 // load 128B of data into v12..v19
4662 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4663 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4664 __ rev64(v12, __ T16B, v12);
4665 __ rev64(v13, __ T16B, v13);
4666 __ rev64(v14, __ T16B, v14);
4667 __ rev64(v15, __ T16B, v15);
4668 __ rev64(v16, __ T16B, v16);
4669 __ rev64(v17, __ T16B, v17);
4670 __ rev64(v18, __ T16B, v18);
4671 __ rev64(v19, __ T16B, v19);
4672
4673 __ mov(rscratch2, rscratch1);
4674
4675 __ mov(v0, __ T16B, v8);
4676 __ mov(v1, __ T16B, v9);
4677 __ mov(v2, __ T16B, v10);
4678 __ mov(v3, __ T16B, v11);
4679
4680 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4681 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4682 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4683 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4684 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4685 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4686 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4687 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4688 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4689 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4690 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4691 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4692 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4693 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4694 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4695 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4696 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4697 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4698 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4699 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4700 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4701 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4702 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4703 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4704 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4705 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4706 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4707 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4708 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4709 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4710 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4711 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4712 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4713 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4714 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4715 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4716 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4717 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4718 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4719 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4720
4721 __ addv(v8, __ T2D, v8, v0);
4722 __ addv(v9, __ T2D, v9, v1);
4723 __ addv(v10, __ T2D, v10, v2);
4724 __ addv(v11, __ T2D, v11, v3);
4725
4726 if (multi_block) {
4727 __ add(ofs, ofs, 128);
4728 __ cmp(ofs, limit);
4729 __ br(Assembler::LE, sha512_loop);
4730 __ mov(c_rarg0, ofs); // return ofs
4731 }
4732
4733 __ st1(v8, v9, v10, v11, __ T2D, state);
4734
4735 __ ldpd(v14, v15, Address(sp, 48));
4736 __ ldpd(v12, v13, Address(sp, 32));
4737 __ ldpd(v10, v11, Address(sp, 16));
4738 __ ldpd(v8, v9, __ post(sp, 64));
4739
4740 __ ret(lr);
4741
4742 // record the stub entry and end
4743 store_archive_data(stub_id, start, __ pc());
4744
4745 return start;
4746 }
4747
4748 // Execute one round of keccak of two computations in parallel.
4749 // One of the states should be loaded into the lower halves of
4750 // the vector registers v0-v24, the other should be loaded into
4751 // the upper halves of those registers. The ld1r instruction loads
4752 // the round constant into both halves of register v31.
4753 // Intermediate results c0...c5 and d0...d5 are computed
4754 // in registers v25...v30.
4755 // All vector instructions that are used operate on both register
4756 // halves in parallel.
4757 // If only a single computation is needed, one can only load the lower halves.
4758 void keccak_round(Register rscratch1) {
4759 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4760 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4761 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4762 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4763 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4764 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4765 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4766 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4767 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4768 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4769
4770 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4771 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4772 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4773 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4774 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4775
4776 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4777 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4778 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4779 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4780 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4781 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4782 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4783 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4784 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4785 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4786 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4787 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4788 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4789 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4790 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4791 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4792 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4793 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4794 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4795 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4796 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4797 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4798 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4799 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4800 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4801
4802 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4803 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4804 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4805 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4806 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4807
4808 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4809
4810 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4811 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4812 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4813 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4814 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4815
4816 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4817 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4818 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4819 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4820 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4821
4822 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4823 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4824 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4825 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4826 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4827
4828 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4829 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4830 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4831 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4832 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4833
4834 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4835 }
4836
4837 // Arguments:
4838 //
4839 // Inputs:
4840 // c_rarg0 - byte[] source+offset
4841 // c_rarg1 - byte[] SHA.state
4842 // c_rarg2 - int block_size
4843 // c_rarg3 - int offset
4844 // c_rarg4 - int limit
4845 //
4846 address generate_sha3_implCompress(StubId stub_id) {
4847 bool multi_block;
4848 switch (stub_id) {
4849 case StubId::stubgen_sha3_implCompress_id:
4850 multi_block = false;
4851 break;
4852 case StubId::stubgen_sha3_implCompressMB_id:
4853 multi_block = true;
4854 break;
4855 default:
4856 ShouldNotReachHere();
4857 }
4858 int entry_count = StubInfo::entry_count(stub_id);
4859 assert(entry_count == 1, "sanity check");
4860 address start = load_archive_data(stub_id);
4861 if (start != nullptr) {
4862 return start;
4863 }
4864 __ align(CodeEntryAlignment);
4865 StubCodeMark mark(this, stub_id);
4866 start = __ pc();
4867
4868 Register buf = c_rarg0;
4869 Register state = c_rarg1;
4870 Register block_size = c_rarg2;
4871 Register ofs = c_rarg3;
4872 Register limit = c_rarg4;
4873
4874 Label sha3_loop, rounds24_loop;
4875 Label sha3_512_or_sha3_384, shake128;
4876
4877 __ stpd(v8, v9, __ pre(sp, -64));
4878 __ stpd(v10, v11, Address(sp, 16));
4879 __ stpd(v12, v13, Address(sp, 32));
4880 __ stpd(v14, v15, Address(sp, 48));
4881
4882 // load state
4883 __ add(rscratch1, state, 32);
4884 __ ld1(v0, v1, v2, v3, __ T1D, state);
4885 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4886 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4887 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4888 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4889 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4890 __ ld1(v24, __ T1D, rscratch1);
4891
4892 __ BIND(sha3_loop);
4893
4894 // 24 keccak rounds
4895 __ movw(rscratch2, 24);
4896
4897 // load round_constants base
4898 __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
4899
4900 // load input
4901 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4902 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4903 __ eor(v0, __ T8B, v0, v25);
4904 __ eor(v1, __ T8B, v1, v26);
4905 __ eor(v2, __ T8B, v2, v27);
4906 __ eor(v3, __ T8B, v3, v28);
4907 __ eor(v4, __ T8B, v4, v29);
4908 __ eor(v5, __ T8B, v5, v30);
4909 __ eor(v6, __ T8B, v6, v31);
4910
4911 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4912 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4913
4914 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4915 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4916 __ eor(v7, __ T8B, v7, v25);
4917 __ eor(v8, __ T8B, v8, v26);
4918 __ eor(v9, __ T8B, v9, v27);
4919 __ eor(v10, __ T8B, v10, v28);
4920 __ eor(v11, __ T8B, v11, v29);
4921 __ eor(v12, __ T8B, v12, v30);
4922 __ eor(v13, __ T8B, v13, v31);
4923
4924 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4925 __ eor(v14, __ T8B, v14, v25);
4926 __ eor(v15, __ T8B, v15, v26);
4927 __ eor(v16, __ T8B, v16, v27);
4928
4929 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4930 __ andw(c_rarg5, block_size, 48);
4931 __ cbzw(c_rarg5, rounds24_loop);
4932
4933 __ tbnz(block_size, 5, shake128);
4934 // block_size == 144, bit5 == 0, SHA3-224
4935 __ ldrd(v28, __ post(buf, 8));
4936 __ eor(v17, __ T8B, v17, v28);
4937 __ b(rounds24_loop);
4938
4939 __ BIND(shake128);
4940 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4941 __ eor(v17, __ T8B, v17, v28);
4942 __ eor(v18, __ T8B, v18, v29);
4943 __ eor(v19, __ T8B, v19, v30);
4944 __ eor(v20, __ T8B, v20, v31);
4945 __ b(rounds24_loop); // block_size == 168, SHAKE128
4946
4947 __ BIND(sha3_512_or_sha3_384);
4948 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4949 __ eor(v7, __ T8B, v7, v25);
4950 __ eor(v8, __ T8B, v8, v26);
4951 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4952
4953 // SHA3-384
4954 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4955 __ eor(v9, __ T8B, v9, v27);
4956 __ eor(v10, __ T8B, v10, v28);
4957 __ eor(v11, __ T8B, v11, v29);
4958 __ eor(v12, __ T8B, v12, v30);
4959
4960 __ BIND(rounds24_loop);
4961 __ subw(rscratch2, rscratch2, 1);
4962
4963 keccak_round(rscratch1);
4964
4965 __ cbnzw(rscratch2, rounds24_loop);
4966
4967 if (multi_block) {
4968 __ add(ofs, ofs, block_size);
4969 __ cmp(ofs, limit);
4970 __ br(Assembler::LE, sha3_loop);
4971 __ mov(c_rarg0, ofs); // return ofs
4972 }
4973
4974 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4975 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4976 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4977 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4978 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4979 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4980 __ st1(v24, __ T1D, state);
4981
4982 // restore callee-saved registers
4983 __ ldpd(v14, v15, Address(sp, 48));
4984 __ ldpd(v12, v13, Address(sp, 32));
4985 __ ldpd(v10, v11, Address(sp, 16));
4986 __ ldpd(v8, v9, __ post(sp, 64));
4987
4988 __ ret(lr);
4989
4990 // record the stub entry and end
4991 store_archive_data(stub_id, start, __ pc());
4992
4993 return start;
4994 }
4995
4996 // Inputs:
4997 // c_rarg0 - long[] state0
4998 // c_rarg1 - long[] state1
4999 address generate_double_keccak() {
5000 StubId stub_id = StubId::stubgen_double_keccak_id;
5001 int entry_count = StubInfo::entry_count(stub_id);
5002 assert(entry_count == 1, "sanity check");
5003 address start = load_archive_data(stub_id);
5004 if (start != nullptr) {
5005 return start;
5006 }
5007 // Implements the double_keccak() method of the
5008 // sun.security.provider.SHA3Parallel class
5009 __ align(CodeEntryAlignment);
5010 StubCodeMark mark(this, stub_id);
5011 start = __ pc();
5012 __ enter();
5013
5014 Register state0 = c_rarg0;
5015 Register state1 = c_rarg1;
5016
5017 Label rounds24_loop;
5018
5019 // save callee-saved registers
5020 __ stpd(v8, v9, __ pre(sp, -64));
5021 __ stpd(v10, v11, Address(sp, 16));
5022 __ stpd(v12, v13, Address(sp, 32));
5023 __ stpd(v14, v15, Address(sp, 48));
5024
5025 // load states
5026 __ add(rscratch1, state0, 32);
5027 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
5028 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
5029 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
5030 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
5031 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
5032 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
5033 __ ld1(v24, __ D, 0, rscratch1);
5034 __ add(rscratch1, state1, 32);
5035 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
5036 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
5037 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
5038 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
5039 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
5040 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
5041 __ ld1(v24, __ D, 1, rscratch1);
5042
5043 // 24 keccak rounds
5044 __ movw(rscratch2, 24);
5045
5046 // load round_constants base
5047 __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
5048
5049 __ BIND(rounds24_loop);
5050 __ subw(rscratch2, rscratch2, 1);
5051 keccak_round(rscratch1);
5052 __ cbnzw(rscratch2, rounds24_loop);
5053
5054 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
5055 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
5056 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
5057 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
5058 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
5059 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
5060 __ st1(v24, __ D, 0, state0);
5061 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
5062 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
5063 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
5064 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
5065 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
5066 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
5067 __ st1(v24, __ D, 1, state1);
5068
5069 // restore callee-saved vector registers
5070 __ ldpd(v14, v15, Address(sp, 48));
5071 __ ldpd(v12, v13, Address(sp, 32));
5072 __ ldpd(v10, v11, Address(sp, 16));
5073 __ ldpd(v8, v9, __ post(sp, 64));
5074
5075 __ leave(); // required for proper stackwalking of RuntimeStub frame
5076
5077 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
5078 __ ret(lr);
5079
5080 // record the stub entry and end
5081 store_archive_data(stub_id, start, __ pc());
5082
5083 return start;
5084 }
5085
5086 // ChaCha20 block function. This version parallelizes the 32-bit
5087 // state elements on each of 16 vectors, producing 4 blocks of
5088 // keystream at a time.
5089 //
5090 // state (int[16]) = c_rarg0
5091 // keystream (byte[256]) = c_rarg1
5092 // return - number of bytes of produced keystream (always 256)
5093 //
5094 // This implementation takes each 32-bit integer from the state
5095 // array and broadcasts it across all 4 32-bit lanes of a vector register
5096 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
5097 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
5098 // the quarter round schedule is implemented as outlined in RFC 7539 section
5099 // 2.3. However, instead of sequentially processing the 3 quarter round
5100 // operations represented by one QUARTERROUND function, we instead stack all
5101 // the adds, xors and left-rotations from the first 4 quarter rounds together
5102 // and then do the same for the second set of 4 quarter rounds. This removes
5103 // some latency that would otherwise be incurred by waiting for an add to
5104 // complete before performing an xor (which depends on the result of the
5105 // add), etc. An adjustment happens between the first and second groups of 4
5106 // quarter rounds, but this is done only in the inputs to the macro functions
5107 // that generate the assembly instructions - these adjustments themselves are
5108 // not part of the resulting assembly.
5109 // The 4 registers v0-v3 are used during the quarter round operations as
5110 // scratch registers. Once the 20 rounds are complete, these 4 scratch
5111 // registers become the vectors involved in adding the start state back onto
5112 // the post-QR working state. After the adds are complete, each of the 16
5113 // vectors write their first lane back to the keystream buffer, followed
5114 // by the second lane from all vectors and so on.
5115 address generate_chacha20Block_blockpar() {
5116 StubId stub_id = StubId::stubgen_chacha20Block_id;
5117 int entry_count = StubInfo::entry_count(stub_id);
5118 assert(entry_count == 1, "sanity check");
5119 address start = load_archive_data(stub_id);
5120 if (start != nullptr) {
5121 return start;
5122 }
5123 Label L_twoRounds, L_cc20_const;
5124 __ align(CodeEntryAlignment);
5125 StubCodeMark mark(this, stub_id);
5126 start = __ pc();
5127 __ enter();
5128
5129 int i, j;
5130 const Register state = c_rarg0;
5131 const Register keystream = c_rarg1;
5132 const Register loopCtr = r10;
5133 const Register tmpAddr = r11;
5134 const FloatRegister ctrAddOverlay = v28;
5135 const FloatRegister lrot8Tbl = v29;
5136
5137 // Organize SIMD registers in an array that facilitates
5138 // putting repetitive opcodes into loop structures. It is
5139 // important that each grouping of 4 registers is monotonically
5140 // increasing to support the requirements of multi-register
5141 // instructions (e.g. ld4r, st4, etc.)
5142 const FloatRegister workSt[16] = {
5143 v4, v5, v6, v7, v16, v17, v18, v19,
5144 v20, v21, v22, v23, v24, v25, v26, v27
5145 };
5146
5147 // Pull in constant data. The first 16 bytes are the add overlay
5148 // which is applied to the vector holding the counter (state[12]).
5149 // The second 16 bytes is the index register for the 8-bit left
5150 // rotation tbl instruction.
5151 __ adr(tmpAddr, L_cc20_const);
5152 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
5153
5154 // Load from memory and interlace across 16 SIMD registers,
5155 // With each word from memory being broadcast to all lanes of
5156 // each successive SIMD register.
5157 // Addr(0) -> All lanes in workSt[i]
5158 // Addr(4) -> All lanes workSt[i + 1], etc.
5159 __ mov(tmpAddr, state);
5160 for (i = 0; i < 16; i += 4) {
5161 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
5162 __ post(tmpAddr, 16));
5163 }
5164 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5165
5166 // Before entering the loop, create 5 4-register arrays. These
5167 // will hold the 4 registers that represent the a/b/c/d fields
5168 // in the quarter round operation. For instance the "b" field
5169 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
5170 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
5171 // since it is part of a diagonal organization. The aSet and scratch
5172 // register sets are defined at declaration time because they do not change
5173 // organization at any point during the 20-round processing.
5174 FloatRegister aSet[4] = { v4, v5, v6, v7 };
5175 FloatRegister bSet[4];
5176 FloatRegister cSet[4];
5177 FloatRegister dSet[4];
5178 FloatRegister scratch[4] = { v0, v1, v2, v3 };
5179
5180 // Set up the 10 iteration loop and perform all 8 quarter round ops
5181 __ mov(loopCtr, 10);
5182 __ BIND(L_twoRounds);
5183
5184 // Set to columnar organization and do the following 4 quarter-rounds:
5185 // QUARTERROUND(0, 4, 8, 12)
5186 // QUARTERROUND(1, 5, 9, 13)
5187 // QUARTERROUND(2, 6, 10, 14)
5188 // QUARTERROUND(3, 7, 11, 15)
5189 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
5190 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
5191 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
5192
5193 __ cc20_qr_add4(aSet, bSet); // a += b
5194 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5195 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5196
5197 __ cc20_qr_add4(cSet, dSet); // c += d
5198 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5199 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5200
5201 __ cc20_qr_add4(aSet, bSet); // a += b
5202 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5203 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5204
5205 __ cc20_qr_add4(cSet, dSet); // c += d
5206 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5207 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5208
5209 // Set to diagonal organization and do the next 4 quarter-rounds:
5210 // QUARTERROUND(0, 5, 10, 15)
5211 // QUARTERROUND(1, 6, 11, 12)
5212 // QUARTERROUND(2, 7, 8, 13)
5213 // QUARTERROUND(3, 4, 9, 14)
5214 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
5215 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
5216 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
5217
5218 __ cc20_qr_add4(aSet, bSet); // a += b
5219 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5220 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5221
5222 __ cc20_qr_add4(cSet, dSet); // c += d
5223 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5224 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5225
5226 __ cc20_qr_add4(aSet, bSet); // a += b
5227 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5228 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5229
5230 __ cc20_qr_add4(cSet, dSet); // c += d
5231 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5232 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5233
5234 // Decrement and iterate
5235 __ sub(loopCtr, loopCtr, 1);
5236 __ cbnz(loopCtr, L_twoRounds);
5237
5238 __ mov(tmpAddr, state);
5239
5240 // Add the starting state back to the post-loop keystream
5241 // state. We read/interlace the state array from memory into
5242 // 4 registers similar to what we did in the beginning. Then
5243 // add the counter overlay onto workSt[12] at the end.
5244 for (i = 0; i < 16; i += 4) {
5245 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
5246 __ addv(workSt[i], __ T4S, workSt[i], v0);
5247 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
5248 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
5249 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
5250 }
5251 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5252
5253 // Write working state into the keystream buffer. This is accomplished
5254 // by taking the lane "i" from each of the four vectors and writing
5255 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
5256 // repeating with the next 4 vectors until all 16 vectors have been used.
5257 // Then move to the next lane and repeat the process until all lanes have
5258 // been written.
5259 for (i = 0; i < 4; i++) {
5260 for (j = 0; j < 16; j += 4) {
5261 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
5262 __ post(keystream, 16));
5263 }
5264 }
5265
5266 __ mov(r0, 256); // Return length of output keystream
5267 __ leave();
5268 __ ret(lr);
5269
5270 // bind label and generate local constant data used by this stub
5271 // The constant data is broken into two 128-bit segments to be loaded
5272 // onto FloatRegisters. The first 128 bits are a counter add overlay
5273 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
5274 // The second 128-bits is a table constant used for 8-bit left rotations.
5275 __ BIND(L_cc20_const);
5276 __ emit_int64(0x0000000100000000UL);
5277 __ emit_int64(0x0000000300000002UL);
5278 __ emit_int64(0x0605040702010003UL);
5279 __ emit_int64(0x0E0D0C0F0A09080BUL);
5280
5281 // record the stub entry and end
5282 store_archive_data(stub_id, start, __ pc());
5283
5284 return start;
5285 }
5286
5287 // Helpers to schedule parallel operation bundles across vector
5288 // register sequences of size 2, 4 or 8.
5289
5290 // Implement various primitive computations across vector sequences
5291
5292 template<int N>
5293 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5294 const VSeq<N>& v1, const VSeq<N>& v2) {
5295 // output must not be constant
5296 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5297 // output cannot overwrite pending inputs
5298 assert(!vs_write_before_read(v, v1), "output overwrites input");
5299 assert(!vs_write_before_read(v, v2), "output overwrites input");
5300 for (int i = 0; i < N; i++) {
5301 __ addv(v[i], T, v1[i], v2[i]);
5302 }
5303 }
5304
5305 template<int N>
5306 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5307 const VSeq<N>& v1, const VSeq<N>& v2) {
5308 // output must not be constant
5309 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5310 // output cannot overwrite pending inputs
5311 assert(!vs_write_before_read(v, v1), "output overwrites input");
5312 assert(!vs_write_before_read(v, v2), "output overwrites input");
5313 for (int i = 0; i < N; i++) {
5314 __ subv(v[i], T, v1[i], v2[i]);
5315 }
5316 }
5317
5318 template<int N>
5319 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5320 const VSeq<N>& v1, const VSeq<N>& v2) {
5321 // output must not be constant
5322 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5323 // output cannot overwrite pending inputs
5324 assert(!vs_write_before_read(v, v1), "output overwrites input");
5325 assert(!vs_write_before_read(v, v2), "output overwrites input");
5326 for (int i = 0; i < N; i++) {
5327 __ mulv(v[i], T, v1[i], v2[i]);
5328 }
5329 }
5330
5331 template<int N>
5332 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
5333 // output must not be constant
5334 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5335 // output cannot overwrite pending inputs
5336 assert(!vs_write_before_read(v, v1), "output overwrites input");
5337 for (int i = 0; i < N; i++) {
5338 __ negr(v[i], T, v1[i]);
5339 }
5340 }
5341
5342 template<int N>
5343 void vs_shl(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5344 const VSeq<N>& v1, int shift) {
5345 // output must not be constant
5346 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5347 // output cannot overwrite pending inputs
5348 assert(!vs_write_before_read(v, v1), "output overwrites input");
5349
5350 for (int i = 0; i < N; i++) {
5351 __ shl(v[i], T, v1[i], shift);
5352 }
5353 }
5354
5355 template<int N>
5356 void vs_ushr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5357 const VSeq<N>& v1, int shift) {
5358 // output must not be constant
5359 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5360 // output cannot overwrite pending inputs
5361 assert(!vs_write_before_read(v, v1), "output overwrites input");
5362
5363 for (int i = 0; i < N; i++) {
5364 __ ushr(v[i], T, v1[i], shift);
5365 }
5366 }
5367
5368 template<int N>
5369 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5370 const VSeq<N>& v1, int shift) {
5371 // output must not be constant
5372 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5373 // output cannot overwrite pending inputs
5374 assert(!vs_write_before_read(v, v1), "output overwrites input");
5375 for (int i = 0; i < N; i++) {
5376 __ sshr(v[i], T, v1[i], shift);
5377 }
5378 }
5379
5380 template<int N>
5381 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5382 // output must not be constant
5383 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5384 // output cannot overwrite pending inputs
5385 assert(!vs_write_before_read(v, v1), "output overwrites input");
5386 assert(!vs_write_before_read(v, v2), "output overwrites input");
5387 for (int i = 0; i < N; i++) {
5388 __ andr(v[i], __ T16B, v1[i], v2[i]);
5389 }
5390 }
5391
5392 template<int N>
5393 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const FloatRegister v2) {
5394 // output must not be constant
5395 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5396 // output cannot overwrite pending inputs
5397 assert(!vs_write_before_read(v, v1), "output overwrites input");
5398 for (int i = 0; i < N; i++) {
5399 __ andr(v[i], __ T16B, v1[i], v2);
5400 }
5401 }
5402
5403 template<int N>
5404 void vs_eor(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5405 // output must not be constant
5406 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5407 // output cannot overwrite pending inputs
5408 assert(!vs_write_before_read(v, v1), "output overwrites input");
5409 assert(!vs_write_before_read(v, v2), "output overwrites input");
5410 for (int i = 0; i < N; i++) {
5411 __ eor(v[i], __ T16B, v1[i], v2[i]);
5412 }
5413 }
5414
5415 template<int N>
5416 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5417 // output must not be constant
5418 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5419 // output cannot overwrite pending inputs
5420 assert(!vs_write_before_read(v, v1), "output overwrites input");
5421 assert(!vs_write_before_read(v, v2), "output overwrites input");
5422 for (int i = 0; i < N; i++) {
5423 __ orr(v[i], __ T16B, v1[i], v2[i]);
5424 }
5425 }
5426
5427 template<int N>
5428 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
5429 // output must not be constant
5430 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5431 // output cannot overwrite pending inputs
5432 assert(!vs_write_before_read(v, v1), "output overwrites input");
5433 for (int i = 0; i < N; i++) {
5434 __ notr(v[i], __ T16B, v1[i]);
5435 }
5436 }
5437
5438 template<int N>
5439 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
5440 // output must not be constant
5441 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5442 // output cannot overwrite pending inputs
5443 assert(!vs_write_before_read(v, v1), "output overwrites input");
5444 assert(!vs_write_before_read(v, v2), "output overwrites input");
5445 for (int i = 0; i < N; i++) {
5446 __ sqdmulh(v[i], T, v1[i], v2[i]);
5447 }
5448 }
5449
5450 template<int N>
5451 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
5452 // output must not be constant
5453 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5454 // output cannot overwrite pending inputs
5455 assert(!vs_write_before_read(v, v1), "output overwrites input");
5456 assert(!vs_write_before_read(v, v2), "output overwrites input");
5457 for (int i = 0; i < N; i++) {
5458 __ mlsv(v[i], T, v1[i], v2[i]);
5459 }
5460 }
5461
5462 // load N/2 successive pairs of quadword values from memory in order
5463 // into N successive vector registers of the sequence via the
5464 // address supplied in base.
5465 template<int N>
5466 void vs_ldpq(const VSeq<N>& v, Register base) {
5467 static_assert(N > 0 && is_even(N), "sequence length must be even");
5468 for (int i = 0; i < N; i += 2) {
5469 __ ldpq(v[i], v[i+1], Address(base, 16 * i));
5470 }
5471 }
5472
5473 // load N/2 successive pairs of quadword values from memory in order
5474 // into N vector registers of the sequence via the address supplied
5475 // in base using post-increment addressing
5476 template<int N>
5477 void vs_ldpq_post(const VSeq<N>& v, Register base) {
5478 static_assert(N > 0 && is_even(N), "sequence length must be even");
5479 for (int i = 0; i < N; i += 2) {
5480 __ ldpq(v[i], v[i+1], __ post(base, 32));
5481 }
5482 }
5483
5484 // store N successive vector registers of the sequence into N/2
5485 // successive pairs of quadword memory locations via the address
5486 // supplied in base using post-increment addressing
5487 template<int N>
5488 void vs_stpq_post(const VSeq<N>& v, Register base) {
5489 static_assert(N > 0 && is_even(N), "sequence length must be even");
5490 for (int i = 0; i < N; i += 2) {
5491 __ stpq(v[i], v[i+1], __ post(base, 32));
5492 }
5493 }
5494
5495 // load N/2 pairs of quadword values from memory de-interleaved into
5496 // N vector registers 2 at a time via the address supplied in base
5497 // using post-increment addressing.
5498 template<int N>
5499 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5500 static_assert(N > 0 && is_even(N), "sequence length must be even");
5501 for (int i = 0; i < N; i += 2) {
5502 __ ld2(v[i], v[i+1], T, __ post(base, 32));
5503 }
5504 }
5505
5506 // store N vector registers interleaved into N/2 pairs of quadword
5507 // memory locations via the address supplied in base using
5508 // post-increment addressing.
5509 template<int N>
5510 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5511 static_assert(N > 0 && is_even(N), "sequence length must be even");
5512 for (int i = 0; i < N; i += 2) {
5513 __ st2(v[i], v[i+1], T, __ post(base, 32));
5514 }
5515 }
5516
5517 // store two vector register sequences of length N
5518 // interleaved into N pairs of quadword memory locations
5519 // starting at the address supplied in dest using
5520 // post-increment addressing.
5521 template<int N>
5522 void vs_st1_interleaved(VSeq<N> A, VSeq<N> B, Register dest) {
5523 for (int i = 0; i < N; i++) {
5524 __ st1(A[i], __ T2D, __ post(dest, 16));
5525 __ st1(B[i], __ T2D, __ post(dest, 16));
5526 }
5527 }
5528
5529 // load N quadword values from memory de-interleaved into N vector
5530 // registers 3 elements at a time via the address supplied in base.
5531 template<int N>
5532 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5533 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5534 for (int i = 0; i < N; i += 3) {
5535 __ ld3(v[i], v[i+1], v[i+2], T, base);
5536 }
5537 }
5538
5539 // load N quadword values from memory de-interleaved into N vector
5540 // registers 3 elements at a time via the address supplied in base
5541 // using post-increment addressing.
5542 template<int N>
5543 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5544 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5545 for (int i = 0; i < N; i += 3) {
5546 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
5547 }
5548 }
5549
5550 // load N/2 pairs of quadword values from memory into N vector
5551 // registers via the address supplied in base with each pair indexed
5552 // using the start offset plus the corresponding entry in the
5553 // offsets array
5554 template<int N>
5555 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5556 static_assert(N > 0 && is_even(N), "sequence length must be even");
5557 for (int i = 0; i < N/2; i++) {
5558 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5559 }
5560 }
5561
5562 // store N vector registers into N/2 pairs of quadword memory
5563 // locations via the address supplied in base with each pair indexed
5564 // using the start offset plus the corresponding entry in the
5565 // offsets array
5566 template<int N>
5567 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5568 for (int i = 0; i < N/2; i++) {
5569 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5570 }
5571 }
5572
5573 // load N single quadword values from memory into N vector registers
5574 // via the address supplied in base with each value indexed using
5575 // the start offset plus the corresponding entry in the offsets
5576 // array
5577 template<int N>
5578 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5579 int start, int (&offsets)[N]) {
5580 for (int i = 0; i < N; i++) {
5581 __ ldr(v[i], T, Address(base, start + offsets[i]));
5582 }
5583 }
5584
5585 // store N vector registers into N single quadword memory locations
5586 // via the address supplied in base with each value indexed using
5587 // the start offset plus the corresponding entry in the offsets
5588 // array
5589 template<int N>
5590 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5591 int start, int (&offsets)[N]) {
5592 for (int i = 0; i < N; i++) {
5593 __ str(v[i], T, Address(base, start + offsets[i]));
5594 }
5595 }
5596
5597 // load N/2 pairs of quadword values from memory de-interleaved into
5598 // N vector registers 2 at a time via the address supplied in base
5599 // with each pair indexed using the start offset plus the
5600 // corresponding entry in the offsets array
5601 template<int N>
5602 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5603 Register tmp, int start, int (&offsets)[N/2]) {
5604 static_assert(N > 0 && is_even(N), "sequence length must be even");
5605 for (int i = 0; i < N/2; i++) {
5606 __ add(tmp, base, start + offsets[i]);
5607 __ ld2(v[2*i], v[2*i+1], T, tmp);
5608 }
5609 }
5610
5611 // store N vector registers 2 at a time interleaved into N/2 pairs
5612 // of quadword memory locations via the address supplied in base
5613 // with each pair indexed using the start offset plus the
5614 // corresponding entry in the offsets array
5615 template<int N>
5616 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5617 Register tmp, int start, int (&offsets)[N/2]) {
5618 static_assert(N > 0 && is_even(N), "sequence length must be even");
5619 for (int i = 0; i < N/2; i++) {
5620 __ add(tmp, base, start + offsets[i]);
5621 __ st2(v[2*i], v[2*i+1], T, tmp);
5622 }
5623 }
5624
5625 // Helper routines for various flavours of Montgomery multiply
5626
5627 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5628 // multiplications in parallel
5629 //
5630
5631 // See the montMul() method of the sun.security.provider.ML_DSA
5632 // class.
5633 //
5634 // Computes 4x4S results or 8x8H results
5635 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5636 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5637 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5638 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5639 // Outputs: va - 4x4S or 4x8H vector register sequences
5640 // vb, vc, vtmp and vq must all be disjoint
5641 // va must be disjoint from all other inputs/temps or must equal vc
5642 // va must have a non-zero delta i.e. it must not be a constant vseq.
5643 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5644 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5645 Assembler::SIMD_Arrangement T,
5646 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5647 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5648 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5649 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5650 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5651
5652 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5653 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5654
5655 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5656
5657 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5658 assert(vs_disjoint(va, vb), "va and vb overlap");
5659 assert(vs_disjoint(va, vq), "va and vq overlap");
5660 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5661 assert(!va.is_constant(), "output vector must identify 4 different registers");
5662
5663 // schedule 4 streams of instructions across the vector sequences
5664 for (int i = 0; i < 4; i++) {
5665 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5666 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5667 }
5668
5669 for (int i = 0; i < 4; i++) {
5670 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5671 }
5672
5673 for (int i = 0; i < 4; i++) {
5674 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5675 }
5676
5677 for (int i = 0; i < 4; i++) {
5678 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5679 }
5680 }
5681
5682 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5683 // multiplications in parallel
5684 //
5685
5686 // See the montMul() method of the sun.security.provider.ML_DSA
5687 // class.
5688 //
5689 // Computes 4x4S results or 8x8H results
5690 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5691 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5692 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5693 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5694 // Outputs: va - 4x4S or 4x8H vector register sequences
5695 // vb, vc, vtmp and vq must all be disjoint
5696 // va must be disjoint from all other inputs/temps or must equal vc
5697 // va must have a non-zero delta i.e. it must not be a constant vseq.
5698 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5699 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5700 Assembler::SIMD_Arrangement T,
5701 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5702 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5703 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5704 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5705 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5706
5707 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5708 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5709
5710 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5711
5712 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5713 assert(vs_disjoint(va, vb), "va and vb overlap");
5714 assert(vs_disjoint(va, vq), "va and vq overlap");
5715 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5716 assert(!va.is_constant(), "output vector must identify 2 different registers");
5717
5718 // schedule 2 streams of instructions across the vector sequences
5719 for (int i = 0; i < 2; i++) {
5720 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5721 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5722 }
5723
5724 for (int i = 0; i < 2; i++) {
5725 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5726 }
5727
5728 for (int i = 0; i < 2; i++) {
5729 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5730 }
5731
5732 for (int i = 0; i < 2; i++) {
5733 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5734 }
5735 }
5736
5737 // Perform 16 16-bit Montgomery multiplications in parallel.
5738 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5739 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5740 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5741 // It will assert that the register use is valid
5742 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5743 }
5744
5745 // Perform 32 16-bit Montgomery multiplications in parallel.
5746 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5747 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5748 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5749 // It will assert that the register use is valid
5750 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5751 }
5752
5753 // Perform 64 16-bit Montgomery multiplications in parallel.
5754 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5755 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5756 // Schedule two successive 4x8H multiplies via the montmul helper
5757 // on the front and back halves of va, vb and vc. The helper will
5758 // assert that the register use has no overlap conflicts on each
5759 // individual call but we also need to ensure that the necessary
5760 // disjoint/equality constraints are met across both calls.
5761
5762 // vb, vc, vtmp and vq must be disjoint. va must either be
5763 // disjoint from all other registers or equal vc
5764
5765 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5766 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5767 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5768
5769 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5770 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5771
5772 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5773
5774 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5775 assert(vs_disjoint(va, vb), "va and vb overlap");
5776 assert(vs_disjoint(va, vq), "va and vq overlap");
5777 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5778
5779 // we multiply the front and back halves of each sequence 4 at a
5780 // time because
5781 //
5782 // 1) we are currently only able to get 4-way instruction
5783 // parallelism at best
5784 //
5785 // 2) we need registers for the constants in vq and temporary
5786 // scratch registers to hold intermediate results so vtmp can only
5787 // be a VSeq<4> which means we only have 4 scratch slots
5788
5789 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5790 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5791 }
5792
5793 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5794 const VSeq<4>& vc,
5795 const VSeq<4>& vtmp,
5796 const VSeq<2>& vq) {
5797 // compute a = montmul(a1, c)
5798 kyber_montmul32(vc, va1, vc, vtmp, vq);
5799 // ouptut a1 = a0 - a
5800 vs_subv(va1, __ T8H, va0, vc);
5801 // and a0 = a0 + a
5802 vs_addv(va0, __ T8H, va0, vc);
5803 }
5804
5805 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5806 const VSeq<4>& vb,
5807 const VSeq<4>& vtmp1,
5808 const VSeq<4>& vtmp2,
5809 const VSeq<2>& vq) {
5810 // compute c = a0 - a1
5811 vs_subv(vtmp1, __ T8H, va0, va1);
5812 // output a0 = a0 + a1
5813 vs_addv(va0, __ T8H, va0, va1);
5814 // output a1 = b montmul c
5815 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5816 }
5817
5818 void load64shorts(const VSeq<8>& v, Register shorts) {
5819 vs_ldpq_post(v, shorts);
5820 }
5821
5822 void load32shorts(const VSeq<4>& v, Register shorts) {
5823 vs_ldpq_post(v, shorts);
5824 }
5825
5826 void store64shorts(VSeq<8> v, Register tmpAddr) {
5827 vs_stpq_post(v, tmpAddr);
5828 }
5829
5830 // Kyber NTT function.
5831 // Implements
5832 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5833 //
5834 // coeffs (short[256]) = c_rarg0
5835 // ntt_zetas (short[256]) = c_rarg1
5836 address generate_kyberNtt() {
5837 StubId stub_id = StubId::stubgen_kyberNtt_id;
5838 int entry_count = StubInfo::entry_count(stub_id);
5839 assert(entry_count == 1, "sanity check");
5840 address start = load_archive_data(stub_id);
5841 if (start != nullptr) {
5842 return start;
5843 }
5844 __ align(CodeEntryAlignment);
5845 StubCodeMark mark(this, stub_id);
5846 start = __ pc();
5847 __ enter();
5848
5849 const Register coeffs = c_rarg0;
5850 const Register zetas = c_rarg1;
5851
5852 const Register kyberConsts = r10;
5853 const Register tmpAddr = r11;
5854
5855 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5856 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5857 VSeq<2> vq(30); // n.b. constants overlap vs3
5858
5859 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5860 // load the montmul constants
5861 vs_ldpq(vq, kyberConsts);
5862
5863 // Each level corresponds to an iteration of the outermost loop of the
5864 // Java method seilerNTT(int[] coeffs). There are some differences
5865 // from what is done in the seilerNTT() method, though:
5866 // 1. The computation is using 16-bit signed values, we do not convert them
5867 // to ints here.
5868 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5869 // this array for each level, it is easier that way to fill up the vector
5870 // registers.
5871 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5872 // multiplications (this is because that way there should not be any
5873 // overflow during the inverse NTT computation), here we use R = 2^16 so
5874 // that we can use the 16-bit arithmetic in the vector unit.
5875 //
5876 // On each level, we fill up the vector registers in such a way that the
5877 // array elements that need to be multiplied by the zetas go into one
5878 // set of vector registers while the corresponding ones that don't need to
5879 // be multiplied, go into another set.
5880 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5881 // registers interleaving the steps of 4 identical computations,
5882 // each done on 8 16-bit values per register.
5883
5884 // At levels 0-3 the coefficients multiplied by or added/subtracted
5885 // to the zetas occur in discrete blocks whose size is some multiple
5886 // of 32.
5887
5888 // level 0
5889 __ add(tmpAddr, coeffs, 256);
5890 load64shorts(vs1, tmpAddr);
5891 load64shorts(vs2, zetas);
5892 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5893 __ add(tmpAddr, coeffs, 0);
5894 load64shorts(vs1, tmpAddr);
5895 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5896 vs_addv(vs1, __ T8H, vs1, vs2);
5897 __ add(tmpAddr, coeffs, 0);
5898 vs_stpq_post(vs1, tmpAddr);
5899 __ add(tmpAddr, coeffs, 256);
5900 vs_stpq_post(vs3, tmpAddr);
5901 // restore montmul constants
5902 vs_ldpq(vq, kyberConsts);
5903 load64shorts(vs1, tmpAddr);
5904 load64shorts(vs2, zetas);
5905 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5906 __ add(tmpAddr, coeffs, 128);
5907 load64shorts(vs1, tmpAddr);
5908 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5909 vs_addv(vs1, __ T8H, vs1, vs2);
5910 __ add(tmpAddr, coeffs, 128);
5911 store64shorts(vs1, tmpAddr);
5912 __ add(tmpAddr, coeffs, 384);
5913 store64shorts(vs3, tmpAddr);
5914
5915 // level 1
5916 // restore montmul constants
5917 vs_ldpq(vq, kyberConsts);
5918 __ add(tmpAddr, coeffs, 128);
5919 load64shorts(vs1, tmpAddr);
5920 load64shorts(vs2, zetas);
5921 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5922 __ add(tmpAddr, coeffs, 0);
5923 load64shorts(vs1, tmpAddr);
5924 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5925 vs_addv(vs1, __ T8H, vs1, vs2);
5926 __ add(tmpAddr, coeffs, 0);
5927 store64shorts(vs1, tmpAddr);
5928 store64shorts(vs3, tmpAddr);
5929 vs_ldpq(vq, kyberConsts);
5930 __ add(tmpAddr, coeffs, 384);
5931 load64shorts(vs1, tmpAddr);
5932 load64shorts(vs2, zetas);
5933 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5934 __ add(tmpAddr, coeffs, 256);
5935 load64shorts(vs1, tmpAddr);
5936 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5937 vs_addv(vs1, __ T8H, vs1, vs2);
5938 __ add(tmpAddr, coeffs, 256);
5939 store64shorts(vs1, tmpAddr);
5940 store64shorts(vs3, tmpAddr);
5941
5942 // level 2
5943 vs_ldpq(vq, kyberConsts);
5944 int offsets1[4] = { 0, 32, 128, 160 };
5945 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5946 load64shorts(vs2, zetas);
5947 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5948 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5949 // kyber_subv_addv64();
5950 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5951 vs_addv(vs1, __ T8H, vs1, vs2);
5952 __ add(tmpAddr, coeffs, 0);
5953 vs_stpq_post(vs_front(vs1), tmpAddr);
5954 vs_stpq_post(vs_front(vs3), tmpAddr);
5955 vs_stpq_post(vs_back(vs1), tmpAddr);
5956 vs_stpq_post(vs_back(vs3), tmpAddr);
5957 vs_ldpq(vq, kyberConsts);
5958 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5959 load64shorts(vs2, zetas);
5960 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5961 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5962 // kyber_subv_addv64();
5963 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5964 vs_addv(vs1, __ T8H, vs1, vs2);
5965 __ add(tmpAddr, coeffs, 256);
5966 vs_stpq_post(vs_front(vs1), tmpAddr);
5967 vs_stpq_post(vs_front(vs3), tmpAddr);
5968 vs_stpq_post(vs_back(vs1), tmpAddr);
5969 vs_stpq_post(vs_back(vs3), tmpAddr);
5970
5971 // level 3
5972 vs_ldpq(vq, kyberConsts);
5973 int offsets2[4] = { 0, 64, 128, 192 };
5974 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5975 load64shorts(vs2, zetas);
5976 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5977 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5978 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5979 vs_addv(vs1, __ T8H, vs1, vs2);
5980 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5981 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5982
5983 vs_ldpq(vq, kyberConsts);
5984 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5985 load64shorts(vs2, zetas);
5986 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5987 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5988 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5989 vs_addv(vs1, __ T8H, vs1, vs2);
5990 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5991 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5992
5993 // level 4
5994 // At level 4 coefficients occur in 8 discrete blocks of size 16
5995 // so they are loaded by employing an ldr at 8 distinct offsets.
5996
5997 vs_ldpq(vq, kyberConsts);
5998 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5999 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
6000 load64shorts(vs2, zetas);
6001 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6002 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6003 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6004 vs_addv(vs1, __ T8H, vs1, vs2);
6005 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
6006 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
6007
6008 vs_ldpq(vq, kyberConsts);
6009 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
6010 load64shorts(vs2, zetas);
6011 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6012 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6013 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6014 vs_addv(vs1, __ T8H, vs1, vs2);
6015 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
6016 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
6017
6018 // level 5
6019 // At level 5 related coefficients occur in discrete blocks of size 8 so
6020 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
6021
6022 vs_ldpq(vq, kyberConsts);
6023 int offsets4[4] = { 0, 32, 64, 96 };
6024 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6025 load32shorts(vs_front(vs2), zetas);
6026 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6027 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6028 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6029 load32shorts(vs_front(vs2), zetas);
6030 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6031 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6032 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6033 load32shorts(vs_front(vs2), zetas);
6034 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6035 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6036
6037 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6038 load32shorts(vs_front(vs2), zetas);
6039 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6040 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6041
6042 // level 6
6043 // At level 6 related coefficients occur in discrete blocks of size 4 so
6044 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
6045
6046 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6047 load32shorts(vs_front(vs2), zetas);
6048 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6049 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6050 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6051 load32shorts(vs_front(vs2), zetas);
6052 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6053 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6054
6055 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6056 load32shorts(vs_front(vs2), zetas);
6057 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6058 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6059
6060 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6061 load32shorts(vs_front(vs2), zetas);
6062 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6063 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6064
6065 __ leave(); // required for proper stackwalking of RuntimeStub frame
6066 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6067 __ ret(lr);
6068
6069 // record the stub entry and end
6070 store_archive_data(stub_id, start, __ pc());
6071
6072 return start;
6073 }
6074
6075 // Kyber Inverse NTT function
6076 // Implements
6077 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
6078 //
6079 // coeffs (short[256]) = c_rarg0
6080 // ntt_zetas (short[256]) = c_rarg1
6081 address generate_kyberInverseNtt() {
6082 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
6083 int entry_count = StubInfo::entry_count(stub_id);
6084 assert(entry_count == 1, "sanity check");
6085 address start = load_archive_data(stub_id);
6086 if (start != nullptr) {
6087 return start;
6088 }
6089 __ align(CodeEntryAlignment);
6090 StubCodeMark mark(this, stub_id);
6091 start = __ pc();
6092 __ enter();
6093
6094 const Register coeffs = c_rarg0;
6095 const Register zetas = c_rarg1;
6096
6097 const Register kyberConsts = r10;
6098 const Register tmpAddr = r11;
6099 const Register tmpAddr2 = c_rarg2;
6100
6101 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
6102 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6103 VSeq<2> vq(30); // n.b. constants overlap vs3
6104
6105 __ lea(kyberConsts,
6106 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6107
6108 // level 0
6109 // At level 0 related coefficients occur in discrete blocks of size 4 so
6110 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
6111
6112 vs_ldpq(vq, kyberConsts);
6113 int offsets4[4] = { 0, 32, 64, 96 };
6114 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6115 load32shorts(vs_front(vs2), zetas);
6116 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6117 vs_front(vs2), vs_back(vs2), vtmp, vq);
6118 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6119 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6120 load32shorts(vs_front(vs2), zetas);
6121 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6122 vs_front(vs2), vs_back(vs2), vtmp, vq);
6123 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6124 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6125 load32shorts(vs_front(vs2), zetas);
6126 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6127 vs_front(vs2), vs_back(vs2), vtmp, vq);
6128 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6129 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6130 load32shorts(vs_front(vs2), zetas);
6131 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6132 vs_front(vs2), vs_back(vs2), vtmp, vq);
6133 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6134
6135 // level 1
6136 // At level 1 related coefficients occur in discrete blocks of size 8 so
6137 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
6138
6139 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6140 load32shorts(vs_front(vs2), zetas);
6141 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6142 vs_front(vs2), vs_back(vs2), vtmp, vq);
6143 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6144 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6145 load32shorts(vs_front(vs2), zetas);
6146 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6147 vs_front(vs2), vs_back(vs2), vtmp, vq);
6148 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6149
6150 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6151 load32shorts(vs_front(vs2), zetas);
6152 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6153 vs_front(vs2), vs_back(vs2), vtmp, vq);
6154 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6155 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6156 load32shorts(vs_front(vs2), zetas);
6157 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6158 vs_front(vs2), vs_back(vs2), vtmp, vq);
6159 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6160
6161 // level 2
6162 // At level 2 coefficients occur in 8 discrete blocks of size 16
6163 // so they are loaded by employing an ldr at 8 distinct offsets.
6164
6165 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6166 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6167 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
6168 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6169 vs_subv(vs1, __ T8H, vs1, vs2);
6170 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
6171 load64shorts(vs2, zetas);
6172 vs_ldpq(vq, kyberConsts);
6173 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6174 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
6175
6176 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6177 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6178 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6179 vs_subv(vs1, __ T8H, vs1, vs2);
6180 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
6181 load64shorts(vs2, zetas);
6182 vs_ldpq(vq, kyberConsts);
6183 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6184 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6185
6186 // Barrett reduction at indexes where overflow may happen
6187
6188 // load q and the multiplier for the Barrett reduction
6189 __ add(tmpAddr, kyberConsts, 16);
6190 vs_ldpq(vq, tmpAddr);
6191
6192 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
6193 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
6194 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
6195 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6196 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6197 vs_sshr(vs2, __ T8H, vs2, 11);
6198 vs_mlsv(vs1, __ T8H, vs2, vq1);
6199 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
6200 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6201 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6202 vs_sshr(vs2, __ T8H, vs2, 11);
6203 vs_mlsv(vs1, __ T8H, vs2, vq1);
6204 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
6205
6206 // level 3
6207 // From level 3 upwards coefficients occur in discrete blocks whose size is
6208 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
6209
6210 int offsets2[4] = { 0, 64, 128, 192 };
6211 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
6212 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
6213 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6214 vs_subv(vs1, __ T8H, vs1, vs2);
6215 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
6216 load64shorts(vs2, zetas);
6217 vs_ldpq(vq, kyberConsts);
6218 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6219 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
6220
6221 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
6222 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6223 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6224 vs_subv(vs1, __ T8H, vs1, vs2);
6225 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
6226 load64shorts(vs2, zetas);
6227 vs_ldpq(vq, kyberConsts);
6228 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6229 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6230
6231 // level 4
6232
6233 int offsets1[4] = { 0, 32, 128, 160 };
6234 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
6235 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
6236 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6237 vs_subv(vs1, __ T8H, vs1, vs2);
6238 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
6239 load64shorts(vs2, zetas);
6240 vs_ldpq(vq, kyberConsts);
6241 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6242 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
6243
6244 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
6245 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6246 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6247 vs_subv(vs1, __ T8H, vs1, vs2);
6248 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
6249 load64shorts(vs2, zetas);
6250 vs_ldpq(vq, kyberConsts);
6251 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6252 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6253
6254 // level 5
6255
6256 __ add(tmpAddr, coeffs, 0);
6257 load64shorts(vs1, tmpAddr);
6258 __ add(tmpAddr, coeffs, 128);
6259 load64shorts(vs2, tmpAddr);
6260 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6261 vs_subv(vs1, __ T8H, vs1, vs2);
6262 __ add(tmpAddr, coeffs, 0);
6263 store64shorts(vs3, tmpAddr);
6264 load64shorts(vs2, zetas);
6265 vs_ldpq(vq, kyberConsts);
6266 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6267 __ add(tmpAddr, coeffs, 128);
6268 store64shorts(vs2, tmpAddr);
6269
6270 load64shorts(vs1, tmpAddr);
6271 __ add(tmpAddr, coeffs, 384);
6272 load64shorts(vs2, tmpAddr);
6273 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6274 vs_subv(vs1, __ T8H, vs1, vs2);
6275 __ add(tmpAddr, coeffs, 256);
6276 store64shorts(vs3, tmpAddr);
6277 load64shorts(vs2, zetas);
6278 vs_ldpq(vq, kyberConsts);
6279 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6280 __ add(tmpAddr, coeffs, 384);
6281 store64shorts(vs2, tmpAddr);
6282
6283 // Barrett reduction at indexes where overflow may happen
6284
6285 // load q and the multiplier for the Barrett reduction
6286 __ add(tmpAddr, kyberConsts, 16);
6287 vs_ldpq(vq, tmpAddr);
6288
6289 int offsets0[2] = { 0, 256 };
6290 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6291 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6292 vs_sshr(vs2, __ T8H, vs2, 11);
6293 vs_mlsv(vs1, __ T8H, vs2, vq1);
6294 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6295
6296 // level 6
6297
6298 __ add(tmpAddr, coeffs, 0);
6299 load64shorts(vs1, tmpAddr);
6300 __ add(tmpAddr, coeffs, 256);
6301 load64shorts(vs2, tmpAddr);
6302 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6303 vs_subv(vs1, __ T8H, vs1, vs2);
6304 __ add(tmpAddr, coeffs, 0);
6305 store64shorts(vs3, tmpAddr);
6306 load64shorts(vs2, zetas);
6307 vs_ldpq(vq, kyberConsts);
6308 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6309 __ add(tmpAddr, coeffs, 256);
6310 store64shorts(vs2, tmpAddr);
6311
6312 __ add(tmpAddr, coeffs, 128);
6313 load64shorts(vs1, tmpAddr);
6314 __ add(tmpAddr, coeffs, 384);
6315 load64shorts(vs2, tmpAddr);
6316 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6317 vs_subv(vs1, __ T8H, vs1, vs2);
6318 __ add(tmpAddr, coeffs, 128);
6319 store64shorts(vs3, tmpAddr);
6320 load64shorts(vs2, zetas);
6321 vs_ldpq(vq, kyberConsts);
6322 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6323 __ add(tmpAddr, coeffs, 384);
6324 store64shorts(vs2, tmpAddr);
6325
6326 // multiply by 2^-n
6327
6328 // load toMont(2^-n mod q)
6329 __ add(tmpAddr, kyberConsts, 48);
6330 __ ldr(v29, __ Q, tmpAddr);
6331
6332 vs_ldpq(vq, kyberConsts);
6333 __ add(tmpAddr, coeffs, 0);
6334 load64shorts(vs1, tmpAddr);
6335 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6336 __ add(tmpAddr, coeffs, 0);
6337 store64shorts(vs2, tmpAddr);
6338
6339 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
6340 load64shorts(vs1, tmpAddr);
6341 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6342 __ add(tmpAddr, coeffs, 128);
6343 store64shorts(vs2, tmpAddr);
6344
6345 // now tmpAddr contains coeffs + 256
6346 load64shorts(vs1, tmpAddr);
6347 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6348 __ add(tmpAddr, coeffs, 256);
6349 store64shorts(vs2, tmpAddr);
6350
6351 // now tmpAddr contains coeffs + 384
6352 load64shorts(vs1, tmpAddr);
6353 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6354 __ add(tmpAddr, coeffs, 384);
6355 store64shorts(vs2, tmpAddr);
6356
6357 __ leave(); // required for proper stackwalking of RuntimeStub frame
6358 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6359 __ ret(lr);
6360
6361 // record the stub entry and end
6362 store_archive_data(stub_id, start, __ pc());
6363
6364 return start;
6365 }
6366
6367 // Kyber multiply polynomials in the NTT domain.
6368 // Implements
6369 // static int implKyberNttMult(
6370 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
6371 //
6372 // The actual algorithm that is used here differs from the one in the Java
6373 // implementation, it uses Montgomery multiplications instead of Barrett
6374 // reduction, but the end result modulo MLKEM_Q is the same. This is the
6375 // Java equivalent of this intrinsic implementation:
6376 // static void implKyberNttMultJava(short[] result, short[] ntta, short[] nttb) {
6377 // for (int m = 0; m < ML_KEM_N / 2; m++) {
6378 // int a0 = ntta[2 * m];
6379 // int a1 = ntta[2 * m + 1];
6380 // int b0 = nttb[2 * m];
6381 // int b1 = nttb[2 * m + 1];
6382 // int r = montMul(a0, b0) +
6383 // montMul(montMul(a1, b1), MONT_ZETAS_FOR_NTT_MULT[m]);
6384 // result[2 * m] = (short) montMul(r, MONT_R_SQUARE_MOD_Q);
6385 // result[2 * m + 1] = (short) montMul(
6386 // (montMul(a0, b1) + montMul(a1, b0)), MONT_R_SQUARE_MOD_Q);
6387 // }
6388 // }
6389 //
6390 // result (short[256]) = c_rarg0
6391 // ntta (short[256]) = c_rarg1
6392 // nttb (short[256]) = c_rarg2
6393 // zetas (short[128]) = c_rarg3
6394 address generate_kyberNttMult() {
6395 StubId stub_id = StubId::stubgen_kyberNttMult_id;
6396 int entry_count = StubInfo::entry_count(stub_id);
6397 assert(entry_count == 1, "sanity check");
6398 address start = load_archive_data(stub_id);
6399 if (start != nullptr) {
6400 return start;
6401 }
6402 __ align(CodeEntryAlignment);
6403 StubCodeMark mark(this, stub_id);
6404 start = __ pc();
6405 __ enter();
6406
6407 const Register result = c_rarg0;
6408 const Register ntta = c_rarg1;
6409 const Register nttb = c_rarg2;
6410 const Register zetas = c_rarg3;
6411
6412 const Register kyberConsts = r10;
6413 const Register limit = r11;
6414
6415 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
6416 VSeq<4> vs3(16), vs4(20);
6417 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
6418 VSeq<2> vz(28); // pair of zetas
6419 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
6420
6421 __ lea(kyberConsts,
6422 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6423
6424 Label kyberNttMult_loop;
6425
6426 __ add(limit, result, 512);
6427
6428 // load q and qinv
6429 vs_ldpq(vq, kyberConsts);
6430
6431 // load R^2 mod q (to convert back from Montgomery representation)
6432 __ add(kyberConsts, kyberConsts, 64);
6433 __ ldr(v27, __ Q, kyberConsts);
6434
6435 __ BIND(kyberNttMult_loop);
6436
6437 // load 16 zetas
6438 vs_ldpq_post(vz, zetas);
6439
6440 // load 2 sets of 32 coefficients from the two input arrays
6441 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
6442 // are striped across pairs of vector registers
6443 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
6444 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
6445 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
6446 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
6447
6448 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
6449 // i.e. montmul the first and second halves of vs1 in order and
6450 // then with one sequence reversed storing the two results in vs3
6451 //
6452 // vs3[0] <- montmul(a0, b0)
6453 // vs3[1] <- montmul(a1, b1)
6454 // vs3[2] <- montmul(a0, b1)
6455 // vs3[3] <- montmul(a1, b0)
6456 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
6457 kyber_montmul16(vs_back(vs3),
6458 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
6459
6460 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
6461 // i.e. montmul the first and second halves of vs4 in order and
6462 // then with one sequence reversed storing the two results in vs1
6463 //
6464 // vs1[0] <- montmul(a2, b2)
6465 // vs1[1] <- montmul(a3, b3)
6466 // vs1[2] <- montmul(a2, b3)
6467 // vs1[3] <- montmul(a3, b2)
6468 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
6469 kyber_montmul16(vs_back(vs1),
6470 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
6471
6472 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
6473 // We can schedule two montmuls at a time if we use a suitable vector
6474 // sequence <vs3[1], vs1[1]>.
6475 int delta = vs1[1]->encoding() - vs3[1]->encoding();
6476 VSeq<2> vs5(vs3[1], delta);
6477
6478 // vs3[1] <- montmul(montmul(a1, b1), z0)
6479 // vs1[1] <- montmul(montmul(a3, b3), z1)
6480 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
6481
6482 // add results in pairs storing in vs3
6483 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
6484 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
6485 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
6486
6487 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
6488 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
6489 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
6490
6491 // vs1 <- montmul(vs3, montRSquareModQ)
6492 kyber_montmul32(vs1, vs3, vc, vs2, vq);
6493
6494 // store back the two pairs of result vectors de-interleaved as 8H elements
6495 // i.e. storing each pairs of shorts striped across a register pair adjacent
6496 // in memory
6497 vs_st2_post(vs1, __ T8H, result);
6498
6499 __ cmp(result, limit);
6500 __ br(Assembler::NE, kyberNttMult_loop);
6501
6502 __ leave(); // required for proper stackwalking of RuntimeStub frame
6503 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6504 __ ret(lr);
6505
6506 // record the stub entry and end
6507 store_archive_data(stub_id, start, __ pc());
6508
6509 return start;
6510 }
6511
6512 // Kyber add 2 polynomials.
6513 // Implements
6514 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
6515 //
6516 // result (short[256]) = c_rarg0
6517 // a (short[256]) = c_rarg1
6518 // b (short[256]) = c_rarg2
6519 address generate_kyberAddPoly_2() {
6520 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
6521 int entry_count = StubInfo::entry_count(stub_id);
6522 assert(entry_count == 1, "sanity check");
6523 address start = load_archive_data(stub_id);
6524 if (start != nullptr) {
6525 return start;
6526 }
6527 __ align(CodeEntryAlignment);
6528 StubCodeMark mark(this, stub_id);
6529 start = __ pc();
6530 __ enter();
6531
6532 const Register result = c_rarg0;
6533 const Register a = c_rarg1;
6534 const Register b = c_rarg2;
6535
6536 const Register kyberConsts = r11;
6537
6538 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
6539 // So, we can load, add and store the data in 3 groups of 11,
6540 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
6541 // registers. A further constraint is that the mapping needs
6542 // to skip callee saves. So, we allocate the register
6543 // sequences using two 8 sequences, two 2 sequences and two
6544 // single registers.
6545 VSeq<8> vs1_1(0);
6546 VSeq<2> vs1_2(16);
6547 FloatRegister vs1_3 = v28;
6548 VSeq<8> vs2_1(18);
6549 VSeq<2> vs2_2(26);
6550 FloatRegister vs2_3 = v29;
6551
6552 // two constant vector sequences
6553 VSeq<8> vc_1(31, 0);
6554 VSeq<2> vc_2(31, 0);
6555
6556 FloatRegister vc_3 = v31;
6557 __ lea(kyberConsts,
6558 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6559
6560 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6561 for (int i = 0; i < 3; i++) {
6562 // load 80 or 88 values from a into vs1_1/2/3
6563 vs_ldpq_post(vs1_1, a);
6564 vs_ldpq_post(vs1_2, a);
6565 if (i < 2) {
6566 __ ldr(vs1_3, __ Q, __ post(a, 16));
6567 }
6568 // load 80 or 88 values from b into vs2_1/2/3
6569 vs_ldpq_post(vs2_1, b);
6570 vs_ldpq_post(vs2_2, b);
6571 if (i < 2) {
6572 __ ldr(vs2_3, __ Q, __ post(b, 16));
6573 }
6574 // sum 80 or 88 values across vs1 and vs2 into vs1
6575 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6576 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6577 if (i < 2) {
6578 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6579 }
6580 // add constant to all 80 or 88 results
6581 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6582 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6583 if (i < 2) {
6584 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6585 }
6586 // store 80 or 88 values
6587 vs_stpq_post(vs1_1, result);
6588 vs_stpq_post(vs1_2, result);
6589 if (i < 2) {
6590 __ str(vs1_3, __ Q, __ post(result, 16));
6591 }
6592 }
6593
6594 __ leave(); // required for proper stackwalking of RuntimeStub frame
6595 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6596 __ ret(lr);
6597
6598 // record the stub entry and end
6599 store_archive_data(stub_id, start, __ pc());
6600
6601 return start;
6602 }
6603
6604 // Kyber add 3 polynomials.
6605 // Implements
6606 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6607 //
6608 // result (short[256]) = c_rarg0
6609 // a (short[256]) = c_rarg1
6610 // b (short[256]) = c_rarg2
6611 // c (short[256]) = c_rarg3
6612 address generate_kyberAddPoly_3() {
6613 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6614 int entry_count = StubInfo::entry_count(stub_id);
6615 assert(entry_count == 1, "sanity check");
6616 address start = load_archive_data(stub_id);
6617 if (start != nullptr) {
6618 return start;
6619 }
6620 __ align(CodeEntryAlignment);
6621 StubCodeMark mark(this, stub_id);
6622 start = __ pc();
6623 __ enter();
6624
6625 const Register result = c_rarg0;
6626 const Register a = c_rarg1;
6627 const Register b = c_rarg2;
6628 const Register c = c_rarg3;
6629
6630 const Register kyberConsts = r11;
6631
6632 // As above we sum 256 sets of values in total i.e. 32 x 8H
6633 // quadwords. So, we can load, add and store the data in 3
6634 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6635 // of 10 or 11 registers. A further constraint is that the
6636 // mapping needs to skip callee saves. So, we allocate the
6637 // register sequences using two 8 sequences, two 2 sequences
6638 // and two single registers.
6639 VSeq<8> vs1_1(0);
6640 VSeq<2> vs1_2(16);
6641 FloatRegister vs1_3 = v28;
6642 VSeq<8> vs2_1(18);
6643 VSeq<2> vs2_2(26);
6644 FloatRegister vs2_3 = v29;
6645
6646 // two constant vector sequences
6647 VSeq<8> vc_1(31, 0);
6648 VSeq<2> vc_2(31, 0);
6649
6650 FloatRegister vc_3 = v31;
6651
6652 __ lea(kyberConsts,
6653 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6654
6655 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6656 for (int i = 0; i < 3; i++) {
6657 // load 80 or 88 values from a into vs1_1/2/3
6658 vs_ldpq_post(vs1_1, a);
6659 vs_ldpq_post(vs1_2, a);
6660 if (i < 2) {
6661 __ ldr(vs1_3, __ Q, __ post(a, 16));
6662 }
6663 // load 80 or 88 values from b into vs2_1/2/3
6664 vs_ldpq_post(vs2_1, b);
6665 vs_ldpq_post(vs2_2, b);
6666 if (i < 2) {
6667 __ ldr(vs2_3, __ Q, __ post(b, 16));
6668 }
6669 // sum 80 or 88 values across vs1 and vs2 into vs1
6670 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6671 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6672 if (i < 2) {
6673 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6674 }
6675 // load 80 or 88 values from c into vs2_1/2/3
6676 vs_ldpq_post(vs2_1, c);
6677 vs_ldpq_post(vs2_2, c);
6678 if (i < 2) {
6679 __ ldr(vs2_3, __ Q, __ post(c, 16));
6680 }
6681 // sum 80 or 88 values across vs1 and vs2 into vs1
6682 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6683 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6684 if (i < 2) {
6685 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6686 }
6687 // add constant to all 80 or 88 results
6688 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6689 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6690 if (i < 2) {
6691 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6692 }
6693 // store 80 or 88 values
6694 vs_stpq_post(vs1_1, result);
6695 vs_stpq_post(vs1_2, result);
6696 if (i < 2) {
6697 __ str(vs1_3, __ Q, __ post(result, 16));
6698 }
6699 }
6700
6701 __ leave(); // required for proper stackwalking of RuntimeStub frame
6702 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6703 __ ret(lr);
6704
6705 // record the stub entry and end
6706 store_archive_data(stub_id, start, __ pc());
6707
6708 return start;
6709 }
6710
6711 // Kyber parse XOF output to polynomial coefficient candidates
6712 // or decodePoly(12, ...).
6713 // Implements
6714 // static int implKyber12To16(
6715 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6716 //
6717 // we assume that parsed and condensed are allocated such that for
6718 // n = (parsedLength + 63) / 64
6719 // n blocks of 96 bytes of input can be processed, i.e.
6720 // index + n * 96 <= condensed.length and
6721 // n * 64 <= parsed.length
6722 //
6723 // condensed (byte[]) = c_rarg0
6724 // condensedIndex = c_rarg1
6725 // parsed (short[]) = c_rarg2
6726 // parsedLength = c_rarg3
6727 address generate_kyber12To16() {
6728 StubId stub_id = StubId::stubgen_kyber12To16_id;
6729 int entry_count = StubInfo::entry_count(stub_id);
6730 assert(entry_count == 1, "sanity check");
6731 address start = load_archive_data(stub_id);
6732 if (start != nullptr) {
6733 return start;
6734 }
6735 Label L_F00, L_loop;
6736
6737 __ align(CodeEntryAlignment);
6738 StubCodeMark mark(this, stub_id);
6739 start = __ pc();
6740 __ enter();
6741
6742 const Register condensed = c_rarg0;
6743 const Register condensedOffs = c_rarg1;
6744 const Register parsed = c_rarg2;
6745 const Register parsedLength = c_rarg3;
6746
6747 const Register tmpAddr = r11;
6748
6749 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6750 // quadwords so we need a 6 vector sequence for the inputs.
6751 // Parsing produces 64 shorts, employing two 8 vector
6752 // sequences to store and combine the intermediate data.
6753 VSeq<6> vin(24);
6754 VSeq<8> va(0), vb(16);
6755
6756 __ adr(tmpAddr, L_F00);
6757 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6758 __ add(condensed, condensed, condensedOffs);
6759
6760 __ BIND(L_loop);
6761 // load 96 (6 x 16B) byte values
6762 vs_ld3_post(vin, __ T16B, condensed);
6763
6764 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6765 // holds 48 (16x3) contiguous bytes from memory striped
6766 // horizontally across each of the 16 byte lanes. Equivalently,
6767 // that is 16 pairs of 12-bit integers. Likewise the back half
6768 // holds the next 48 bytes in the same arrangement.
6769
6770 // Each vector in the front half can also be viewed as a vertical
6771 // strip across the 16 pairs of 12 bit integers. Each byte in
6772 // vin[0] stores the low 8 bits of the first int in a pair. Each
6773 // byte in vin[1] stores the high 4 bits of the first int and the
6774 // low 4 bits of the second int. Each byte in vin[2] stores the
6775 // high 8 bits of the second int. Likewise the vectors in second
6776 // half.
6777
6778 // Converting the data to 16-bit shorts requires first of all
6779 // expanding each of the 6 x 16B vectors into 6 corresponding
6780 // pairs of 8H vectors. Mask, shift and add operations on the
6781 // resulting vector pairs can be used to combine 4 and 8 bit
6782 // parts of related 8H vector elements.
6783 //
6784 // The middle vectors (vin[2] and vin[5]) are actually expanded
6785 // twice, one copy manipulated to provide the lower 4 bits
6786 // belonging to the first short in a pair and another copy
6787 // manipulated to provide the higher 4 bits belonging to the
6788 // second short in a pair. This is why the vector sequences va
6789 // and vb are used to hold the expanded 8H elements are of length 8.
6790
6791 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6792 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6793 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6794 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6795 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6796 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6797 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6798 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6799
6800 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6801 // and vb[4:5]
6802 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6803 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6804 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6805 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6806 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6807 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6808
6809 // shift lo byte of copy 1 of the middle stripe into the high byte
6810 __ shl(va[2], __ T8H, va[2], 8);
6811 __ shl(va[3], __ T8H, va[3], 8);
6812 __ shl(vb[2], __ T8H, vb[2], 8);
6813 __ shl(vb[3], __ T8H, vb[3], 8);
6814
6815 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6816 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6817 // are in bit positions [4..11].
6818 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6819 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6820 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6821 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6822
6823 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6824 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6825 // copy2
6826 __ andr(va[2], __ T16B, va[2], v31);
6827 __ andr(va[3], __ T16B, va[3], v31);
6828 __ ushr(va[4], __ T8H, va[4], 4);
6829 __ ushr(va[5], __ T8H, va[5], 4);
6830 __ andr(vb[2], __ T16B, vb[2], v31);
6831 __ andr(vb[3], __ T16B, vb[3], v31);
6832 __ ushr(vb[4], __ T8H, vb[4], 4);
6833 __ ushr(vb[5], __ T8H, vb[5], 4);
6834
6835 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6836 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6837 // n.b. the ordering ensures: i) inputs are consumed before they
6838 // are overwritten ii) the order of 16-bit results across successive
6839 // pairs of vectors in va and then vb reflects the order of the
6840 // corresponding 12-bit inputs
6841 __ addv(va[0], __ T8H, va[0], va[2]);
6842 __ addv(va[2], __ T8H, va[1], va[3]);
6843 __ addv(va[1], __ T8H, va[4], va[6]);
6844 __ addv(va[3], __ T8H, va[5], va[7]);
6845 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6846 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6847 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6848 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6849
6850 // store 64 results interleaved as shorts
6851 vs_st2_post(vs_front(va), __ T8H, parsed);
6852 vs_st2_post(vs_front(vb), __ T8H, parsed);
6853
6854 __ sub(parsedLength, parsedLength, 64);
6855 __ cmp(parsedLength, (u1)0);
6856 __ br(Assembler::GT, L_loop);
6857
6858 __ leave(); // required for proper stackwalking of RuntimeStub frame
6859 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6860 __ ret(lr);
6861
6862 // bind label and generate constant data used by this stub
6863 __ BIND(L_F00);
6864 __ emit_int64(0x0f000f000f000f00);
6865 __ emit_int64(0x0f000f000f000f00);
6866
6867 // record the stub entry and end
6868 store_archive_data(stub_id, start, __ pc());
6869
6870 return start;
6871 }
6872
6873 // Kyber Barrett reduce function.
6874 // Implements
6875 // static int implKyberBarrettReduce(short[] coeffs) {}
6876 //
6877 // coeffs (short[256]) = c_rarg0
6878 address generate_kyberBarrettReduce() {
6879 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6880 int entry_count = StubInfo::entry_count(stub_id);
6881 assert(entry_count == 1, "sanity check");
6882 address start = load_archive_data(stub_id);
6883 if (start != nullptr) {
6884 return start;
6885 }
6886 __ align(CodeEntryAlignment);
6887 StubCodeMark mark(this, stub_id);
6888 start = __ pc();
6889 __ enter();
6890
6891 const Register coeffs = c_rarg0;
6892
6893 const Register kyberConsts = r10;
6894 const Register result = r11;
6895
6896 // As above we process 256 sets of values in total i.e. 32 x
6897 // 8H quadwords. So, we can load, add and store the data in 3
6898 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6899 // of 10 or 11 registers. A further constraint is that the
6900 // mapping needs to skip callee saves. So, we allocate the
6901 // register sequences using two 8 sequences, two 2 sequences
6902 // and two single registers.
6903 VSeq<8> vs1_1(0);
6904 VSeq<2> vs1_2(16);
6905 FloatRegister vs1_3 = v28;
6906 VSeq<8> vs2_1(18);
6907 VSeq<2> vs2_2(26);
6908 FloatRegister vs2_3 = v29;
6909
6910 // we also need a pair of corresponding constant sequences
6911
6912 VSeq<8> vc1_1(30, 0);
6913 VSeq<2> vc1_2(30, 0);
6914 FloatRegister vc1_3 = v30; // for kyber_q
6915
6916 VSeq<8> vc2_1(31, 0);
6917 VSeq<2> vc2_2(31, 0);
6918 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6919
6920 __ add(result, coeffs, 0);
6921 __ lea(kyberConsts,
6922 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6923
6924 // load q and the multiplier for the Barrett reduction
6925 __ add(kyberConsts, kyberConsts, 16);
6926 __ ldpq(vc1_3, vc2_3, kyberConsts);
6927
6928 for (int i = 0; i < 3; i++) {
6929 // load 80 or 88 coefficients
6930 vs_ldpq_post(vs1_1, coeffs);
6931 vs_ldpq_post(vs1_2, coeffs);
6932 if (i < 2) {
6933 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6934 }
6935
6936 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6937 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6938 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6939 if (i < 2) {
6940 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6941 }
6942
6943 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6944 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6945 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6946 if (i < 2) {
6947 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6948 }
6949
6950 // vs1 <- vs1 - vs2 * kyber_q
6951 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6952 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6953 if (i < 2) {
6954 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6955 }
6956
6957 vs_stpq_post(vs1_1, result);
6958 vs_stpq_post(vs1_2, result);
6959 if (i < 2) {
6960 __ str(vs1_3, __ Q, __ post(result, 16));
6961 }
6962 }
6963
6964 __ leave(); // required for proper stackwalking of RuntimeStub frame
6965 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6966 __ ret(lr);
6967
6968 // record the stub entry and end
6969 store_archive_data(stub_id, start, __ pc());
6970
6971 return start;
6972 }
6973
6974
6975 // Dilithium-specific montmul helper routines that generate parallel
6976 // code for, respectively, a single 4x4s vector sequence montmul or
6977 // two such multiplies in a row.
6978
6979 // Perform 16 32-bit Montgomery multiplications in parallel
6980 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6981 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6982 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6983 // It will assert that the register use is valid
6984 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6985 }
6986
6987 // Perform 2x16 32-bit Montgomery multiplications in parallel
6988 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6989 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6990 // Schedule two successive 4x4S multiplies via the montmul helper
6991 // on the front and back halves of va, vb and vc. The helper will
6992 // assert that the register use has no overlap conflicts on each
6993 // individual call but we also need to ensure that the necessary
6994 // disjoint/equality constraints are met across both calls.
6995
6996 // vb, vc, vtmp and vq must be disjoint. va must either be
6997 // disjoint from all other registers or equal vc
6998
6999 assert(vs_disjoint(vb, vc), "vb and vc overlap");
7000 assert(vs_disjoint(vb, vq), "vb and vq overlap");
7001 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
7002
7003 assert(vs_disjoint(vc, vq), "vc and vq overlap");
7004 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
7005
7006 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
7007
7008 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
7009 assert(vs_disjoint(va, vb), "va and vb overlap");
7010 assert(vs_disjoint(va, vq), "va and vq overlap");
7011 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
7012
7013 // We multiply the front and back halves of each sequence 4 at a
7014 // time because
7015 //
7016 // 1) we are currently only able to get 4-way instruction
7017 // parallelism at best
7018 //
7019 // 2) we need registers for the constants in vq and temporary
7020 // scratch registers to hold intermediate results so vtmp can only
7021 // be a VSeq<4> which means we only have 4 scratch slots.
7022
7023 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
7024 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
7025 }
7026
7027 // Perform combined montmul then add/sub on 4x4S vectors.
7028 void dilithium_montmul16_sub_add(
7029 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
7030 const VSeq<4>& vtmp, const VSeq<2>& vq) {
7031 // compute a = montmul(a1, c)
7032 dilithium_montmul16(vc, va1, vc, vtmp, vq);
7033 // ouptut a1 = a0 - a
7034 vs_subv(va1, __ T4S, va0, vc);
7035 // and a0 = a0 + a
7036 vs_addv(va0, __ T4S, va0, vc);
7037 }
7038
7039 // Perform combined add/sub then montmul on 4x4S vectors.
7040 void dilithium_sub_add_montmul16(
7041 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
7042 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
7043 // compute c = a0 - a1
7044 vs_subv(vtmp1, __ T4S, va0, va1);
7045 // output a0 = a0 + a1
7046 vs_addv(va0, __ T4S, va0, va1);
7047 // output a1 = b montmul c
7048 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
7049 }
7050
7051 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
7052 // in the Java implementation come in sequences of at least 8, so we
7053 // can use ldpq to collect the corresponding data into pairs of vector
7054 // registers.
7055 // We collect the coefficients corresponding to the 'j+l' indexes into
7056 // the vector registers v0-v7, the zetas into the vector registers v16-v23
7057 // then we do the (Montgomery) multiplications by the zetas in parallel
7058 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
7059 // v0-v7, then do the additions into v24-v31 and the subtractions into
7060 // v0-v7 and finally save the results back to the coeffs array.
7061 void dilithiumNttLevel0_4(const Register dilithiumConsts,
7062 const Register coeffs, const Register zetas) {
7063 int c1 = 0;
7064 int c2 = 512;
7065 int startIncr;
7066 // don't use callee save registers v8 - v15
7067 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7068 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7069 VSeq<2> vq(30); // n.b. constants overlap vs3
7070 int offsets[4] = { 0, 32, 64, 96 };
7071
7072 for (int level = 0; level < 5; level++) {
7073 int c1Start = c1;
7074 int c2Start = c2;
7075 if (level == 3) {
7076 offsets[1] = 32;
7077 offsets[2] = 128;
7078 offsets[3] = 160;
7079 } else if (level == 4) {
7080 offsets[1] = 64;
7081 offsets[2] = 128;
7082 offsets[3] = 192;
7083 }
7084
7085 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
7086 // time at 4 different offsets and multiply them in order by the
7087 // next set of input values. So we employ indexed load and store
7088 // pair instructions with arrangement 4S.
7089 for (int i = 0; i < 4; i++) {
7090 // reload q and qinv
7091 vs_ldpq(vq, dilithiumConsts); // qInv, q
7092 // load 8x4S coefficients via second start pos == c2
7093 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
7094 // load next 8x4S inputs == b
7095 vs_ldpq_post(vs2, zetas);
7096 // compute a == c2 * b mod MONT_Q
7097 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7098 // load 8x4s coefficients via first start pos == c1
7099 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7100 // compute a1 = c1 + a
7101 vs_addv(vs3, __ T4S, vs1, vs2);
7102 // compute a2 = c1 - a
7103 vs_subv(vs1, __ T4S, vs1, vs2);
7104 // output a1 and a2
7105 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7106 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
7107
7108 int k = 4 * level + i;
7109
7110 if (k > 7) {
7111 startIncr = 256;
7112 } else if (k == 5) {
7113 startIncr = 384;
7114 } else {
7115 startIncr = 128;
7116 }
7117
7118 c1Start += startIncr;
7119 c2Start += startIncr;
7120 }
7121
7122 c2 /= 2;
7123 }
7124 }
7125
7126 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
7127 // Implements the method
7128 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
7129 // of the Java class sun.security.provider
7130 //
7131 // coeffs (int[256]) = c_rarg0
7132 // zetas (int[256]) = c_rarg1
7133 address generate_dilithiumAlmostNtt() {
7134 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
7135 int entry_count = StubInfo::entry_count(stub_id);
7136 assert(entry_count == 1, "sanity check");
7137 address start = load_archive_data(stub_id);
7138 if (start != nullptr) {
7139 return start;
7140 }
7141 __ align(CodeEntryAlignment);
7142 StubCodeMark mark(this, stub_id);
7143 start = __ pc();
7144 __ enter();
7145
7146 const Register coeffs = c_rarg0;
7147 const Register zetas = c_rarg1;
7148
7149 const Register tmpAddr = r9;
7150 const Register dilithiumConsts = r10;
7151 const Register result = r11;
7152 // don't use callee save registers v8 - v15
7153 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7154 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7155 VSeq<2> vq(30); // n.b. constants overlap vs3
7156 int offsets[4] = { 0, 32, 64, 96};
7157 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7158 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7159 __ add(result, coeffs, 0);
7160 __ lea(dilithiumConsts,
7161 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7162
7163 // Each level represents one iteration of the outer for loop of the Java version.
7164
7165 // level 0-4
7166 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
7167
7168 // level 5
7169
7170 // At level 5 the coefficients we need to combine with the zetas
7171 // are grouped in memory in blocks of size 4. So, for both sets of
7172 // coefficients we load 4 adjacent values at 8 different offsets
7173 // using an indexed ldr with register variant Q and multiply them
7174 // in sequence order by the next set of inputs. Likewise we store
7175 // the results using an indexed str with register variant Q.
7176 for (int i = 0; i < 1024; i += 256) {
7177 // reload constants q, qinv each iteration as they get clobbered later
7178 vs_ldpq(vq, dilithiumConsts); // qInv, q
7179 // load 32 (8x4S) coefficients via first offsets = c1
7180 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7181 // load next 32 (8x4S) inputs = b
7182 vs_ldpq_post(vs2, zetas);
7183 // a = b montul c1
7184 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7185 // load 32 (8x4S) coefficients via second offsets = c2
7186 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
7187 // add/sub with result of multiply
7188 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
7189 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
7190 // write back new coefficients using same offsets
7191 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
7192 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
7193 }
7194
7195 // level 6
7196 // At level 6 the coefficients we need to combine with the zetas
7197 // are grouped in memory in pairs, the first two being montmul
7198 // inputs and the second add/sub inputs. We can still implement
7199 // the montmul+sub+add using 4-way parallelism but only if we
7200 // combine the coefficients with the zetas 16 at a time. We load 8
7201 // adjacent values at 4 different offsets using an ld2 load with
7202 // arrangement 2D. That interleaves the lower and upper halves of
7203 // each pair of quadwords into successive vector registers. We
7204 // then need to montmul the 4 even elements of the coefficients
7205 // register sequence by the zetas in order and then add/sub the 4
7206 // odd elements of the coefficients register sequence. We use an
7207 // equivalent st2 operation to store the results back into memory
7208 // de-interleaved.
7209 for (int i = 0; i < 1024; i += 128) {
7210 // reload constants q, qinv each iteration as they get clobbered later
7211 vs_ldpq(vq, dilithiumConsts); // qInv, q
7212 // load interleaved 16 (4x2D) coefficients via offsets
7213 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7214 // load next 16 (4x4S) inputs
7215 vs_ldpq_post(vs_front(vs2), zetas);
7216 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7217 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7218 vs_front(vs2), vtmp, vq);
7219 // store interleaved 16 (4x2D) coefficients via offsets
7220 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7221 }
7222
7223 // level 7
7224 // At level 7 the coefficients we need to combine with the zetas
7225 // occur singly with montmul inputs alternating with add/sub
7226 // inputs. Once again we can use 4-way parallelism to combine 16
7227 // zetas at a time. However, we have to load 8 adjacent values at
7228 // 4 different offsets using an ld2 load with arrangement 4S. That
7229 // interleaves the odd words of each pair into one
7230 // coefficients vector register and the even words of the pair
7231 // into the next register. We then need to montmul the 4 even
7232 // elements of the coefficients register sequence by the zetas in
7233 // order and then add/sub the 4 odd elements of the coefficients
7234 // register sequence. We use an equivalent st2 operation to store
7235 // the results back into memory de-interleaved.
7236
7237 for (int i = 0; i < 1024; i += 128) {
7238 // reload constants q, qinv each iteration as they get clobbered later
7239 vs_ldpq(vq, dilithiumConsts); // qInv, q
7240 // load interleaved 16 (4x4S) coefficients via offsets
7241 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7242 // load next 16 (4x4S) inputs
7243 vs_ldpq_post(vs_front(vs2), zetas);
7244 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7245 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7246 vs_front(vs2), vtmp, vq);
7247 // store interleaved 16 (4x4S) coefficients via offsets
7248 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7249 }
7250 __ leave(); // required for proper stackwalking of RuntimeStub frame
7251 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7252 __ ret(lr);
7253
7254 // record the stub entry and end
7255 store_archive_data(stub_id, start, __ pc());
7256
7257 return start;
7258 }
7259
7260 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
7261 // in the Java implementation come in sequences of at least 8, so we
7262 // can use ldpq to collect the corresponding data into pairs of vector
7263 // registers
7264 // We collect the coefficients that correspond to the 'j's into vs1
7265 // the coefficiets that correspond to the 'j+l's into vs2 then
7266 // do the additions into vs3 and the subtractions into vs1 then
7267 // save the result of the additions, load the zetas into vs2
7268 // do the (Montgomery) multiplications by zeta in parallel into vs2
7269 // finally save the results back to the coeffs array
7270 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
7271 const Register coeffs, const Register zetas) {
7272 int c1 = 0;
7273 int c2 = 32;
7274 int startIncr;
7275 int offsets[4];
7276 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7277 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7278 VSeq<2> vq(30); // n.b. constants overlap vs3
7279
7280 offsets[0] = 0;
7281
7282 for (int level = 3; level < 8; level++) {
7283 int c1Start = c1;
7284 int c2Start = c2;
7285 if (level == 3) {
7286 offsets[1] = 64;
7287 offsets[2] = 128;
7288 offsets[3] = 192;
7289 } else if (level == 4) {
7290 offsets[1] = 32;
7291 offsets[2] = 128;
7292 offsets[3] = 160;
7293 } else {
7294 offsets[1] = 32;
7295 offsets[2] = 64;
7296 offsets[3] = 96;
7297 }
7298
7299 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
7300 // time at 4 different offsets and multiply them in order by the
7301 // next set of input values. So we employ indexed load and store
7302 // pair instructions with arrangement 4S.
7303 for (int i = 0; i < 4; i++) {
7304 // load v1 32 (8x4S) coefficients relative to first start index
7305 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7306 // load v2 32 (8x4S) coefficients relative to second start index
7307 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
7308 // a0 = v1 + v2 -- n.b. clobbers vqs
7309 vs_addv(vs3, __ T4S, vs1, vs2);
7310 // a1 = v1 - v2
7311 vs_subv(vs1, __ T4S, vs1, vs2);
7312 // save a1 relative to first start index
7313 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7314 // load constants q, qinv each iteration as they get clobbered above
7315 vs_ldpq(vq, dilithiumConsts); // qInv, q
7316 // load b next 32 (8x4S) inputs
7317 vs_ldpq_post(vs2, zetas);
7318 // a = a1 montmul b
7319 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7320 // save a relative to second start index
7321 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
7322
7323 int k = 4 * level + i;
7324
7325 if (k < 24) {
7326 startIncr = 256;
7327 } else if (k == 25) {
7328 startIncr = 384;
7329 } else {
7330 startIncr = 128;
7331 }
7332
7333 c1Start += startIncr;
7334 c2Start += startIncr;
7335 }
7336
7337 c2 *= 2;
7338 }
7339 }
7340
7341 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
7342 // Implements the method
7343 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
7344 // the sun.security.provider.ML_DSA class.
7345 //
7346 // coeffs (int[256]) = c_rarg0
7347 // zetas (int[256]) = c_rarg1
7348 address generate_dilithiumAlmostInverseNtt() {
7349 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
7350 int entry_count = StubInfo::entry_count(stub_id);
7351 assert(entry_count == 1, "sanity check");
7352 address start = load_archive_data(stub_id);
7353 if (start != nullptr) {
7354 return start;
7355 }
7356 __ align(CodeEntryAlignment);
7357 StubCodeMark mark(this, stub_id);
7358 start = __ pc();
7359 __ enter();
7360
7361 const Register coeffs = c_rarg0;
7362 const Register zetas = c_rarg1;
7363
7364 const Register tmpAddr = r9;
7365 const Register dilithiumConsts = r10;
7366 const Register result = r11;
7367 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7368 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7369 VSeq<2> vq(30); // n.b. constants overlap vs3
7370 int offsets[4] = { 0, 32, 64, 96 };
7371 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7372 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7373
7374 __ add(result, coeffs, 0);
7375 __ lea(dilithiumConsts,
7376 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7377
7378 // Each level represents one iteration of the outer for loop of the Java version
7379
7380 // level 0
7381 // At level 0 we need to interleave adjacent quartets of
7382 // coefficients before we multiply and add/sub by the next 16
7383 // zetas just as we did for level 7 in the multiply code. So we
7384 // load and store the values using an ld2/st2 with arrangement 4S.
7385 for (int i = 0; i < 1024; i += 128) {
7386 // load constants q, qinv
7387 // n.b. this can be moved out of the loop as they do not get
7388 // clobbered by first two loops
7389 vs_ldpq(vq, dilithiumConsts); // qInv, q
7390 // a0/a1 load interleaved 32 (8x4S) coefficients
7391 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7392 // b load next 32 (8x4S) inputs
7393 vs_ldpq_post(vs_front(vs2), zetas);
7394 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7395 // n.b. second half of vs2 provides temporary register storage
7396 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7397 vs_front(vs2), vs_back(vs2), vtmp, vq);
7398 // a0/a1 store interleaved 32 (8x4S) coefficients
7399 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7400 }
7401
7402 // level 1
7403 // At level 1 we need to interleave pairs of adjacent pairs of
7404 // coefficients before we multiply by the next 16 zetas just as we
7405 // did for level 6 in the multiply code. So we load and store the
7406 // values an ld2/st2 with arrangement 2D.
7407 for (int i = 0; i < 1024; i += 128) {
7408 // a0/a1 load interleaved 32 (8x2D) coefficients
7409 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7410 // b load next 16 (4x4S) inputs
7411 vs_ldpq_post(vs_front(vs2), zetas);
7412 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7413 // n.b. second half of vs2 provides temporary register storage
7414 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7415 vs_front(vs2), vs_back(vs2), vtmp, vq);
7416 // a0/a1 store interleaved 32 (8x2D) coefficients
7417 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7418 }
7419
7420 // level 2
7421 // At level 2 coefficients come in blocks of 4. So, we load 4
7422 // adjacent coefficients at 8 distinct offsets for both the first
7423 // and second coefficient sequences, using an ldr with register
7424 // variant Q then combine them with next set of 32 zetas. Likewise
7425 // we store the results using an str with register variant Q.
7426 for (int i = 0; i < 1024; i += 256) {
7427 // c0 load 32 (8x4S) coefficients via first offsets
7428 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7429 // c1 load 32 (8x4S) coefficients via second offsets
7430 vs_ldr_indexed(vs2, __ Q, coeffs, i, offsets2);
7431 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
7432 vs_addv(vs3, __ T4S, vs1, vs2);
7433 // c = c0 - c1
7434 vs_subv(vs1, __ T4S, vs1, vs2);
7435 // store a0 32 (8x4S) coefficients via first offsets
7436 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
7437 // b load 32 (8x4S) next inputs
7438 vs_ldpq_post(vs2, zetas);
7439 // reload constants q, qinv -- they were clobbered earlier
7440 vs_ldpq(vq, dilithiumConsts); // qInv, q
7441 // compute a1 = b montmul c
7442 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7443 // store a1 32 (8x4S) coefficients via second offsets
7444 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
7445 }
7446
7447 // level 3-7
7448 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
7449
7450 __ leave(); // required for proper stackwalking of RuntimeStub frame
7451 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7452 __ ret(lr);
7453
7454 // record the stub entry and end
7455 store_archive_data(stub_id, start, __ pc());
7456
7457 return start;
7458 }
7459
7460 // Dilithium multiply polynomials in the NTT domain.
7461 // Straightforward implementation of the method
7462 // static int implDilithiumNttMult(
7463 // int[] product, int[] coeffs1, int[] coeffs2) {}
7464 // of the sun.security.provider.ML_DSA class.
7465 //
7466 // result (int[256]) = c_rarg0
7467 // poly1 (int[256]) = c_rarg1
7468 // poly2 (int[256]) = c_rarg2
7469 address generate_dilithiumNttMult() {
7470 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
7471 int entry_count = StubInfo::entry_count(stub_id);
7472 assert(entry_count == 1, "sanity check");
7473 address start = load_archive_data(stub_id);
7474 if (start != nullptr) {
7475 return start;
7476 }
7477 __ align(CodeEntryAlignment);
7478 StubCodeMark mark(this, stub_id);
7479 start = __ pc();
7480 __ enter();
7481
7482 Label L_loop;
7483
7484 const Register result = c_rarg0;
7485 const Register poly1 = c_rarg1;
7486 const Register poly2 = c_rarg2;
7487
7488 const Register dilithiumConsts = r10;
7489 const Register len = r11;
7490
7491 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7492 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7493 VSeq<2> vq(30); // n.b. constants overlap vs3
7494 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
7495
7496 __ lea(dilithiumConsts,
7497 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7498
7499 // load constants q, qinv
7500 vs_ldpq(vq, dilithiumConsts); // qInv, q
7501 // load constant rSquare into v29
7502 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
7503
7504 __ mov(len, zr);
7505 __ add(len, len, 1024);
7506
7507 __ BIND(L_loop);
7508
7509 // b load 32 (8x4S) next inputs from poly1
7510 vs_ldpq_post(vs1, poly1);
7511 // c load 32 (8x4S) next inputs from poly2
7512 vs_ldpq_post(vs2, poly2);
7513 // compute a = b montmul c
7514 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7515 // compute a = rsquare montmul a
7516 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
7517 // save a 32 (8x4S) results
7518 vs_stpq_post(vs2, result);
7519
7520 __ sub(len, len, 128);
7521 __ cmp(len, (u1)128);
7522 __ br(Assembler::GE, L_loop);
7523
7524 __ leave(); // required for proper stackwalking of RuntimeStub frame
7525 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7526 __ ret(lr);
7527
7528 // record the stub entry and end
7529 store_archive_data(stub_id, start, __ pc());
7530
7531 return start;
7532 }
7533
7534 // Dilithium Montgomery multiply an array by a constant.
7535 // A straightforward implementation of the method
7536 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
7537 // of the sun.security.provider.ML_DSA class
7538 //
7539 // coeffs (int[256]) = c_rarg0
7540 // constant (int) = c_rarg1
7541 address generate_dilithiumMontMulByConstant() {
7542 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
7543 int entry_count = StubInfo::entry_count(stub_id);
7544 assert(entry_count == 1, "sanity check");
7545 address start = load_archive_data(stub_id);
7546 if (start != nullptr) {
7547 return start;
7548 }
7549 __ align(CodeEntryAlignment);
7550 StubCodeMark mark(this, stub_id);
7551 start = __ pc();
7552 __ enter();
7553
7554 Label L_loop;
7555
7556 const Register coeffs = c_rarg0;
7557 const Register constant = c_rarg1;
7558
7559 const Register dilithiumConsts = r10;
7560 const Register result = r11;
7561 const Register len = r12;
7562
7563 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7564 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7565 VSeq<2> vq(30); // n.b. constants overlap vs3
7566 VSeq<8> vconst(29, 0); // for montmul by constant
7567
7568 // results track inputs
7569 __ add(result, coeffs, 0);
7570 __ lea(dilithiumConsts,
7571 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7572
7573 // load constants q, qinv -- they do not get clobbered by first two loops
7574 vs_ldpq(vq, dilithiumConsts); // qInv, q
7575 // copy caller supplied constant across vconst
7576 __ dup(vconst[0], __ T4S, constant);
7577 __ mov(len, zr);
7578 __ add(len, len, 1024);
7579
7580 __ BIND(L_loop);
7581
7582 // load next 32 inputs
7583 vs_ldpq_post(vs2, coeffs);
7584 // mont mul by constant
7585 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
7586 // write next 32 results
7587 vs_stpq_post(vs2, result);
7588
7589 __ sub(len, len, 128);
7590 __ cmp(len, (u1)128);
7591 __ br(Assembler::GE, L_loop);
7592
7593 __ leave(); // required for proper stackwalking of RuntimeStub frame
7594 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7595 __ ret(lr);
7596
7597 // record the stub entry and end
7598 store_archive_data(stub_id, start, __ pc());
7599
7600 return start;
7601 }
7602
7603 // Dilithium decompose poly.
7604 // Implements the method
7605 // static int implDilithiumDecomposePoly(int[] input, int[] lowPart, int[] highPart,
7606 // int twoGamma2, int multiplier) {
7607 // of the sun.security.provider.ML_DSA class
7608 //
7609 // input (int[256]) = c_rarg0
7610 // lowPart (int[256]) = c_rarg1
7611 // highPart (int[256]) = c_rarg2
7612 // twoGamma2 (int) = c_rarg3
7613 // multiplier (int) = c_rarg4
7614 address generate_dilithiumDecomposePoly() {
7615 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7616 int entry_count = StubInfo::entry_count(stub_id);
7617 assert(entry_count == 1, "sanity check");
7618 address start = load_archive_data(stub_id);
7619 if (start != nullptr) {
7620 return start;
7621 }
7622 __ align(CodeEntryAlignment);
7623 StubCodeMark mark(this, stub_id);
7624 start = __ pc();
7625 Label L_loop;
7626
7627 const Register input = c_rarg0;
7628 const Register lowPart = c_rarg1;
7629 const Register highPart = c_rarg2;
7630 const Register twoGamma2 = c_rarg3;
7631 const Register multiplier = c_rarg4;
7632
7633 const Register len = r9;
7634 const Register dilithiumConsts = r10;
7635 const Register tmp = r11;
7636
7637 // 6 independent sets of 4x4s values
7638 VSeq<4> vs1(0), vs2(4), vs3(8);
7639 VSeq<4> vs4(12), vs5(16), vtmp(20);
7640
7641 // 7 constants for cross-multiplying
7642 VSeq<4> one(25, 0);
7643 VSeq<4> qminus1(26, 0);
7644 VSeq<4> g2(27, 0);
7645 VSeq<4> twog2(28, 0);
7646 VSeq<4> mult(29, 0);
7647 VSeq<4> q(30, 0);
7648 VSeq<4> qadd(31, 0);
7649
7650 __ enter();
7651
7652 __ lea(dilithiumConsts,
7653 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7654
7655 // save callee-saved registers
7656 __ stpd(v8, v9, __ pre(sp, -64));
7657 __ stpd(v10, v11, Address(sp, 16));
7658 __ stpd(v12, v13, Address(sp, 32));
7659 __ stpd(v14, v15, Address(sp, 48));
7660
7661 // populate constant registers
7662 __ mov(tmp, zr);
7663 __ add(tmp, tmp, 1);
7664 __ dup(one[0], __ T4S, tmp); // 1
7665 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7666 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7667 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7668 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7669 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7670 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7671
7672 __ mov(len, zr);
7673 __ add(len, len, 1024);
7674
7675 __ BIND(L_loop);
7676
7677 // load next 4x4S inputs interleaved: rplus --> vs1
7678 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7679
7680 // rplus = rplus - ((rplus + qadd) >> 23) * q
7681 vs_addv(vtmp, __ T4S, vs1, qadd);
7682 vs_sshr(vtmp, __ T4S, vtmp, 23);
7683 vs_mulv(vtmp, __ T4S, vtmp, q);
7684 vs_subv(vs1, __ T4S, vs1, vtmp);
7685
7686 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7687 vs_sshr(vtmp, __ T4S, vs1, 31);
7688 vs_andr(vtmp, vtmp, q);
7689 vs_addv(vs1, __ T4S, vs1, vtmp);
7690
7691 // quotient --> vs2
7692 // int quotient = (rplus * multiplier) >> 22;
7693 vs_mulv(vtmp, __ T4S, vs1, mult);
7694 vs_sshr(vs2, __ T4S, vtmp, 22);
7695
7696 // r0 --> vs3
7697 // int r0 = rplus - quotient * twoGamma2;
7698 vs_mulv(vtmp, __ T4S, vs2, twog2);
7699 vs_subv(vs3, __ T4S, vs1, vtmp);
7700
7701 // mask --> vs4
7702 // int mask = (twoGamma2 - r0) >> 22;
7703 vs_subv(vtmp, __ T4S, twog2, vs3);
7704 vs_sshr(vs4, __ T4S, vtmp, 22);
7705
7706 // r0 -= (mask & twoGamma2);
7707 vs_andr(vtmp, vs4, twog2);
7708 vs_subv(vs3, __ T4S, vs3, vtmp);
7709
7710 // quotient += (mask & 1);
7711 vs_andr(vtmp, vs4, one);
7712 vs_addv(vs2, __ T4S, vs2, vtmp);
7713
7714 // mask = (twoGamma2 / 2 - r0) >> 31;
7715 vs_subv(vtmp, __ T4S, g2, vs3);
7716 vs_sshr(vs4, __ T4S, vtmp, 31);
7717
7718 // r0 -= (mask & twoGamma2);
7719 vs_andr(vtmp, vs4, twog2);
7720 vs_subv(vs3, __ T4S, vs3, vtmp);
7721
7722 // quotient += (mask & 1);
7723 vs_andr(vtmp, vs4, one);
7724 vs_addv(vs2, __ T4S, vs2, vtmp);
7725
7726 // r1 --> vs5
7727 // int r1 = rplus - r0 - (dilithium_q - 1);
7728 vs_subv(vtmp, __ T4S, vs1, vs3);
7729 vs_subv(vs5, __ T4S, vtmp, qminus1);
7730
7731 // r1 --> vs1 (overwriting rplus)
7732 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7733 vs_negr(vtmp, __ T4S, vs5);
7734 vs_orr(vtmp, vs5, vtmp);
7735 vs_sshr(vs1, __ T4S, vtmp, 31);
7736
7737 // r0 += ~r1;
7738 vs_notr(vtmp, vs1);
7739 vs_addv(vs3, __ T4S, vs3, vtmp);
7740
7741 // r1 = r1 & quotient;
7742 vs_andr(vs1, vs2, vs1);
7743
7744 // store results interleaved
7745 // lowPart[m] = r0;
7746 // highPart[m] = r1;
7747 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7748 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7749
7750 __ sub(len, len, 64);
7751 __ cmp(len, (u1)64);
7752 __ br(Assembler::GE, L_loop);
7753
7754 // restore callee-saved vector registers
7755 __ ldpd(v14, v15, Address(sp, 48));
7756 __ ldpd(v12, v13, Address(sp, 32));
7757 __ ldpd(v10, v11, Address(sp, 16));
7758 __ ldpd(v8, v9, __ post(sp, 64));
7759
7760 __ leave(); // required for proper stackwalking of RuntimeStub frame
7761 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7762 __ ret(lr);
7763
7764 // record the stub entry and end
7765 store_archive_data(stub_id, start, __ pc());
7766
7767 return start;
7768 }
7769
7770 static constexpr int montMulP256Shift1 = 12; // 64 - bits per limb
7771 static constexpr int montMulP256Shift2 = 52; // bits per limb
7772 // stack space needed for carry computation
7773 static constexpr int cDataSize = 6 * BytesPerLong;
7774 // stack space needed for data computed by the neon side
7775 static constexpr int mulDataSize = 16 * BytesPerLong;
7776
7777
7778 // Subroutine used by the 52 x 52 bit multiplication algorithm in
7779 // generate_intpoly_montgomeryMult_P256().
7780 // This function computes partial results of eight 52 x 52 bit multiplications,
7781 // where the multiplicands are stored as 64-bit values, specifically
7782 // (b_0, b_1, b_2, b_3) * (a_3, a_4). (The 4 calls to this function
7783 // together provide the results of these limb-multiplications.)
7784 // Calls to this function accept either the low 32 bits or high 20 bits
7785 // of each b_i packed into bs in ascending order. a_3 and a_4 are packed
7786 // into successive 64 bit elements of as. lane selects the low 32 or high
7787 // 20 bits of each a_j value. So four calls with the appropriate parameters
7788 // will produce the 64-bit low32 * low32, low32 * high20, high20 * low32,
7789 // high20 * high20 values in the output register sequences vs. The
7790 // 64-bit partial products are returned in vs in ascending order:
7791 // vs[0] = (b_0*a_3, b_1*a_3) . . . vs[3] = (b_2*a_4, b_3*a_4)
7792
7793 void neon_partial_mult_64(const VSeq<4>& vs, FloatRegister bs, FloatRegister as, int lane_lo) {
7794 __ umullv(vs[0], __ T2D, bs, __ T2S, as, __ S, lane_lo);
7795 __ umull2v(vs[1], __ T2D, bs, __ T4S, as, __ S, lane_lo);
7796 __ umullv(vs[2], __ T2D, bs, __ T2S, as, __ S, lane_lo + 2);
7797 __ umull2v(vs[3], __ T2D, bs, __ T4S, as, __ S, lane_lo + 2);
7798 }
7799
7800 // Subroutine used by the generate_intpoly_montgomeryMult_P256() function
7801 // to compute the result of a 52 x 52 bit multiplications where the
7802 // multiplicands, a and b are available as 64-bit values.
7803 // The result is going to two 64-bit registers lo (least significant 52 bits)
7804 // and hi (most significant 52 bits).
7805 void gpr_partial_mult_52(Register a, Register b, Register hi, Register lo,
7806 Register mask) {
7807 // compute 104-bit (40 + 64) full product
7808 __ umulh(hi, a, b);
7809 __ mul(lo, a, b);
7810 // combine 40 + 12 bits into hi result
7811 // on certain implementations of aarch64 (e.g. apple M1) replacing extr()
7812 // with the following equivalent instruction sequence the performance
7813 // improves slightly (despite it is two instructions longer and needs
7814 // an additional register)
7815 // __ lsl(hi, hi, montMulP256Shift1);
7816 // __ lsr(tmp, lo, montMulP256Shift2);
7817 // __ orr(hi, hi, tmp);
7818 __ extr(hi, hi, lo, montMulP256Shift2);
7819 // mask off 52 bits of lo result
7820 __ andr(lo, lo, mask);
7821 }
7822
7823 // This assembly follows the Java code in MontgomeryIntegerPolynomial256.mult()
7824 // quite closely. The main difference is that the computations done with the
7825 // last two limbs of `a` are done using Neon registers. This allows us to take
7826 // advantage of both the Neon registers and GPRs simultaneously.
7827 // It is also worth noting that since Neon does not support 64 bit
7828 // multiplication, we split each 64 bit value into lower and upper halves
7829 // and use the "schoolbook" multiplication algorithm.
7830 address generate_intpoly_montgomeryMult_P256() {
7831 assert(UseIntPolyIntrinsics, "what are we doing here?");
7832 StubId stub_id = StubId::stubgen_intpoly_montgomeryMult_P256_id;
7833 int entry_count = StubInfo::entry_count(stub_id);
7834 assert(entry_count == 1, "sanity check");
7835 address start = load_archive_data(stub_id);
7836 if (start != nullptr) {
7837 return start;
7838 }
7839 __ align(CodeEntryAlignment);
7840 StubCodeMark mark(this, stub_id);
7841 start = __ pc();
7842 __ enter();
7843
7844 // Registers that are used throughout entire routine
7845 const Register a = c_rarg0;
7846 const Register b = c_rarg1;
7847 const Register result = c_rarg2;
7848
7849 RegSet regs = RegSet::range(r0, r28) - rscratch1 - rscratch2
7850 - r16 - r17 - r18_tls - a - b - result;
7851
7852 auto common_regs = regs.begin();
7853 Register limb_mask = *common_regs++,
7854 c_ptr = *common_regs++,
7855 mod_0 = *common_regs++,
7856 mod_1 = *common_regs++,
7857 mod_3 = *common_regs++,
7858 mod_4 = *common_regs++,
7859 b_0 = *common_regs++,
7860 b_1 = *common_regs++,
7861 b_2 = *common_regs++,
7862 b_3 = *common_regs++,
7863 b_4 = *common_regs++;
7864
7865 FloatRegSet floatRegs = FloatRegSet::range(v0, v31)
7866 - FloatRegSet::range(v8, v15) // Caller saved vectors
7867 - FloatRegSet::range(v16, v31); // Manually-allocated vectors
7868
7869 auto common_vectors = floatRegs.begin();
7870 FloatRegister limb_mask_vec = *common_vectors++,
7871 b_lows = *common_vectors++,
7872 b_highs = *common_vectors++,
7873 a_vals = *common_vectors++;
7874
7875 // Push callee saved registers on to the stack
7876 RegSet callee_saved = RegSet::range(r19, r28);
7877 __ push(callee_saved, sp);
7878
7879 // Allocate space on the stack for carry values
7880 __ sub(sp, sp, cDataSize);
7881 __ mov(c_ptr, sp);
7882
7883 // Calculate (52-bit) limb masks for both gpr and vector registers
7884 __ mov(limb_mask, -UCONST64(1) >> montMulP256Shift1);
7885 __ dup(limb_mask_vec, __ T2D, limb_mask);
7886
7887 //Load input arrays and modulus
7888 Register a_ptr = *common_regs++, mod_ptr = *common_regs++;
7889 // skip 3 limbs so a_ptr addresses trailing pair {a3, a4}
7890 __ add(a_ptr, a, 3 * BytesPerLong);
7891 __ lea(mod_ptr, ExternalAddress((address)_modulus_P256));
7892 __ ldr(b_0, Address(b));
7893 __ ldr(b_1, Address(b, BytesPerLong));
7894 __ ldr(b_2, Address(b, 2 * BytesPerLong));
7895 __ ldr(b_3, Address(b, 3 * BytesPerLong));
7896 __ ldr(b_4, Address(b, 4 * BytesPerLong));
7897 __ ldr(mod_0, __ post(mod_ptr, BytesPerLong));
7898 __ ldr(mod_1, __ post(mod_ptr, BytesPerLong));
7899 __ ldr(mod_3, __ post(mod_ptr, BytesPerLong));
7900 __ ldr(mod_4, mod_ptr);
7901 __ ld1(a_vals, __ T2D, a_ptr);
7902 // use an interleaved load to group low 32 bits and high 20 bits
7903 // of 4 successive b values into two vector registers
7904 // n.b. these are the same inputs as the ones in b_0 ... b4
7905 __ ld2(b_lows, b_highs, __ T4S, b);
7906 common_regs = common_regs.remaining()
7907 + a_ptr + mod_ptr;
7908 a_ptr = mod_ptr = noreg;
7909
7910 //Regs used throughout the main "loop", which is partially unrolled here
7911 Register high = *common_regs++,
7912 low = *common_regs++,
7913 mul_ptr = *common_regs++,
7914 mod_high = *common_regs++,
7915 mod_low = *common_regs++,
7916 a_i = *common_regs++,
7917 c_i = *common_regs++,
7918 tmp = *common_regs++,
7919 n = *common_regs++;
7920
7921 // vector sequences used to compute and combine partial products of
7922 // b_i * a_j for i = {0,1,2,3} j = {3,4}
7923 VSeq<4> A(16);
7924 VSeq<4> B(20);
7925 VSeq<4> C(24);
7926 VSeq<4> D(28);
7927
7928
7929 // neon and gpr computations are interleaved to maximize parallelism
7930
7931 // allocate stack space for the neon results
7932 __ sub(sp, sp, mulDataSize);
7933 __ mov(mul_ptr, sp);
7934
7935 // cross-multiply low * low for limbs b0-b3 and a3-a4 in parallel
7936 neon_partial_mult_64(A, b_lows, a_vals, 0);
7937
7938 // Limb 0
7939 __ ldr(a_i, __ post(a, BytesPerLong));
7940 gpr_partial_mult_52(a_i, b_0, high, low, limb_mask);
7941 __ mov(n, low);
7942 // __ andr(n, low, limb_mask);
7943
7944 // cross-multiply high * low for limbs b0-b3 and a3-a4 in parallel
7945 neon_partial_mult_64(B, b_highs, a_vals, 0);
7946
7947 // Limb 0 modulus computation
7948 // n.b. modulus computation requires multiplying successive
7949 // limbs of the product by corresponding limbs of the p256
7950 // prime adding the result to the limb and folding this
7951 // partial result into a running 256-bit sum in c_i. Limbs
7952 // of c_i are stored via c_ptr once carries are included.
7953 // n.b. the mul + add is omitted for limb 2 since the
7954 // corresponding prime bits are zero.
7955 gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
7956 __ add(low, low, mod_low);
7957 __ add(high, high, mod_high);
7958 __ lsr(c_i, low, montMulP256Shift2);
7959 __ add(c_i, c_i, high);
7960
7961 // cross-multiply low * high for limbs b0-b3 and a3-a4 in parallel
7962 neon_partial_mult_64(C, b_lows, a_vals, 1);
7963
7964 // Limb 1
7965 gpr_partial_mult_52(a_i, b_1, high, low, limb_mask);
7966
7967 // cross-multiply high * high for limbs b0-b3 and a3-a4 in parallel
7968 neon_partial_mult_64(D, b_highs, a_vals, 1);
7969
7970 gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
7971 __ add(low, low, mod_low);
7972 __ add(high, high, mod_high);
7973 __ add(c_i, c_i, low);
7974 __ str(c_i, c_ptr);
7975 __ mov(c_i, high);
7976
7977 // combine neon 32-bit partial products, regrouping to produce
7978 // 8*52-bit low products in A and 8*52-bit high products in D
7979
7980 // add low*high/high*low intermediate products before regrouping
7981 vs_addv(B, __ T2D, B, C); // Store (B+C) in B
7982
7983 // Limb 2
7984 gpr_partial_mult_52(a_i, b_2, high, low, limb_mask);
7985 __ add(c_i, c_i, low);
7986 __ str(c_i, Address(c_ptr, 8));
7987 __ mov(c_i, high);
7988
7989 // shift high*high (40-bit) product up into 52-bits of output
7990 vs_shl(D, __ T2D, D, montMulP256Shift1);
7991
7992 // Limb 3
7993 gpr_partial_mult_52(a_i, b_3, high, low, limb_mask);
7994
7995 // shift high 32 (or 33) bits of intermediate products for addition to D
7996 vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >>> 20)
7997
7998 gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
7999 __ add(low, low, mod_low);
8000 __ add(high, high, mod_high);
8001 __ add(c_i, c_i, low);
8002 __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
8003 __ mov(c_i, high);
8004
8005 // shift low 32 bits of intermediate product up for masking and addition to A
8006 vs_shl(B, __ T2D, B, 32);
8007
8008 // Limb 4
8009 gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
8010
8011 // add high bits of intermediate product into D
8012 vs_addv(D, __ T2D, D, C);
8013
8014 gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
8015 __ add(low, low, mod_low);
8016 __ add(high, high, mod_high);
8017 __ add(c_i, c_i, low);
8018 __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
8019 __ str(high, Address(c_ptr, 4 * BytesPerLong));
8020
8021 // top 12 bits of 32*32 bit product in A need adding into high 52-bit output
8022 vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
8023 // Only 20 of the 32 bits now in the top of B should be added into A
8024 vs_andr(B, B, limb_mask_vec);
8025 // reduce original 64-bit product to 52-bits
8026 vs_andr(A, A, limb_mask_vec);
8027 // add intermediate products to high 52-bit result in D
8028 vs_addv(D, __ T2D, D, C);
8029 // add 20/21 bits of intermediate product in top of B into low 52-bit result
8030 vs_addv(A, __ T2D, A, B);
8031 // save and then mask off any overflow bit from computing low 52-bit result
8032 vs_ushr(B, __ T2D, A, montMulP256Shift2);
8033 vs_andr(A, A, limb_mask_vec);
8034 // add any remaining carry into the high 52-bit result
8035 vs_addv(D, __ T2D, D, B);
8036
8037 // the write interleaves the 4 successive pairs of low and
8038 // high results: (l0, l1), (h0, h1), ... (l6, l7), (h6, h7)
8039 vs_st1_interleaved(A, D, mul_ptr);
8040
8041 // Free mul_ptr
8042 common_regs = common_regs.remaining() + mul_ptr;
8043 mul_ptr = noreg;
8044
8045 /////////////////////////
8046 // Loop 2 & 3
8047 /////////////////////////
8048
8049 for (int i = 0; i < 2; i++) {
8050 // Load a_i and increment by 8 bytes
8051 __ ldr(a_i, __ post(a, BytesPerLong));
8052 __ ldr(c_i, c_ptr); //Load prior c_i
8053
8054 // Limb 0
8055 gpr_partial_mult_52(a_i, b_0, high, low, limb_mask);
8056 __ add(low, low, c_i);
8057 __ ldr(c_i, Address(c_ptr, BytesPerLong));
8058 __ andr(n, low, limb_mask);
8059 gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
8060 __ add(low, low, mod_low);
8061 __ add(high, high, mod_high);
8062 __ lsr(tmp, low, montMulP256Shift2);
8063 __ add(c_i, c_i, tmp);
8064 __ add(c_i, c_i, high);
8065
8066 // Limb 1
8067 gpr_partial_mult_52(a_i, b_1, high, low, limb_mask);
8068 gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
8069 __ ldr(tmp, Address(c_ptr, 2 * BytesPerLong));
8070 __ add(low, low, mod_low);
8071 __ add(high, high, mod_high);
8072 __ add(c_i, c_i, low);
8073 __ str(c_i, c_ptr);
8074 __ add(c_i, tmp, high);
8075
8076 // Limb 2
8077 gpr_partial_mult_52(a_i, b_2, high, low, limb_mask);
8078 __ ldr(tmp, Address(c_ptr, 3 * BytesPerLong));
8079 __ add(c_i, c_i, low);
8080 __ str(c_i, Address(c_ptr, BytesPerLong));
8081 __ add(c_i, tmp, high);
8082
8083 // Limb 3
8084 gpr_partial_mult_52(a_i, b_3, high, low, limb_mask);
8085 gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
8086 __ ldr(tmp, Address(c_ptr, 4 * BytesPerLong));
8087 __ add(low, low, mod_low);
8088 __ add(high, high, mod_high);
8089 __ add(c_i, c_i, low);
8090 __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
8091 __ add(c_i, tmp, high);
8092
8093 // Limb 4
8094 gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
8095 gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
8096 __ add(low, low, mod_low);
8097 __ add(high, high, mod_high);
8098 __ add(c_i, c_i, low);
8099 __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
8100 __ str(high, Address(c_ptr, 4 * BytesPerLong));
8101 }
8102 // Reallocate regs b_0, b_1, b_2 and b_3
8103 common_regs = common_regs.remaining()
8104 + b_0 + b_1 + b_2 + b_3;
8105 b_0 = b_1 = b_2 = b_3 = noreg;
8106
8107 Register low_1 = *common_regs++;
8108 Register high_1 = *common_regs++;
8109
8110 //////////////////////////////
8111 // a[3]
8112 //////////////////////////////
8113
8114 // For a_3 and a_4 we have already computed the cross-products
8115 // with b_0 ... b_3 and stored them on the stack relative to
8116 // `mul_ptr` i.e. the current `sp`in the order
8117 // l(a_3 * b_0), l(a_3 * b_1), h(a_3 * b_0), h(a_3 * b_1),
8118 // l(a_3 * b_2), l(a_3 * b_3), h(a_3 * b_2), h(a_3 * b_3),
8119 // l(a_4 * b_0), l(a_4 * b_1), h(a_4 * b_0), h(a_4 * b_1),
8120 // l(a_4 * b_2), l(a_4 * b_3), h(a_4 * b_2), h(a_4 * b_3),
8121 // where l(x) is the low 52 bits of x and h(x) is the high 52 bits
8122
8123 __ ldr(low_1, Address(sp));
8124 __ ldr(high_1, Address(sp, 2 * BytesPerLong));
8125
8126 __ ldr(low, Address(sp, BytesPerLong));
8127 __ ldr(high, Address(sp, 3 * BytesPerLong));
8128 __ ldr(a_i, __ post(a, BytesPerLong));
8129 __ ldr(c_i, c_ptr);
8130
8131 // Limb 0
8132 __ add(low_1, low_1, c_i);
8133 __ ldr(c_i, Address(c_ptr, BytesPerLong));
8134 __ andr(n, low_1, limb_mask);
8135 gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
8136 __ add(low_1, low_1, mod_low);
8137 __ add(high_1, high_1, mod_high);
8138 __ lsr(tmp, low_1, montMulP256Shift2);
8139 __ add(c_i, c_i, tmp);
8140 __ add(c_i, c_i, high_1);
8141
8142 // Limb 1
8143 __ ldr(low_1, Address(sp, 4 * BytesPerLong));
8144 __ ldr(high_1, Address(sp, 6 * BytesPerLong));
8145 gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
8146 __ ldr(tmp, Address(c_ptr, 2 * BytesPerLong));
8147 __ andr(mod_low, mod_low, limb_mask);
8148 __ add(low, low, mod_low);
8149 __ add(high, high, mod_high);
8150 __ add(c_i, c_i, low);
8151 __ str(c_i, c_ptr);
8152 __ add(c_i, tmp, high);
8153
8154 // Limb 2
8155 __ ldr(low, Address(sp, 5 * BytesPerLong));
8156 __ ldr(high, Address(sp, 7 * BytesPerLong));
8157 __ ldr(tmp, Address(c_ptr, 3 * BytesPerLong));
8158 __ add(c_i, c_i, low_1);
8159 __ str(c_i, Address(c_ptr, BytesPerLong));
8160 __ add(c_i, tmp, high_1);
8161
8162 // Limb 3
8163 gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
8164 __ ldr(tmp, Address(c_ptr, 4 * BytesPerLong));
8165 __ add(low, low, mod_low);
8166 __ add(high, high, mod_high);
8167 __ add(c_i, c_i, low);
8168 __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
8169 __ add(c_i, tmp, high);
8170
8171 // Limb 4
8172 __ ldr(low, Address(sp, 8 * BytesPerLong));
8173 __ ldr(high, Address(sp, 10 * BytesPerLong));
8174 gpr_partial_mult_52(a_i, b_4, high_1, low_1, limb_mask);
8175 gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
8176 __ add(low_1, low_1, mod_low);
8177 __ add(high_1, high_1, mod_high);
8178 __ add(c_i, c_i, low_1);
8179 __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
8180 __ str(high_1, Address(c_ptr, 4 * BytesPerLong));
8181
8182 //////////////////////////////
8183 // a[4]
8184 //////////////////////////////
8185
8186 Register c5 = *common_regs++,
8187 c6 = *common_regs++,
8188 c7 = *common_regs++;
8189
8190 __ ldr(a_i, a);
8191 __ ldr(c_i, c_ptr);
8192
8193 // Limb 0
8194 __ ldr(low_1, Address(sp, 9 * BytesPerLong));
8195 __ ldr(high_1, Address(sp, 11 * BytesPerLong));
8196
8197 __ add(low, low, c_i);
8198 __ ldr(c_i, Address(c_ptr, BytesPerLong));
8199 __ andr(n, low, limb_mask);
8200 gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
8201 __ add(low, low, mod_low);
8202 __ add(high, high, mod_high);
8203 __ lsr(tmp, low, montMulP256Shift2);
8204 __ add(c_i, c_i, tmp);
8205 __ add(c_i, c_i, high);
8206
8207 __ ldr(low, Address(sp, 12 * BytesPerLong));
8208 __ ldr(high, Address(sp, 14 * BytesPerLong));
8209 gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
8210 __ add(low_1, low_1, mod_low);
8211 __ add(high_1, high_1, mod_high);
8212 __ add(c5, c_i, low_1);
8213 __ ldr(c_i, Address(c_ptr, 2 * BytesPerLong));
8214 __ lsr(tmp, c5, montMulP256Shift2);
8215 __ add(c_i, c_i, tmp);
8216 __ add(c_i, c_i, high_1);
8217
8218 // Limb 2
8219 __ ldr(low_1, Address(sp, 13 * BytesPerLong));
8220 __ ldr(high_1, Address(sp, 15 * BytesPerLong));
8221 __ add(c6, c_i, low);
8222 __ ldr(c_i, Address(c_ptr, 3 * BytesPerLong));
8223 __ lsr(tmp, c6, montMulP256Shift2);
8224 __ add(c_i, c_i, tmp);
8225 __ add(c_i, c_i, high);
8226
8227 // Limb 3
8228 gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
8229 __ add(low_1, low_1, mod_low);
8230 __ add(high_1, high_1, mod_high);
8231 __ add(c7, c_i, low_1);
8232 __ ldr(c_i, Address(c_ptr, 4 * BytesPerLong));
8233 __ lsr(tmp, c7, montMulP256Shift2);
8234 __ add(c_i, c_i, tmp);
8235 __ add(c_i, c_i, high_1);
8236
8237 // Limb 4
8238 gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
8239 gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
8240 __ add(low, low, mod_low);
8241 __ add(high, high, mod_high);
8242
8243 // Reallocate b_4
8244 common_regs = common_regs.remaining() + b_4;
8245 b_4 = noreg;
8246
8247 Register c8 = *common_regs++,
8248 c9 = *common_regs++;
8249
8250 __ add(c8, c_i, low);
8251 __ lsr(c9, c8, montMulP256Shift2);
8252 __ add(c9, c9, high);
8253
8254 __ andr(c5, c5, limb_mask);
8255 __ andr(c6, c6, limb_mask);
8256 __ andr(c7, c7, limb_mask);
8257 __ andr(c8, c8, limb_mask);
8258
8259 /////////////////////////////
8260 // Final carry propagate
8261 /////////////////////////////
8262
8263 // c0 = c5 - modulus[0];
8264 // c1 = c6 - modulus[1] + (c0 >> BITS_PER_LIMB);
8265 // c0 &= LIMB_MASK;
8266 // c2 = c7 + (c1 >> BITS_PER_LIMB);
8267 // c1 &= LIMB_MASK;
8268 // c3 = c8 - modulus[3] + (c2 >> BITS_PER_LIMB);
8269 // c2 &= LIMB_MASK;
8270 // c4 = c9 - modulus4] + (c3 >> BITS_PER_LIMB);
8271 // c3 &= LIMB_MASK;
8272
8273 // Free up all unused regs
8274 common_regs = common_regs.remaining()
8275 + c_ptr + low + high + mod_high
8276 + mod_low + a_i + c_i + n + low_1 + high_1;
8277 c_ptr = low = high = mod_high
8278 = mod_low = a_i = c_i = n = low_1 = high_1 = noreg;
8279
8280 Register c0 = *common_regs++,
8281 c1 = *common_regs++,
8282 c2 = *common_regs++,
8283 c3 = *common_regs++,
8284 c4 = *common_regs++;
8285
8286 __ sub(c0, c5, mod_0);
8287 __ sub(c1, c6, mod_1);
8288 __ sub(c3, c8, mod_3);
8289 __ sub(c4, c9, mod_4);
8290 __ add(c1, c1, c0, Assembler::ASR, montMulP256Shift2);
8291 __ andr(c0, c0, limb_mask);
8292 __ add(c2, c7, c1, Assembler::ASR, montMulP256Shift2);
8293 __ andr(c1, c1, limb_mask);
8294 __ add(c3, c3, c2, Assembler::ASR, montMulP256Shift2);
8295 __ andr(c2, c2, limb_mask);
8296 __ add(c4, c4, c3, Assembler::ASR, montMulP256Shift2);
8297 __ andr(c3, c3, limb_mask);
8298
8299 // Final write back
8300 // mask = c4 >> 63
8301 // r[0] = ((c5 & mask) | (c0 & ~mask));
8302 // r[1] = ((c6 & mask) | (c1 & ~mask));
8303 // r[2] = ((c7 & mask) | (c2 & ~mask));
8304 // r[3] = ((c8 & mask) | (c3 & ~mask));
8305 // r[4] = ((c9 & mask) | (c4 & ~mask));
8306
8307 common_regs = common_regs.remaining()
8308 + mod_0 + mod_1 + mod_3 + mod_4;
8309 mod_0 = mod_1 = mod_3 = mod_4 = noreg;
8310
8311 Register mask = *common_regs++;
8312 Register nmask = *common_regs++;
8313
8314 __ asr(mask, c4, 63);
8315 __ mvn(nmask, mask);
8316 __ andr(c5, c5, mask);
8317 __ andr(tmp, c0, nmask);
8318 __ orr(c5, c5, tmp);
8319 __ andr(c6, c6, mask);
8320 __ andr(tmp, c1, nmask);
8321 __ orr(c6, c6, tmp);
8322 __ andr(c7, c7, mask);
8323 __ andr(tmp, c2, nmask);
8324 __ orr(c7, c7, tmp);
8325 __ andr(c8, c8, mask);
8326 __ andr(tmp, c3, nmask);
8327 __ orr(c8, c8, tmp);
8328 __ andr(c9, c9, mask);
8329 __ andr(tmp, c4, nmask);
8330 __ orr(c9, c9, tmp);
8331
8332 __ str(c5, result);
8333 __ str(c6, Address(result, BytesPerLong));
8334 __ str(c7, Address(result, 2 * BytesPerLong));
8335 __ str(c8, Address(result, 3 * BytesPerLong));
8336 __ str(c9, Address(result, 4 * BytesPerLong));
8337
8338 // End intrinsic call
8339 __ add(sp, sp, cDataSize + mulDataSize);
8340 __ pop(callee_saved, sp);
8341 __ leave();
8342 __ mov(r0, zr); // return 0
8343 __ ret(lr);
8344
8345 // record the stub entry and end
8346 store_archive_data(stub_id, start, __ pc());
8347
8348 return start;
8349 }
8350
8351 address generate_intpoly_assign() {
8352 // KNOWN Lengths:
8353 // MontgomeryIntPolynP256: 5 = 4 + 1
8354 // IntegerPolynomial1305: 5 = 4 + 1
8355 // IntegerPolynomial25519: 10 = 8 + 2
8356 // IntegerPolynomialP256: 10 = 8 + 2
8357 // Curve25519OrderField: 10 = 8 + 2
8358 // Curve25519OrderField: 10 = 8 + 2
8359 // P256OrderField: 10 = 8 + 2
8360 // IntegerPolynomialP384: 14 = 8 + 4 + 2
8361 // P384OrderField: 14 = 8 + 4 + 2
8362 // IntegerPolynomial448: 16 = 8 + 8
8363 // Curve448OrderField: 16 = 8 + 8
8364 // Curve448OrderField: 16 = 8 + 8
8365 // IntegerPolynomialP521: 19 = 8 + 8 + 2 + 1
8366 // P521OrderField: 19 = 8 + 8 + 2 + 1
8367 // Special Cases 5, 10, 14, 16, 19
8368 assert(UseIntPolyIntrinsics, "what are we doing here?");
8369 StubId stub_id = StubId::stubgen_intpoly_assign_id;
8370 int entry_count = StubInfo::entry_count(stub_id);
8371 assert(entry_count == 1, "sanity check");
8372 address start = load_archive_data(stub_id);
8373 if (start != nullptr) {
8374 return start;
8375 }
8376
8377 __ align(CodeEntryAlignment);
8378 StubCodeMark mark(this, stub_id);
8379 start = __ pc();
8380 __ enter();
8381
8382 // Inputs
8383 const Register set = c_rarg0;
8384 const Register aLimbs = c_rarg1;
8385 const Register bLimbs = c_rarg2;
8386 const Register length = c_rarg3;
8387
8388 Label L_Length5, L_Length10, L_Length14, L_Length16, L_Length19, L_Default, L_Done;
8389
8390 /*
8391 int maskValue = -set;
8392 for (int i = 0; i < a.length; i++) {
8393 long dummyLimbs = maskValue & (a[i] ^ b[i]);
8394 a[i] = dummyLimbs ^ a[i];
8395 }
8396 */
8397 Register mask_scalar = r4;
8398 FloatRegister mask_vec = v0;
8399
8400 __ neg(mask_scalar, set);
8401 __ dup(mask_vec, __ T2D, mask_scalar);
8402
8403 __ cmp(length, (u1)5);
8404 __ br(Assembler::EQ, L_Length5);
8405 __ cmp(length, (u1)10);
8406 __ br(Assembler::EQ, L_Length10);
8407 __ cmp(length, (u1)14);
8408 __ br(Assembler::EQ, L_Length14);
8409 __ cmp(length, (u1)16);
8410 __ br(Assembler::EQ, L_Length16);
8411 __ cmp(length, (u1)19);
8412 __ br(Assembler::EQ, L_Length19);
8413 __ b(L_Default);
8414
8415
8416 // Length = 5
8417 // Use 5 GPRs (neon not faster with this few limbs)
8418 __ BIND(L_Length5);
8419 {
8420 Register a0 = r5;
8421 Register a1 = r6;
8422 Register a2 = r7;
8423 Register a3 = r10;
8424 Register a4 = r11;
8425 Register b0 = r12;
8426 Register b1 = r13;
8427 Register b2 = r14;
8428 Register b3 = r15;
8429 Register b4 = r19;
8430
8431 __ push(r19, sp);
8432
8433 __ ldr(a0, aLimbs);
8434 __ ldr(a1, Address(aLimbs, 1 * BytesPerLong));
8435 __ ldr(a2, Address(aLimbs, 2 * BytesPerLong));
8436 __ ldr(a3, Address(aLimbs, 3 * BytesPerLong));
8437 __ ldr(a4, Address(aLimbs, 4 * BytesPerLong));
8438
8439 __ ldr(b0, bLimbs);
8440 __ ldr(b1, Address(bLimbs, 1 * BytesPerLong));
8441 __ ldr(b2, Address(bLimbs, 2 * BytesPerLong));
8442 __ ldr(b3, Address(bLimbs, 3 * BytesPerLong));
8443 __ ldr(b4, Address(bLimbs, 4 * BytesPerLong));
8444
8445 __ eor(b0, b0, a0);
8446 __ eor(b1, b1, a1);
8447 __ eor(b2, b2, a2);
8448 __ eor(b3, b3, a3);
8449 __ eor(b4, b4, a4);
8450
8451 __ andr(b0, b0, mask_scalar);
8452 __ andr(b1, b1, mask_scalar);
8453 __ andr(b2, b2, mask_scalar);
8454 __ andr(b3, b3, mask_scalar);
8455 __ andr(b4, b4, mask_scalar);
8456
8457 __ eor(a0, a0, b0);
8458 __ eor(a1, a1, b1);
8459 __ eor(a2, a2, b2);
8460 __ eor(a3, a3, b3);
8461 __ eor(a4, a4, b4);
8462
8463 __ str(a0, aLimbs);
8464 __ str(a1, Address(aLimbs, 1 * BytesPerLong));
8465 __ str(a2, Address(aLimbs, 2 * BytesPerLong));
8466 __ str(a3, Address(aLimbs, 3 * BytesPerLong));
8467 __ str(a4, Address(aLimbs, 4 * BytesPerLong));
8468
8469 __ pop(r19, sp);
8470 __ b(L_Done);
8471 }
8472
8473 // Length = 10
8474 // Split into 4 neon regs and 2 GPRs
8475 __ BIND(L_Length10);
8476 {
8477 Register a9 = r10;
8478 Register a10 = r11;
8479 Register b9 = r12;
8480 Register b10 = r13;
8481
8482 VSeq<4> a_vec(16);
8483 VSeq<4> b_vec(20);
8484
8485 __ ldr(a9, Address(aLimbs, 8 * BytesPerLong));
8486 __ ldr(a10, Address(aLimbs, 9 * BytesPerLong));
8487 __ ldr(b9, Address(bLimbs, 8 * BytesPerLong));
8488 __ ldr(b10, Address(bLimbs, 9 * BytesPerLong));
8489
8490 vs_ldpq(a_vec, aLimbs);
8491
8492 __ eor(b9, b9, a9);
8493 __ eor(b10, b10, a10);
8494
8495 vs_ldpq(b_vec, bLimbs);
8496
8497 __ andr(b9, b9, mask_scalar);
8498 __ andr(b10, b10, mask_scalar);
8499
8500 vs_eor(b_vec, b_vec, a_vec);
8501
8502 __ eor(a9, a9, b9);
8503 __ eor(a10, a10, b10);
8504
8505 vs_andr(b_vec, b_vec, mask_vec);
8506
8507 __ str(a9, Address(aLimbs, 8 * BytesPerLong));
8508 __ str(a10, Address(aLimbs, 9 * BytesPerLong));
8509
8510 vs_eor(a_vec, a_vec, b_vec);
8511 vs_stpq_post(a_vec, aLimbs);
8512
8513 __ b(L_Done);
8514 }
8515
8516 // Length = 14
8517 // Split into 5 neon regs and 4 GPRs
8518 __ BIND(L_Length14);
8519 {
8520 Register a10 = r5;
8521 Register a11 = r6;
8522 Register a12 = r7;
8523 Register a13 = r8;
8524 Register b10 = r9;
8525 Register b11 = r10;
8526 Register b12 = r11;
8527 Register b13 = r12;
8528
8529 VSeq<5> a_vec(16);
8530 VSeq<5> b_vec(22);
8531
8532 int offsets[2] = { 0, 32 };
8533
8534 __ ldr(a10, Address(aLimbs, 10 * BytesPerLong));
8535 __ ldr(a11, Address(aLimbs, 11 * BytesPerLong));
8536 __ ldr(a12, Address(aLimbs, 12 * BytesPerLong));
8537 __ ldr(a13, Address(aLimbs, 13 * BytesPerLong));
8538
8539 __ ldr(b10, Address(bLimbs, 10 * BytesPerLong));
8540 __ ldr(b11, Address(bLimbs, 11 * BytesPerLong));
8541 __ ldr(b12, Address(bLimbs, 12 * BytesPerLong));
8542 __ ldr(b13, Address(bLimbs, 13 * BytesPerLong));
8543
8544 __ ld1(a_vec[0], __ T2D, aLimbs);
8545 vs_ldpq_indexed(vs_tail(a_vec), aLimbs, 16, offsets);
8546
8547 __ eor(b10, b10, a10);
8548 __ eor(b11, b11, a11);
8549 __ eor(b12, b12, a12);
8550 __ eor(b13, b13, a13);
8551
8552 __ ld1(b_vec[0], __ T2D, bLimbs);
8553 vs_ldpq_indexed(vs_tail(b_vec), bLimbs, 16, offsets);
8554
8555 __ andr(b10, b10, mask_scalar);
8556 __ andr(b11, b11, mask_scalar);
8557 __ andr(b12, b12, mask_scalar);
8558 __ andr(b13, b13, mask_scalar);
8559
8560 vs_eor(b_vec, b_vec, a_vec);
8561
8562 __ eor(a10, a10, b10);
8563 __ eor(a11, a11, b11);
8564 __ eor(a12, a12, b12);
8565 __ eor(a13, a13, b13);
8566
8567 vs_andr(b_vec, b_vec, mask_vec);
8568
8569 __ str(a10, Address(aLimbs, 10 * BytesPerLong));
8570 __ str(a11, Address(aLimbs, 11 * BytesPerLong));
8571 __ str(a12, Address(aLimbs, 12 * BytesPerLong));
8572 __ str(a13, Address(aLimbs, 13 * BytesPerLong));
8573
8574 vs_eor(a_vec, a_vec, b_vec);
8575
8576 __ st1(a_vec[0], __ T2D, aLimbs);
8577 vs_stpq_indexed(vs_tail(a_vec), aLimbs, 16, offsets);
8578
8579 __ b(L_Done);
8580 }
8581
8582 // Length = 16
8583 // Use 8 neon regs
8584 __ BIND(L_Length16);
8585 {
8586 VSeq<8> a_vec(16);
8587 VSeq<8> b_vec(24);
8588
8589 vs_ldpq(a_vec, aLimbs);
8590 vs_ldpq(b_vec, bLimbs);
8591 vs_eor(b_vec, b_vec, a_vec);
8592 vs_andr(b_vec, b_vec, mask_vec);
8593 vs_eor(a_vec, a_vec, b_vec);
8594 vs_stpq_post(a_vec, aLimbs);
8595
8596 __ b(L_Done);
8597 }
8598
8599 // Length = 19
8600 // Split into 8 neon regs and 3 GPRs
8601 __ BIND(L_Length19);
8602 {
8603 Register a17 = r10;
8604 Register a18 = r11;
8605 Register a19 = r12;
8606 Register b17 = r13;
8607 Register b18 = r14;
8608 Register b19 = r15;
8609
8610 VSeq<8> a_vec(16);
8611 VSeq<8> b_vec(24);
8612
8613 __ ldr(a17, Address(aLimbs, 16 * BytesPerLong));
8614 __ ldr(a18, Address(aLimbs, 17 * BytesPerLong));
8615 __ ldr(a19, Address(aLimbs, 18 * BytesPerLong));
8616 __ ldr(b17, Address(bLimbs, 16 * BytesPerLong));
8617 __ ldr(b18, Address(bLimbs, 17 * BytesPerLong));
8618 __ ldr(b19, Address(bLimbs, 18 * BytesPerLong));
8619
8620 vs_ldpq(a_vec, aLimbs);
8621
8622 __ eor(b17, b17, a17);
8623 __ eor(b18, b18, a18);
8624 __ eor(b19, b19, a19);
8625
8626 vs_ldpq(b_vec, bLimbs);
8627
8628 __ andr(b17, b17, mask_scalar);
8629 __ andr(b18, b18, mask_scalar);
8630 __ andr(b19, b19, mask_scalar);
8631
8632 vs_eor(b_vec, b_vec, a_vec);
8633
8634 __ eor(a17, a17, b17);
8635 __ eor(a18, a18, b18);
8636 __ eor(a19, a19, b19);
8637
8638 vs_andr(b_vec, b_vec, mask_vec);
8639
8640 __ str(a17, Address(aLimbs, 16 * BytesPerLong));
8641 __ str(a18, Address(aLimbs, 17 * BytesPerLong));
8642 __ str(a19, Address(aLimbs, 18 * BytesPerLong));
8643
8644 vs_eor(a_vec, a_vec, b_vec);
8645 vs_stpq_post(a_vec, aLimbs);
8646
8647 __ b(L_Done);
8648 }
8649
8650 __ BIND(L_Default);
8651 {
8652 Register ctr = r5;
8653 Register a_val = r6;
8654 Register b_val = r7;
8655
8656 __ mov(ctr, length); // length (the number of limbs) is never 0
8657
8658 Label default_loop;
8659 __ BIND(default_loop);
8660
8661 __ ldr(a_val, aLimbs);
8662 __ ldr(b_val, __ post(bLimbs, 8));
8663 __ eor(b_val, b_val, a_val);
8664 __ andr(b_val, b_val, mask_scalar);
8665 __ eor(a_val, a_val, b_val);
8666 __ str(a_val, __ post(aLimbs, 8));
8667 __ sub(ctr, ctr, 1);
8668 __ cmp(ctr, (u1)0);
8669 __ br(Assembler::NE, default_loop);
8670 }
8671
8672 __ BIND(L_Done);
8673 __ leave(); // required for proper stackwalking of RuntimeStub frame
8674 __ mov(r0, zr); // return 0
8675 __ ret(lr);
8676
8677 // record the stub entry and end
8678 store_archive_data(stub_id, start, __ pc());
8679
8680 return start;
8681 }
8682
8683 /**
8684 * Arithmetic polynomial multiplication in Curve25519. The algorithm mimics
8685 * the version in the IntegerPolynomial25519 class, including the use of all
8686 * columns (no folding method).
8687 *
8688 * Arguments:
8689 *
8690 * Inputs:
8691 * c_rarg0 - long[] aLimbs
8692 * c_rarg1 - long[] bLimbs
8693 *
8694 * Output:
8695 * c_rarg2 - long[] rLimbs result
8696 */
8697 address generate_intpoly_mult_25519() {
8698 StubId stub_id = StubId::stubgen_intpoly_mult_25519_id;
8699 int entry_count = StubInfo::entry_count(stub_id);
8700 assert(entry_count == 1, "sanity check");
8701 address start = load_archive_data(stub_id);
8702 if (start != nullptr) {
8703 return start;
8704 }
8705 __ align(CodeEntryAlignment);
8706 StubCodeMark mark(this, stub_id);
8707 start = __ pc();
8708 __ enter();
8709
8710 // Register Map
8711 const Register aLimbs = c_rarg0; // r0
8712 const Register bLimbs = c_rarg1; // r1
8713 const Register rLimbs = c_rarg2; // r2
8714
8715 Register c[] = {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12};
8716 Register a = r13;
8717 Register b = r14;
8718 Register term = r15;
8719 Register low = r16;
8720 Register high = r17;
8721
8722 const int32_t limbs = 5;
8723 const int32_t bpl = 51;
8724 const int32_t rem = 64 - bpl;
8725 const int32_t TERM = 19;
8726 const int32_t columns = limbs * 2;
8727 const uint64_t mask = (uint64_t) -1 >> rem;
8728 const uint64_t CARRY_ADD = (uint64_t) 1 << (bpl - 1);
8729
8730 __ mov(term, TERM);
8731 for (int i = 0; i < columns; i++) {
8732 __ mov(c[i], zr);
8733 }
8734
8735 // Perform high/low multiplication with signed 5x51 bit limbs
8736 for (int i = 0; i < limbs; i++) {
8737 __ ldr(b, Address(bLimbs, i * 8));
8738 for (int j = 0; j < limbs; j++) {
8739 __ ldr(a, Address(aLimbs, j * 8));
8740 __ smulh(high, a, b);
8741 __ mul(low, a, b);
8742 __ extr(high, high, low, bpl);
8743 __ andr(low, low, mask);
8744 __ add(c[i + j], c[i + j], low);
8745 __ add(c[i + j + 1], c[i + j + 1], high);
8746 }
8747 }
8748
8749 for (int i = 0; i < limbs; i++) {
8750 __ mul(c[i + 5], c[i + 5], term);
8751 __ add(c[i], c[i], c[i + 5]);
8752 }
8753
8754 // Carry-add with reduction from high limb
8755 Register tmp = low;
8756 Register carry_add = high;
8757 __ mov(carry_add, CARRY_ADD);
8758
8759 // Limb 3
8760 __ add(tmp, c[3], carry_add);
8761 __ asr(tmp, tmp, bpl);
8762 __ add(c[4], c[4], tmp);
8763 __ lsl(tmp, tmp, bpl);
8764 __ sub(c[3], c[3], tmp);
8765
8766 // Limb 4
8767 __ add(tmp, c[4], carry_add);
8768 __ asr(tmp, tmp, bpl);
8769
8770 // Reduce high order limb and fold back into low order limb
8771 __ mul(term, tmp, term);
8772 __ add(c[0], c[0], term);
8773
8774 __ lsl(tmp, tmp, bpl);
8775 __ sub(c[4], c[4], tmp);
8776
8777 // Limbs 0 - 3
8778 for (int i = 0; i < (limbs - 1); i++) {
8779 __ add(tmp, c[i], carry_add);
8780 __ asr(tmp, tmp, bpl);
8781 __ add(c[i + 1], c[i + 1], tmp);
8782 __ lsl(tmp, tmp, bpl);
8783 __ sub(c[i], c[i], tmp);
8784 }
8785
8786 for (int i = 0; i < limbs; i++) {
8787 __ str(c[i], Address(rLimbs, i * 8));
8788 }
8789
8790 __ mov(r0, 0);
8791 __ leave(); // required for proper stackwalking of RuntimeStub frame
8792 __ ret(lr);
8793
8794 // record the stub entry and end
8795 store_archive_data(stub_id, start, __ pc());
8796
8797 return start;
8798 }
8799
8800 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
8801 Register tmp0, Register tmp1, Register tmp2) {
8802 __ bic(tmp0, a2, a1); // for a0
8803 __ bic(tmp1, a3, a2); // for a1
8804 __ bic(tmp2, a4, a3); // for a2
8805 __ eor(a2, a2, tmp2);
8806 __ bic(tmp2, a0, a4); // for a3
8807 __ eor(a3, a3, tmp2);
8808 __ bic(tmp2, a1, a0); // for a4
8809 __ eor(a0, a0, tmp0);
8810 __ eor(a1, a1, tmp1);
8811 __ eor(a4, a4, tmp2);
8812 }
8813
8814 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
8815 Register a0, Register a1, Register a2, Register a3, Register a4,
8816 Register a5, Register a6, Register a7, Register a8, Register a9,
8817 Register a10, Register a11, Register a12, Register a13, Register a14,
8818 Register a15, Register a16, Register a17, Register a18, Register a19,
8819 Register a20, Register a21, Register a22, Register a23, Register a24,
8820 Register tmp0, Register tmp1, Register tmp2) {
8821 __ eor3(tmp1, a4, a9, a14);
8822 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
8823 __ eor3(tmp2, a1, a6, a11);
8824 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
8825 __ rax1(tmp2, tmp0, tmp1); // d0
8826 {
8827
8828 Register tmp3, tmp4;
8829 if (can_use_fp && can_use_r18) {
8830 tmp3 = rfp;
8831 tmp4 = r18_tls;
8832 } else {
8833 tmp3 = a4;
8834 tmp4 = a9;
8835 __ stp(tmp3, tmp4, __ pre(sp, -16));
8836 }
8837
8838 __ eor3(tmp3, a0, a5, a10);
8839 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
8840 __ eor(a0, a0, tmp2);
8841 __ eor(a5, a5, tmp2);
8842 __ eor(a10, a10, tmp2);
8843 __ eor(a15, a15, tmp2);
8844 __ eor(a20, a20, tmp2); // d0(tmp2)
8845 __ eor3(tmp3, a2, a7, a12);
8846 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
8847 __ rax1(tmp3, tmp4, tmp2); // d1
8848 __ eor(a1, a1, tmp3);
8849 __ eor(a6, a6, tmp3);
8850 __ eor(a11, a11, tmp3);
8851 __ eor(a16, a16, tmp3);
8852 __ eor(a21, a21, tmp3); // d1(tmp3)
8853 __ rax1(tmp3, tmp2, tmp0); // d3
8854 __ eor3(tmp2, a3, a8, a13);
8855 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
8856 __ eor(a3, a3, tmp3);
8857 __ eor(a8, a8, tmp3);
8858 __ eor(a13, a13, tmp3);
8859 __ eor(a18, a18, tmp3);
8860 __ eor(a23, a23, tmp3);
8861 __ rax1(tmp2, tmp1, tmp0); // d2
8862 __ eor(a2, a2, tmp2);
8863 __ eor(a7, a7, tmp2);
8864 __ eor(a12, a12, tmp2);
8865 __ rax1(tmp0, tmp0, tmp4); // d4
8866 if (!can_use_fp || !can_use_r18) {
8867 __ ldp(tmp3, tmp4, __ post(sp, 16));
8868 }
8869 __ eor(a17, a17, tmp2);
8870 __ eor(a22, a22, tmp2);
8871 __ eor(a4, a4, tmp0);
8872 __ eor(a9, a9, tmp0);
8873 __ eor(a14, a14, tmp0);
8874 __ eor(a19, a19, tmp0);
8875 __ eor(a24, a24, tmp0);
8876 }
8877
8878 __ rol(tmp0, a10, 3);
8879 __ rol(a10, a1, 1);
8880 __ rol(a1, a6, 44);
8881 __ rol(a6, a9, 20);
8882 __ rol(a9, a22, 61);
8883 __ rol(a22, a14, 39);
8884 __ rol(a14, a20, 18);
8885 __ rol(a20, a2, 62);
8886 __ rol(a2, a12, 43);
8887 __ rol(a12, a13, 25);
8888 __ rol(a13, a19, 8) ;
8889 __ rol(a19, a23, 56);
8890 __ rol(a23, a15, 41);
8891 __ rol(a15, a4, 27);
8892 __ rol(a4, a24, 14);
8893 __ rol(a24, a21, 2);
8894 __ rol(a21, a8, 55);
8895 __ rol(a8, a16, 45);
8896 __ rol(a16, a5, 36);
8897 __ rol(a5, a3, 28);
8898 __ rol(a3, a18, 21);
8899 __ rol(a18, a17, 15);
8900 __ rol(a17, a11, 10);
8901 __ rol(a11, a7, 6);
8902 __ mov(a7, tmp0);
8903
8904 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
8905 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
8906 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
8907 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
8908 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
8909
8910 __ ldr(tmp1, __ post(rc, 8));
8911 __ eor(a0, a0, tmp1);
8912
8913 }
8914
8915 // Arguments:
8916 //
8917 // Inputs:
8918 // c_rarg0 - byte[] source+offset
8919 // c_rarg1 - byte[] SHA.state
8920 // c_rarg2 - int block_size
8921 // c_rarg3 - int offset
8922 // c_rarg4 - int limit
8923 //
8924 address generate_sha3_implCompress_gpr(StubId stub_id) {
8925 bool multi_block;
8926 switch (stub_id) {
8927 case StubId::stubgen_sha3_implCompress_id:
8928 multi_block = false;
8929 break;
8930 case StubId::stubgen_sha3_implCompressMB_id:
8931 multi_block = true;
8932 break;
8933 default:
8934 ShouldNotReachHere();
8935 }
8936 int entry_count = StubInfo::entry_count(stub_id);
8937 assert(entry_count == 1, "sanity check");
8938 address start = load_archive_data(stub_id);
8939 if (start != nullptr) {
8940 return start;
8941 }
8942 __ align(CodeEntryAlignment);
8943 StubCodeMark mark(this, stub_id);
8944 start = __ pc();
8945
8946 Register buf = c_rarg0;
8947 Register state = c_rarg1;
8948 Register block_size = c_rarg2;
8949 Register ofs = c_rarg3;
8950 Register limit = c_rarg4;
8951
8952 // use r3.r17,r19..r28 to keep a0..a24.
8953 // a0..a24 are respective locals from SHA3.java
8954 Register a0 = r25,
8955 a1 = r26,
8956 a2 = r27,
8957 a3 = r3,
8958 a4 = r4,
8959 a5 = r5,
8960 a6 = r6,
8961 a7 = r7,
8962 a8 = rscratch1, // r8
8963 a9 = rscratch2, // r9
8964 a10 = r10,
8965 a11 = r11,
8966 a12 = r12,
8967 a13 = r13,
8968 a14 = r14,
8969 a15 = r15,
8970 a16 = r16,
8971 a17 = r17,
8972 a18 = r28,
8973 a19 = r19,
8974 a20 = r20,
8975 a21 = r21,
8976 a22 = r22,
8977 a23 = r23,
8978 a24 = r24;
8979
8980 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
8981
8982 Label sha3_loop, rounds24_preloop, loop_body;
8983 Label sha3_512_or_sha3_384, shake128;
8984
8985 bool can_use_r18 = false;
8986 #ifndef R18_RESERVED
8987 can_use_r18 = true;
8988 #endif
8989 bool can_use_fp = !PreserveFramePointer;
8990
8991 __ enter();
8992
8993 // save almost all yet unsaved gpr registers on stack
8994 __ str(block_size, __ pre(sp, -128));
8995 if (multi_block) {
8996 __ stpw(ofs, limit, Address(sp, 8));
8997 }
8998 // 8 bytes at sp+16 will be used to keep buf
8999 __ stp(r19, r20, Address(sp, 32));
9000 __ stp(r21, r22, Address(sp, 48));
9001 __ stp(r23, r24, Address(sp, 64));
9002 __ stp(r25, r26, Address(sp, 80));
9003 __ stp(r27, r28, Address(sp, 96));
9004 if (can_use_r18 && can_use_fp) {
9005 __ stp(r18_tls, state, Address(sp, 112));
9006 } else {
9007 __ str(state, Address(sp, 112));
9008 }
9009
9010 // begin sha3 calculations: loading a0..a24 from state arrary
9011 __ ldp(a0, a1, state);
9012 __ ldp(a2, a3, Address(state, 16));
9013 __ ldp(a4, a5, Address(state, 32));
9014 __ ldp(a6, a7, Address(state, 48));
9015 __ ldp(a8, a9, Address(state, 64));
9016 __ ldp(a10, a11, Address(state, 80));
9017 __ ldp(a12, a13, Address(state, 96));
9018 __ ldp(a14, a15, Address(state, 112));
9019 __ ldp(a16, a17, Address(state, 128));
9020 __ ldp(a18, a19, Address(state, 144));
9021 __ ldp(a20, a21, Address(state, 160));
9022 __ ldp(a22, a23, Address(state, 176));
9023 __ ldr(a24, Address(state, 192));
9024
9025 __ BIND(sha3_loop);
9026
9027 // load input
9028 __ ldp(tmp3, tmp2, __ post(buf, 16));
9029 __ eor(a0, a0, tmp3);
9030 __ eor(a1, a1, tmp2);
9031 __ ldp(tmp3, tmp2, __ post(buf, 16));
9032 __ eor(a2, a2, tmp3);
9033 __ eor(a3, a3, tmp2);
9034 __ ldp(tmp3, tmp2, __ post(buf, 16));
9035 __ eor(a4, a4, tmp3);
9036 __ eor(a5, a5, tmp2);
9037 __ ldr(tmp3, __ post(buf, 8));
9038 __ eor(a6, a6, tmp3);
9039
9040 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
9041 __ tbz(block_size, 7, sha3_512_or_sha3_384);
9042
9043 __ ldp(tmp3, tmp2, __ post(buf, 16));
9044 __ eor(a7, a7, tmp3);
9045 __ eor(a8, a8, tmp2);
9046 __ ldp(tmp3, tmp2, __ post(buf, 16));
9047 __ eor(a9, a9, tmp3);
9048 __ eor(a10, a10, tmp2);
9049 __ ldp(tmp3, tmp2, __ post(buf, 16));
9050 __ eor(a11, a11, tmp3);
9051 __ eor(a12, a12, tmp2);
9052 __ ldp(tmp3, tmp2, __ post(buf, 16));
9053 __ eor(a13, a13, tmp3);
9054 __ eor(a14, a14, tmp2);
9055 __ ldp(tmp3, tmp2, __ post(buf, 16));
9056 __ eor(a15, a15, tmp3);
9057 __ eor(a16, a16, tmp2);
9058
9059 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
9060 __ andw(tmp2, block_size, 48);
9061 __ cbzw(tmp2, rounds24_preloop);
9062 __ tbnz(block_size, 5, shake128);
9063 // block_size == 144, bit5 == 0, SHA3-244
9064 __ ldr(tmp3, __ post(buf, 8));
9065 __ eor(a17, a17, tmp3);
9066 __ b(rounds24_preloop);
9067
9068 __ BIND(shake128);
9069 __ ldp(tmp3, tmp2, __ post(buf, 16));
9070 __ eor(a17, a17, tmp3);
9071 __ eor(a18, a18, tmp2);
9072 __ ldp(tmp3, tmp2, __ post(buf, 16));
9073 __ eor(a19, a19, tmp3);
9074 __ eor(a20, a20, tmp2);
9075 __ b(rounds24_preloop); // block_size == 168, SHAKE128
9076
9077 __ BIND(sha3_512_or_sha3_384);
9078 __ ldp(tmp3, tmp2, __ post(buf, 16));
9079 __ eor(a7, a7, tmp3);
9080 __ eor(a8, a8, tmp2);
9081 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
9082
9083 // SHA3-384
9084 __ ldp(tmp3, tmp2, __ post(buf, 16));
9085 __ eor(a9, a9, tmp3);
9086 __ eor(a10, a10, tmp2);
9087 __ ldp(tmp3, tmp2, __ post(buf, 16));
9088 __ eor(a11, a11, tmp3);
9089 __ eor(a12, a12, tmp2);
9090
9091 __ BIND(rounds24_preloop);
9092 __ fmovs(v0, 24.0); // float loop counter,
9093 __ fmovs(v1, 1.0); // exact representation
9094
9095 __ str(buf, Address(sp, 16));
9096 __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
9097
9098 __ BIND(loop_body);
9099 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
9100 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
9101 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
9102 tmp0, tmp1, tmp2);
9103 __ fsubs(v0, v0, v1);
9104 __ fcmps(v0, 0.0);
9105 __ br(__ NE, loop_body);
9106
9107 if (multi_block) {
9108 __ ldrw(block_size, sp); // block_size
9109 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
9110 __ addw(tmp2, tmp2, block_size);
9111 __ cmpw(tmp2, tmp1);
9112 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
9113 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
9114 __ br(Assembler::LE, sha3_loop);
9115 __ movw(c_rarg0, tmp2); // return offset
9116 }
9117 if (can_use_fp && can_use_r18) {
9118 __ ldp(r18_tls, state, Address(sp, 112));
9119 } else {
9120 __ ldr(state, Address(sp, 112));
9121 }
9122 // save calculated sha3 state
9123 __ stp(a0, a1, Address(state));
9124 __ stp(a2, a3, Address(state, 16));
9125 __ stp(a4, a5, Address(state, 32));
9126 __ stp(a6, a7, Address(state, 48));
9127 __ stp(a8, a9, Address(state, 64));
9128 __ stp(a10, a11, Address(state, 80));
9129 __ stp(a12, a13, Address(state, 96));
9130 __ stp(a14, a15, Address(state, 112));
9131 __ stp(a16, a17, Address(state, 128));
9132 __ stp(a18, a19, Address(state, 144));
9133 __ stp(a20, a21, Address(state, 160));
9134 __ stp(a22, a23, Address(state, 176));
9135 __ str(a24, Address(state, 192));
9136
9137 // restore required registers from stack
9138 __ ldp(r19, r20, Address(sp, 32));
9139 __ ldp(r21, r22, Address(sp, 48));
9140 __ ldp(r23, r24, Address(sp, 64));
9141 __ ldp(r25, r26, Address(sp, 80));
9142 __ ldp(r27, r28, Address(sp, 96));
9143 if (can_use_fp && can_use_r18) {
9144 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
9145 } // else no need to recalculate rfp, since it wasn't changed
9146
9147 __ leave();
9148
9149 __ ret(lr);
9150
9151 // record the stub entry and end
9152 store_archive_data(stub_id, start, __ pc());
9153
9154 return start;
9155 }
9156
9157 /**
9158 * Arguments:
9159 *
9160 * Inputs:
9161 * c_rarg0 - int crc
9162 * c_rarg1 - byte* buf
9163 * c_rarg2 - int length
9164 *
9165 * Output:
9166 * rax - int crc result
9167 */
9168 address generate_updateBytesCRC32() {
9169 assert(UseCRC32Intrinsics, "what are we doing here?");
9170 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
9171 int entry_count = StubInfo::entry_count(stub_id);
9172 assert(entry_count == 1, "sanity check");
9173 address start = load_archive_data(stub_id);
9174 if (start != nullptr) {
9175 return start;
9176 }
9177 __ align(CodeEntryAlignment);
9178 StubCodeMark mark(this, stub_id);
9179
9180 start = __ pc();
9181
9182 const Register crc = c_rarg0; // crc
9183 const Register buf = c_rarg1; // source java byte array address
9184 const Register len = c_rarg2; // length
9185 const Register table0 = c_rarg3; // crc_table address
9186 const Register table1 = c_rarg4;
9187 const Register table2 = c_rarg5;
9188 const Register table3 = c_rarg6;
9189 const Register tmp3 = c_rarg7;
9190
9191 BLOCK_COMMENT("Entry:");
9192 __ enter(); // required for proper stackwalking of RuntimeStub frame
9193
9194 __ kernel_crc32(crc, buf, len,
9195 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
9196
9197 __ leave(); // required for proper stackwalking of RuntimeStub frame
9198 __ ret(lr);
9199
9200 // record the stub entry and end
9201 store_archive_data(stub_id, start, __ pc());
9202
9203 return start;
9204 }
9205
9206 /**
9207 * Arguments:
9208 *
9209 * Inputs:
9210 * c_rarg0 - int crc
9211 * c_rarg1 - byte* buf
9212 * c_rarg2 - int length
9213 * c_rarg3 - int* table
9214 *
9215 * Output:
9216 * r0 - int crc result
9217 */
9218 address generate_updateBytesCRC32C() {
9219 assert(UseCRC32CIntrinsics, "what are we doing here?");
9220 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
9221 int entry_count = StubInfo::entry_count(stub_id);
9222 assert(entry_count == 1, "sanity check");
9223 address start = load_archive_data(stub_id);
9224 if (start != nullptr) {
9225 return start;
9226 }
9227 __ align(CodeEntryAlignment);
9228 StubCodeMark mark(this, stub_id);
9229
9230 start = __ pc();
9231
9232 const Register crc = c_rarg0; // crc
9233 const Register buf = c_rarg1; // source java byte array address
9234 const Register len = c_rarg2; // length
9235 const Register table0 = c_rarg3; // crc_table address
9236 const Register table1 = c_rarg4;
9237 const Register table2 = c_rarg5;
9238 const Register table3 = c_rarg6;
9239 const Register tmp3 = c_rarg7;
9240
9241 BLOCK_COMMENT("Entry:");
9242 __ enter(); // required for proper stackwalking of RuntimeStub frame
9243
9244 __ kernel_crc32c(crc, buf, len,
9245 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
9246
9247 __ leave(); // required for proper stackwalking of RuntimeStub frame
9248 __ ret(lr);
9249
9250 // record the stub entry and end
9251 store_archive_data(stub_id, start, __ pc());
9252
9253 return start;
9254 }
9255
9256 /***
9257 * Arguments:
9258 *
9259 * Inputs:
9260 * c_rarg0 - int adler
9261 * c_rarg1 - byte* buff
9262 * c_rarg2 - int len
9263 *
9264 * Output:
9265 * c_rarg0 - int adler result
9266 */
9267 address generate_updateBytesAdler32() {
9268 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
9269 int entry_count = StubInfo::entry_count(stub_id);
9270 assert(entry_count == 1, "sanity check");
9271 address start = load_archive_data(stub_id);
9272 if (start != nullptr) {
9273 return start;
9274 }
9275 __ align(CodeEntryAlignment);
9276 StubCodeMark mark(this, stub_id);
9277 start = __ pc();
9278
9279 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
9280
9281 // Aliases
9282 Register adler = c_rarg0;
9283 Register s1 = c_rarg0;
9284 Register s2 = c_rarg3;
9285 Register buff = c_rarg1;
9286 Register len = c_rarg2;
9287 Register nmax = r4;
9288 Register base = r5;
9289 Register count = r6;
9290 Register temp0 = rscratch1;
9291 Register temp1 = rscratch2;
9292 FloatRegister vbytes = v0;
9293 FloatRegister vs1acc = v1;
9294 FloatRegister vs2acc = v2;
9295 FloatRegister vtable = v3;
9296
9297 // Max number of bytes we can process before having to take the mod
9298 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
9299 uint64_t BASE = 0xfff1;
9300 uint64_t NMAX = 0x15B0;
9301
9302 __ mov(base, BASE);
9303 __ mov(nmax, NMAX);
9304
9305 // Load accumulation coefficients for the upper 16 bits
9306 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
9307 __ ld1(vtable, __ T16B, Address(temp0));
9308
9309 // s1 is initialized to the lower 16 bits of adler
9310 // s2 is initialized to the upper 16 bits of adler
9311 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
9312 __ uxth(s1, adler); // s1 = (adler & 0xffff)
9313
9314 // The pipelined loop needs at least 16 elements for 1 iteration
9315 // It does check this, but it is more effective to skip to the cleanup loop
9316 __ cmp(len, (u1)16);
9317 __ br(Assembler::HS, L_nmax);
9318 __ cbz(len, L_combine);
9319
9320 __ bind(L_simple_by1_loop);
9321 __ ldrb(temp0, Address(__ post(buff, 1)));
9322 __ add(s1, s1, temp0);
9323 __ add(s2, s2, s1);
9324 __ subs(len, len, 1);
9325 __ br(Assembler::HI, L_simple_by1_loop);
9326
9327 // s1 = s1 % BASE
9328 __ subs(temp0, s1, base);
9329 __ csel(s1, temp0, s1, Assembler::HS);
9330
9331 // s2 = s2 % BASE
9332 __ lsr(temp0, s2, 16);
9333 __ lsl(temp1, temp0, 4);
9334 __ sub(temp1, temp1, temp0);
9335 __ add(s2, temp1, s2, ext::uxth);
9336
9337 __ subs(temp0, s2, base);
9338 __ csel(s2, temp0, s2, Assembler::HS);
9339
9340 __ b(L_combine);
9341
9342 __ bind(L_nmax);
9343 __ subs(len, len, nmax);
9344 __ sub(count, nmax, 16);
9345 __ br(Assembler::LO, L_by16);
9346
9347 __ bind(L_nmax_loop);
9348
9349 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
9350 vbytes, vs1acc, vs2acc, vtable);
9351
9352 __ subs(count, count, 16);
9353 __ br(Assembler::HS, L_nmax_loop);
9354
9355 // s1 = s1 % BASE
9356 __ lsr(temp0, s1, 16);
9357 __ lsl(temp1, temp0, 4);
9358 __ sub(temp1, temp1, temp0);
9359 __ add(temp1, temp1, s1, ext::uxth);
9360
9361 __ lsr(temp0, temp1, 16);
9362 __ lsl(s1, temp0, 4);
9363 __ sub(s1, s1, temp0);
9364 __ add(s1, s1, temp1, ext:: uxth);
9365
9366 __ subs(temp0, s1, base);
9367 __ csel(s1, temp0, s1, Assembler::HS);
9368
9369 // s2 = s2 % BASE
9370 __ lsr(temp0, s2, 16);
9371 __ lsl(temp1, temp0, 4);
9372 __ sub(temp1, temp1, temp0);
9373 __ add(temp1, temp1, s2, ext::uxth);
9374
9375 __ lsr(temp0, temp1, 16);
9376 __ lsl(s2, temp0, 4);
9377 __ sub(s2, s2, temp0);
9378 __ add(s2, s2, temp1, ext:: uxth);
9379
9380 __ subs(temp0, s2, base);
9381 __ csel(s2, temp0, s2, Assembler::HS);
9382
9383 __ subs(len, len, nmax);
9384 __ sub(count, nmax, 16);
9385 __ br(Assembler::HS, L_nmax_loop);
9386
9387 __ bind(L_by16);
9388 __ adds(len, len, count);
9389 __ br(Assembler::LO, L_by1);
9390
9391 __ bind(L_by16_loop);
9392
9393 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
9394 vbytes, vs1acc, vs2acc, vtable);
9395
9396 __ subs(len, len, 16);
9397 __ br(Assembler::HS, L_by16_loop);
9398
9399 __ bind(L_by1);
9400 __ adds(len, len, 15);
9401 __ br(Assembler::LO, L_do_mod);
9402
9403 __ bind(L_by1_loop);
9404 __ ldrb(temp0, Address(__ post(buff, 1)));
9405 __ add(s1, temp0, s1);
9406 __ add(s2, s2, s1);
9407 __ subs(len, len, 1);
9408 __ br(Assembler::HS, L_by1_loop);
9409
9410 __ bind(L_do_mod);
9411 // s1 = s1 % BASE
9412 __ lsr(temp0, s1, 16);
9413 __ lsl(temp1, temp0, 4);
9414 __ sub(temp1, temp1, temp0);
9415 __ add(temp1, temp1, s1, ext::uxth);
9416
9417 __ lsr(temp0, temp1, 16);
9418 __ lsl(s1, temp0, 4);
9419 __ sub(s1, s1, temp0);
9420 __ add(s1, s1, temp1, ext:: uxth);
9421
9422 __ subs(temp0, s1, base);
9423 __ csel(s1, temp0, s1, Assembler::HS);
9424
9425 // s2 = s2 % BASE
9426 __ lsr(temp0, s2, 16);
9427 __ lsl(temp1, temp0, 4);
9428 __ sub(temp1, temp1, temp0);
9429 __ add(temp1, temp1, s2, ext::uxth);
9430
9431 __ lsr(temp0, temp1, 16);
9432 __ lsl(s2, temp0, 4);
9433 __ sub(s2, s2, temp0);
9434 __ add(s2, s2, temp1, ext:: uxth);
9435
9436 __ subs(temp0, s2, base);
9437 __ csel(s2, temp0, s2, Assembler::HS);
9438
9439 // Combine lower bits and higher bits
9440 __ bind(L_combine);
9441 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
9442
9443 __ ret(lr);
9444
9445 // record the stub entry and end
9446 store_archive_data(stub_id, start, __ pc());
9447
9448 return start;
9449 }
9450
9451 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
9452 Register temp0, Register temp1, FloatRegister vbytes,
9453 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
9454 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
9455 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
9456 // In non-vectorized code, we update s1 and s2 as:
9457 // s1 <- s1 + b1
9458 // s2 <- s2 + s1
9459 // s1 <- s1 + b2
9460 // s2 <- s2 + b1
9461 // ...
9462 // s1 <- s1 + b16
9463 // s2 <- s2 + s1
9464 // Putting above assignments together, we have:
9465 // s1_new = s1 + b1 + b2 + ... + b16
9466 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
9467 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
9468 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
9469 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
9470
9471 // s2 = s2 + s1 * 16
9472 __ add(s2, s2, s1, Assembler::LSL, 4);
9473
9474 // vs1acc = b1 + b2 + b3 + ... + b16
9475 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
9476 __ umullv(vs2acc, __ T8B, vtable, vbytes);
9477 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
9478 __ uaddlv(vs1acc, __ T16B, vbytes);
9479 __ uaddlv(vs2acc, __ T8H, vs2acc);
9480
9481 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
9482 __ fmovd(temp0, vs1acc);
9483 __ fmovd(temp1, vs2acc);
9484 __ add(s1, s1, temp0);
9485 __ add(s2, s2, temp1);
9486 }
9487
9488 /**
9489 * Arguments:
9490 *
9491 * Input:
9492 * c_rarg0 - x address
9493 * c_rarg1 - x length
9494 * c_rarg2 - y address
9495 * c_rarg3 - y length
9496 * c_rarg4 - z address
9497 */
9498 address generate_multiplyToLen() {
9499 StubId stub_id = StubId::stubgen_multiplyToLen_id;
9500 int entry_count = StubInfo::entry_count(stub_id);
9501 assert(entry_count == 1, "sanity check");
9502 address start = load_archive_data(stub_id);
9503 if (start != nullptr) {
9504 return start;
9505 }
9506 __ align(CodeEntryAlignment);
9507 StubCodeMark mark(this, stub_id);
9508
9509 start = __ pc();
9510 const Register x = r0;
9511 const Register xlen = r1;
9512 const Register y = r2;
9513 const Register ylen = r3;
9514 const Register z = r4;
9515
9516 const Register tmp0 = r5;
9517 const Register tmp1 = r10;
9518 const Register tmp2 = r11;
9519 const Register tmp3 = r12;
9520 const Register tmp4 = r13;
9521 const Register tmp5 = r14;
9522 const Register tmp6 = r15;
9523 const Register tmp7 = r16;
9524
9525 BLOCK_COMMENT("Entry:");
9526 __ enter(); // required for proper stackwalking of RuntimeStub frame
9527 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
9528 __ leave(); // required for proper stackwalking of RuntimeStub frame
9529 __ ret(lr);
9530
9531 // record the stub entry and end
9532 store_archive_data(stub_id, start, __ pc());
9533
9534 return start;
9535 }
9536
9537 address generate_squareToLen() {
9538 // squareToLen algorithm for sizes 1..127 described in java code works
9539 // faster than multiply_to_len on some CPUs and slower on others, but
9540 // multiply_to_len shows a bit better overall results
9541 StubId stub_id = StubId::stubgen_squareToLen_id;
9542 int entry_count = StubInfo::entry_count(stub_id);
9543 assert(entry_count == 1, "sanity check");
9544 address start = load_archive_data(stub_id);
9545 if (start != nullptr) {
9546 return start;
9547 }
9548 __ align(CodeEntryAlignment);
9549 StubCodeMark mark(this, stub_id);
9550 start = __ pc();
9551
9552 const Register x = r0;
9553 const Register xlen = r1;
9554 const Register z = r2;
9555 const Register y = r4; // == x
9556 const Register ylen = r5; // == xlen
9557
9558 const Register tmp0 = r3;
9559 const Register tmp1 = r10;
9560 const Register tmp2 = r11;
9561 const Register tmp3 = r12;
9562 const Register tmp4 = r13;
9563 const Register tmp5 = r14;
9564 const Register tmp6 = r15;
9565 const Register tmp7 = r16;
9566
9567 RegSet spilled_regs = RegSet::of(y, ylen);
9568 BLOCK_COMMENT("Entry:");
9569 __ enter();
9570 __ push(spilled_regs, sp);
9571 __ mov(y, x);
9572 __ mov(ylen, xlen);
9573 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
9574 __ pop(spilled_regs, sp);
9575 __ leave();
9576 __ ret(lr);
9577
9578 // record the stub entry and end
9579 store_archive_data(stub_id, start, __ pc());
9580
9581 return start;
9582 }
9583
9584 address generate_mulAdd() {
9585 StubId stub_id = StubId::stubgen_mulAdd_id;
9586 int entry_count = StubInfo::entry_count(stub_id);
9587 assert(entry_count == 1, "sanity check");
9588 address start = load_archive_data(stub_id);
9589 if (start != nullptr) {
9590 return start;
9591 }
9592 __ align(CodeEntryAlignment);
9593 StubCodeMark mark(this, stub_id);
9594
9595 start = __ pc();
9596
9597 const Register out = r0;
9598 const Register in = r1;
9599 const Register offset = r2;
9600 const Register len = r3;
9601 const Register k = r4;
9602
9603 BLOCK_COMMENT("Entry:");
9604 __ enter();
9605 __ mul_add(out, in, offset, len, k);
9606 __ leave();
9607 __ ret(lr);
9608
9609 // record the stub entry and end
9610 store_archive_data(stub_id, start, __ pc());
9611
9612 return start;
9613 }
9614
9615 // Arguments:
9616 //
9617 // Input:
9618 // c_rarg0 - newArr address
9619 // c_rarg1 - oldArr address
9620 // c_rarg2 - newIdx
9621 // c_rarg3 - shiftCount
9622 // c_rarg4 - numIter
9623 //
9624 address generate_bigIntegerRightShift() {
9625 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
9626 int entry_count = StubInfo::entry_count(stub_id);
9627 assert(entry_count == 1, "sanity check");
9628 address start = load_archive_data(stub_id);
9629 if (start != nullptr) {
9630 return start;
9631 }
9632 __ align(CodeEntryAlignment);
9633 StubCodeMark mark(this, stub_id);
9634 start = __ pc();
9635
9636 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
9637
9638 Register newArr = c_rarg0;
9639 Register oldArr = c_rarg1;
9640 Register newIdx = c_rarg2;
9641 Register shiftCount = c_rarg3;
9642 Register numIter = c_rarg4;
9643 Register idx = numIter;
9644
9645 Register newArrCur = rscratch1;
9646 Register shiftRevCount = rscratch2;
9647 Register oldArrCur = r13;
9648 Register oldArrNext = r14;
9649
9650 FloatRegister oldElem0 = v0;
9651 FloatRegister oldElem1 = v1;
9652 FloatRegister newElem = v2;
9653 FloatRegister shiftVCount = v3;
9654 FloatRegister shiftVRevCount = v4;
9655
9656 __ cbz(idx, Exit);
9657
9658 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
9659
9660 // left shift count
9661 __ movw(shiftRevCount, 32);
9662 __ subw(shiftRevCount, shiftRevCount, shiftCount);
9663
9664 // numIter too small to allow a 4-words SIMD loop, rolling back
9665 __ cmp(numIter, (u1)4);
9666 __ br(Assembler::LT, ShiftThree);
9667
9668 __ dup(shiftVCount, __ T4S, shiftCount);
9669 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
9670 __ negr(shiftVCount, __ T4S, shiftVCount);
9671
9672 __ BIND(ShiftSIMDLoop);
9673
9674 // Calculate the load addresses
9675 __ sub(idx, idx, 4);
9676 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
9677 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
9678 __ add(oldArrCur, oldArrNext, 4);
9679
9680 // Load 4 words and process
9681 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
9682 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
9683 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
9684 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
9685 __ orr(newElem, __ T16B, oldElem0, oldElem1);
9686 __ st1(newElem, __ T4S, Address(newArrCur));
9687
9688 __ cmp(idx, (u1)4);
9689 __ br(Assembler::LT, ShiftTwoLoop);
9690 __ b(ShiftSIMDLoop);
9691
9692 __ BIND(ShiftTwoLoop);
9693 __ cbz(idx, Exit);
9694 __ cmp(idx, (u1)1);
9695 __ br(Assembler::EQ, ShiftOne);
9696
9697 // Calculate the load addresses
9698 __ sub(idx, idx, 2);
9699 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
9700 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
9701 __ add(oldArrCur, oldArrNext, 4);
9702
9703 // Load 2 words and process
9704 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
9705 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
9706 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
9707 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
9708 __ orr(newElem, __ T8B, oldElem0, oldElem1);
9709 __ st1(newElem, __ T2S, Address(newArrCur));
9710 __ b(ShiftTwoLoop);
9711
9712 __ BIND(ShiftThree);
9713 __ tbz(idx, 1, ShiftOne);
9714 __ tbz(idx, 0, ShiftTwo);
9715 __ ldrw(r10, Address(oldArr, 12));
9716 __ ldrw(r11, Address(oldArr, 8));
9717 __ lsrvw(r10, r10, shiftCount);
9718 __ lslvw(r11, r11, shiftRevCount);
9719 __ orrw(r12, r10, r11);
9720 __ strw(r12, Address(newArr, 8));
9721
9722 __ BIND(ShiftTwo);
9723 __ ldrw(r10, Address(oldArr, 8));
9724 __ ldrw(r11, Address(oldArr, 4));
9725 __ lsrvw(r10, r10, shiftCount);
9726 __ lslvw(r11, r11, shiftRevCount);
9727 __ orrw(r12, r10, r11);
9728 __ strw(r12, Address(newArr, 4));
9729
9730 __ BIND(ShiftOne);
9731 __ ldrw(r10, Address(oldArr, 4));
9732 __ ldrw(r11, Address(oldArr));
9733 __ lsrvw(r10, r10, shiftCount);
9734 __ lslvw(r11, r11, shiftRevCount);
9735 __ orrw(r12, r10, r11);
9736 __ strw(r12, Address(newArr));
9737
9738 __ BIND(Exit);
9739 __ ret(lr);
9740
9741 // record the stub entry and end
9742 store_archive_data(stub_id, start, __ pc());
9743
9744 return start;
9745 }
9746
9747 // Arguments:
9748 //
9749 // Input:
9750 // c_rarg0 - newArr address
9751 // c_rarg1 - oldArr address
9752 // c_rarg2 - newIdx
9753 // c_rarg3 - shiftCount
9754 // c_rarg4 - numIter
9755 //
9756 address generate_bigIntegerLeftShift() {
9757 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
9758 int entry_count = StubInfo::entry_count(stub_id);
9759 assert(entry_count == 1, "sanity check");
9760 address start = load_archive_data(stub_id);
9761 if (start != nullptr) {
9762 return start;
9763 }
9764 __ align(CodeEntryAlignment);
9765 StubCodeMark mark(this, stub_id);
9766 start = __ pc();
9767
9768 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
9769
9770 Register newArr = c_rarg0;
9771 Register oldArr = c_rarg1;
9772 Register newIdx = c_rarg2;
9773 Register shiftCount = c_rarg3;
9774 Register numIter = c_rarg4;
9775
9776 Register shiftRevCount = rscratch1;
9777 Register oldArrNext = rscratch2;
9778
9779 FloatRegister oldElem0 = v0;
9780 FloatRegister oldElem1 = v1;
9781 FloatRegister newElem = v2;
9782 FloatRegister shiftVCount = v3;
9783 FloatRegister shiftVRevCount = v4;
9784
9785 __ cbz(numIter, Exit);
9786
9787 __ add(oldArrNext, oldArr, 4);
9788 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
9789
9790 // right shift count
9791 __ movw(shiftRevCount, 32);
9792 __ subw(shiftRevCount, shiftRevCount, shiftCount);
9793
9794 // numIter too small to allow a 4-words SIMD loop, rolling back
9795 __ cmp(numIter, (u1)4);
9796 __ br(Assembler::LT, ShiftThree);
9797
9798 __ dup(shiftVCount, __ T4S, shiftCount);
9799 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
9800 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
9801
9802 __ BIND(ShiftSIMDLoop);
9803
9804 // load 4 words and process
9805 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
9806 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
9807 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
9808 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
9809 __ orr(newElem, __ T16B, oldElem0, oldElem1);
9810 __ st1(newElem, __ T4S, __ post(newArr, 16));
9811 __ sub(numIter, numIter, 4);
9812
9813 __ cmp(numIter, (u1)4);
9814 __ br(Assembler::LT, ShiftTwoLoop);
9815 __ b(ShiftSIMDLoop);
9816
9817 __ BIND(ShiftTwoLoop);
9818 __ cbz(numIter, Exit);
9819 __ cmp(numIter, (u1)1);
9820 __ br(Assembler::EQ, ShiftOne);
9821
9822 // load 2 words and process
9823 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
9824 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
9825 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
9826 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
9827 __ orr(newElem, __ T8B, oldElem0, oldElem1);
9828 __ st1(newElem, __ T2S, __ post(newArr, 8));
9829 __ sub(numIter, numIter, 2);
9830 __ b(ShiftTwoLoop);
9831
9832 __ BIND(ShiftThree);
9833 __ ldrw(r10, __ post(oldArr, 4));
9834 __ ldrw(r11, __ post(oldArrNext, 4));
9835 __ lslvw(r10, r10, shiftCount);
9836 __ lsrvw(r11, r11, shiftRevCount);
9837 __ orrw(r12, r10, r11);
9838 __ strw(r12, __ post(newArr, 4));
9839 __ tbz(numIter, 1, Exit);
9840 __ tbz(numIter, 0, ShiftOne);
9841
9842 __ BIND(ShiftTwo);
9843 __ ldrw(r10, __ post(oldArr, 4));
9844 __ ldrw(r11, __ post(oldArrNext, 4));
9845 __ lslvw(r10, r10, shiftCount);
9846 __ lsrvw(r11, r11, shiftRevCount);
9847 __ orrw(r12, r10, r11);
9848 __ strw(r12, __ post(newArr, 4));
9849
9850 __ BIND(ShiftOne);
9851 __ ldrw(r10, Address(oldArr));
9852 __ ldrw(r11, Address(oldArrNext));
9853 __ lslvw(r10, r10, shiftCount);
9854 __ lsrvw(r11, r11, shiftRevCount);
9855 __ orrw(r12, r10, r11);
9856 __ strw(r12, Address(newArr));
9857
9858 __ BIND(Exit);
9859 __ ret(lr);
9860
9861 // record the stub entry and end
9862 store_archive_data(stub_id, start, __ pc());
9863
9864 return start;
9865 }
9866
9867 address generate_count_positives(address &count_positives_long) {
9868 StubId stub_id = StubId::stubgen_count_positives_id;
9869 GrowableArray<address> entries;
9870 int entry_count = StubInfo::entry_count(stub_id);
9871 // We have an extra entry for count_positives_long.
9872 assert(entry_count == 2, "sanity check");
9873 address start = load_archive_data(stub_id, &entries);
9874 if (start != nullptr) {
9875 assert(entries.length() == 1,
9876 "unexpected extra entry count %d", entries.length());
9877 count_positives_long = entries.at(0);
9878 return start;
9879 }
9880 const u1 large_loop_size = 64;
9881 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
9882 int dcache_line = VM_Version::dcache_line_size();
9883
9884 Register ary1 = r1, len = r2, result = r0;
9885
9886 __ align(CodeEntryAlignment);
9887 StubCodeMark mark(this, stub_id);
9888
9889 address entry = __ pc();
9890
9891 __ enter();
9892 // precondition: a copy of len is already in result
9893 // __ mov(result, len);
9894
9895 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
9896 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
9897
9898 __ cmp(len, (u1)15);
9899 __ br(Assembler::GT, LEN_OVER_15);
9900 // The only case when execution falls into this code is when pointer is near
9901 // the end of memory page and we have to avoid reading next page
9902 __ add(ary1, ary1, len);
9903 __ subs(len, len, 8);
9904 __ br(Assembler::GT, LEN_OVER_8);
9905 __ ldr(rscratch2, Address(ary1, -8));
9906 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
9907 __ lsrv(rscratch2, rscratch2, rscratch1);
9908 __ tst(rscratch2, UPPER_BIT_MASK);
9909 __ csel(result, zr, result, Assembler::NE);
9910 __ leave();
9911 __ ret(lr);
9912 __ bind(LEN_OVER_8);
9913 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
9914 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
9915 __ tst(rscratch2, UPPER_BIT_MASK);
9916 __ br(Assembler::NE, RET_NO_POP);
9917 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
9918 __ lsrv(rscratch1, rscratch1, rscratch2);
9919 __ tst(rscratch1, UPPER_BIT_MASK);
9920 __ bind(RET_NO_POP);
9921 __ csel(result, zr, result, Assembler::NE);
9922 __ leave();
9923 __ ret(lr);
9924
9925 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
9926 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
9927
9928 count_positives_long = __ pc(); // 2nd entry point
9929 entries.append(count_positives_long);
9930
9931 __ enter();
9932
9933 __ bind(LEN_OVER_15);
9934 __ push(spilled_regs, sp);
9935 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
9936 __ cbz(rscratch2, ALIGNED);
9937 __ ldp(tmp6, tmp1, Address(ary1));
9938 __ mov(tmp5, 16);
9939 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
9940 __ add(ary1, ary1, rscratch1);
9941 __ orr(tmp6, tmp6, tmp1);
9942 __ tst(tmp6, UPPER_BIT_MASK);
9943 __ br(Assembler::NE, RET_ADJUST);
9944 __ sub(len, len, rscratch1);
9945
9946 __ bind(ALIGNED);
9947 __ cmp(len, large_loop_size);
9948 __ br(Assembler::LT, CHECK_16);
9949 // Perform 16-byte load as early return in pre-loop to handle situation
9950 // when initially aligned large array has negative values at starting bytes,
9951 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
9952 // slower. Cases with negative bytes further ahead won't be affected that
9953 // much. In fact, it'll be faster due to early loads, less instructions and
9954 // less branches in LARGE_LOOP.
9955 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
9956 __ sub(len, len, 16);
9957 __ orr(tmp6, tmp6, tmp1);
9958 __ tst(tmp6, UPPER_BIT_MASK);
9959 __ br(Assembler::NE, RET_ADJUST_16);
9960 __ cmp(len, large_loop_size);
9961 __ br(Assembler::LT, CHECK_16);
9962
9963 if (SoftwarePrefetchHintDistance >= 0
9964 && SoftwarePrefetchHintDistance >= dcache_line) {
9965 // initial prefetch
9966 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
9967 }
9968 __ bind(LARGE_LOOP);
9969 if (SoftwarePrefetchHintDistance >= 0) {
9970 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
9971 }
9972 // Issue load instructions first, since it can save few CPU/MEM cycles, also
9973 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
9974 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
9975 // instructions per cycle and have less branches, but this approach disables
9976 // early return, thus, all 64 bytes are loaded and checked every time.
9977 __ ldp(tmp2, tmp3, Address(ary1));
9978 __ ldp(tmp4, tmp5, Address(ary1, 16));
9979 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
9980 __ ldp(tmp6, tmp1, Address(ary1, 48));
9981 __ add(ary1, ary1, large_loop_size);
9982 __ sub(len, len, large_loop_size);
9983 __ orr(tmp2, tmp2, tmp3);
9984 __ orr(tmp4, tmp4, tmp5);
9985 __ orr(rscratch1, rscratch1, rscratch2);
9986 __ orr(tmp6, tmp6, tmp1);
9987 __ orr(tmp2, tmp2, tmp4);
9988 __ orr(rscratch1, rscratch1, tmp6);
9989 __ orr(tmp2, tmp2, rscratch1);
9990 __ tst(tmp2, UPPER_BIT_MASK);
9991 __ br(Assembler::NE, RET_ADJUST_LONG);
9992 __ cmp(len, large_loop_size);
9993 __ br(Assembler::GE, LARGE_LOOP);
9994
9995 __ bind(CHECK_16); // small 16-byte load pre-loop
9996 __ cmp(len, (u1)16);
9997 __ br(Assembler::LT, POST_LOOP16);
9998
9999 __ bind(LOOP16); // small 16-byte load loop
10000 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
10001 __ sub(len, len, 16);
10002 __ orr(tmp2, tmp2, tmp3);
10003 __ tst(tmp2, UPPER_BIT_MASK);
10004 __ br(Assembler::NE, RET_ADJUST_16);
10005 __ cmp(len, (u1)16);
10006 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
10007
10008 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
10009 __ cmp(len, (u1)8);
10010 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
10011 __ ldr(tmp3, Address(__ post(ary1, 8)));
10012 __ tst(tmp3, UPPER_BIT_MASK);
10013 __ br(Assembler::NE, RET_ADJUST);
10014 __ sub(len, len, 8);
10015
10016 __ bind(POST_LOOP16_LOAD_TAIL);
10017 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
10018 __ ldr(tmp1, Address(ary1));
10019 __ mov(tmp2, 64);
10020 __ sub(tmp4, tmp2, len, __ LSL, 3);
10021 __ lslv(tmp1, tmp1, tmp4);
10022 __ tst(tmp1, UPPER_BIT_MASK);
10023 __ br(Assembler::NE, RET_ADJUST);
10024 // Fallthrough
10025
10026 __ bind(RET_LEN);
10027 __ pop(spilled_regs, sp);
10028 __ leave();
10029 __ ret(lr);
10030
10031 // difference result - len is the count of guaranteed to be
10032 // positive bytes
10033
10034 __ bind(RET_ADJUST_LONG);
10035 __ add(len, len, (u1)(large_loop_size - 16));
10036 __ bind(RET_ADJUST_16);
10037 __ add(len, len, 16);
10038 __ bind(RET_ADJUST);
10039 __ pop(spilled_regs, sp);
10040 __ leave();
10041 __ sub(result, result, len);
10042 __ ret(lr);
10043
10044 // record the stub entry and end plus the extra entry
10045 store_archive_data(stub_id, entry, __ pc(), &entries);
10046
10047 return entry;
10048 }
10049
10050 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
10051 bool usePrefetch, Label &NOT_EQUAL) {
10052 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10053 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
10054 tmp7 = r12, tmp8 = r13;
10055 Label LOOP;
10056
10057 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10058 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10059 __ bind(LOOP);
10060 if (usePrefetch) {
10061 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
10062 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
10063 }
10064 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
10065 __ eor(tmp1, tmp1, tmp2);
10066 __ eor(tmp3, tmp3, tmp4);
10067 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
10068 __ orr(tmp1, tmp1, tmp3);
10069 __ cbnz(tmp1, NOT_EQUAL);
10070 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10071 __ eor(tmp5, tmp5, tmp6);
10072 __ eor(tmp7, tmp7, tmp8);
10073 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10074 __ orr(tmp5, tmp5, tmp7);
10075 __ cbnz(tmp5, NOT_EQUAL);
10076 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
10077 __ eor(tmp1, tmp1, tmp2);
10078 __ eor(tmp3, tmp3, tmp4);
10079 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
10080 __ orr(tmp1, tmp1, tmp3);
10081 __ cbnz(tmp1, NOT_EQUAL);
10082 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10083 __ eor(tmp5, tmp5, tmp6);
10084 __ sub(cnt1, cnt1, 8 * wordSize);
10085 __ eor(tmp7, tmp7, tmp8);
10086 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10087 // tmp6 is not used. MacroAssembler::subs is used here (rather than
10088 // cmp) because subs allows an unlimited range of immediate operand.
10089 __ subs(tmp6, cnt1, loopThreshold);
10090 __ orr(tmp5, tmp5, tmp7);
10091 __ cbnz(tmp5, NOT_EQUAL);
10092 __ br(__ GE, LOOP);
10093 // post-loop
10094 __ eor(tmp1, tmp1, tmp2);
10095 __ eor(tmp3, tmp3, tmp4);
10096 __ orr(tmp1, tmp1, tmp3);
10097 __ sub(cnt1, cnt1, 2 * wordSize);
10098 __ cbnz(tmp1, NOT_EQUAL);
10099 }
10100
10101 void generate_large_array_equals_loop_simd(int loopThreshold,
10102 bool usePrefetch, Label &NOT_EQUAL) {
10103 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10104 tmp2 = rscratch2;
10105 Label LOOP;
10106
10107 __ bind(LOOP);
10108 if (usePrefetch) {
10109 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
10110 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
10111 }
10112 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
10113 __ sub(cnt1, cnt1, 8 * wordSize);
10114 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
10115 __ subs(tmp1, cnt1, loopThreshold);
10116 __ eor(v0, __ T16B, v0, v4);
10117 __ eor(v1, __ T16B, v1, v5);
10118 __ eor(v2, __ T16B, v2, v6);
10119 __ eor(v3, __ T16B, v3, v7);
10120 __ orr(v0, __ T16B, v0, v1);
10121 __ orr(v1, __ T16B, v2, v3);
10122 __ orr(v0, __ T16B, v0, v1);
10123 __ umov(tmp1, v0, __ D, 0);
10124 __ umov(tmp2, v0, __ D, 1);
10125 __ orr(tmp1, tmp1, tmp2);
10126 __ cbnz(tmp1, NOT_EQUAL);
10127 __ br(__ GE, LOOP);
10128 }
10129
10130 // a1 = r1 - array1 address
10131 // a2 = r2 - array2 address
10132 // result = r0 - return value. Already contains "false"
10133 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
10134 // r3-r5 are reserved temporary registers
10135 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
10136 address generate_large_array_equals() {
10137 StubId stub_id = StubId::stubgen_large_array_equals_id;
10138 int entry_count = StubInfo::entry_count(stub_id);
10139 assert(entry_count == 1, "sanity check");
10140 address start = load_archive_data(stub_id);
10141 if (start != nullptr) {
10142 return start;
10143 }
10144 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10145 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
10146 tmp7 = r12, tmp8 = r13;
10147 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
10148 SMALL_LOOP, POST_LOOP;
10149 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
10150 // calculate if at least 32 prefetched bytes are used
10151 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
10152 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
10153 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
10154 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
10155 tmp5, tmp6, tmp7, tmp8);
10156
10157 __ align(CodeEntryAlignment);
10158
10159 StubCodeMark mark(this, stub_id);
10160
10161 address entry = __ pc();
10162 __ enter();
10163 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
10164 // also advance pointers to use post-increment instead of pre-increment
10165 __ add(a1, a1, wordSize);
10166 __ add(a2, a2, wordSize);
10167 if (AvoidUnalignedAccesses) {
10168 // both implementations (SIMD/nonSIMD) are using relatively large load
10169 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
10170 // on some CPUs in case of address is not at least 16-byte aligned.
10171 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
10172 // load if needed at least for 1st address and make if 16-byte aligned.
10173 Label ALIGNED16;
10174 __ tbz(a1, 3, ALIGNED16);
10175 __ ldr(tmp1, Address(__ post(a1, wordSize)));
10176 __ ldr(tmp2, Address(__ post(a2, wordSize)));
10177 __ sub(cnt1, cnt1, wordSize);
10178 __ eor(tmp1, tmp1, tmp2);
10179 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
10180 __ bind(ALIGNED16);
10181 }
10182 if (UseSIMDForArrayEquals) {
10183 if (SoftwarePrefetchHintDistance >= 0) {
10184 __ subs(tmp1, cnt1, prefetchLoopThreshold);
10185 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
10186 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
10187 /* prfm = */ true, NOT_EQUAL);
10188 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
10189 __ br(__ LT, TAIL);
10190 }
10191 __ bind(NO_PREFETCH_LARGE_LOOP);
10192 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
10193 /* prfm = */ false, NOT_EQUAL);
10194 } else {
10195 __ push(spilled_regs, sp);
10196 if (SoftwarePrefetchHintDistance >= 0) {
10197 __ subs(tmp1, cnt1, prefetchLoopThreshold);
10198 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
10199 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
10200 /* prfm = */ true, NOT_EQUAL);
10201 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
10202 __ br(__ LT, TAIL);
10203 }
10204 __ bind(NO_PREFETCH_LARGE_LOOP);
10205 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
10206 /* prfm = */ false, NOT_EQUAL);
10207 }
10208 __ bind(TAIL);
10209 __ cbz(cnt1, EQUAL);
10210 __ subs(cnt1, cnt1, wordSize);
10211 __ br(__ LE, POST_LOOP);
10212 __ bind(SMALL_LOOP);
10213 __ ldr(tmp1, Address(__ post(a1, wordSize)));
10214 __ ldr(tmp2, Address(__ post(a2, wordSize)));
10215 __ subs(cnt1, cnt1, wordSize);
10216 __ eor(tmp1, tmp1, tmp2);
10217 __ cbnz(tmp1, NOT_EQUAL);
10218 __ br(__ GT, SMALL_LOOP);
10219 __ bind(POST_LOOP);
10220 __ ldr(tmp1, Address(a1, cnt1));
10221 __ ldr(tmp2, Address(a2, cnt1));
10222 __ eor(tmp1, tmp1, tmp2);
10223 __ cbnz(tmp1, NOT_EQUAL);
10224 __ bind(EQUAL);
10225 __ mov(result, true);
10226 __ bind(NOT_EQUAL);
10227 if (!UseSIMDForArrayEquals) {
10228 __ pop(spilled_regs, sp);
10229 }
10230 __ bind(NOT_EQUAL_NO_POP);
10231 __ leave();
10232 __ ret(lr);
10233
10234 // record the stub entry and end
10235 store_archive_data(stub_id, entry, __ pc());
10236
10237 return entry;
10238 }
10239
10240 // result = r0 - return value. Contains initial hashcode value on entry.
10241 // ary = r1 - array address
10242 // cnt = r2 - elements count
10243 // Clobbers: v0-v13, rscratch1, rscratch2
10244 address generate_large_arrays_hashcode(BasicType eltype) {
10245 StubId stub_id;
10246 switch (eltype) {
10247 case T_BOOLEAN:
10248 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
10249 break;
10250 case T_BYTE:
10251 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
10252 break;
10253 case T_CHAR:
10254 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
10255 break;
10256 case T_SHORT:
10257 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
10258 break;
10259 case T_INT:
10260 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
10261 break;
10262 default:
10263 stub_id = StubId::NO_STUBID;
10264 ShouldNotReachHere();
10265 };
10266 int entry_count = StubInfo::entry_count(stub_id);
10267 assert(entry_count == 1, "sanity check");
10268 address start = load_archive_data(stub_id);
10269 if (start != nullptr) {
10270 return start;
10271 }
10272 const Register result = r0, ary = r1, cnt = r2;
10273 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
10274 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
10275 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
10276 const FloatRegister vpowm = v13;
10277
10278 ARRAYS_HASHCODE_REGISTERS;
10279
10280 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
10281
10282 unsigned int vf; // vectorization factor
10283 bool multiply_by_halves;
10284 Assembler::SIMD_Arrangement load_arrangement;
10285 switch (eltype) {
10286 case T_BOOLEAN:
10287 case T_BYTE:
10288 load_arrangement = Assembler::T8B;
10289 multiply_by_halves = true;
10290 vf = 8;
10291 break;
10292 case T_CHAR:
10293 case T_SHORT:
10294 load_arrangement = Assembler::T8H;
10295 multiply_by_halves = true;
10296 vf = 8;
10297 break;
10298 case T_INT:
10299 load_arrangement = Assembler::T4S;
10300 multiply_by_halves = false;
10301 vf = 4;
10302 break;
10303 default:
10304 ShouldNotReachHere();
10305 }
10306
10307 // Unroll factor
10308 const unsigned uf = 4;
10309
10310 // Effective vectorization factor
10311 const unsigned evf = vf * uf;
10312
10313 __ align(CodeEntryAlignment);
10314
10315 StubCodeMark mark(this, stub_id);
10316
10317 address entry = __ pc();
10318 __ enter();
10319
10320 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
10321 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
10322 // value shouldn't change throughout both loops.
10323 __ movw(rscratch1, intpow(31U, 3));
10324 __ mov(vpow, Assembler::S, 0, rscratch1);
10325 __ movw(rscratch1, intpow(31U, 2));
10326 __ mov(vpow, Assembler::S, 1, rscratch1);
10327 __ movw(rscratch1, intpow(31U, 1));
10328 __ mov(vpow, Assembler::S, 2, rscratch1);
10329 __ movw(rscratch1, intpow(31U, 0));
10330 __ mov(vpow, Assembler::S, 3, rscratch1);
10331
10332 __ mov(vmul0, Assembler::T16B, 0);
10333 __ mov(vmul0, Assembler::S, 3, result);
10334
10335 __ andr(rscratch2, cnt, (uf - 1) * vf);
10336 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
10337
10338 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
10339 __ mov(vpowm, Assembler::S, 0, rscratch1);
10340
10341 // SMALL LOOP
10342 __ bind(SMALL_LOOP);
10343
10344 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
10345 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10346 __ subsw(rscratch2, rscratch2, vf);
10347
10348 if (load_arrangement == Assembler::T8B) {
10349 // Extend 8B to 8H to be able to use vector multiply
10350 // instructions
10351 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
10352 if (is_signed_subword_type(eltype)) {
10353 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10354 } else {
10355 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10356 }
10357 }
10358
10359 switch (load_arrangement) {
10360 case Assembler::T4S:
10361 __ addv(vmul0, load_arrangement, vmul0, vdata0);
10362 break;
10363 case Assembler::T8B:
10364 case Assembler::T8H:
10365 assert(is_subword_type(eltype), "subword type expected");
10366 if (is_signed_subword_type(eltype)) {
10367 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10368 } else {
10369 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10370 }
10371 break;
10372 default:
10373 __ should_not_reach_here();
10374 }
10375
10376 // Process the upper half of a vector
10377 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
10378 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10379 if (is_signed_subword_type(eltype)) {
10380 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10381 } else {
10382 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10383 }
10384 }
10385
10386 __ br(Assembler::HI, SMALL_LOOP);
10387
10388 // SMALL LOOP'S EPILOQUE
10389 __ lsr(rscratch2, cnt, exact_log2(evf));
10390 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
10391
10392 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
10393 __ addv(vmul0, Assembler::T4S, vmul0);
10394 __ umov(result, vmul0, Assembler::S, 0);
10395
10396 // TAIL
10397 __ bind(TAIL);
10398
10399 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
10400 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
10401 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
10402 __ andr(rscratch2, cnt, vf - 1);
10403 __ bind(TAIL_SHORTCUT);
10404 __ adr(rscratch1, BR_BASE);
10405 // For Cortex-A53 offset is 4 because 2 nops are generated.
10406 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
10407 __ movw(rscratch2, 0x1f);
10408 __ br(rscratch1);
10409
10410 for (size_t i = 0; i < vf - 1; ++i) {
10411 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
10412 eltype);
10413 __ maddw(result, result, rscratch2, rscratch1);
10414 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
10415 // Generate 2nd nop to have 4 instructions per iteration.
10416 if (VM_Version::supports_a53mac()) {
10417 __ nop();
10418 }
10419 }
10420 __ bind(BR_BASE);
10421
10422 __ leave();
10423 __ ret(lr);
10424
10425 // LARGE LOOP
10426 __ bind(LARGE_LOOP_PREHEADER);
10427
10428 __ lsr(rscratch2, cnt, exact_log2(evf));
10429
10430 if (multiply_by_halves) {
10431 // 31^4 - multiplier between lower and upper parts of a register
10432 __ movw(rscratch1, intpow(31U, vf / 2));
10433 __ mov(vpowm, Assembler::S, 1, rscratch1);
10434 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
10435 __ movw(rscratch1, intpow(31U, evf - vf / 2));
10436 __ mov(vpowm, Assembler::S, 0, rscratch1);
10437 } else {
10438 // 31^16
10439 __ movw(rscratch1, intpow(31U, evf));
10440 __ mov(vpowm, Assembler::S, 0, rscratch1);
10441 }
10442
10443 __ mov(vmul3, Assembler::T16B, 0);
10444 __ mov(vmul2, Assembler::T16B, 0);
10445 __ mov(vmul1, Assembler::T16B, 0);
10446
10447 __ bind(LARGE_LOOP);
10448
10449 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
10450 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
10451 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
10452 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10453
10454 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
10455 Address(__ post(ary, evf * type2aelembytes(eltype))));
10456
10457 if (load_arrangement == Assembler::T8B) {
10458 // Extend 8B to 8H to be able to use vector multiply
10459 // instructions
10460 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
10461 if (is_signed_subword_type(eltype)) {
10462 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
10463 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
10464 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
10465 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10466 } else {
10467 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
10468 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
10469 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
10470 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10471 }
10472 }
10473
10474 switch (load_arrangement) {
10475 case Assembler::T4S:
10476 __ addv(vmul3, load_arrangement, vmul3, vdata3);
10477 __ addv(vmul2, load_arrangement, vmul2, vdata2);
10478 __ addv(vmul1, load_arrangement, vmul1, vdata1);
10479 __ addv(vmul0, load_arrangement, vmul0, vdata0);
10480 break;
10481 case Assembler::T8B:
10482 case Assembler::T8H:
10483 assert(is_subword_type(eltype), "subword type expected");
10484 if (is_signed_subword_type(eltype)) {
10485 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
10486 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
10487 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
10488 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10489 } else {
10490 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
10491 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
10492 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
10493 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10494 }
10495 break;
10496 default:
10497 __ should_not_reach_here();
10498 }
10499
10500 // Process the upper half of a vector
10501 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
10502 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
10503 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
10504 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
10505 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
10506 if (is_signed_subword_type(eltype)) {
10507 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
10508 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
10509 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
10510 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10511 } else {
10512 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
10513 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
10514 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
10515 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10516 }
10517 }
10518
10519 __ subsw(rscratch2, rscratch2, 1);
10520 __ br(Assembler::HI, LARGE_LOOP);
10521
10522 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
10523 __ addv(vmul3, Assembler::T4S, vmul3);
10524 __ umov(result, vmul3, Assembler::S, 0);
10525
10526 __ mov(rscratch2, intpow(31U, vf));
10527
10528 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
10529 __ addv(vmul2, Assembler::T4S, vmul2);
10530 __ umov(rscratch1, vmul2, Assembler::S, 0);
10531 __ maddw(result, result, rscratch2, rscratch1);
10532
10533 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
10534 __ addv(vmul1, Assembler::T4S, vmul1);
10535 __ umov(rscratch1, vmul1, Assembler::S, 0);
10536 __ maddw(result, result, rscratch2, rscratch1);
10537
10538 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
10539 __ addv(vmul0, Assembler::T4S, vmul0);
10540 __ umov(rscratch1, vmul0, Assembler::S, 0);
10541 __ maddw(result, result, rscratch2, rscratch1);
10542
10543 __ andr(rscratch2, cnt, vf - 1);
10544 __ cbnz(rscratch2, TAIL_SHORTCUT);
10545
10546 __ leave();
10547 __ ret(lr);
10548
10549 // record the stub entry and end
10550 store_archive_data(stub_id, entry, __ pc());
10551
10552 return entry;
10553 }
10554
10555 address generate_dsin_dcos(bool isCos) {
10556 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
10557 int entry_count = StubInfo::entry_count(stub_id);
10558 assert(entry_count == 1, "sanity check");
10559 address start = load_archive_data(stub_id);
10560 if (start != nullptr) {
10561 return start;
10562 }
10563 __ align(CodeEntryAlignment);
10564 StubCodeMark mark(this, stub_id);
10565 start = __ pc();
10566 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
10567 (address)StubRoutines::aarch64::_two_over_pi,
10568 (address)StubRoutines::aarch64::_pio2,
10569 (address)StubRoutines::aarch64::_dsin_coef,
10570 (address)StubRoutines::aarch64::_dcos_coef);
10571
10572 // record the stub entry and end
10573 store_archive_data(stub_id, start, __ pc());
10574
10575 return start;
10576 }
10577
10578 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
10579 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
10580 Label &DIFF2) {
10581 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
10582 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
10583
10584 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
10585 __ ldr(tmpU, Address(__ post(cnt1, 8)));
10586 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
10587 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
10588
10589 __ fmovd(tmpL, vtmp3);
10590 __ eor(rscratch2, tmp3, tmpL);
10591 __ cbnz(rscratch2, DIFF2);
10592
10593 __ ldr(tmp3, Address(__ post(cnt1, 8)));
10594 __ umov(tmpL, vtmp3, __ D, 1);
10595 __ eor(rscratch2, tmpU, tmpL);
10596 __ cbnz(rscratch2, DIFF1);
10597
10598 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
10599 __ ldr(tmpU, Address(__ post(cnt1, 8)));
10600 __ fmovd(tmpL, vtmp);
10601 __ eor(rscratch2, tmp3, tmpL);
10602 __ cbnz(rscratch2, DIFF2);
10603
10604 __ ldr(tmp3, Address(__ post(cnt1, 8)));
10605 __ umov(tmpL, vtmp, __ D, 1);
10606 __ eor(rscratch2, tmpU, tmpL);
10607 __ cbnz(rscratch2, DIFF1);
10608 }
10609
10610 // r0 = result
10611 // r1 = str1
10612 // r2 = cnt1
10613 // r3 = str2
10614 // r4 = cnt2
10615 // r10 = tmp1
10616 // r11 = tmp2
10617 address generate_compare_long_string_different_encoding(bool isLU) {
10618 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
10619 int entry_count = StubInfo::entry_count(stub_id);
10620 assert(entry_count == 1, "sanity check");
10621 address start = load_archive_data(stub_id);
10622 if (start != nullptr) {
10623 return start;
10624 }
10625 __ align(CodeEntryAlignment);
10626 StubCodeMark mark(this, stub_id);
10627 address entry = __ pc();
10628 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
10629 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
10630 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
10631 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10632 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
10633 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
10634 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
10635
10636 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
10637
10638 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
10639 // cnt2 == amount of characters left to compare
10640 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
10641 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
10642 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
10643 __ add(str2, str2, isLU ? wordSize : wordSize/2);
10644 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
10645 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
10646 __ eor(rscratch2, tmp1, tmp2);
10647 __ mov(rscratch1, tmp2);
10648 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
10649 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
10650 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
10651 __ push(spilled_regs, sp);
10652 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
10653 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
10654
10655 __ ldr(tmp3, Address(__ post(cnt1, 8)));
10656
10657 if (SoftwarePrefetchHintDistance >= 0) {
10658 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
10659 __ br(__ LT, NO_PREFETCH);
10660 __ bind(LARGE_LOOP_PREFETCH);
10661 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
10662 __ mov(tmp4, 2);
10663 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
10664 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
10665 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10666 __ subs(tmp4, tmp4, 1);
10667 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
10668 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
10669 __ mov(tmp4, 2);
10670 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
10671 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10672 __ subs(tmp4, tmp4, 1);
10673 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
10674 __ sub(cnt2, cnt2, 64);
10675 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
10676 __ br(__ GE, LARGE_LOOP_PREFETCH);
10677 }
10678 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
10679 __ bind(NO_PREFETCH);
10680 __ subs(cnt2, cnt2, 16);
10681 __ br(__ LT, TAIL);
10682 __ align(OptoLoopAlignment);
10683 __ bind(SMALL_LOOP); // smaller loop
10684 __ subs(cnt2, cnt2, 16);
10685 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10686 __ br(__ GE, SMALL_LOOP);
10687 __ cmn(cnt2, (u1)16);
10688 __ br(__ EQ, LOAD_LAST);
10689 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
10690 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
10691 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
10692 __ ldr(tmp3, Address(cnt1, -8));
10693 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
10694 __ b(LOAD_LAST);
10695 __ bind(DIFF2);
10696 __ mov(tmpU, tmp3);
10697 __ bind(DIFF1);
10698 __ pop(spilled_regs, sp);
10699 __ b(CALCULATE_DIFFERENCE);
10700 __ bind(LOAD_LAST);
10701 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
10702 // No need to load it again
10703 __ mov(tmpU, tmp3);
10704 __ pop(spilled_regs, sp);
10705
10706 // tmp2 points to the address of the last 4 Latin1 characters right now
10707 __ ldrs(vtmp, Address(tmp2));
10708 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
10709 __ fmovd(tmpL, vtmp);
10710
10711 __ eor(rscratch2, tmpU, tmpL);
10712 __ cbz(rscratch2, DONE);
10713
10714 // Find the first different characters in the longwords and
10715 // compute their difference.
10716 __ bind(CALCULATE_DIFFERENCE);
10717 __ rev(rscratch2, rscratch2);
10718 __ clz(rscratch2, rscratch2);
10719 __ andr(rscratch2, rscratch2, -16);
10720 __ lsrv(tmp1, tmp1, rscratch2);
10721 __ uxthw(tmp1, tmp1);
10722 __ lsrv(rscratch1, rscratch1, rscratch2);
10723 __ uxthw(rscratch1, rscratch1);
10724 __ subw(result, tmp1, rscratch1);
10725 __ bind(DONE);
10726 __ ret(lr);
10727
10728 // record the stub entry and end
10729 store_archive_data(stub_id, entry, __ pc());
10730
10731 return entry;
10732 }
10733
10734 // r0 = input (float16)
10735 // v0 = result (float)
10736 // v1 = temporary float register
10737 address generate_float16ToFloat() {
10738 StubId stub_id = StubId::stubgen_hf2f_id;
10739 int entry_count = StubInfo::entry_count(stub_id);
10740 assert(entry_count == 1, "sanity check");
10741 address start = load_archive_data(stub_id);
10742 if (start != nullptr) {
10743 return start;
10744 }
10745 __ align(CodeEntryAlignment);
10746 StubCodeMark mark(this, stub_id);
10747 address entry = __ pc();
10748 BLOCK_COMMENT("Entry:");
10749 __ flt16_to_flt(v0, r0, v1);
10750 __ ret(lr);
10751
10752 // record the stub entry and end
10753 store_archive_data(stub_id, entry, __ pc());
10754
10755 return entry;
10756 }
10757
10758 // v0 = input (float)
10759 // r0 = result (float16)
10760 // v1 = temporary float register
10761 address generate_floatToFloat16() {
10762 StubId stub_id = StubId::stubgen_f2hf_id;
10763 int entry_count = StubInfo::entry_count(stub_id);
10764 assert(entry_count == 1, "sanity check");
10765 address start = load_archive_data(stub_id);
10766 if (start != nullptr) {
10767 return start;
10768 }
10769 __ align(CodeEntryAlignment);
10770 StubCodeMark mark(this, stub_id);
10771 address entry = __ pc();
10772 BLOCK_COMMENT("Entry:");
10773 __ flt_to_flt16(r0, v0, v1);
10774 __ ret(lr);
10775
10776 // record the stub entry and end
10777 store_archive_data(stub_id, entry, __ pc());
10778
10779 return entry;
10780 }
10781
10782 address generate_method_entry_barrier() {
10783 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
10784 int entry_count = StubInfo::entry_count(stub_id);
10785 assert(entry_count == 1, "sanity check");
10786 address start = load_archive_data(stub_id);
10787 if (start != nullptr) {
10788 return start;
10789 }
10790 __ align(CodeEntryAlignment);
10791 StubCodeMark mark(this, stub_id);
10792
10793 Label deoptimize_label;
10794
10795 start = __ pc();
10796
10797 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
10798
10799 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
10800 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
10801 // We can get here despite the nmethod being good, if we have not
10802 // yet applied our cross modification fence (or data fence).
10803 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
10804 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
10805 __ ldrw(rscratch2, rscratch2);
10806 __ strw(rscratch2, thread_epoch_addr);
10807 __ isb();
10808 __ membar(__ LoadLoad);
10809 }
10810
10811 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
10812
10813 __ enter();
10814 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
10815
10816 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
10817
10818 __ push_call_clobbered_registers();
10819
10820 __ mov(c_rarg0, rscratch2);
10821 __ call_VM_leaf
10822 (CAST_FROM_FN_PTR
10823 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
10824
10825 __ reset_last_Java_frame(true);
10826
10827 __ mov(rscratch1, r0);
10828
10829 __ pop_call_clobbered_registers();
10830
10831 __ cbnz(rscratch1, deoptimize_label);
10832
10833 __ leave();
10834 __ ret(lr);
10835
10836 __ BIND(deoptimize_label);
10837
10838 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
10839 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
10840
10841 __ mov(sp, rscratch1);
10842 __ br(rscratch2);
10843
10844 // record the stub entry and end
10845 store_archive_data(stub_id, start, __ pc());
10846
10847 return start;
10848 }
10849
10850 // r0 = result
10851 // r1 = str1
10852 // r2 = cnt1
10853 // r3 = str2
10854 // r4 = cnt2
10855 // r10 = tmp1
10856 // r11 = tmp2
10857 address generate_compare_long_string_same_encoding(bool isLL) {
10858 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
10859 int entry_count = StubInfo::entry_count(stub_id);
10860 assert(entry_count == 1, "sanity check");
10861 address start = load_archive_data(stub_id);
10862 if (start != nullptr) {
10863 return start;
10864 }
10865 __ align(CodeEntryAlignment);
10866 StubCodeMark mark(this, stub_id);
10867 address entry = __ pc();
10868 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10869 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
10870
10871 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
10872
10873 // exit from large loop when less than 64 bytes left to read or we're about
10874 // to prefetch memory behind array border
10875 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
10876
10877 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
10878 __ eor(rscratch2, tmp1, tmp2);
10879 __ cbnz(rscratch2, CAL_DIFFERENCE);
10880
10881 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
10882 // update pointers, because of previous read
10883 __ add(str1, str1, wordSize);
10884 __ add(str2, str2, wordSize);
10885 if (SoftwarePrefetchHintDistance >= 0) {
10886 __ align(OptoLoopAlignment);
10887 __ bind(LARGE_LOOP_PREFETCH);
10888 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
10889 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
10890
10891 for (int i = 0; i < 4; i++) {
10892 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
10893 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
10894 __ cmp(tmp1, tmp2);
10895 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10896 __ br(Assembler::NE, DIFF);
10897 }
10898 __ sub(cnt2, cnt2, isLL ? 64 : 32);
10899 __ add(str1, str1, 64);
10900 __ add(str2, str2, 64);
10901 __ subs(rscratch2, cnt2, largeLoopExitCondition);
10902 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
10903 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
10904 }
10905
10906 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
10907 __ br(Assembler::LE, LESS16);
10908 __ align(OptoLoopAlignment);
10909 __ bind(LOOP_COMPARE16);
10910 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
10911 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
10912 __ cmp(tmp1, tmp2);
10913 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10914 __ br(Assembler::NE, DIFF);
10915 __ sub(cnt2, cnt2, isLL ? 16 : 8);
10916 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
10917 __ br(Assembler::LT, LESS16);
10918
10919 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
10920 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
10921 __ cmp(tmp1, tmp2);
10922 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10923 __ br(Assembler::NE, DIFF);
10924 __ sub(cnt2, cnt2, isLL ? 16 : 8);
10925 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
10926 __ br(Assembler::GE, LOOP_COMPARE16);
10927 __ cbz(cnt2, LENGTH_DIFF);
10928
10929 __ bind(LESS16);
10930 // each 8 compare
10931 __ subs(cnt2, cnt2, isLL ? 8 : 4);
10932 __ br(Assembler::LE, LESS8);
10933 __ ldr(tmp1, Address(__ post(str1, 8)));
10934 __ ldr(tmp2, Address(__ post(str2, 8)));
10935 __ eor(rscratch2, tmp1, tmp2);
10936 __ cbnz(rscratch2, CAL_DIFFERENCE);
10937 __ sub(cnt2, cnt2, isLL ? 8 : 4);
10938
10939 __ bind(LESS8); // directly load last 8 bytes
10940 if (!isLL) {
10941 __ add(cnt2, cnt2, cnt2);
10942 }
10943 __ ldr(tmp1, Address(str1, cnt2));
10944 __ ldr(tmp2, Address(str2, cnt2));
10945 __ eor(rscratch2, tmp1, tmp2);
10946 __ cbz(rscratch2, LENGTH_DIFF);
10947 __ b(CAL_DIFFERENCE);
10948
10949 __ bind(DIFF);
10950 __ cmp(tmp1, tmp2);
10951 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
10952 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
10953 // reuse rscratch2 register for the result of eor instruction
10954 __ eor(rscratch2, tmp1, tmp2);
10955
10956 __ bind(CAL_DIFFERENCE);
10957 __ rev(rscratch2, rscratch2);
10958 __ clz(rscratch2, rscratch2);
10959 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
10960 __ lsrv(tmp1, tmp1, rscratch2);
10961 __ lsrv(tmp2, tmp2, rscratch2);
10962 if (isLL) {
10963 __ uxtbw(tmp1, tmp1);
10964 __ uxtbw(tmp2, tmp2);
10965 } else {
10966 __ uxthw(tmp1, tmp1);
10967 __ uxthw(tmp2, tmp2);
10968 }
10969 __ subw(result, tmp1, tmp2);
10970
10971 __ bind(LENGTH_DIFF);
10972 __ ret(lr);
10973
10974 // record the stub entry and end
10975 store_archive_data(stub_id, entry, __ pc());
10976
10977 return entry;
10978 }
10979
10980 enum string_compare_mode {
10981 LL,
10982 LU,
10983 UL,
10984 UU,
10985 };
10986
10987 // The following registers are declared in aarch64.ad
10988 // r0 = result
10989 // r1 = str1
10990 // r2 = cnt1
10991 // r3 = str2
10992 // r4 = cnt2
10993 // r10 = tmp1
10994 // r11 = tmp2
10995 // z0 = ztmp1
10996 // z1 = ztmp2
10997 // p0 = pgtmp1
10998 // p1 = pgtmp2
10999 address generate_compare_long_string_sve(string_compare_mode mode) {
11000 StubId stub_id;
11001 switch (mode) {
11002 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
11003 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
11004 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
11005 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
11006 default: ShouldNotReachHere();
11007 }
11008 int entry_count = StubInfo::entry_count(stub_id);
11009 assert(entry_count == 1, "sanity check");
11010 address start = load_archive_data(stub_id);
11011 if (start != nullptr) {
11012 return start;
11013 }
11014 __ align(CodeEntryAlignment);
11015 StubCodeMark mark(this, stub_id);
11016 address entry = __ pc();
11017 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
11018 tmp1 = r10, tmp2 = r11;
11019
11020 Label LOOP, DONE, MISMATCH;
11021 Register vec_len = tmp1;
11022 Register idx = tmp2;
11023 // The minimum of the string lengths has been stored in cnt2.
11024 Register cnt = cnt2;
11025 FloatRegister ztmp1 = z0, ztmp2 = z1;
11026 PRegister pgtmp1 = p0, pgtmp2 = p1;
11027
11028 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
11029 switch (mode) { \
11030 case LL: \
11031 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
11032 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
11033 break; \
11034 case LU: \
11035 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
11036 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
11037 break; \
11038 case UL: \
11039 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
11040 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
11041 break; \
11042 case UU: \
11043 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
11044 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
11045 break; \
11046 default: \
11047 ShouldNotReachHere(); \
11048 }
11049
11050 __ mov(idx, 0);
11051 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
11052
11053 if (mode == LL) {
11054 __ sve_cntb(vec_len);
11055 } else {
11056 __ sve_cnth(vec_len);
11057 }
11058
11059 __ sub(rscratch1, cnt, vec_len);
11060
11061 __ bind(LOOP);
11062
11063 // main loop
11064 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
11065 __ add(idx, idx, vec_len);
11066 // Compare strings.
11067 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
11068 __ br(__ NE, MISMATCH);
11069 __ cmp(idx, rscratch1);
11070 __ br(__ LT, LOOP);
11071
11072 // post loop, last iteration
11073 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
11074
11075 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
11076 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
11077 __ br(__ EQ, DONE);
11078
11079 __ bind(MISMATCH);
11080
11081 // Crop the vector to find its location.
11082 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
11083 // Extract the first different characters of each string.
11084 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
11085 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
11086
11087 // Compute the difference of the first different characters.
11088 __ sub(result, rscratch1, rscratch2);
11089
11090 __ bind(DONE);
11091 __ ret(lr);
11092 #undef LOAD_PAIR
11093
11094 // record the stub entry and end
11095 store_archive_data(stub_id, entry, __ pc());
11096
11097 return entry;
11098 }
11099
11100 void generate_compare_long_strings() {
11101 if (UseSVE == 0) {
11102 StubRoutines::aarch64::_compare_long_string_LL
11103 = generate_compare_long_string_same_encoding(true);
11104 StubRoutines::aarch64::_compare_long_string_UU
11105 = generate_compare_long_string_same_encoding(false);
11106 StubRoutines::aarch64::_compare_long_string_LU
11107 = generate_compare_long_string_different_encoding(true);
11108 StubRoutines::aarch64::_compare_long_string_UL
11109 = generate_compare_long_string_different_encoding(false);
11110 } else {
11111 StubRoutines::aarch64::_compare_long_string_LL
11112 = generate_compare_long_string_sve(LL);
11113 StubRoutines::aarch64::_compare_long_string_UU
11114 = generate_compare_long_string_sve(UU);
11115 StubRoutines::aarch64::_compare_long_string_LU
11116 = generate_compare_long_string_sve(LU);
11117 StubRoutines::aarch64::_compare_long_string_UL
11118 = generate_compare_long_string_sve(UL);
11119 }
11120 }
11121
11122 // R0 = result
11123 // R1 = str2
11124 // R2 = cnt1
11125 // R3 = str1
11126 // R4 = cnt2
11127 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
11128 //
11129 // This generic linear code use few additional ideas, which makes it faster:
11130 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
11131 // in order to skip initial loading(help in systems with 1 ld pipeline)
11132 // 2) we can use "fast" algorithm of finding single character to search for
11133 // first symbol with less branches(1 branch per each loaded register instead
11134 // of branch for each symbol), so, this is where constants like
11135 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
11136 // 3) after loading and analyzing 1st register of source string, it can be
11137 // used to search for every 1st character entry, saving few loads in
11138 // comparison with "simplier-but-slower" implementation
11139 // 4) in order to avoid lots of push/pop operations, code below is heavily
11140 // re-using/re-initializing/compressing register values, which makes code
11141 // larger and a bit less readable, however, most of extra operations are
11142 // issued during loads or branches, so, penalty is minimal
11143 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
11144 StubId stub_id;
11145 if (str1_isL) {
11146 if (str2_isL) {
11147 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
11148 } else {
11149 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
11150 }
11151 } else {
11152 if (str2_isL) {
11153 ShouldNotReachHere();
11154 } else {
11155 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
11156 }
11157 }
11158 int entry_count = StubInfo::entry_count(stub_id);
11159 assert(entry_count == 1, "sanity check");
11160 address start = load_archive_data(stub_id);
11161 if (start != nullptr) {
11162 return start;
11163 }
11164 __ align(CodeEntryAlignment);
11165 StubCodeMark mark(this, stub_id);
11166 address entry = __ pc();
11167
11168 int str1_chr_size = str1_isL ? 1 : 2;
11169 int str2_chr_size = str2_isL ? 1 : 2;
11170 int str1_chr_shift = str1_isL ? 0 : 1;
11171 int str2_chr_shift = str2_isL ? 0 : 1;
11172 bool isL = str1_isL && str2_isL;
11173 // parameters
11174 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
11175 // temporary registers
11176 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
11177 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
11178 // redefinitions
11179 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
11180
11181 __ push(spilled_regs, sp);
11182 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
11183 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
11184 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
11185 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
11186 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
11187 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
11188 // Read whole register from str1. It is safe, because length >=8 here
11189 __ ldr(ch1, Address(str1));
11190 // Read whole register from str2. It is safe, because length >=8 here
11191 __ ldr(ch2, Address(str2));
11192 __ sub(cnt2, cnt2, cnt1);
11193 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
11194 if (str1_isL != str2_isL) {
11195 __ eor(v0, __ T16B, v0, v0);
11196 }
11197 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
11198 __ mul(first, first, tmp1);
11199 // check if we have less than 1 register to check
11200 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
11201 if (str1_isL != str2_isL) {
11202 __ fmovd(v1, ch1);
11203 }
11204 __ br(__ LE, L_SMALL);
11205 __ eor(ch2, first, ch2);
11206 if (str1_isL != str2_isL) {
11207 __ zip1(v1, __ T16B, v1, v0);
11208 }
11209 __ sub(tmp2, ch2, tmp1);
11210 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11211 __ bics(tmp2, tmp2, ch2);
11212 if (str1_isL != str2_isL) {
11213 __ fmovd(ch1, v1);
11214 }
11215 __ br(__ NE, L_HAS_ZERO);
11216 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
11217 __ add(result, result, wordSize/str2_chr_size);
11218 __ add(str2, str2, wordSize);
11219 __ br(__ LT, L_POST_LOOP);
11220 __ BIND(L_LOOP);
11221 __ ldr(ch2, Address(str2));
11222 __ eor(ch2, first, ch2);
11223 __ sub(tmp2, ch2, tmp1);
11224 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11225 __ bics(tmp2, tmp2, ch2);
11226 __ br(__ NE, L_HAS_ZERO);
11227 __ BIND(L_LOOP_PROCEED);
11228 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
11229 __ add(str2, str2, wordSize);
11230 __ add(result, result, wordSize/str2_chr_size);
11231 __ br(__ GE, L_LOOP);
11232 __ BIND(L_POST_LOOP);
11233 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
11234 __ br(__ LE, NOMATCH);
11235 __ ldr(ch2, Address(str2));
11236 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
11237 __ eor(ch2, first, ch2);
11238 __ sub(tmp2, ch2, tmp1);
11239 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11240 __ mov(tmp4, -1); // all bits set
11241 __ b(L_SMALL_PROCEED);
11242 __ align(OptoLoopAlignment);
11243 __ BIND(L_SMALL);
11244 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
11245 __ eor(ch2, first, ch2);
11246 if (str1_isL != str2_isL) {
11247 __ zip1(v1, __ T16B, v1, v0);
11248 }
11249 __ sub(tmp2, ch2, tmp1);
11250 __ mov(tmp4, -1); // all bits set
11251 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11252 if (str1_isL != str2_isL) {
11253 __ fmovd(ch1, v1); // move converted 4 symbols
11254 }
11255 __ BIND(L_SMALL_PROCEED);
11256 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
11257 __ bic(tmp2, tmp2, ch2);
11258 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
11259 __ rbit(tmp2, tmp2);
11260 __ br(__ EQ, NOMATCH);
11261 __ BIND(L_SMALL_HAS_ZERO_LOOP);
11262 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
11263 __ cmp(cnt1, u1(wordSize/str2_chr_size));
11264 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
11265 if (str2_isL) { // LL
11266 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
11267 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
11268 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
11269 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
11270 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11271 } else {
11272 __ mov(ch2, 0xE); // all bits in byte set except last one
11273 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11274 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11275 __ lslv(tmp2, tmp2, tmp4);
11276 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11277 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11278 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11279 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11280 }
11281 __ cmp(ch1, ch2);
11282 __ mov(tmp4, wordSize/str2_chr_size);
11283 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11284 __ BIND(L_SMALL_CMP_LOOP);
11285 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
11286 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
11287 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
11288 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
11289 __ add(tmp4, tmp4, 1);
11290 __ cmp(tmp4, cnt1);
11291 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
11292 __ cmp(first, ch2);
11293 __ br(__ EQ, L_SMALL_CMP_LOOP);
11294 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
11295 __ cbz(tmp2, NOMATCH); // no more matches. exit
11296 __ clz(tmp4, tmp2);
11297 __ add(result, result, 1); // advance index
11298 __ add(str2, str2, str2_chr_size); // advance pointer
11299 __ b(L_SMALL_HAS_ZERO_LOOP);
11300 __ align(OptoLoopAlignment);
11301 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
11302 __ cmp(first, ch2);
11303 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11304 __ b(DONE);
11305 __ align(OptoLoopAlignment);
11306 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
11307 if (str2_isL) { // LL
11308 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
11309 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
11310 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
11311 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
11312 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11313 } else {
11314 __ mov(ch2, 0xE); // all bits in byte set except last one
11315 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11316 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11317 __ lslv(tmp2, tmp2, tmp4);
11318 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11319 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11320 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11321 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11322 }
11323 __ cmp(ch1, ch2);
11324 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11325 __ b(DONE);
11326 __ align(OptoLoopAlignment);
11327 __ BIND(L_HAS_ZERO);
11328 __ rbit(tmp2, tmp2);
11329 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
11330 // Now, perform compression of counters(cnt2 and cnt1) into one register.
11331 // It's fine because both counters are 32bit and are not changed in this
11332 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
11333 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
11334 __ sub(result, result, 1);
11335 __ BIND(L_HAS_ZERO_LOOP);
11336 __ mov(cnt1, wordSize/str2_chr_size);
11337 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
11338 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
11339 if (str2_isL) {
11340 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
11341 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11342 __ lslv(tmp2, tmp2, tmp4);
11343 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11344 __ add(tmp4, tmp4, 1);
11345 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11346 __ lsl(tmp2, tmp2, 1);
11347 __ mov(tmp4, wordSize/str2_chr_size);
11348 } else {
11349 __ mov(ch2, 0xE);
11350 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11351 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11352 __ lslv(tmp2, tmp2, tmp4);
11353 __ add(tmp4, tmp4, 1);
11354 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11355 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
11356 __ lsl(tmp2, tmp2, 1);
11357 __ mov(tmp4, wordSize/str2_chr_size);
11358 __ sub(str2, str2, str2_chr_size);
11359 }
11360 __ cmp(ch1, ch2);
11361 __ mov(tmp4, wordSize/str2_chr_size);
11362 __ br(__ NE, L_CMP_LOOP_NOMATCH);
11363 __ BIND(L_CMP_LOOP);
11364 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
11365 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
11366 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
11367 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
11368 __ add(tmp4, tmp4, 1);
11369 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
11370 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
11371 __ cmp(cnt1, ch2);
11372 __ br(__ EQ, L_CMP_LOOP);
11373 __ BIND(L_CMP_LOOP_NOMATCH);
11374 // here we're not matched
11375 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
11376 __ clz(tmp4, tmp2);
11377 __ add(str2, str2, str2_chr_size); // advance pointer
11378 __ b(L_HAS_ZERO_LOOP);
11379 __ align(OptoLoopAlignment);
11380 __ BIND(L_CMP_LOOP_LAST_CMP);
11381 __ cmp(cnt1, ch2);
11382 __ br(__ NE, L_CMP_LOOP_NOMATCH);
11383 __ b(DONE);
11384 __ align(OptoLoopAlignment);
11385 __ BIND(L_CMP_LOOP_LAST_CMP2);
11386 if (str2_isL) {
11387 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
11388 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11389 __ lslv(tmp2, tmp2, tmp4);
11390 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11391 __ add(tmp4, tmp4, 1);
11392 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11393 __ lsl(tmp2, tmp2, 1);
11394 } else {
11395 __ mov(ch2, 0xE);
11396 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11397 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11398 __ lslv(tmp2, tmp2, tmp4);
11399 __ add(tmp4, tmp4, 1);
11400 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11401 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
11402 __ lsl(tmp2, tmp2, 1);
11403 __ sub(str2, str2, str2_chr_size);
11404 }
11405 __ cmp(ch1, ch2);
11406 __ br(__ NE, L_CMP_LOOP_NOMATCH);
11407 __ b(DONE);
11408 __ align(OptoLoopAlignment);
11409 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
11410 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
11411 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
11412 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
11413 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
11414 // result by analyzed characters value, so, we can just reset lower bits
11415 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
11416 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
11417 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
11418 // index of last analyzed substring inside current octet. So, str2 in at
11419 // respective start address. We need to advance it to next octet
11420 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
11421 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
11422 __ bfm(result, zr, 0, 2 - str2_chr_shift);
11423 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
11424 __ movw(cnt2, cnt2);
11425 __ b(L_LOOP_PROCEED);
11426 __ align(OptoLoopAlignment);
11427 __ BIND(NOMATCH);
11428 __ mov(result, -1);
11429 __ BIND(DONE);
11430 __ pop(spilled_regs, sp);
11431 __ ret(lr);
11432
11433 // record the stub entry and end
11434 store_archive_data(stub_id, entry, __ pc());
11435
11436 return entry;
11437 }
11438
11439 void generate_string_indexof_stubs() {
11440 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
11441 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
11442 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
11443 }
11444
11445 void inflate_and_store_2_fp_registers(bool generatePrfm,
11446 FloatRegister src1, FloatRegister src2) {
11447 Register dst = r1;
11448 __ zip1(v1, __ T16B, src1, v0);
11449 __ zip2(v2, __ T16B, src1, v0);
11450 if (generatePrfm) {
11451 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
11452 }
11453 __ zip1(v3, __ T16B, src2, v0);
11454 __ zip2(v4, __ T16B, src2, v0);
11455 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
11456 }
11457
11458 // R0 = src
11459 // R1 = dst
11460 // R2 = len
11461 // R3 = len >> 3
11462 // V0 = 0
11463 // v1 = loaded 8 bytes
11464 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
11465 address generate_large_byte_array_inflate() {
11466 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
11467 int entry_count = StubInfo::entry_count(stub_id);
11468 assert(entry_count == 1, "sanity check");
11469 address start = load_archive_data(stub_id);
11470 if (start != nullptr) {
11471 return start;
11472 }
11473 __ align(CodeEntryAlignment);
11474 StubCodeMark mark(this, stub_id);
11475 address entry = __ pc();
11476 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
11477 Register src = r0, dst = r1, len = r2, octetCounter = r3;
11478 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
11479
11480 // do one more 8-byte read to have address 16-byte aligned in most cases
11481 // also use single store instruction
11482 __ ldrd(v2, __ post(src, 8));
11483 __ sub(octetCounter, octetCounter, 2);
11484 __ zip1(v1, __ T16B, v1, v0);
11485 __ zip1(v2, __ T16B, v2, v0);
11486 __ st1(v1, v2, __ T16B, __ post(dst, 32));
11487 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11488 __ subs(rscratch1, octetCounter, large_loop_threshold);
11489 __ br(__ LE, LOOP_START);
11490 __ b(LOOP_PRFM_START);
11491 __ bind(LOOP_PRFM);
11492 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11493 __ bind(LOOP_PRFM_START);
11494 __ prfm(Address(src, SoftwarePrefetchHintDistance));
11495 __ sub(octetCounter, octetCounter, 8);
11496 __ subs(rscratch1, octetCounter, large_loop_threshold);
11497 inflate_and_store_2_fp_registers(true, v3, v4);
11498 inflate_and_store_2_fp_registers(true, v5, v6);
11499 __ br(__ GT, LOOP_PRFM);
11500 __ cmp(octetCounter, (u1)8);
11501 __ br(__ LT, DONE);
11502 __ bind(LOOP);
11503 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11504 __ bind(LOOP_START);
11505 __ sub(octetCounter, octetCounter, 8);
11506 __ cmp(octetCounter, (u1)8);
11507 inflate_and_store_2_fp_registers(false, v3, v4);
11508 inflate_and_store_2_fp_registers(false, v5, v6);
11509 __ br(__ GE, LOOP);
11510 __ bind(DONE);
11511 __ ret(lr);
11512
11513 // record the stub entry and end
11514 store_archive_data(stub_id, entry, __ pc());
11515
11516 return entry;
11517 }
11518
11519 /**
11520 * Arguments:
11521 *
11522 * Input:
11523 * c_rarg0 - current state address
11524 * c_rarg1 - H key address
11525 * c_rarg2 - data address
11526 * c_rarg3 - number of blocks
11527 *
11528 * Output:
11529 * Updated state at c_rarg0
11530 */
11531 address generate_ghash_processBlocks_small() {
11532 // Bafflingly, GCM uses little-endian for the byte order, but
11533 // big-endian for the bit order. For example, the polynomial 1 is
11534 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
11535 //
11536 // So, we must either reverse the bytes in each word and do
11537 // everything big-endian or reverse the bits in each byte and do
11538 // it little-endian. On AArch64 it's more idiomatic to reverse
11539 // the bits in each byte (we have an instruction, RBIT, to do
11540 // that) and keep the data in little-endian bit order through the
11541 // calculation, bit-reversing the inputs and outputs.
11542
11543 StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
11544 int entry_count = StubInfo::entry_count(stub_id);
11545 assert(entry_count == 1, "sanity check");
11546 address start = load_archive_data(stub_id);
11547 if (start != nullptr) {
11548 return start;
11549 }
11550 __ align(CodeEntryAlignment);
11551 StubCodeMark mark(this, stub_id);
11552 Label polynomial; // local data generated at end of stub
11553 start = __ pc();
11554
11555 Register state = c_rarg0;
11556 Register subkeyH = c_rarg1;
11557 Register data = c_rarg2;
11558 Register blocks = c_rarg3;
11559
11560 FloatRegister vzr = v30;
11561 __ eor(vzr, __ T16B, vzr, vzr); // zero register
11562
11563 __ adr(rscratch1, polynomial);
11564 __ ldrq(v24, rscratch1); // The field polynomial
11565
11566 __ ldrq(v0, Address(state));
11567 __ ldrq(v1, Address(subkeyH));
11568
11569 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
11570 __ rbit(v0, __ T16B, v0);
11571 __ rev64(v1, __ T16B, v1);
11572 __ rbit(v1, __ T16B, v1);
11573
11574 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
11575 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
11576
11577 {
11578 Label L_ghash_loop;
11579 __ bind(L_ghash_loop);
11580
11581 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
11582 // reversing each byte
11583 __ rbit(v2, __ T16B, v2);
11584 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
11585
11586 // Multiply state in v2 by subkey in v1
11587 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
11588 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
11589 /*temps*/v6, v3, /*reuse/clobber b*/v2);
11590 // Reduce v7:v5 by the field polynomial
11591 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
11592
11593 __ sub(blocks, blocks, 1);
11594 __ cbnz(blocks, L_ghash_loop);
11595 }
11596
11597 // The bit-reversed result is at this point in v0
11598 __ rev64(v0, __ T16B, v0);
11599 __ rbit(v0, __ T16B, v0);
11600
11601 __ st1(v0, __ T16B, state);
11602 __ ret(lr);
11603
11604 // bind label and generate local polynomial data
11605 __ align(wordSize * 2);
11606 __ bind(polynomial);
11607 __ emit_int64(0x87); // The low-order bits of the field
11608 // polynomial (i.e. p = z^7+z^2+z+1)
11609 // repeated in the low and high parts of a
11610 // 128-bit vector
11611 __ emit_int64(0x87);
11612
11613 // record the stub entry and end
11614 store_archive_data(stub_id, start, __ pc());
11615
11616 return start;
11617 }
11618
11619 address generate_ghash_processBlocks(address small) {
11620 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
11621 int entry_count = StubInfo::entry_count(stub_id);
11622 assert(entry_count == 1, "sanity check");
11623 address start = load_archive_data(stub_id);
11624 if (start != nullptr) {
11625 return start;
11626 }
11627 Label polynomial; // local data generated after stub
11628 __ align(CodeEntryAlignment);
11629 StubCodeMark mark(this, stub_id);
11630 start = __ pc();
11631
11632 Register state = c_rarg0;
11633 Register subkeyH = c_rarg1;
11634 Register data = c_rarg2;
11635 Register blocks = c_rarg3;
11636
11637 const int unroll = 4;
11638
11639 __ cmp(blocks, (unsigned char)(unroll * 2));
11640 __ br(__ LT, small);
11641
11642 if (unroll > 1) {
11643 // Save state before entering routine
11644 __ sub(sp, sp, 4 * 16);
11645 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
11646 __ sub(sp, sp, 4 * 16);
11647 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
11648 }
11649
11650 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
11651
11652 if (unroll > 1) {
11653 // And restore state
11654 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
11655 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
11656 }
11657
11658 __ cmp(blocks, (unsigned char)0);
11659 __ br(__ GT, small);
11660
11661 __ ret(lr);
11662
11663 // bind label and generate polynomial data
11664 __ align(wordSize * 2);
11665 __ bind(polynomial);
11666 __ emit_int64(0x87); // The low-order bits of the field
11667 // polynomial (i.e. p = z^7+z^2+z+1)
11668 // repeated in the low and high parts of a
11669 // 128-bit vector
11670 __ emit_int64(0x87);
11671
11672 // record the stub entry and end
11673 store_archive_data(stub_id, start, __ pc());
11674
11675 return start;
11676 }
11677
11678 void generate_base64_encode_simdround(Register src, Register dst,
11679 FloatRegister codec, u8 size) {
11680
11681 FloatRegister in0 = v4, in1 = v5, in2 = v6;
11682 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
11683 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
11684
11685 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
11686
11687 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
11688
11689 __ ushr(ind0, arrangement, in0, 2);
11690
11691 __ ushr(ind1, arrangement, in1, 2);
11692 __ shl(in0, arrangement, in0, 6);
11693 __ orr(ind1, arrangement, ind1, in0);
11694 __ ushr(ind1, arrangement, ind1, 2);
11695
11696 __ ushr(ind2, arrangement, in2, 4);
11697 __ shl(in1, arrangement, in1, 4);
11698 __ orr(ind2, arrangement, in1, ind2);
11699 __ ushr(ind2, arrangement, ind2, 2);
11700
11701 __ shl(ind3, arrangement, in2, 2);
11702 __ ushr(ind3, arrangement, ind3, 2);
11703
11704 __ tbl(out0, arrangement, codec, 4, ind0);
11705 __ tbl(out1, arrangement, codec, 4, ind1);
11706 __ tbl(out2, arrangement, codec, 4, ind2);
11707 __ tbl(out3, arrangement, codec, 4, ind3);
11708
11709 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
11710 }
11711
11712 /**
11713 * Arguments:
11714 *
11715 * Input:
11716 * c_rarg0 - src_start
11717 * c_rarg1 - src_offset
11718 * c_rarg2 - src_length
11719 * c_rarg3 - dest_start
11720 * c_rarg4 - dest_offset
11721 * c_rarg5 - isURL
11722 *
11723 */
11724 address generate_base64_encodeBlock() {
11725
11726 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
11727 int entry_count = StubInfo::entry_count(stub_id);
11728 assert(entry_count == 1, "sanity check");
11729 address start = load_archive_data(stub_id);
11730 if (start != nullptr) {
11731 return start;
11732 }
11733 __ align(CodeEntryAlignment);
11734 StubCodeMark mark(this, stub_id);
11735 start = __ pc();
11736
11737 Register src = c_rarg0; // source array
11738 Register soff = c_rarg1; // source start offset
11739 Register send = c_rarg2; // source end offset
11740 Register dst = c_rarg3; // dest array
11741 Register doff = c_rarg4; // position for writing to dest array
11742 Register isURL = c_rarg5; // Base64 or URL character set
11743
11744 // c_rarg6 and c_rarg7 are free to use as temps
11745 Register codec = c_rarg6;
11746 Register length = c_rarg7;
11747
11748 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
11749
11750 __ add(src, src, soff);
11751 __ add(dst, dst, doff);
11752 __ sub(length, send, soff);
11753
11754 // load the codec base address
11755 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
11756 __ cbz(isURL, ProcessData);
11757 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
11758
11759 __ BIND(ProcessData);
11760
11761 // too short to formup a SIMD loop, roll back
11762 __ cmp(length, (u1)24);
11763 __ br(Assembler::LT, Process3B);
11764
11765 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
11766
11767 __ BIND(Process48B);
11768 __ cmp(length, (u1)48);
11769 __ br(Assembler::LT, Process24B);
11770 generate_base64_encode_simdround(src, dst, v0, 16);
11771 __ sub(length, length, 48);
11772 __ b(Process48B);
11773
11774 __ BIND(Process24B);
11775 __ cmp(length, (u1)24);
11776 __ br(Assembler::LT, SIMDExit);
11777 generate_base64_encode_simdround(src, dst, v0, 8);
11778 __ sub(length, length, 24);
11779
11780 __ BIND(SIMDExit);
11781 __ cbz(length, Exit);
11782
11783 __ BIND(Process3B);
11784 // 3 src bytes, 24 bits
11785 __ ldrb(r10, __ post(src, 1));
11786 __ ldrb(r11, __ post(src, 1));
11787 __ ldrb(r12, __ post(src, 1));
11788 __ orrw(r11, r11, r10, Assembler::LSL, 8);
11789 __ orrw(r12, r12, r11, Assembler::LSL, 8);
11790 // codec index
11791 __ ubfmw(r15, r12, 18, 23);
11792 __ ubfmw(r14, r12, 12, 17);
11793 __ ubfmw(r13, r12, 6, 11);
11794 __ andw(r12, r12, 63);
11795 // get the code based on the codec
11796 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
11797 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
11798 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
11799 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
11800 __ strb(r15, __ post(dst, 1));
11801 __ strb(r14, __ post(dst, 1));
11802 __ strb(r13, __ post(dst, 1));
11803 __ strb(r12, __ post(dst, 1));
11804 __ sub(length, length, 3);
11805 __ cbnz(length, Process3B);
11806
11807 __ BIND(Exit);
11808 __ ret(lr);
11809
11810 // record the stub entry and end
11811 store_archive_data(stub_id, start, __ pc());
11812
11813 return start;
11814 }
11815
11816 void generate_base64_decode_simdround(Register src, Register dst,
11817 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
11818
11819 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
11820 FloatRegister out0 = v20, out1 = v21, out2 = v22;
11821
11822 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
11823 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
11824
11825 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
11826
11827 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
11828
11829 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
11830
11831 // we need unsigned saturating subtract, to make sure all input values
11832 // in range [0, 63] will have 0U value in the higher half lookup
11833 __ uqsubv(decH0, __ T16B, in0, v27);
11834 __ uqsubv(decH1, __ T16B, in1, v27);
11835 __ uqsubv(decH2, __ T16B, in2, v27);
11836 __ uqsubv(decH3, __ T16B, in3, v27);
11837
11838 // lower half lookup
11839 __ tbl(decL0, arrangement, codecL, 4, in0);
11840 __ tbl(decL1, arrangement, codecL, 4, in1);
11841 __ tbl(decL2, arrangement, codecL, 4, in2);
11842 __ tbl(decL3, arrangement, codecL, 4, in3);
11843
11844 // higher half lookup
11845 __ tbx(decH0, arrangement, codecH, 4, decH0);
11846 __ tbx(decH1, arrangement, codecH, 4, decH1);
11847 __ tbx(decH2, arrangement, codecH, 4, decH2);
11848 __ tbx(decH3, arrangement, codecH, 4, decH3);
11849
11850 // combine lower and higher
11851 __ orr(decL0, arrangement, decL0, decH0);
11852 __ orr(decL1, arrangement, decL1, decH1);
11853 __ orr(decL2, arrangement, decL2, decH2);
11854 __ orr(decL3, arrangement, decL3, decH3);
11855
11856 // check illegal inputs, value larger than 63 (maximum of 6 bits)
11857 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
11858 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
11859 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
11860 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
11861 __ orr(in0, arrangement, decH0, decH1);
11862 __ orr(in1, arrangement, decH2, decH3);
11863 __ orr(in2, arrangement, in0, in1);
11864 __ umaxv(in3, arrangement, in2);
11865 __ umov(rscratch2, in3, __ B, 0);
11866
11867 // get the data to output
11868 __ shl(out0, arrangement, decL0, 2);
11869 __ ushr(out1, arrangement, decL1, 4);
11870 __ orr(out0, arrangement, out0, out1);
11871 __ shl(out1, arrangement, decL1, 4);
11872 __ ushr(out2, arrangement, decL2, 2);
11873 __ orr(out1, arrangement, out1, out2);
11874 __ shl(out2, arrangement, decL2, 6);
11875 __ orr(out2, arrangement, out2, decL3);
11876
11877 __ cbz(rscratch2, NoIllegalData);
11878
11879 // handle illegal input
11880 __ umov(r10, in2, __ D, 0);
11881 if (size == 16) {
11882 __ cbnz(r10, ErrorInLowerHalf);
11883
11884 // illegal input is in higher half, store the lower half now.
11885 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
11886
11887 __ umov(r10, in2, __ D, 1);
11888 __ umov(r11, out0, __ D, 1);
11889 __ umov(r12, out1, __ D, 1);
11890 __ umov(r13, out2, __ D, 1);
11891 __ b(StoreLegalData);
11892
11893 __ BIND(ErrorInLowerHalf);
11894 }
11895 __ umov(r11, out0, __ D, 0);
11896 __ umov(r12, out1, __ D, 0);
11897 __ umov(r13, out2, __ D, 0);
11898
11899 __ BIND(StoreLegalData);
11900 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
11901 __ strb(r11, __ post(dst, 1));
11902 __ strb(r12, __ post(dst, 1));
11903 __ strb(r13, __ post(dst, 1));
11904 __ lsr(r10, r10, 8);
11905 __ lsr(r11, r11, 8);
11906 __ lsr(r12, r12, 8);
11907 __ lsr(r13, r13, 8);
11908 __ b(StoreLegalData);
11909
11910 __ BIND(NoIllegalData);
11911 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
11912 }
11913
11914
11915 /**
11916 * Arguments:
11917 *
11918 * Input:
11919 * c_rarg0 - src_start
11920 * c_rarg1 - src_offset
11921 * c_rarg2 - src_length
11922 * c_rarg3 - dest_start
11923 * c_rarg4 - dest_offset
11924 * c_rarg5 - isURL
11925 * c_rarg6 - isMIME
11926 *
11927 */
11928 address generate_base64_decodeBlock() {
11929
11930 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
11931 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
11932 // titled "Base64 decoding".
11933
11934 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
11935 int entry_count = StubInfo::entry_count(stub_id);
11936 assert(entry_count == 1, "sanity check");
11937 address start = load_archive_data(stub_id);
11938 if (start != nullptr) {
11939 return start;
11940 }
11941 __ align(CodeEntryAlignment);
11942 StubCodeMark mark(this, stub_id);
11943 start = __ pc();
11944
11945 Register src = c_rarg0; // source array
11946 Register soff = c_rarg1; // source start offset
11947 Register send = c_rarg2; // source end offset
11948 Register dst = c_rarg3; // dest array
11949 Register doff = c_rarg4; // position for writing to dest array
11950 Register isURL = c_rarg5; // Base64 or URL character set
11951 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
11952
11953 Register length = send; // reuse send as length of source data to process
11954
11955 Register simd_codec = c_rarg6;
11956 Register nosimd_codec = c_rarg7;
11957
11958 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
11959
11960 __ enter();
11961
11962 __ add(src, src, soff);
11963 __ add(dst, dst, doff);
11964
11965 __ mov(doff, dst);
11966
11967 __ sub(length, send, soff);
11968 __ bfm(length, zr, 0, 1);
11969
11970 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
11971 __ cbz(isURL, ProcessData);
11972 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
11973
11974 __ BIND(ProcessData);
11975 __ mov(rscratch1, length);
11976 __ cmp(length, (u1)144); // 144 = 80 + 64
11977 __ br(Assembler::LT, Process4B);
11978
11979 // In the MIME case, the line length cannot be more than 76
11980 // bytes (see RFC 2045). This is too short a block for SIMD
11981 // to be worthwhile, so we use non-SIMD here.
11982 __ movw(rscratch1, 79);
11983
11984 __ BIND(Process4B);
11985 __ ldrw(r14, __ post(src, 4));
11986 __ ubfxw(r10, r14, 0, 8);
11987 __ ubfxw(r11, r14, 8, 8);
11988 __ ubfxw(r12, r14, 16, 8);
11989 __ ubfxw(r13, r14, 24, 8);
11990 // get the de-code
11991 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
11992 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
11993 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
11994 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
11995 // error detection, 255u indicates an illegal input
11996 __ orrw(r14, r10, r11);
11997 __ orrw(r15, r12, r13);
11998 __ orrw(r14, r14, r15);
11999 __ tbnz(r14, 7, Exit);
12000 // recover the data
12001 __ lslw(r14, r10, 10);
12002 __ bfiw(r14, r11, 4, 6);
12003 __ bfmw(r14, r12, 2, 5);
12004 __ rev16w(r14, r14);
12005 __ bfiw(r13, r12, 6, 2);
12006 __ strh(r14, __ post(dst, 2));
12007 __ strb(r13, __ post(dst, 1));
12008 // non-simd loop
12009 __ subsw(rscratch1, rscratch1, 4);
12010 __ br(Assembler::GT, Process4B);
12011
12012 // if exiting from PreProcess80B, rscratch1 == -1;
12013 // otherwise, rscratch1 == 0.
12014 __ cbzw(rscratch1, Exit);
12015 __ sub(length, length, 80);
12016
12017 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
12018 __ cbz(isURL, SIMDEnter);
12019 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
12020
12021 __ BIND(SIMDEnter);
12022 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
12023 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
12024 __ mov(rscratch1, 63);
12025 __ dup(v27, __ T16B, rscratch1);
12026
12027 __ BIND(Process64B);
12028 __ cmp(length, (u1)64);
12029 __ br(Assembler::LT, Process32B);
12030 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
12031 __ sub(length, length, 64);
12032 __ b(Process64B);
12033
12034 __ BIND(Process32B);
12035 __ cmp(length, (u1)32);
12036 __ br(Assembler::LT, SIMDExit);
12037 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
12038 __ sub(length, length, 32);
12039 __ b(Process32B);
12040
12041 __ BIND(SIMDExit);
12042 __ cbz(length, Exit);
12043 __ movw(rscratch1, length);
12044 __ b(Process4B);
12045
12046 __ BIND(Exit);
12047 __ sub(c_rarg0, dst, doff);
12048
12049 __ leave();
12050 __ ret(lr);
12051
12052 // record the stub entry and end
12053 store_archive_data(stub_id, start, __ pc());
12054
12055 return start;
12056 }
12057
12058 // Support for spin waits.
12059 address generate_spin_wait() {
12060 StubId stub_id = StubId::stubgen_spin_wait_id;
12061 int entry_count = StubInfo::entry_count(stub_id);
12062 assert(entry_count == 1, "sanity check");
12063 address start = load_archive_data(stub_id);
12064 if (start != nullptr) {
12065 return start;
12066 }
12067 __ align(CodeEntryAlignment);
12068 StubCodeMark mark(this, stub_id);
12069 start = __ pc();
12070
12071 __ spin_wait();
12072 __ ret(lr);
12073
12074 // record the stub entry and end
12075 store_archive_data(stub_id, start, __ pc());
12076
12077 return start;
12078 }
12079
12080 void generate_lookup_secondary_supers_table_stub() {
12081 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
12082 GrowableArray<address> entries;
12083 int entry_count = StubInfo::entry_count(stub_id);
12084 assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
12085 address start = load_archive_data(stub_id, &entries);
12086 if (start != nullptr) {
12087 assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
12088 "unexpected extra entry count %d", entries.length());
12089 StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
12090 for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
12091 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
12092 }
12093 return;
12094 }
12095
12096 StubCodeMark mark(this, stub_id);
12097
12098 const Register
12099 r_super_klass = r0,
12100 r_array_base = r1,
12101 r_array_length = r2,
12102 r_array_index = r3,
12103 r_sub_klass = r4,
12104 r_bitmap = rscratch2,
12105 result = r5;
12106 const FloatRegister
12107 vtemp = v0;
12108
12109 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
12110 address next_entry = __ pc();
12111 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
12112 if (slot == 0) {
12113 start = next_entry;
12114 } else {
12115 entries.append(next_entry);
12116 }
12117 Label L_success;
12118 __ enter();
12119 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
12120 r_array_base, r_array_length, r_array_index,
12121 vtemp, result, slot,
12122 /*stub_is_near*/true);
12123 __ leave();
12124 __ ret(lr);
12125 }
12126 // record the stub entry and end plus all the auxiliary entries
12127 store_archive_data(stub_id, start, __ pc(), &entries);
12128 }
12129
12130 // Slow path implementation for UseSecondarySupersTable.
12131 address generate_lookup_secondary_supers_table_slow_path_stub() {
12132 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
12133 int entry_count = StubInfo::entry_count(stub_id);
12134 assert(entry_count == 1, "sanity check");
12135 address start = load_archive_data(stub_id);
12136 if (start != nullptr) {
12137 return start;
12138 }
12139 StubCodeMark mark(this, stub_id);
12140 start = __ pc();
12141 const Register
12142 r_super_klass = r0, // argument
12143 r_array_base = r1, // argument
12144 temp1 = r2, // temp
12145 r_array_index = r3, // argument
12146 r_bitmap = rscratch2, // argument
12147 result = r5; // argument
12148
12149 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
12150 __ ret(lr);
12151
12152 // record the stub entry and end
12153 store_archive_data(stub_id, start, __ pc());
12154
12155 return start;
12156 }
12157
12158 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12159
12160 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
12161 //
12162 // If LSE is in use, generate LSE versions of all the stubs. The
12163 // non-LSE versions are in atomic_aarch64.S.
12164
12165 // class AtomicStubMark records the entry point of a stub and the
12166 // stub pointer which will point to it. The stub pointer is set to
12167 // the entry point when ~AtomicStubMark() is called, which must be
12168 // after ICache::invalidate_range. This ensures safe publication of
12169 // the generated code.
12170 class AtomicStubMark {
12171 address _entry_point;
12172 aarch64_atomic_stub_t *_stub;
12173 MacroAssembler *_masm;
12174 public:
12175 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
12176 _masm = masm;
12177 __ align(32);
12178 _entry_point = __ pc();
12179 _stub = stub;
12180 }
12181 ~AtomicStubMark() {
12182 *_stub = (aarch64_atomic_stub_t)_entry_point;
12183 }
12184 };
12185
12186 // NB: For memory_order_conservative we need a trailing membar after
12187 // LSE atomic operations but not a leading membar.
12188 //
12189 // We don't need a leading membar because a clause in the Arm ARM
12190 // says:
12191 //
12192 // Barrier-ordered-before
12193 //
12194 // Barrier instructions order prior Memory effects before subsequent
12195 // Memory effects generated by the same Observer. A read or a write
12196 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
12197 // Observer if and only if RW1 appears in program order before RW 2
12198 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
12199 // instruction with both Acquire and Release semantics.
12200 //
12201 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
12202 // and Release semantics, therefore we don't need a leading
12203 // barrier. However, there is no corresponding Barrier-ordered-after
12204 // relationship, therefore we need a trailing membar to prevent a
12205 // later store or load from being reordered with the store in an
12206 // atomic instruction.
12207 //
12208 // This was checked by using the herd7 consistency model simulator
12209 // (http://diy.inria.fr/) with this test case:
12210 //
12211 // AArch64 LseCas
12212 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
12213 // P0 | P1;
12214 // LDR W4, [X2] | MOV W3, #0;
12215 // DMB LD | MOV W4, #1;
12216 // LDR W3, [X1] | CASAL W3, W4, [X1];
12217 // | DMB ISH;
12218 // | STR W4, [X2];
12219 // exists
12220 // (0:X3=0 /\ 0:X4=1)
12221 //
12222 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
12223 // with the store to x in P1. Without the DMB in P1 this may happen.
12224 //
12225 // At the time of writing we don't know of any AArch64 hardware that
12226 // reorders stores in this way, but the Reference Manual permits it.
12227
12228 void gen_cas_entry(Assembler::operand_size size,
12229 atomic_memory_order order) {
12230 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
12231 exchange_val = c_rarg2;
12232 bool acquire, release;
12233 switch (order) {
12234 case memory_order_relaxed:
12235 acquire = false;
12236 release = false;
12237 break;
12238 case memory_order_release:
12239 acquire = false;
12240 release = true;
12241 break;
12242 default:
12243 acquire = true;
12244 release = true;
12245 break;
12246 }
12247 __ mov(prev, compare_val);
12248 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
12249 if (order == memory_order_conservative) {
12250 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12251 }
12252 if (size == Assembler::xword) {
12253 __ mov(r0, prev);
12254 } else {
12255 __ movw(r0, prev);
12256 }
12257 __ ret(lr);
12258 }
12259
12260 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
12261 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
12262 // If not relaxed, then default to conservative. Relaxed is the only
12263 // case we use enough to be worth specializing.
12264 if (order == memory_order_relaxed) {
12265 __ ldadd(size, incr, prev, addr);
12266 } else {
12267 __ ldaddal(size, incr, prev, addr);
12268 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12269 }
12270 if (size == Assembler::xword) {
12271 __ mov(r0, prev);
12272 } else {
12273 __ movw(r0, prev);
12274 }
12275 __ ret(lr);
12276 }
12277
12278 void gen_swpal_entry(Assembler::operand_size size) {
12279 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
12280 __ swpal(size, incr, prev, addr);
12281 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12282 if (size == Assembler::xword) {
12283 __ mov(r0, prev);
12284 } else {
12285 __ movw(r0, prev);
12286 }
12287 __ ret(lr);
12288 }
12289
12290 void generate_atomic_entry_points() {
12291 if (! UseLSE) {
12292 return;
12293 }
12294 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
12295 GrowableArray<address> entries;
12296 int entry_count = StubInfo::entry_count(stub_id);
12297 address start = load_archive_data(stub_id, &entries);
12298 if (start != nullptr) {
12299 assert(entries.length() == entry_count - 1,
12300 "unexpected extra entry count %d", entries.length());
12301 aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
12302 int idx = 0;
12303 aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12304 aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12305 aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12306 aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12307 aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12308 aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12309 aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12310 aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12311 aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12312 aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12313 aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12314 aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12315 aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12316 aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12317 aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12318 assert(idx == entries.length(), "sanity!");
12319 return;
12320 }
12321
12322 __ align(CodeEntryAlignment);
12323 StubCodeMark mark(this, stub_id);
12324 start = __ pc();
12325 address end;
12326 {
12327 // ADD, memory_order_conservative
12328 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
12329 gen_ldadd_entry(Assembler::word, memory_order_conservative);
12330
12331 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
12332 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
12333
12334 // ADD, memory_order_relaxed
12335 AtomicStubMark mark_fetch_add_4_relaxed
12336 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
12337 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
12338
12339 AtomicStubMark mark_fetch_add_8_relaxed
12340 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
12341 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
12342
12343 // XCHG, memory_order_conservative
12344 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
12345 gen_swpal_entry(Assembler::word);
12346
12347 AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
12348 gen_swpal_entry(Assembler::xword);
12349
12350 // CAS, memory_order_conservative
12351 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
12352 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
12353
12354 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
12355 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
12356
12357 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
12358 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
12359
12360 // CAS, memory_order_relaxed
12361 AtomicStubMark mark_cmpxchg_1_relaxed
12362 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
12363 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
12364
12365 AtomicStubMark mark_cmpxchg_4_relaxed
12366 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
12367 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
12368
12369 AtomicStubMark mark_cmpxchg_8_relaxed
12370 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
12371 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
12372
12373 AtomicStubMark mark_cmpxchg_4_release
12374 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
12375 gen_cas_entry(MacroAssembler::word, memory_order_release);
12376
12377 AtomicStubMark mark_cmpxchg_8_release
12378 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
12379 gen_cas_entry(MacroAssembler::xword, memory_order_release);
12380
12381 AtomicStubMark mark_cmpxchg_4_seq_cst
12382 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
12383 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
12384
12385 AtomicStubMark mark_cmpxchg_8_seq_cst
12386 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
12387 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
12388
12389 end = __ pc();
12390
12391 ICache::invalidate_range(start, end - start);
12392 // exit block to force update of AtomicStubMark targets
12393 }
12394
12395 assert(start == (address)aarch64_atomic_fetch_add_4_impl,
12396 "atomic stub should be at start of buffer");
12397 // record the stub start and end plus all the entries saved by the
12398 // AtomicStubMark destructor
12399 entries.append((address)aarch64_atomic_fetch_add_8_impl);
12400 entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
12401 entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
12402 entries.append((address)aarch64_atomic_xchg_4_impl);
12403 entries.append((address)aarch64_atomic_xchg_8_impl);
12404 entries.append((address)aarch64_atomic_cmpxchg_1_impl);
12405 entries.append((address)aarch64_atomic_cmpxchg_4_impl);
12406 entries.append((address)aarch64_atomic_cmpxchg_8_impl);
12407 entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
12408 entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
12409 entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
12410 entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
12411 entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
12412 entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
12413 entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
12414
12415 assert(entries.length() == entry_count - 1,
12416 "unexpected extra entry count %d", entries.length());
12417
12418 store_archive_data(stub_id, start, end, &entries);
12419 }
12420 #endif // LINUX
12421
12422 static void save_return_registers(MacroAssembler* masm) {
12423 if (InlineTypeReturnedAsFields) {
12424 masm->push(RegSet::range(r0, r7), sp);
12425 masm->sub(sp, sp, 4 * wordSize);
12426 masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
12427 masm->sub(sp, sp, 4 * wordSize);
12428 masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
12429 } else {
12430 masm->fmovd(rscratch1, v0);
12431 masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
12432 }
12433 }
12434
12435 static void restore_return_registers(MacroAssembler* masm) {
12436 if (InlineTypeReturnedAsFields) {
12437 masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
12438 masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
12439 masm->pop(RegSet::range(r0, r7), sp);
12440 } else {
12441 masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
12442 masm->fmovd(v0, rscratch1);
12443 }
12444 }
12445
12446 address generate_cont_thaw(Continuation::thaw_kind kind) {
12447 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
12448 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
12449
12450 address start = __ pc();
12451
12452 if (return_barrier) {
12453 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
12454 __ mov(sp, rscratch1);
12455 }
12456 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
12457
12458 if (return_barrier) {
12459 // preserve possible return value from a method returning to the return barrier
12460 save_return_registers(_masm);
12461 }
12462
12463 __ movw(c_rarg1, (return_barrier ? 1 : 0));
12464 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
12465 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
12466
12467 if (return_barrier) {
12468 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
12469 restore_return_registers(_masm);
12470 }
12471 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
12472
12473
12474 Label thaw_success;
12475 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
12476 __ cbnz(rscratch2, thaw_success);
12477 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
12478 __ br(rscratch1);
12479 __ bind(thaw_success);
12480
12481 // make room for the thawed frames
12482 __ sub(rscratch1, sp, rscratch2);
12483 __ andr(rscratch1, rscratch1, -16); // align
12484 __ mov(sp, rscratch1);
12485
12486 if (return_barrier) {
12487 // save original return value -- again
12488 save_return_registers(_masm);
12489 }
12490
12491 // If we want, we can templatize thaw by kind, and have three different entries
12492 __ movw(c_rarg1, (uint32_t)kind);
12493
12494 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
12495 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
12496
12497 if (return_barrier) {
12498 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
12499 restore_return_registers(_masm);
12500 } else {
12501 __ mov(r0, zr); // return 0 (success) from doYield
12502 }
12503
12504 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
12505 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
12506 __ mov(rfp, sp);
12507
12508 if (return_barrier_exception) {
12509 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
12510 __ authenticate_return_address(c_rarg1);
12511 __ verify_oop(r0);
12512 // save return value containing the exception oop in callee-saved R19
12513 __ mov(r19, r0);
12514
12515 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
12516
12517 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
12518 // __ reinitialize_ptrue();
12519
12520 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
12521
12522 __ mov(r1, r0); // the exception handler
12523 __ mov(r0, r19); // restore return value containing the exception oop
12524 __ verify_oop(r0);
12525
12526 __ leave();
12527 __ mov(r3, lr);
12528 __ br(r1); // the exception handler
12529 } else {
12530 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
12531 __ leave();
12532 __ ret(lr);
12533 }
12534
12535 return start;
12536 }
12537
12538 address generate_cont_thaw() {
12539 if (!Continuations::enabled()) return nullptr;
12540
12541 StubId stub_id = StubId::stubgen_cont_thaw_id;
12542 int entry_count = StubInfo::entry_count(stub_id);
12543 assert(entry_count == 1, "sanity check");
12544 address start = load_archive_data(stub_id);
12545 if (start != nullptr) {
12546 return start;
12547 }
12548 StubCodeMark mark(this, stub_id);
12549 start = __ pc();
12550 generate_cont_thaw(Continuation::thaw_top);
12551
12552 // record the stub start and end
12553 store_archive_data(stub_id, start, __ pc());
12554
12555 return start;
12556 }
12557
12558 address generate_cont_returnBarrier() {
12559 if (!Continuations::enabled()) return nullptr;
12560
12561 // TODO: will probably need multiple return barriers depending on return type
12562 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
12563 int entry_count = StubInfo::entry_count(stub_id);
12564 assert(entry_count == 1, "sanity check");
12565 address start = load_archive_data(stub_id);
12566 if (start != nullptr) {
12567 return start;
12568 }
12569 StubCodeMark mark(this, stub_id);
12570 start = __ pc();
12571
12572 generate_cont_thaw(Continuation::thaw_return_barrier);
12573
12574 // record the stub start and end
12575 store_archive_data(stub_id, start, __ pc());
12576
12577 return start;
12578 }
12579
12580 address generate_cont_returnBarrier_exception() {
12581 if (!Continuations::enabled()) return nullptr;
12582
12583 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
12584 int entry_count = StubInfo::entry_count(stub_id);
12585 assert(entry_count == 1, "sanity check");
12586 address start = load_archive_data(stub_id);
12587 if (start != nullptr) {
12588 return start;
12589 }
12590 StubCodeMark mark(this, stub_id);
12591 start = __ pc();
12592
12593 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
12594
12595 // record the stub start and end
12596 store_archive_data(stub_id, start, __ pc());
12597
12598 return start;
12599 }
12600
12601 address generate_cont_preempt_stub() {
12602 if (!Continuations::enabled()) return nullptr;
12603 StubId stub_id = StubId::stubgen_cont_preempt_id;
12604 int entry_count = StubInfo::entry_count(stub_id);
12605 assert(entry_count == 1, "sanity check");
12606 address start = load_archive_data(stub_id);
12607 if (start != nullptr) {
12608 return start;
12609 }
12610 StubCodeMark mark(this, stub_id);
12611 start = __ pc();
12612
12613 __ reset_last_Java_frame(true);
12614
12615 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
12616 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
12617 __ mov(sp, rscratch2);
12618
12619 Label preemption_cancelled;
12620 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
12621 __ cbnz(rscratch1, preemption_cancelled);
12622
12623 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
12624 SharedRuntime::continuation_enter_cleanup(_masm);
12625 __ leave();
12626 __ ret(lr);
12627
12628 // We acquired the monitor after freezing the frames so call thaw to continue execution.
12629 __ bind(preemption_cancelled);
12630 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
12631 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
12632 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
12633 __ ldr(rscratch1, Address(rscratch1));
12634 __ br(rscratch1);
12635
12636 // record the stub start and end
12637 store_archive_data(stub_id, start, __ pc());
12638
12639 return start;
12640 }
12641
12642 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
12643 // are represented as long[5], with BITS_PER_LIMB = 26.
12644 // Pack five 26-bit limbs into three 64-bit registers.
12645 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
12646 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
12647 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
12648 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
12649 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
12650
12651 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
12652 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
12653 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
12654 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
12655
12656 if (dest2->is_valid()) {
12657 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
12658 } else {
12659 #ifdef ASSERT
12660 Label OK;
12661 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
12662 __ br(__ EQ, OK);
12663 __ stop("high bits of Poly1305 integer should be zero");
12664 __ should_not_reach_here();
12665 __ bind(OK);
12666 #endif
12667 }
12668 }
12669
12670 // As above, but return only a 128-bit integer, packed into two
12671 // 64-bit registers.
12672 void pack_26(Register dest0, Register dest1, Register src) {
12673 pack_26(dest0, dest1, noreg, src);
12674 }
12675
12676 // Multiply and multiply-accumulate unsigned 64-bit registers.
12677 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
12678 __ mul(prod_lo, n, m);
12679 __ umulh(prod_hi, n, m);
12680 }
12681 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
12682 wide_mul(rscratch1, rscratch2, n, m);
12683 __ adds(sum_lo, sum_lo, rscratch1);
12684 __ adc(sum_hi, sum_hi, rscratch2);
12685 }
12686
12687 // Poly1305, RFC 7539
12688
12689 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
12690 // description of the tricks used to simplify and accelerate this
12691 // computation.
12692
12693 address generate_poly1305_processBlocks() {
12694 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
12695 int entry_count = StubInfo::entry_count(stub_id);
12696 assert(entry_count == 1, "sanity check");
12697 address start = load_archive_data(stub_id);
12698 if (start != nullptr) {
12699 return start;
12700 }
12701 __ align(CodeEntryAlignment);
12702 StubCodeMark mark(this, stub_id);
12703 start = __ pc();
12704 Label here;
12705 __ enter();
12706 RegSet callee_saved = RegSet::range(r19, r28);
12707 __ push(callee_saved, sp);
12708
12709 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
12710
12711 // Arguments
12712 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
12713
12714 // R_n is the 128-bit randomly-generated key, packed into two
12715 // registers. The caller passes this key to us as long[5], with
12716 // BITS_PER_LIMB = 26.
12717 const Register R_0 = *++regs, R_1 = *++regs;
12718 pack_26(R_0, R_1, r_start);
12719
12720 // RR_n is (R_n >> 2) * 5
12721 const Register RR_0 = *++regs, RR_1 = *++regs;
12722 __ lsr(RR_0, R_0, 2);
12723 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
12724 __ lsr(RR_1, R_1, 2);
12725 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
12726
12727 // U_n is the current checksum
12728 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
12729 pack_26(U_0, U_1, U_2, acc_start);
12730
12731 static constexpr int BLOCK_LENGTH = 16;
12732 Label DONE, LOOP;
12733
12734 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
12735 __ br(Assembler::LT, DONE); {
12736 __ bind(LOOP);
12737
12738 // S_n is to be the sum of U_n and the next block of data
12739 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
12740 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
12741 __ adds(S_0, U_0, S_0);
12742 __ adcs(S_1, U_1, S_1);
12743 __ adc(S_2, U_2, zr);
12744 __ add(S_2, S_2, 1);
12745
12746 const Register U_0HI = *++regs, U_1HI = *++regs;
12747
12748 // NB: this logic depends on some of the special properties of
12749 // Poly1305 keys. In particular, because we know that the top
12750 // four bits of R_0 and R_1 are zero, we can add together
12751 // partial products without any risk of needing to propagate a
12752 // carry out.
12753 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
12754 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
12755 __ andr(U_2, R_0, 3);
12756 __ mul(U_2, S_2, U_2);
12757
12758 // Recycle registers S_0, S_1, S_2
12759 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
12760
12761 // Partial reduction mod 2**130 - 5
12762 __ adds(U_1, U_0HI, U_1);
12763 __ adc(U_2, U_1HI, U_2);
12764 // Sum now in U_2:U_1:U_0.
12765 // Dead: U_0HI, U_1HI.
12766 regs = (regs.remaining() + U_0HI + U_1HI).begin();
12767
12768 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
12769
12770 // First, U_2:U_1:U_0 += (U_2 >> 2)
12771 __ lsr(rscratch1, U_2, 2);
12772 __ andr(U_2, U_2, (u8)3);
12773 __ adds(U_0, U_0, rscratch1);
12774 __ adcs(U_1, U_1, zr);
12775 __ adc(U_2, U_2, zr);
12776 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
12777 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
12778 __ adcs(U_1, U_1, zr);
12779 __ adc(U_2, U_2, zr);
12780
12781 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
12782 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
12783 __ br(~ Assembler::LT, LOOP);
12784 }
12785
12786 // Further reduce modulo 2^130 - 5
12787 __ lsr(rscratch1, U_2, 2);
12788 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
12789 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
12790 __ adcs(U_1, U_1, zr);
12791 __ andr(U_2, U_2, (u1)3);
12792 __ adc(U_2, U_2, zr);
12793
12794 // Unpack the sum into five 26-bit limbs and write to memory.
12795 __ ubfiz(rscratch1, U_0, 0, 26);
12796 __ ubfx(rscratch2, U_0, 26, 26);
12797 __ stp(rscratch1, rscratch2, Address(acc_start));
12798 __ ubfx(rscratch1, U_0, 52, 12);
12799 __ bfi(rscratch1, U_1, 12, 14);
12800 __ ubfx(rscratch2, U_1, 14, 26);
12801 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
12802 __ ubfx(rscratch1, U_1, 40, 24);
12803 __ bfi(rscratch1, U_2, 24, 3);
12804 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
12805
12806 __ bind(DONE);
12807 __ pop(callee_saved, sp);
12808 __ leave();
12809 __ ret(lr);
12810
12811 // record the stub start and end
12812 store_archive_data(stub_id, start, __ pc());
12813
12814 return start;
12815 }
12816
12817 // exception handler for upcall stubs
12818 address generate_upcall_stub_exception_handler() {
12819 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
12820 int entry_count = StubInfo::entry_count(stub_id);
12821 assert(entry_count == 1, "sanity check");
12822 address start = load_archive_data(stub_id);
12823 if (start != nullptr) {
12824 return start;
12825 }
12826 StubCodeMark mark(this, stub_id);
12827 start = __ pc();
12828
12829 // Native caller has no idea how to handle exceptions,
12830 // so we just crash here. Up to callee to catch exceptions.
12831 __ verify_oop(r0);
12832 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
12833 __ blr(rscratch1);
12834 __ should_not_reach_here();
12835
12836 // record the stub start and end
12837 store_archive_data(stub_id, start, __ pc());
12838
12839 return start;
12840 }
12841
12842 // load Method* target of MethodHandle
12843 // j_rarg0 = jobject receiver
12844 // rmethod = result
12845 address generate_upcall_stub_load_target() {
12846 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
12847 int entry_count = StubInfo::entry_count(stub_id);
12848 assert(entry_count == 1, "sanity check");
12849 address start = load_archive_data(stub_id);
12850 if (start != nullptr) {
12851 return start;
12852 }
12853 StubCodeMark mark(this, stub_id);
12854 start = __ pc();
12855
12856 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
12857 // Load target method from receiver
12858 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
12859 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
12860 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
12861 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
12862 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
12863 noreg, noreg);
12864 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
12865
12866 __ ret(lr);
12867
12868 // record the stub start and end
12869 store_archive_data(stub_id, start, __ pc());
12870
12871 return start;
12872 }
12873
12874 #undef __
12875 #define __ masm->
12876
12877 class MontgomeryMultiplyGenerator : public MacroAssembler {
12878
12879 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
12880 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
12881
12882 RegSet _toSave;
12883 bool _squaring;
12884
12885 public:
12886 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
12887 : MacroAssembler(as->code()), _squaring(squaring) {
12888
12889 // Register allocation
12890
12891 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
12892 Pa_base = *regs; // Argument registers
12893 if (squaring)
12894 Pb_base = Pa_base;
12895 else
12896 Pb_base = *++regs;
12897 Pn_base = *++regs;
12898 Rlen= *++regs;
12899 inv = *++regs;
12900 Pm_base = *++regs;
12901
12902 // Working registers:
12903 Ra = *++regs; // The current digit of a, b, n, and m.
12904 Rb = *++regs;
12905 Rm = *++regs;
12906 Rn = *++regs;
12907
12908 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
12909 Pb = *++regs;
12910 Pm = *++regs;
12911 Pn = *++regs;
12912
12913 t0 = *++regs; // Three registers which form a
12914 t1 = *++regs; // triple-precision accumuator.
12915 t2 = *++regs;
12916
12917 Ri = *++regs; // Inner and outer loop indexes.
12918 Rj = *++regs;
12919
12920 Rhi_ab = *++regs; // Product registers: low and high parts
12921 Rlo_ab = *++regs; // of a*b and m*n.
12922 Rhi_mn = *++regs;
12923 Rlo_mn = *++regs;
12924
12925 // r19 and up are callee-saved.
12926 _toSave = RegSet::range(r19, *regs) + Pm_base;
12927 }
12928
12929 private:
12930 void save_regs() {
12931 push(_toSave, sp);
12932 }
12933
12934 void restore_regs() {
12935 pop(_toSave, sp);
12936 }
12937
12938 template <typename T>
12939 void unroll_2(Register count, T block) {
12940 Label loop, end, odd;
12941 tbnz(count, 0, odd);
12942 cbz(count, end);
12943 align(16);
12944 bind(loop);
12945 (this->*block)();
12946 bind(odd);
12947 (this->*block)();
12948 subs(count, count, 2);
12949 br(Assembler::GT, loop);
12950 bind(end);
12951 }
12952
12953 template <typename T>
12954 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
12955 Label loop, end, odd;
12956 tbnz(count, 0, odd);
12957 cbz(count, end);
12958 align(16);
12959 bind(loop);
12960 (this->*block)(d, s, tmp);
12961 bind(odd);
12962 (this->*block)(d, s, tmp);
12963 subs(count, count, 2);
12964 br(Assembler::GT, loop);
12965 bind(end);
12966 }
12967
12968 void pre1(RegisterOrConstant i) {
12969 block_comment("pre1");
12970 // Pa = Pa_base;
12971 // Pb = Pb_base + i;
12972 // Pm = Pm_base;
12973 // Pn = Pn_base + i;
12974 // Ra = *Pa;
12975 // Rb = *Pb;
12976 // Rm = *Pm;
12977 // Rn = *Pn;
12978 ldr(Ra, Address(Pa_base));
12979 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
12980 ldr(Rm, Address(Pm_base));
12981 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12982 lea(Pa, Address(Pa_base));
12983 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
12984 lea(Pm, Address(Pm_base));
12985 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12986
12987 // Zero the m*n result.
12988 mov(Rhi_mn, zr);
12989 mov(Rlo_mn, zr);
12990 }
12991
12992 // The core multiply-accumulate step of a Montgomery
12993 // multiplication. The idea is to schedule operations as a
12994 // pipeline so that instructions with long latencies (loads and
12995 // multiplies) have time to complete before their results are
12996 // used. This most benefits in-order implementations of the
12997 // architecture but out-of-order ones also benefit.
12998 void step() {
12999 block_comment("step");
13000 // MACC(Ra, Rb, t0, t1, t2);
13001 // Ra = *++Pa;
13002 // Rb = *--Pb;
13003 umulh(Rhi_ab, Ra, Rb);
13004 mul(Rlo_ab, Ra, Rb);
13005 ldr(Ra, pre(Pa, wordSize));
13006 ldr(Rb, pre(Pb, -wordSize));
13007 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
13008 // previous iteration.
13009 // MACC(Rm, Rn, t0, t1, t2);
13010 // Rm = *++Pm;
13011 // Rn = *--Pn;
13012 umulh(Rhi_mn, Rm, Rn);
13013 mul(Rlo_mn, Rm, Rn);
13014 ldr(Rm, pre(Pm, wordSize));
13015 ldr(Rn, pre(Pn, -wordSize));
13016 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13017 }
13018
13019 void post1() {
13020 block_comment("post1");
13021
13022 // MACC(Ra, Rb, t0, t1, t2);
13023 // Ra = *++Pa;
13024 // Rb = *--Pb;
13025 umulh(Rhi_ab, Ra, Rb);
13026 mul(Rlo_ab, Ra, Rb);
13027 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
13028 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13029
13030 // *Pm = Rm = t0 * inv;
13031 mul(Rm, t0, inv);
13032 str(Rm, Address(Pm));
13033
13034 // MACC(Rm, Rn, t0, t1, t2);
13035 // t0 = t1; t1 = t2; t2 = 0;
13036 umulh(Rhi_mn, Rm, Rn);
13037
13038 #ifndef PRODUCT
13039 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
13040 {
13041 mul(Rlo_mn, Rm, Rn);
13042 add(Rlo_mn, t0, Rlo_mn);
13043 Label ok;
13044 cbz(Rlo_mn, ok); {
13045 stop("broken Montgomery multiply");
13046 } bind(ok);
13047 }
13048 #endif
13049 // We have very carefully set things up so that
13050 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
13051 // the lower half of Rm * Rn because we know the result already:
13052 // it must be -t0. t0 + (-t0) must generate a carry iff
13053 // t0 != 0. So, rather than do a mul and an adds we just set
13054 // the carry flag iff t0 is nonzero.
13055 //
13056 // mul(Rlo_mn, Rm, Rn);
13057 // adds(zr, t0, Rlo_mn);
13058 subs(zr, t0, 1); // Set carry iff t0 is nonzero
13059 adcs(t0, t1, Rhi_mn);
13060 adc(t1, t2, zr);
13061 mov(t2, zr);
13062 }
13063
13064 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
13065 block_comment("pre2");
13066 // Pa = Pa_base + i-len;
13067 // Pb = Pb_base + len;
13068 // Pm = Pm_base + i-len;
13069 // Pn = Pn_base + len;
13070
13071 if (i.is_register()) {
13072 sub(Rj, i.as_register(), len);
13073 } else {
13074 mov(Rj, i.as_constant());
13075 sub(Rj, Rj, len);
13076 }
13077 // Rj == i-len
13078
13079 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
13080 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
13081 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
13082 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
13083
13084 // Ra = *++Pa;
13085 // Rb = *--Pb;
13086 // Rm = *++Pm;
13087 // Rn = *--Pn;
13088 ldr(Ra, pre(Pa, wordSize));
13089 ldr(Rb, pre(Pb, -wordSize));
13090 ldr(Rm, pre(Pm, wordSize));
13091 ldr(Rn, pre(Pn, -wordSize));
13092
13093 mov(Rhi_mn, zr);
13094 mov(Rlo_mn, zr);
13095 }
13096
13097 void post2(RegisterOrConstant i, RegisterOrConstant len) {
13098 block_comment("post2");
13099 if (i.is_constant()) {
13100 mov(Rj, i.as_constant()-len.as_constant());
13101 } else {
13102 sub(Rj, i.as_register(), len);
13103 }
13104
13105 adds(t0, t0, Rlo_mn); // The pending m*n, low part
13106
13107 // As soon as we know the least significant digit of our result,
13108 // store it.
13109 // Pm_base[i-len] = t0;
13110 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
13111
13112 // t0 = t1; t1 = t2; t2 = 0;
13113 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
13114 adc(t1, t2, zr);
13115 mov(t2, zr);
13116 }
13117
13118 // A carry in t0 after Montgomery multiplication means that we
13119 // should subtract multiples of n from our result in m. We'll
13120 // keep doing that until there is no carry.
13121 void normalize(RegisterOrConstant len) {
13122 block_comment("normalize");
13123 // while (t0)
13124 // t0 = sub(Pm_base, Pn_base, t0, len);
13125 Label loop, post, again;
13126 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
13127 cbz(t0, post); {
13128 bind(again); {
13129 mov(i, zr);
13130 mov(cnt, len);
13131 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13132 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
13133 subs(zr, zr, zr); // set carry flag, i.e. no borrow
13134 align(16);
13135 bind(loop); {
13136 sbcs(Rm, Rm, Rn);
13137 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13138 add(i, i, 1);
13139 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13140 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
13141 sub(cnt, cnt, 1);
13142 } cbnz(cnt, loop);
13143 sbc(t0, t0, zr);
13144 } cbnz(t0, again);
13145 } bind(post);
13146 }
13147
13148 // Move memory at s to d, reversing words.
13149 // Increments d to end of copied memory
13150 // Destroys tmp1, tmp2
13151 // Preserves len
13152 // Leaves s pointing to the address which was in d at start
13153 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
13154 assert(tmp1->encoding() < r19->encoding(), "register corruption");
13155 assert(tmp2->encoding() < r19->encoding(), "register corruption");
13156
13157 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
13158 mov(tmp1, len);
13159 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
13160 sub(s, d, len, ext::uxtw, LogBytesPerWord);
13161 }
13162 // where
13163 void reverse1(Register d, Register s, Register tmp) {
13164 ldr(tmp, pre(s, -wordSize));
13165 ror(tmp, tmp, 32);
13166 str(tmp, post(d, wordSize));
13167 }
13168
13169 void step_squaring() {
13170 // An extra ACC
13171 step();
13172 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13173 }
13174
13175 void last_squaring(RegisterOrConstant i) {
13176 Label dont;
13177 // if ((i & 1) == 0) {
13178 tbnz(i.as_register(), 0, dont); {
13179 // MACC(Ra, Rb, t0, t1, t2);
13180 // Ra = *++Pa;
13181 // Rb = *--Pb;
13182 umulh(Rhi_ab, Ra, Rb);
13183 mul(Rlo_ab, Ra, Rb);
13184 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13185 } bind(dont);
13186 }
13187
13188 void extra_step_squaring() {
13189 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
13190
13191 // MACC(Rm, Rn, t0, t1, t2);
13192 // Rm = *++Pm;
13193 // Rn = *--Pn;
13194 umulh(Rhi_mn, Rm, Rn);
13195 mul(Rlo_mn, Rm, Rn);
13196 ldr(Rm, pre(Pm, wordSize));
13197 ldr(Rn, pre(Pn, -wordSize));
13198 }
13199
13200 void post1_squaring() {
13201 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
13202
13203 // *Pm = Rm = t0 * inv;
13204 mul(Rm, t0, inv);
13205 str(Rm, Address(Pm));
13206
13207 // MACC(Rm, Rn, t0, t1, t2);
13208 // t0 = t1; t1 = t2; t2 = 0;
13209 umulh(Rhi_mn, Rm, Rn);
13210
13211 #ifndef PRODUCT
13212 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
13213 {
13214 mul(Rlo_mn, Rm, Rn);
13215 add(Rlo_mn, t0, Rlo_mn);
13216 Label ok;
13217 cbz(Rlo_mn, ok); {
13218 stop("broken Montgomery multiply");
13219 } bind(ok);
13220 }
13221 #endif
13222 // We have very carefully set things up so that
13223 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
13224 // the lower half of Rm * Rn because we know the result already:
13225 // it must be -t0. t0 + (-t0) must generate a carry iff
13226 // t0 != 0. So, rather than do a mul and an adds we just set
13227 // the carry flag iff t0 is nonzero.
13228 //
13229 // mul(Rlo_mn, Rm, Rn);
13230 // adds(zr, t0, Rlo_mn);
13231 subs(zr, t0, 1); // Set carry iff t0 is nonzero
13232 adcs(t0, t1, Rhi_mn);
13233 adc(t1, t2, zr);
13234 mov(t2, zr);
13235 }
13236
13237 void acc(Register Rhi, Register Rlo,
13238 Register t0, Register t1, Register t2) {
13239 adds(t0, t0, Rlo);
13240 adcs(t1, t1, Rhi);
13241 adc(t2, t2, zr);
13242 }
13243
13244 public:
13245 /**
13246 * Fast Montgomery multiplication. The derivation of the
13247 * algorithm is in A Cryptographic Library for the Motorola
13248 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
13249 *
13250 * Arguments:
13251 *
13252 * Inputs for multiplication:
13253 * c_rarg0 - int array elements a
13254 * c_rarg1 - int array elements b
13255 * c_rarg2 - int array elements n (the modulus)
13256 * c_rarg3 - int length
13257 * c_rarg4 - int inv
13258 * c_rarg5 - int array elements m (the result)
13259 *
13260 * Inputs for squaring:
13261 * c_rarg0 - int array elements a
13262 * c_rarg1 - int array elements n (the modulus)
13263 * c_rarg2 - int length
13264 * c_rarg3 - int inv
13265 * c_rarg4 - int array elements m (the result)
13266 *
13267 */
13268 address generate_multiply() {
13269 Label argh, nothing;
13270
13271 align(CodeEntryAlignment);
13272 address entry = pc();
13273
13274 cbzw(Rlen, nothing);
13275
13276 enter();
13277
13278 // Make room.
13279 cmpw(Rlen, 512);
13280 br(Assembler::HI, argh);
13281 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
13282 andr(sp, Ra, -2 * wordSize);
13283
13284 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
13285
13286 {
13287 // Copy input args, reversing as we go. We use Ra as a
13288 // temporary variable.
13289 reverse(Ra, Pa_base, Rlen, t0, t1);
13290 if (!_squaring)
13291 reverse(Ra, Pb_base, Rlen, t0, t1);
13292 reverse(Ra, Pn_base, Rlen, t0, t1);
13293 }
13294
13295 // Push all call-saved registers and also Pm_base which we'll need
13296 // at the end.
13297 save_regs();
13298
13299 #ifndef PRODUCT
13300 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
13301 {
13302 ldr(Rn, Address(Pn_base, 0));
13303 mul(Rlo_mn, Rn, inv);
13304 subs(zr, Rlo_mn, -1);
13305 Label ok;
13306 br(EQ, ok); {
13307 stop("broken inverse in Montgomery multiply");
13308 } bind(ok);
13309 }
13310 #endif
13311
13312 mov(Pm_base, Ra);
13313
13314 mov(t0, zr);
13315 mov(t1, zr);
13316 mov(t2, zr);
13317
13318 block_comment("for (int i = 0; i < len; i++) {");
13319 mov(Ri, zr); {
13320 Label loop, end;
13321 cmpw(Ri, Rlen);
13322 br(Assembler::GE, end);
13323
13324 bind(loop);
13325 pre1(Ri);
13326
13327 block_comment(" for (j = i; j; j--) {"); {
13328 movw(Rj, Ri);
13329 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
13330 } block_comment(" } // j");
13331
13332 post1();
13333 addw(Ri, Ri, 1);
13334 cmpw(Ri, Rlen);
13335 br(Assembler::LT, loop);
13336 bind(end);
13337 block_comment("} // i");
13338 }
13339
13340 block_comment("for (int i = len; i < 2*len; i++) {");
13341 mov(Ri, Rlen); {
13342 Label loop, end;
13343 cmpw(Ri, Rlen, Assembler::LSL, 1);
13344 br(Assembler::GE, end);
13345
13346 bind(loop);
13347 pre2(Ri, Rlen);
13348
13349 block_comment(" for (j = len*2-i-1; j; j--) {"); {
13350 lslw(Rj, Rlen, 1);
13351 subw(Rj, Rj, Ri);
13352 subw(Rj, Rj, 1);
13353 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
13354 } block_comment(" } // j");
13355
13356 post2(Ri, Rlen);
13357 addw(Ri, Ri, 1);
13358 cmpw(Ri, Rlen, Assembler::LSL, 1);
13359 br(Assembler::LT, loop);
13360 bind(end);
13361 }
13362 block_comment("} // i");
13363
13364 normalize(Rlen);
13365
13366 mov(Ra, Pm_base); // Save Pm_base in Ra
13367 restore_regs(); // Restore caller's Pm_base
13368
13369 // Copy our result into caller's Pm_base
13370 reverse(Pm_base, Ra, Rlen, t0, t1);
13371
13372 leave();
13373 bind(nothing);
13374 ret(lr);
13375
13376 // handler for error case
13377 bind(argh);
13378 stop("MontgomeryMultiply total_allocation must be <= 8192");
13379
13380 return entry;
13381 }
13382 // In C, approximately:
13383
13384 // void
13385 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
13386 // julong Pn_base[], julong Pm_base[],
13387 // julong inv, int len) {
13388 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
13389 // julong *Pa, *Pb, *Pn, *Pm;
13390 // julong Ra, Rb, Rn, Rm;
13391
13392 // int i;
13393
13394 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
13395
13396 // for (i = 0; i < len; i++) {
13397 // int j;
13398
13399 // Pa = Pa_base;
13400 // Pb = Pb_base + i;
13401 // Pm = Pm_base;
13402 // Pn = Pn_base + i;
13403
13404 // Ra = *Pa;
13405 // Rb = *Pb;
13406 // Rm = *Pm;
13407 // Rn = *Pn;
13408
13409 // int iters = i;
13410 // for (j = 0; iters--; j++) {
13411 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
13412 // MACC(Ra, Rb, t0, t1, t2);
13413 // Ra = *++Pa;
13414 // Rb = *--Pb;
13415 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13416 // MACC(Rm, Rn, t0, t1, t2);
13417 // Rm = *++Pm;
13418 // Rn = *--Pn;
13419 // }
13420
13421 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
13422 // MACC(Ra, Rb, t0, t1, t2);
13423 // *Pm = Rm = t0 * inv;
13424 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
13425 // MACC(Rm, Rn, t0, t1, t2);
13426
13427 // assert(t0 == 0, "broken Montgomery multiply");
13428
13429 // t0 = t1; t1 = t2; t2 = 0;
13430 // }
13431
13432 // for (i = len; i < 2*len; i++) {
13433 // int j;
13434
13435 // Pa = Pa_base + i-len;
13436 // Pb = Pb_base + len;
13437 // Pm = Pm_base + i-len;
13438 // Pn = Pn_base + len;
13439
13440 // Ra = *++Pa;
13441 // Rb = *--Pb;
13442 // Rm = *++Pm;
13443 // Rn = *--Pn;
13444
13445 // int iters = len*2-i-1;
13446 // for (j = i-len+1; iters--; j++) {
13447 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
13448 // MACC(Ra, Rb, t0, t1, t2);
13449 // Ra = *++Pa;
13450 // Rb = *--Pb;
13451 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13452 // MACC(Rm, Rn, t0, t1, t2);
13453 // Rm = *++Pm;
13454 // Rn = *--Pn;
13455 // }
13456
13457 // Pm_base[i-len] = t0;
13458 // t0 = t1; t1 = t2; t2 = 0;
13459 // }
13460
13461 // while (t0)
13462 // t0 = sub(Pm_base, Pn_base, t0, len);
13463 // }
13464
13465 /**
13466 * Fast Montgomery squaring. This uses asymptotically 25% fewer
13467 * multiplies than Montgomery multiplication so it should be up to
13468 * 25% faster. However, its loop control is more complex and it
13469 * may actually run slower on some machines.
13470 *
13471 * Arguments:
13472 *
13473 * Inputs:
13474 * c_rarg0 - int array elements a
13475 * c_rarg1 - int array elements n (the modulus)
13476 * c_rarg2 - int length
13477 * c_rarg3 - int inv
13478 * c_rarg4 - int array elements m (the result)
13479 *
13480 */
13481 address generate_square() {
13482 Label argh;
13483
13484 align(CodeEntryAlignment);
13485 address entry = pc();
13486
13487 enter();
13488
13489 // Make room.
13490 cmpw(Rlen, 512);
13491 br(Assembler::HI, argh);
13492 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
13493 andr(sp, Ra, -2 * wordSize);
13494
13495 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
13496
13497 {
13498 // Copy input args, reversing as we go. We use Ra as a
13499 // temporary variable.
13500 reverse(Ra, Pa_base, Rlen, t0, t1);
13501 reverse(Ra, Pn_base, Rlen, t0, t1);
13502 }
13503
13504 // Push all call-saved registers and also Pm_base which we'll need
13505 // at the end.
13506 save_regs();
13507
13508 mov(Pm_base, Ra);
13509
13510 mov(t0, zr);
13511 mov(t1, zr);
13512 mov(t2, zr);
13513
13514 block_comment("for (int i = 0; i < len; i++) {");
13515 mov(Ri, zr); {
13516 Label loop, end;
13517 bind(loop);
13518 cmp(Ri, Rlen);
13519 br(Assembler::GE, end);
13520
13521 pre1(Ri);
13522
13523 block_comment("for (j = (i+1)/2; j; j--) {"); {
13524 add(Rj, Ri, 1);
13525 lsr(Rj, Rj, 1);
13526 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
13527 } block_comment(" } // j");
13528
13529 last_squaring(Ri);
13530
13531 block_comment(" for (j = i/2; j; j--) {"); {
13532 lsr(Rj, Ri, 1);
13533 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
13534 } block_comment(" } // j");
13535
13536 post1_squaring();
13537 add(Ri, Ri, 1);
13538 cmp(Ri, Rlen);
13539 br(Assembler::LT, loop);
13540
13541 bind(end);
13542 block_comment("} // i");
13543 }
13544
13545 block_comment("for (int i = len; i < 2*len; i++) {");
13546 mov(Ri, Rlen); {
13547 Label loop, end;
13548 bind(loop);
13549 cmp(Ri, Rlen, Assembler::LSL, 1);
13550 br(Assembler::GE, end);
13551
13552 pre2(Ri, Rlen);
13553
13554 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
13555 lsl(Rj, Rlen, 1);
13556 sub(Rj, Rj, Ri);
13557 sub(Rj, Rj, 1);
13558 lsr(Rj, Rj, 1);
13559 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
13560 } block_comment(" } // j");
13561
13562 last_squaring(Ri);
13563
13564 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
13565 lsl(Rj, Rlen, 1);
13566 sub(Rj, Rj, Ri);
13567 lsr(Rj, Rj, 1);
13568 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
13569 } block_comment(" } // j");
13570
13571 post2(Ri, Rlen);
13572 add(Ri, Ri, 1);
13573 cmp(Ri, Rlen, Assembler::LSL, 1);
13574
13575 br(Assembler::LT, loop);
13576 bind(end);
13577 block_comment("} // i");
13578 }
13579
13580 normalize(Rlen);
13581
13582 mov(Ra, Pm_base); // Save Pm_base in Ra
13583 restore_regs(); // Restore caller's Pm_base
13584
13585 // Copy our result into caller's Pm_base
13586 reverse(Pm_base, Ra, Rlen, t0, t1);
13587
13588 leave();
13589 ret(lr);
13590
13591 // handler for error case
13592 bind(argh);
13593 stop("MontgomeryMultiply total_allocation must be <= 8192");
13594
13595 return entry;
13596 }
13597 // In C, approximately:
13598
13599 // void
13600 // montgomery_square(julong Pa_base[], julong Pn_base[],
13601 // julong Pm_base[], julong inv, int len) {
13602 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
13603 // julong *Pa, *Pb, *Pn, *Pm;
13604 // julong Ra, Rb, Rn, Rm;
13605
13606 // int i;
13607
13608 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
13609
13610 // for (i = 0; i < len; i++) {
13611 // int j;
13612
13613 // Pa = Pa_base;
13614 // Pb = Pa_base + i;
13615 // Pm = Pm_base;
13616 // Pn = Pn_base + i;
13617
13618 // Ra = *Pa;
13619 // Rb = *Pb;
13620 // Rm = *Pm;
13621 // Rn = *Pn;
13622
13623 // int iters = (i+1)/2;
13624 // for (j = 0; iters--; j++) {
13625 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
13626 // MACC2(Ra, Rb, t0, t1, t2);
13627 // Ra = *++Pa;
13628 // Rb = *--Pb;
13629 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13630 // MACC(Rm, Rn, t0, t1, t2);
13631 // Rm = *++Pm;
13632 // Rn = *--Pn;
13633 // }
13634 // if ((i & 1) == 0) {
13635 // assert(Ra == Pa_base[j], "must be");
13636 // MACC(Ra, Ra, t0, t1, t2);
13637 // }
13638 // iters = i/2;
13639 // assert(iters == i-j, "must be");
13640 // for (; iters--; j++) {
13641 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13642 // MACC(Rm, Rn, t0, t1, t2);
13643 // Rm = *++Pm;
13644 // Rn = *--Pn;
13645 // }
13646
13647 // *Pm = Rm = t0 * inv;
13648 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
13649 // MACC(Rm, Rn, t0, t1, t2);
13650
13651 // assert(t0 == 0, "broken Montgomery multiply");
13652
13653 // t0 = t1; t1 = t2; t2 = 0;
13654 // }
13655
13656 // for (i = len; i < 2*len; i++) {
13657 // int start = i-len+1;
13658 // int end = start + (len - start)/2;
13659 // int j;
13660
13661 // Pa = Pa_base + i-len;
13662 // Pb = Pa_base + len;
13663 // Pm = Pm_base + i-len;
13664 // Pn = Pn_base + len;
13665
13666 // Ra = *++Pa;
13667 // Rb = *--Pb;
13668 // Rm = *++Pm;
13669 // Rn = *--Pn;
13670
13671 // int iters = (2*len-i-1)/2;
13672 // assert(iters == end-start, "must be");
13673 // for (j = start; iters--; j++) {
13674 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
13675 // MACC2(Ra, Rb, t0, t1, t2);
13676 // Ra = *++Pa;
13677 // Rb = *--Pb;
13678 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13679 // MACC(Rm, Rn, t0, t1, t2);
13680 // Rm = *++Pm;
13681 // Rn = *--Pn;
13682 // }
13683 // if ((i & 1) == 0) {
13684 // assert(Ra == Pa_base[j], "must be");
13685 // MACC(Ra, Ra, t0, t1, t2);
13686 // }
13687 // iters = (2*len-i)/2;
13688 // assert(iters == len-j, "must be");
13689 // for (; iters--; j++) {
13690 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13691 // MACC(Rm, Rn, t0, t1, t2);
13692 // Rm = *++Pm;
13693 // Rn = *--Pn;
13694 // }
13695 // Pm_base[i-len] = t0;
13696 // t0 = t1; t1 = t2; t2 = 0;
13697 // }
13698
13699 // while (t0)
13700 // t0 = sub(Pm_base, Pn_base, t0, len);
13701 // }
13702 };
13703
13704 // Call here from the interpreter or compiled code to either load
13705 // multiple returned values from the inline type instance being
13706 // returned to registers or to store returned values to a newly
13707 // allocated inline type instance.
13708 address generate_return_value_stub(address destination, const char* name, bool has_res) {
13709 // We need to save all registers the calling convention may use so
13710 // the runtime calls read or update those registers. This needs to
13711 // be in sync with SharedRuntime::java_return_convention().
13712 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
13713 enum layout {
13714 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0
13715 j_rarg6_off, j_rarg6_2,
13716 j_rarg5_off, j_rarg5_2,
13717 j_rarg4_off, j_rarg4_2,
13718 j_rarg3_off, j_rarg3_2,
13719 j_rarg2_off, j_rarg2_2,
13720 j_rarg1_off, j_rarg1_2,
13721 j_rarg0_off, j_rarg0_2,
13722
13723 j_farg7_off, j_farg7_2,
13724 j_farg6_off, j_farg6_2,
13725 j_farg5_off, j_farg5_2,
13726 j_farg4_off, j_farg4_2,
13727 j_farg3_off, j_farg3_2,
13728 j_farg2_off, j_farg2_2,
13729 j_farg1_off, j_farg1_2,
13730 j_farg0_off, j_farg0_2,
13731
13732 rfp_off, rfp_off2,
13733 return_off, return_off2,
13734
13735 framesize // inclusive of return address
13736 };
13737
13738 CodeBuffer code(name, 512, 64);
13739 MacroAssembler* masm = new MacroAssembler(&code);
13740
13741 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
13742 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
13743 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
13744 int frame_size_in_words = frame_size_in_bytes / wordSize;
13745
13746 OopMapSet* oop_maps = new OopMapSet();
13747 OopMap* map = new OopMap(frame_size_in_slots, 0);
13748
13749 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
13750 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
13751 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
13752 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
13753 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
13754 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
13755 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
13756 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
13757
13758 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
13759 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
13760 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
13761 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
13762 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
13763 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
13764 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
13765 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
13766
13767 address start = __ pc();
13768
13769 __ enter(); // Save FP and LR before call
13770
13771 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
13772 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
13773 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
13774 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
13775
13776 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
13777 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
13778 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
13779 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
13780
13781 int frame_complete = __ offset();
13782
13783 // Set up last_Java_sp and last_Java_fp
13784 address the_pc = __ pc();
13785 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
13786
13787 // Call runtime
13788 __ mov(c_rarg1, r0);
13789 __ mov(c_rarg0, rthread);
13790
13791 __ mov(rscratch1, destination);
13792 __ blr(rscratch1);
13793
13794 oop_maps->add_gc_map(the_pc - start, map);
13795
13796 __ reset_last_Java_frame(false);
13797
13798 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
13799 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
13800 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
13801 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
13802
13803 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
13804 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
13805 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
13806 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
13807
13808 // check for pending exceptions
13809 Label pending;
13810 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
13811 __ cbnz(rscratch1, pending);
13812
13813 if (has_res) {
13814 // We just called SharedRuntime::store_inline_type_fields_to_buf. Check if we still
13815 // need to initialize the buffer and if so, call the inline class specific pack handler.
13816 Label skip_pack;
13817 __ get_vm_result_oop(r0, rthread);
13818 __ get_vm_result_metadata(rscratch1, rthread);
13819 __ cbz(rscratch1, skip_pack);
13820 __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
13821 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_offset()));
13822 __ blr(rscratch1);
13823 __ membar(Assembler::StoreStore);
13824 __ bind(skip_pack);
13825 }
13826
13827 __ leave();
13828 __ ret(lr);
13829
13830 __ bind(pending);
13831 __ leave();
13832 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
13833
13834 // -------------
13835 // make sure all code is generated
13836 masm->flush();
13837
13838 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
13839 return stub->entry_point();
13840 }
13841
13842 // Initialization
13843 void generate_preuniverse_stubs() {
13844 // preuniverse stubs are not needed for aarch64
13845 }
13846
13847 void generate_initial_stubs() {
13848 // Generate initial stubs and initializes the entry points
13849
13850 // entry points that exist in all platforms Note: This is code
13851 // that could be shared among different platforms - however the
13852 // benefit seems to be smaller than the disadvantage of having a
13853 // much more complicated generator structure. See also comment in
13854 // stubRoutines.hpp.
13855
13856 StubRoutines::_forward_exception_entry = generate_forward_exception();
13857
13858 StubRoutines::_call_stub_entry =
13859 generate_call_stub(StubRoutines::_call_stub_return_address);
13860
13861 // is referenced by megamorphic call
13862 StubRoutines::_catch_exception_entry = generate_catch_exception();
13863
13864 // Initialize table for copy memory (arraycopy) check.
13865 if (UnsafeMemoryAccess::_table == nullptr) {
13866 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
13867 }
13868
13869 if (UseCRC32Intrinsics) {
13870 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
13871 }
13872
13873 if (UseCRC32CIntrinsics) {
13874 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
13875 }
13876
13877 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
13878 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
13879 }
13880
13881 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
13882 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
13883 }
13884
13885 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
13886 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
13887 StubRoutines::_hf2f = generate_float16ToFloat();
13888 StubRoutines::_f2hf = generate_floatToFloat16();
13889 }
13890
13891 if (InlineTypeReturnedAsFields) {
13892 StubRoutines::_load_inline_type_fields_in_regs =
13893 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
13894 StubRoutines::_store_inline_type_fields_to_buf =
13895 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
13896 }
13897
13898 }
13899
13900 void generate_continuation_stubs() {
13901 // Continuation stubs:
13902 StubRoutines::_cont_thaw = generate_cont_thaw();
13903 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
13904 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
13905 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
13906 }
13907
13908 void generate_final_stubs() {
13909 // support for verify_oop (must happen after universe_init)
13910 if (VerifyOops) {
13911 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
13912 }
13913
13914 // arraycopy stubs used by compilers
13915 generate_arraycopy_stubs();
13916
13917 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
13918
13919 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
13920
13921 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
13922 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
13923
13924 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
13925
13926 generate_atomic_entry_points();
13927
13928 #endif // LINUX
13929
13930 #ifdef COMPILER2
13931 if (UseSecondarySupersTable) {
13932 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
13933 if (! InlineSecondarySupersTest) {
13934 generate_lookup_secondary_supers_table_stub();
13935 }
13936 }
13937 #endif
13938
13939 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
13940 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
13941 }
13942
13943 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
13944 }
13945
13946 void generate_compiler_stubs() {
13947 #ifdef COMPILER2
13948
13949 if (UseSVE == 0) {
13950 generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
13951 }
13952
13953 // array equals stub for large arrays.
13954 if (!UseSimpleArrayEquals) {
13955 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
13956 }
13957
13958 // arrays_hascode stub for large arrays.
13959 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
13960 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
13961 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
13962 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
13963 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
13964
13965 // byte_array_inflate stub for large arrays.
13966 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
13967
13968 // countPositives stub for large arrays.
13969 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
13970
13971 generate_compare_long_strings();
13972
13973 generate_string_indexof_stubs();
13974
13975 if (UseMultiplyToLenIntrinsic) {
13976 StubRoutines::_multiplyToLen = generate_multiplyToLen();
13977 }
13978
13979 if (UseSquareToLenIntrinsic) {
13980 StubRoutines::_squareToLen = generate_squareToLen();
13981 }
13982
13983 if (UseMulAddIntrinsic) {
13984 StubRoutines::_mulAdd = generate_mulAdd();
13985 }
13986
13987 if (UseSIMDForBigIntegerShiftIntrinsics) {
13988 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
13989 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
13990 }
13991
13992 if (UseMontgomeryMultiplyIntrinsic) {
13993 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
13994 address start = load_archive_data(stub_id);
13995 if (start == nullptr) {
13996 // we have to generate it
13997 StubCodeMark mark(this, stub_id);
13998 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
13999 start = g.generate_multiply();
14000 // record the stub start and end
14001 store_archive_data(stub_id, start, _masm->pc());
14002 }
14003 StubRoutines::_montgomeryMultiply = start;
14004 }
14005
14006 if (UseMontgomerySquareIntrinsic) {
14007 StubId stub_id = StubId::stubgen_montgomerySquare_id;
14008 address start = load_archive_data(stub_id);
14009 if (start == nullptr) {
14010 // we have to generate it
14011 StubCodeMark mark(this, stub_id);
14012 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
14013 // We use generate_multiply() rather than generate_square()
14014 // because it's faster for the sizes of modulus we care about.
14015 start = g.generate_multiply();
14016 // record the stub start and end
14017 store_archive_data(stub_id, start, _masm->pc());
14018 }
14019 StubRoutines::_montgomerySquare = start;
14020 }
14021
14022 if (UseChaCha20Intrinsics) {
14023 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
14024 }
14025
14026 if (UseIntPolyIntrinsics) {
14027 StubRoutines::_intpoly_montgomeryMult_P256 = generate_intpoly_montgomeryMult_P256();
14028 StubRoutines::_intpoly_assign = generate_intpoly_assign();
14029 }
14030
14031 if (UseKyberIntrinsics) {
14032 StubRoutines::_kyberNtt = generate_kyberNtt();
14033 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
14034 StubRoutines::_kyberNttMult = generate_kyberNttMult();
14035 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
14036 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
14037 StubRoutines::_kyber12To16 = generate_kyber12To16();
14038 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
14039 }
14040
14041 if (UseDilithiumIntrinsics) {
14042 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
14043 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
14044 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
14045 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
14046 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
14047 }
14048
14049 if (UseBASE64Intrinsics) {
14050 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
14051 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
14052 }
14053
14054 // data cache line writeback
14055 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
14056 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
14057
14058 if (UseAESIntrinsics) {
14059 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
14060 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
14061 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
14062 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
14063 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
14064 }
14065 if (UseGHASHIntrinsics) {
14066 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
14067 StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
14068 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
14069 }
14070 if (UseAESIntrinsics && UseGHASHIntrinsics) {
14071 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
14072 }
14073
14074 if (UseMD5Intrinsics) {
14075 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
14076 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
14077 }
14078 if (UseSHA1Intrinsics) {
14079 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
14080 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
14081 }
14082 if (UseSHA256Intrinsics) {
14083 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
14084 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
14085 }
14086 if (UseSHA512Intrinsics) {
14087 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
14088 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
14089 }
14090 if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
14091 StubRoutines::_double_keccak = generate_double_keccak();
14092 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
14093 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
14094 } else if (UseSHA3Intrinsics) {
14095 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
14096 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
14097 }
14098
14099 if (UsePoly1305Intrinsics) {
14100 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
14101 }
14102
14103 // The difference between AArch64 vs. x86_64 intrinsics implementation
14104 // include the lack of square() intrinsics; usage caused a 3.3% performance
14105 // degradation due to the efficiencies of the symmetric squaring shape in
14106 // Java vs. the inefficiencies of the leaf calls and the additional cycles
14107 // required for 64 bit multiplication in AArch64.
14108 if (UseIntPoly25519Intrinsics) {
14109 StubRoutines::_intpoly_mult_25519 = generate_intpoly_mult_25519();
14110 }
14111
14112 // generate Adler32 intrinsics code
14113 if (UseAdler32Intrinsics) {
14114 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
14115 }
14116
14117 #endif // COMPILER2
14118 }
14119
14120 public:
14121 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
14122 switch(blob_id) {
14123 case BlobId::stubgen_preuniverse_id:
14124 generate_preuniverse_stubs();
14125 break;
14126 case BlobId::stubgen_initial_id:
14127 generate_initial_stubs();
14128 break;
14129 case BlobId::stubgen_continuation_id:
14130 generate_continuation_stubs();
14131 break;
14132 case BlobId::stubgen_compiler_id:
14133 generate_compiler_stubs();
14134 break;
14135 case BlobId::stubgen_final_id:
14136 generate_final_stubs();
14137 break;
14138 default:
14139 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
14140 break;
14141 };
14142 }
14143
14144 #if INCLUDE_CDS
14145 static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
14146 // external data defined in this file
14147 #define ADD(addr) external_addresses.append((address)(addr));
14148 ADD(_sha256_round_consts);
14149 ADD(_sha512_round_consts);
14150 ADD(_sha3_round_consts);
14151 ADD(_double_keccak_round_consts);
14152 ADD(_modulus_P256);
14153 ADD(_encodeBlock_toBase64);
14154 ADD(_encodeBlock_toBase64URL);
14155 ADD(_decodeBlock_fromBase64ForNoSIMD);
14156 ADD(_decodeBlock_fromBase64URLForNoSIMD);
14157 ADD(_decodeBlock_fromBase64ForSIMD);
14158 ADD(_decodeBlock_fromBase64URLForSIMD);
14159 #undef ADD
14160 }
14161 #endif // INCLUDE_CDS
14162 }; // end class declaration
14163
14164 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
14165 StubGenerator g(code, blob_id, stub_data);
14166 }
14167
14168 #if INCLUDE_CDS
14169 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
14170 StubGenerator::init_AOTAddressTable(addresses);
14171 }
14172 #endif // INCLUDE_CDS
14173
14174 #if defined (LINUX)
14175
14176 // Define pointers to atomic stubs and initialize them to point to the
14177 // code in atomic_aarch64.S.
14178
14179 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
14180 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
14181 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
14182 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
14183 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
14184
14185 DEFAULT_ATOMIC_OP(fetch_add, 4, )
14186 DEFAULT_ATOMIC_OP(fetch_add, 8, )
14187 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
14188 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
14189 DEFAULT_ATOMIC_OP(xchg, 4, )
14190 DEFAULT_ATOMIC_OP(xchg, 8, )
14191 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
14192 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
14193 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
14194 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
14195 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
14196 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
14197 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
14198 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
14199 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
14200 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
14201
14202 #undef DEFAULT_ATOMIC_OP
14203
14204 #endif // LINUX