1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Constant data definitions
83
84 static const uint32_t _sha256_round_consts[64] = {
85 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
86 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
87 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
88 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
89 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
90 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
91 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
92 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
93 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
94 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
95 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
96 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
97 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
98 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
99 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
100 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
101 };
102
103 static const uint64_t _sha512_round_consts[80] = {
104 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
105 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
106 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
107 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
108 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
109 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
110 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
111 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
112 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
113 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
114 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
115 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
116 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
117 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
118 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
119 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
120 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
121 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
122 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
123 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
124 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
125 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
126 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
127 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
128 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
129 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
130 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
131 };
132
133 static const uint64_t _sha3_round_consts[24] = {
134 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
135 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
136 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
137 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
138 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
139 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
140 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
141 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
142 };
143
144 static const uint64_t _double_keccak_round_consts[24] = {
145 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
146 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
147 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
148 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
149 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
150 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
151 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
152 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
153 };
154
155 //Omit 3rd limb of modulus since it is 0
156 static const int64_t _modulus_P256[5] = {
157 0x000fffffffffffffL, 0x00000fffffffffffL,
158 0x0000001000000000L, 0x0000ffffffff0000L
159 };
160
161 static const char _encodeBlock_toBase64[64] = {
162 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
163 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
164 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
165 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
166 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
167 };
168
169 static const char _encodeBlock_toBase64URL[64] = {
170 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
171 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
172 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
173 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
174 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
175 };
176
177 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
178 // except the trailing character '=' is also treated illegal value in this intrinsic. That
179 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
180 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
181 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
182 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
184 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
185 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
186 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
187 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
188 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
189 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
190 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
191 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
192 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
193 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
194 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
195 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
196 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
197 };
198
199 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
200 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
201 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
202 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
203 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
204 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
205 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
206 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
207 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
208 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
210 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
211 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
212 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
213 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
214 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
215 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
216 };
217
218 // A legal value of base64 code is in range [0, 127]. We need two lookups
219 // with tbl/tbx and combine them to get the decode data. The 1st table vector
220 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
221 // table vector lookup use tbx, out of range indices are unchanged in
222 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
223 // The value of index 64 is set to 0, so that we know that we already get the
224 // decoded data with the 1st lookup.
225 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
226 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
227 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
228 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
229 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
230 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
231 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
232 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
233 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
234 };
235
236 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
237 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
238 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
239 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
240 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
241 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
242 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
243 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
244 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
245 };
246
247
248 // Stub Code definitions
249
250 class StubGenerator: public StubCodeGenerator {
251 private:
252
253 #ifdef PRODUCT
254 #define inc_counter_np(counter) ((void)0)
255 #else
256 void inc_counter_np_(uint& counter) {
257 __ incrementw(ExternalAddress((address)&counter));
258 }
259 #define inc_counter_np(counter) \
260 BLOCK_COMMENT("inc_counter " #counter); \
261 inc_counter_np_(counter);
262 #endif
263
264 // Call stubs are used to call Java from C
265 //
266 // Arguments:
267 // c_rarg0: call wrapper address address
268 // c_rarg1: result address
269 // c_rarg2: result type BasicType
270 // c_rarg3: method Method*
271 // c_rarg4: (interpreter) entry point address
272 // c_rarg5: parameters intptr_t*
273 // c_rarg6: parameter size (in words) int
274 // c_rarg7: thread Thread*
275 //
276 // There is no return from the stub itself as any Java result
277 // is written to result
278 //
279 // we save r30 (lr) as the return PC at the base of the frame and
280 // link r29 (fp) below it as the frame pointer installing sp (r31)
281 // into fp.
282 //
283 // we save r0-r7, which accounts for all the c arguments.
284 //
285 // TODO: strictly do we need to save them all? they are treated as
286 // volatile by C so could we omit saving the ones we are going to
287 // place in global registers (thread? method?) or those we only use
288 // during setup of the Java call?
289 //
290 // we don't need to save r8 which C uses as an indirect result location
291 // return register.
292 //
293 // we don't need to save r9-r15 which both C and Java treat as
294 // volatile
295 //
296 // we don't need to save r16-18 because Java does not use them
297 //
298 // we save r19-r28 which Java uses as scratch registers and C
299 // expects to be callee-save
300 //
301 // we save the bottom 64 bits of each value stored in v8-v15; it is
302 // the responsibility of the caller to preserve larger values.
303 //
304 // so the stub frame looks like this when we enter Java code
305 //
306 // [ return_from_Java ] <--- sp
307 // [ argument word n ]
308 // ...
309 // -29 [ argument word 1 ]
310 // -28 [ saved Floating-point Control Register ]
311 // -26 [ saved v15 ] <--- sp_after_call
312 // -25 [ saved v14 ]
313 // -24 [ saved v13 ]
314 // -23 [ saved v12 ]
315 // -22 [ saved v11 ]
316 // -21 [ saved v10 ]
317 // -20 [ saved v9 ]
318 // -19 [ saved v8 ]
319 // -18 [ saved r28 ]
320 // -17 [ saved r27 ]
321 // -16 [ saved r26 ]
322 // -15 [ saved r25 ]
323 // -14 [ saved r24 ]
324 // -13 [ saved r23 ]
325 // -12 [ saved r22 ]
326 // -11 [ saved r21 ]
327 // -10 [ saved r20 ]
328 // -9 [ saved r19 ]
329 // -8 [ call wrapper (r0) ]
330 // -7 [ result (r1) ]
331 // -6 [ result type (r2) ]
332 // -5 [ method (r3) ]
333 // -4 [ entry point (r4) ]
334 // -3 [ parameters (r5) ]
335 // -2 [ parameter size (r6) ]
336 // -1 [ thread (r7) ]
337 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
338 // 1 [ saved lr (r30) ]
339
340 // Call stub stack layout word offsets from fp
341 enum call_stub_layout {
342 sp_after_call_off = -28,
343
344 fpcr_off = sp_after_call_off,
345 d15_off = -26,
346 d13_off = -24,
347 d11_off = -22,
348 d9_off = -20,
349
350 r28_off = -18,
351 r26_off = -16,
352 r24_off = -14,
353 r22_off = -12,
354 r20_off = -10,
355 call_wrapper_off = -8,
356 result_off = -7,
357 result_type_off = -6,
358 method_off = -5,
359 entry_point_off = -4,
360 parameter_size_off = -2,
361 thread_off = -1,
362 fp_f = 0,
363 retaddr_off = 1,
364 };
365
366 address generate_call_stub(address& return_address) {
367 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
368 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
369 "adjust this code");
370
371 StubId stub_id = StubId::stubgen_call_stub_id;
372 GrowableArray<address> entries;
373 int entry_count = StubInfo::entry_count(stub_id);
374 assert(entry_count == 2, "sanity check");
375 address start = load_archive_data(stub_id, &entries);
376 if (start != nullptr) {
377 assert(entries.length() == 1, "expected 1 extra entry");
378 return_address = entries.at(0);
379 return start;
380 }
381 StubCodeMark mark(this, stub_id);
382 start = __ pc();
383
384 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
385
386 const Address fpcr_save (rfp, fpcr_off * wordSize);
387 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
388 const Address result (rfp, result_off * wordSize);
389 const Address result_type (rfp, result_type_off * wordSize);
390 const Address method (rfp, method_off * wordSize);
391 const Address entry_point (rfp, entry_point_off * wordSize);
392 const Address parameter_size(rfp, parameter_size_off * wordSize);
393
394 const Address thread (rfp, thread_off * wordSize);
395
396 const Address d15_save (rfp, d15_off * wordSize);
397 const Address d13_save (rfp, d13_off * wordSize);
398 const Address d11_save (rfp, d11_off * wordSize);
399 const Address d9_save (rfp, d9_off * wordSize);
400
401 const Address r28_save (rfp, r28_off * wordSize);
402 const Address r26_save (rfp, r26_off * wordSize);
403 const Address r24_save (rfp, r24_off * wordSize);
404 const Address r22_save (rfp, r22_off * wordSize);
405 const Address r20_save (rfp, r20_off * wordSize);
406
407 // stub code
408
409 address aarch64_entry = __ pc();
410
411 // set up frame and move sp to end of save area
412 __ enter();
413 __ sub(sp, rfp, -sp_after_call_off * wordSize);
414
415 // save register parameters and Java scratch/global registers
416 // n.b. we save thread even though it gets installed in
417 // rthread because we want to sanity check rthread later
418 __ str(c_rarg7, thread);
419 __ strw(c_rarg6, parameter_size);
420 __ stp(c_rarg4, c_rarg5, entry_point);
421 __ stp(c_rarg2, c_rarg3, result_type);
422 __ stp(c_rarg0, c_rarg1, call_wrapper);
423
424 __ stp(r20, r19, r20_save);
425 __ stp(r22, r21, r22_save);
426 __ stp(r24, r23, r24_save);
427 __ stp(r26, r25, r26_save);
428 __ stp(r28, r27, r28_save);
429
430 __ stpd(v9, v8, d9_save);
431 __ stpd(v11, v10, d11_save);
432 __ stpd(v13, v12, d13_save);
433 __ stpd(v15, v14, d15_save);
434
435 __ get_fpcr(rscratch1);
436 __ str(rscratch1, fpcr_save);
437 // Set FPCR to the state we need. We do want Round to Nearest. We
438 // don't want non-IEEE rounding modes or floating-point traps.
439 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
440 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
441 __ set_fpcr(rscratch1);
442
443 // install Java thread in global register now we have saved
444 // whatever value it held
445 __ mov(rthread, c_rarg7);
446 // And method
447 __ mov(rmethod, c_rarg3);
448
449 // set up the heapbase register
450 __ reinit_heapbase();
451
452 #ifdef ASSERT
453 // make sure we have no pending exceptions
454 {
455 Label L;
456 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
457 __ cmp(rscratch1, (u1)NULL_WORD);
458 __ br(Assembler::EQ, L);
459 __ stop("StubRoutines::call_stub: entered with pending exception");
460 __ BIND(L);
461 }
462 #endif
463 // pass parameters if any
464 __ mov(esp, sp);
465 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
466 __ andr(sp, rscratch1, -2 * wordSize);
467
468 BLOCK_COMMENT("pass parameters if any");
469 Label parameters_done;
470 // parameter count is still in c_rarg6
471 // and parameter pointer identifying param 1 is in c_rarg5
472 __ cbzw(c_rarg6, parameters_done);
473
474 address loop = __ pc();
475 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
476 __ subsw(c_rarg6, c_rarg6, 1);
477 __ push(rscratch1);
478 __ br(Assembler::GT, loop);
479
480 __ BIND(parameters_done);
481
482 // call Java entry -- passing methdoOop, and current sp
483 // rmethod: Method*
484 // r19_sender_sp: sender sp
485 BLOCK_COMMENT("call Java function");
486 __ mov(r19_sender_sp, sp);
487 __ blr(c_rarg4);
488
489 // we do this here because the notify will already have been done
490 // if we get to the next instruction via an exception
491 //
492 // n.b. adding this instruction here affects the calculation of
493 // whether or not a routine returns to the call stub (used when
494 // doing stack walks) since the normal test is to check the return
495 // pc against the address saved below. so we may need to allow for
496 // this extra instruction in the check.
497
498 // save current address for use by exception handling code
499
500 return_address = __ pc();
501 entries.append(return_address);
502
503 // store result depending on type (everything that is not
504 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
505 // n.b. this assumes Java returns an integral result in r0
506 // and a floating result in j_farg0
507 __ ldr(j_rarg2, result);
508 Label is_long, is_float, is_double, exit;
509 __ ldr(j_rarg1, result_type);
510 __ cmp(j_rarg1, (u1)T_OBJECT);
511 __ br(Assembler::EQ, is_long);
512 __ cmp(j_rarg1, (u1)T_LONG);
513 __ br(Assembler::EQ, is_long);
514 __ cmp(j_rarg1, (u1)T_FLOAT);
515 __ br(Assembler::EQ, is_float);
516 __ cmp(j_rarg1, (u1)T_DOUBLE);
517 __ br(Assembler::EQ, is_double);
518
519 // handle T_INT case
520 __ strw(r0, Address(j_rarg2));
521
522 __ BIND(exit);
523
524 // pop parameters
525 __ sub(esp, rfp, -sp_after_call_off * wordSize);
526
527 #ifdef ASSERT
528 // verify that threads correspond
529 {
530 Label L, S;
531 __ ldr(rscratch1, thread);
532 __ cmp(rthread, rscratch1);
533 __ br(Assembler::NE, S);
534 __ get_thread(rscratch1);
535 __ cmp(rthread, rscratch1);
536 __ br(Assembler::EQ, L);
537 __ BIND(S);
538 __ stop("StubRoutines::call_stub: threads must correspond");
539 __ BIND(L);
540 }
541 #endif
542
543 __ pop_cont_fastpath(rthread);
544
545 // restore callee-save registers
546 __ ldpd(v15, v14, d15_save);
547 __ ldpd(v13, v12, d13_save);
548 __ ldpd(v11, v10, d11_save);
549 __ ldpd(v9, v8, d9_save);
550
551 __ ldp(r28, r27, r28_save);
552 __ ldp(r26, r25, r26_save);
553 __ ldp(r24, r23, r24_save);
554 __ ldp(r22, r21, r22_save);
555 __ ldp(r20, r19, r20_save);
556
557 // restore fpcr
558 __ ldr(rscratch1, fpcr_save);
559 __ set_fpcr(rscratch1);
560
561 __ ldp(c_rarg0, c_rarg1, call_wrapper);
562 __ ldrw(c_rarg2, result_type);
563 __ ldr(c_rarg3, method);
564 __ ldp(c_rarg4, c_rarg5, entry_point);
565 __ ldp(c_rarg6, c_rarg7, parameter_size);
566
567 // leave frame and return to caller
568 __ leave();
569 __ ret(lr);
570
571 // handle return types different from T_INT
572
573 __ BIND(is_long);
574 __ str(r0, Address(j_rarg2, 0));
575 __ br(Assembler::AL, exit);
576
577 __ BIND(is_float);
578 __ strs(j_farg0, Address(j_rarg2, 0));
579 __ br(Assembler::AL, exit);
580
581 __ BIND(is_double);
582 __ strd(j_farg0, Address(j_rarg2, 0));
583 __ br(Assembler::AL, exit);
584
585 // record the stub entry and end plus the auxiliary entry
586 store_archive_data(stub_id, start, __ pc(), &entries);
587
588 return start;
589 }
590
591 // Return point for a Java call if there's an exception thrown in
592 // Java code. The exception is caught and transformed into a
593 // pending exception stored in JavaThread that can be tested from
594 // within the VM.
595 //
596 // Note: Usually the parameters are removed by the callee. In case
597 // of an exception crossing an activation frame boundary, that is
598 // not the case if the callee is compiled code => need to setup the
599 // rsp.
600 //
601 // r0: exception oop
602
603 address generate_catch_exception() {
604 StubId stub_id = StubId::stubgen_catch_exception_id;
605 int entry_count = StubInfo::entry_count(stub_id);
606 assert(entry_count == 1, "sanity check");
607 address start = load_archive_data(stub_id);
608 if (start != nullptr) {
609 return start;
610 }
611 StubCodeMark mark(this, stub_id);
612 start = __ pc();
613
614 // same as in generate_call_stub():
615 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
616 const Address thread (rfp, thread_off * wordSize);
617
618 #ifdef ASSERT
619 // verify that threads correspond
620 {
621 Label L, S;
622 __ ldr(rscratch1, thread);
623 __ cmp(rthread, rscratch1);
624 __ br(Assembler::NE, S);
625 __ get_thread(rscratch1);
626 __ cmp(rthread, rscratch1);
627 __ br(Assembler::EQ, L);
628 __ bind(S);
629 __ stop("StubRoutines::catch_exception: threads must correspond");
630 __ bind(L);
631 }
632 #endif
633
634 // set pending exception
635 __ verify_oop(r0);
636
637 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
638 // special case -- add file name string to AOT address table
639 address file = (address)AOTCodeCache::add_C_string(__FILE__);
640 __ lea(rscratch1, ExternalAddress(file));
641 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
642 __ movw(rscratch1, (int)__LINE__);
643 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
644
645 // complete return to VM
646 assert(StubRoutines::_call_stub_return_address != nullptr,
647 "_call_stub_return_address must have been generated before");
648 __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
649
650 // record the stub entry and end
651 store_archive_data(stub_id, start, __ pc());
652
653 return start;
654 }
655
656 // Continuation point for runtime calls returning with a pending
657 // exception. The pending exception check happened in the runtime
658 // or native call stub. The pending exception in Thread is
659 // converted into a Java-level exception.
660 //
661 // Contract with Java-level exception handlers:
662 // r0: exception
663 // r3: throwing pc
664 //
665 // NOTE: At entry of this stub, exception-pc must be in LR !!
666
667 // NOTE: this is always used as a jump target within generated code
668 // so it just needs to be generated code with no x86 prolog
669
670 address generate_forward_exception() {
671 StubId stub_id = StubId::stubgen_forward_exception_id;
672 int entry_count = StubInfo::entry_count(stub_id);
673 assert(entry_count == 1, "sanity check");
674 address start = load_archive_data(stub_id);
675 if (start != nullptr) {
676 return start;
677 }
678 StubCodeMark mark(this, stub_id);
679 start = __ pc();
680
681 // Upon entry, LR points to the return address returning into
682 // Java (interpreted or compiled) code; i.e., the return address
683 // becomes the throwing pc.
684 //
685 // Arguments pushed before the runtime call are still on the stack
686 // but the exception handler will reset the stack pointer ->
687 // ignore them. A potential result in registers can be ignored as
688 // well.
689
690 #ifdef ASSERT
691 // make sure this code is only executed if there is a pending exception
692 {
693 Label L;
694 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
695 __ cbnz(rscratch1, L);
696 __ stop("StubRoutines::forward exception: no pending exception (1)");
697 __ bind(L);
698 }
699 #endif
700
701 // compute exception handler into r19
702
703 // call the VM to find the handler address associated with the
704 // caller address. pass thread in r0 and caller pc (ret address)
705 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
706 // the stack.
707 __ mov(c_rarg1, lr);
708 // lr will be trashed by the VM call so we move it to R19
709 // (callee-saved) because we also need to pass it to the handler
710 // returned by this call.
711 __ mov(r19, lr);
712 BLOCK_COMMENT("call exception_handler_for_return_address");
713 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
714 SharedRuntime::exception_handler_for_return_address),
715 rthread, c_rarg1);
716 // Reinitialize the ptrue predicate register, in case the external runtime
717 // call clobbers ptrue reg, as we may return to SVE compiled code.
718 __ reinitialize_ptrue();
719
720 // we should not really care that lr is no longer the callee
721 // address. we saved the value the handler needs in r19 so we can
722 // just copy it to r3. however, the C2 handler will push its own
723 // frame and then calls into the VM and the VM code asserts that
724 // the PC for the frame above the handler belongs to a compiled
725 // Java method. So, we restore lr here to satisfy that assert.
726 __ mov(lr, r19);
727 // setup r0 & r3 & clear pending exception
728 __ mov(r3, r19);
729 __ mov(r19, r0);
730 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
731 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
732
733 #ifdef ASSERT
734 // make sure exception is set
735 {
736 Label L;
737 __ cbnz(r0, L);
738 __ stop("StubRoutines::forward exception: no pending exception (2)");
739 __ bind(L);
740 }
741 #endif
742
743 // continue at exception handler
744 // r0: exception
745 // r3: throwing pc
746 // r19: exception handler
747 __ verify_oop(r0);
748 __ br(r19);
749
750 // record the stub entry and end
751 store_archive_data(stub_id, start, __ pc());
752
753 return start;
754 }
755
756 // Non-destructive plausibility checks for oops
757 //
758 // Arguments:
759 // r0: oop to verify
760 // rscratch1: error message
761 //
762 // Stack after saving c_rarg3:
763 // [tos + 0]: saved c_rarg3
764 // [tos + 1]: saved c_rarg2
765 // [tos + 2]: saved lr
766 // [tos + 3]: saved rscratch2
767 // [tos + 4]: saved r0
768 // [tos + 5]: saved rscratch1
769 address generate_verify_oop() {
770 StubId stub_id = StubId::stubgen_verify_oop_id;
771 int entry_count = StubInfo::entry_count(stub_id);
772 assert(entry_count == 1, "sanity check");
773 address start = load_archive_data(stub_id);
774 if (start != nullptr) {
775 return start;
776 }
777 StubCodeMark mark(this, stub_id);
778 start = __ pc();
779
780 Label exit, error;
781
782 // save c_rarg2 and c_rarg3
783 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
784
785 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
786 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
787 __ ldr(c_rarg3, Address(c_rarg2));
788 __ add(c_rarg3, c_rarg3, 1);
789 __ str(c_rarg3, Address(c_rarg2));
790
791 // object is in r0
792 // make sure object is 'reasonable'
793 __ cbz(r0, exit); // if obj is null it is OK
794
795 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
796 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
797
798 // return if everything seems ok
799 __ bind(exit);
800
801 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
802 __ ret(lr);
803
804 // handle errors
805 __ bind(error);
806 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
807
808 __ push(RegSet::range(r0, r29), sp);
809 // debug(char* msg, int64_t pc, int64_t regs[])
810 __ mov(c_rarg0, rscratch1); // pass address of error message
811 __ mov(c_rarg1, lr); // pass return address
812 __ mov(c_rarg2, sp); // pass address of regs on stack
813 #ifndef PRODUCT
814 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
815 #endif
816 BLOCK_COMMENT("call MacroAssembler::debug");
817 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
818 __ blr(rscratch1);
819 __ hlt(0);
820
821 // record the stub entry and end
822 store_archive_data(stub_id, start, __ pc());
823
824 return start;
825 }
826
827 // Generate indices for iota vector.
828 void generate_iota_indices(StubId stub_id) {
829 GrowableArray<address> entries;
830 int entry_count = StubInfo::entry_count(stub_id);
831 assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
832 address start = load_archive_data(stub_id, &entries);
833 if (start != nullptr) {
834 assert(entries.length() == entry_count - 1,
835 "unexpected entries count %d", entries.length());
836 StubRoutines::aarch64::_vector_iota_indices[0] = start;
837 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
838 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
839 }
840 return;
841 }
842 __ align(CodeEntryAlignment);
843 StubCodeMark mark(this, stub_id);
844 start = __ pc();
845 // B
846 __ emit_data64(0x0706050403020100, relocInfo::none);
847 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
848 entries.append(__ pc());
849 // H
850 __ emit_data64(0x0003000200010000, relocInfo::none);
851 __ emit_data64(0x0007000600050004, relocInfo::none);
852 entries.append(__ pc());
853 // S
854 __ emit_data64(0x0000000100000000, relocInfo::none);
855 __ emit_data64(0x0000000300000002, relocInfo::none);
856 entries.append(__ pc());
857 // D
858 __ emit_data64(0x0000000000000000, relocInfo::none);
859 __ emit_data64(0x0000000000000001, relocInfo::none);
860 entries.append(__ pc());
861 // S - FP
862 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
863 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
864 entries.append(__ pc());
865 // D - FP
866 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
867 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
868
869 // record the stub entry and end
870 store_archive_data(stub_id, start, __ pc(), &entries);
871
872 // install the entry addresses in the entry array
873 assert(entries.length() == entry_count - 1,
874 "unexpected entries count %d", entries.length());
875 StubRoutines::aarch64::_vector_iota_indices[0] = start;
876 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
877 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
878 }
879 }
880
881 // The inner part of zero_words(). This is the bulk operation,
882 // zeroing words in blocks, possibly using DC ZVA to do it. The
883 // caller is responsible for zeroing the last few words.
884 //
885 // Inputs:
886 // r10: the HeapWord-aligned base address of an array to zero.
887 // r11: the count in HeapWords, r11 > 0.
888 //
889 // Returns r10 and r11, adjusted for the caller to clear.
890 // r10: the base address of the tail of words left to clear.
891 // r11: the number of words in the tail.
892 // r11 < MacroAssembler::zero_words_block_size.
893
894 address generate_zero_blocks() {
895 StubId stub_id = StubId::stubgen_zero_blocks_id;
896 int entry_count = StubInfo::entry_count(stub_id);
897 assert(entry_count == 1, "sanity check");
898 address start = load_archive_data(stub_id);
899 if (start != nullptr) {
900 return start;
901 }
902 __ align(CodeEntryAlignment);
903 StubCodeMark mark(this, stub_id);
904 Label done;
905 Label base_aligned;
906
907 Register base = r10, cnt = r11;
908
909 start = __ pc();
910
911 if (UseBlockZeroing) {
912 int zva_length = VM_Version::zva_length();
913
914 // Ensure ZVA length can be divided by 16. This is required by
915 // the subsequent operations.
916 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
917
918 __ tbz(base, 3, base_aligned);
919 __ str(zr, Address(__ post(base, 8)));
920 __ sub(cnt, cnt, 1);
921 __ bind(base_aligned);
922
923 // Ensure count >= zva_length * 2 so that it still deserves a zva after
924 // alignment.
925 Label small;
926 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
927 __ subs(rscratch1, cnt, low_limit >> 3);
928 __ br(Assembler::LT, small);
929 __ zero_dcache_blocks(base, cnt);
930 __ bind(small);
931 }
932
933 {
934 // Number of stp instructions we'll unroll
935 const int unroll =
936 MacroAssembler::zero_words_block_size / 2;
937 // Clear the remaining blocks.
938 Label loop;
939 __ subs(cnt, cnt, unroll * 2);
940 __ br(Assembler::LT, done);
941 __ bind(loop);
942 for (int i = 0; i < unroll; i++)
943 __ stp(zr, zr, __ post(base, 16));
944 __ subs(cnt, cnt, unroll * 2);
945 __ br(Assembler::GE, loop);
946 __ bind(done);
947 __ add(cnt, cnt, unroll * 2);
948 }
949
950 __ ret(lr);
951
952 // record the stub entry and end
953 store_archive_data(stub_id, start, __ pc());
954
955 return start;
956 }
957
958
959 typedef enum {
960 copy_forwards = 1,
961 copy_backwards = -1
962 } copy_direction;
963
964 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
965 // for arraycopy stubs.
966 class ArrayCopyBarrierSetHelper : StackObj {
967 BarrierSetAssembler* _bs_asm;
968 MacroAssembler* _masm;
969 DecoratorSet _decorators;
970 BasicType _type;
971 Register _gct1;
972 Register _gct2;
973 Register _gct3;
974 FloatRegister _gcvt1;
975 FloatRegister _gcvt2;
976 FloatRegister _gcvt3;
977
978 public:
979 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
980 DecoratorSet decorators,
981 BasicType type,
982 Register gct1,
983 Register gct2,
984 Register gct3,
985 FloatRegister gcvt1,
986 FloatRegister gcvt2,
987 FloatRegister gcvt3)
988 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
989 _masm(masm),
990 _decorators(decorators),
991 _type(type),
992 _gct1(gct1),
993 _gct2(gct2),
994 _gct3(gct3),
995 _gcvt1(gcvt1),
996 _gcvt2(gcvt2),
997 _gcvt3(gcvt3) {
998 }
999
1000 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
1001 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
1002 dst1, dst2, src,
1003 _gct1, _gct2, _gcvt1);
1004 }
1005
1006 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
1007 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
1008 dst, src1, src2,
1009 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
1010 }
1011
1012 void copy_load_at_16(Register dst1, Register dst2, Address src) {
1013 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
1014 dst1, dst2, src,
1015 _gct1);
1016 }
1017
1018 void copy_store_at_16(Address dst, Register src1, Register src2) {
1019 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
1020 dst, src1, src2,
1021 _gct1, _gct2, _gct3);
1022 }
1023
1024 void copy_load_at_8(Register dst, Address src) {
1025 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
1026 dst, noreg, src,
1027 _gct1);
1028 }
1029
1030 void copy_store_at_8(Address dst, Register src) {
1031 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
1032 dst, src, noreg,
1033 _gct1, _gct2, _gct3);
1034 }
1035 };
1036
1037 // Bulk copy of blocks of 8 words.
1038 //
1039 // count is a count of words.
1040 //
1041 // Precondition: count >= 8
1042 //
1043 // Postconditions:
1044 //
1045 // The least significant bit of count contains the remaining count
1046 // of words to copy. The rest of count is trash.
1047 //
1048 // s and d are adjusted to point to the remaining words to copy
1049 //
1050 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
1051 int entry_count = StubInfo::entry_count(stub_id);
1052 assert(entry_count == 1, "sanity check");
1053 address start = load_archive_data(stub_id);
1054 if (start != nullptr) {
1055 return start;
1056 }
1057 BasicType type;
1058 copy_direction direction;
1059
1060 switch (stub_id) {
1061 case StubId::stubgen_copy_byte_f_id:
1062 direction = copy_forwards;
1063 type = T_BYTE;
1064 break;
1065 case StubId::stubgen_copy_byte_b_id:
1066 direction = copy_backwards;
1067 type = T_BYTE;
1068 break;
1069 case StubId::stubgen_copy_oop_f_id:
1070 direction = copy_forwards;
1071 type = T_OBJECT;
1072 break;
1073 case StubId::stubgen_copy_oop_b_id:
1074 direction = copy_backwards;
1075 type = T_OBJECT;
1076 break;
1077 case StubId::stubgen_copy_oop_uninit_f_id:
1078 direction = copy_forwards;
1079 type = T_OBJECT;
1080 break;
1081 case StubId::stubgen_copy_oop_uninit_b_id:
1082 direction = copy_backwards;
1083 type = T_OBJECT;
1084 break;
1085 default:
1086 ShouldNotReachHere();
1087 }
1088
1089 int unit = wordSize * direction;
1090 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
1091
1092 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
1093 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
1094 const Register stride = r14;
1095 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1096 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1097 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1098
1099 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
1100 assert_different_registers(s, d, count, rscratch1, rscratch2);
1101
1102 Label again, drain;
1103
1104 __ align(CodeEntryAlignment);
1105
1106 StubCodeMark mark(this, stub_id);
1107
1108 start = __ pc();
1109
1110 Label unaligned_copy_long;
1111 if (AvoidUnalignedAccesses) {
1112 __ tbnz(d, 3, unaligned_copy_long);
1113 }
1114
1115 if (direction == copy_forwards) {
1116 __ sub(s, s, bias);
1117 __ sub(d, d, bias);
1118 }
1119
1120 #ifdef ASSERT
1121 // Make sure we are never given < 8 words
1122 {
1123 Label L;
1124 __ cmp(count, (u1)8);
1125 __ br(Assembler::GE, L);
1126 __ stop("genrate_copy_longs called with < 8 words");
1127 __ bind(L);
1128 }
1129 #endif
1130
1131 // Fill 8 registers
1132 if (UseSIMDForMemoryOps) {
1133 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1134 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1135 } else {
1136 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1137 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1138 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1139 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1140 }
1141
1142 __ subs(count, count, 16);
1143 __ br(Assembler::LO, drain);
1144
1145 int prefetch = PrefetchCopyIntervalInBytes;
1146 bool use_stride = false;
1147 if (direction == copy_backwards) {
1148 use_stride = prefetch > 256;
1149 prefetch = -prefetch;
1150 if (use_stride) __ mov(stride, prefetch);
1151 }
1152
1153 __ bind(again);
1154
1155 if (PrefetchCopyIntervalInBytes > 0)
1156 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1157
1158 if (UseSIMDForMemoryOps) {
1159 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1160 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1161 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1162 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1163 } else {
1164 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1165 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1166 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1167 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1168 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1169 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1170 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1171 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1172 }
1173
1174 __ subs(count, count, 8);
1175 __ br(Assembler::HS, again);
1176
1177 // Drain
1178 __ bind(drain);
1179 if (UseSIMDForMemoryOps) {
1180 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1181 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1182 } else {
1183 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1184 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1185 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1186 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1187 }
1188
1189 {
1190 Label L1, L2;
1191 __ tbz(count, exact_log2(4), L1);
1192 if (UseSIMDForMemoryOps) {
1193 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
1194 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
1195 } else {
1196 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1197 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1198 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1199 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
1200 }
1201 __ bind(L1);
1202
1203 if (direction == copy_forwards) {
1204 __ add(s, s, bias);
1205 __ add(d, d, bias);
1206 }
1207
1208 __ tbz(count, 1, L2);
1209 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
1210 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
1211 __ bind(L2);
1212 }
1213
1214 __ ret(lr);
1215
1216 if (AvoidUnalignedAccesses) {
1217 Label drain, again;
1218 // Register order for storing. Order is different for backward copy.
1219
1220 __ bind(unaligned_copy_long);
1221
1222 // source address is even aligned, target odd aligned
1223 //
1224 // when forward copying word pairs we read long pairs at offsets
1225 // {0, 2, 4, 6} (in long words). when backwards copying we read
1226 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
1227 // address by -2 in the forwards case so we can compute the
1228 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
1229 // or -1.
1230 //
1231 // when forward copying we need to store 1 word, 3 pairs and
1232 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
1233 // zero offset We adjust the destination by -1 which means we
1234 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1235 //
1236 // When backwards copyng we need to store 1 word, 3 pairs and
1237 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1238 // offsets {1, 3, 5, 7, 8} * unit.
1239
1240 if (direction == copy_forwards) {
1241 __ sub(s, s, 16);
1242 __ sub(d, d, 8);
1243 }
1244
1245 // Fill 8 registers
1246 //
1247 // for forwards copy s was offset by -16 from the original input
1248 // value of s so the register contents are at these offsets
1249 // relative to the 64 bit block addressed by that original input
1250 // and so on for each successive 64 byte block when s is updated
1251 //
1252 // t0 at offset 0, t1 at offset 8
1253 // t2 at offset 16, t3 at offset 24
1254 // t4 at offset 32, t5 at offset 40
1255 // t6 at offset 48, t7 at offset 56
1256
1257 // for backwards copy s was not offset so the register contents
1258 // are at these offsets into the preceding 64 byte block
1259 // relative to that original input and so on for each successive
1260 // preceding 64 byte block when s is updated. this explains the
1261 // slightly counter-intuitive looking pattern of register usage
1262 // in the stp instructions for backwards copy.
1263 //
1264 // t0 at offset -16, t1 at offset -8
1265 // t2 at offset -32, t3 at offset -24
1266 // t4 at offset -48, t5 at offset -40
1267 // t6 at offset -64, t7 at offset -56
1268
1269 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1270 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1271 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1272 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1273
1274 __ subs(count, count, 16);
1275 __ br(Assembler::LO, drain);
1276
1277 int prefetch = PrefetchCopyIntervalInBytes;
1278 bool use_stride = false;
1279 if (direction == copy_backwards) {
1280 use_stride = prefetch > 256;
1281 prefetch = -prefetch;
1282 if (use_stride) __ mov(stride, prefetch);
1283 }
1284
1285 __ bind(again);
1286
1287 if (PrefetchCopyIntervalInBytes > 0)
1288 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1289
1290 if (direction == copy_forwards) {
1291 // allowing for the offset of -8 the store instructions place
1292 // registers into the target 64 bit block at the following
1293 // offsets
1294 //
1295 // t0 at offset 0
1296 // t1 at offset 8, t2 at offset 16
1297 // t3 at offset 24, t4 at offset 32
1298 // t5 at offset 40, t6 at offset 48
1299 // t7 at offset 56
1300
1301 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1302 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1303 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1304 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1305 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1306 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1307 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1308 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1309 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1310 } else {
1311 // d was not offset when we started so the registers are
1312 // written into the 64 bit block preceding d with the following
1313 // offsets
1314 //
1315 // t1 at offset -8
1316 // t3 at offset -24, t0 at offset -16
1317 // t5 at offset -48, t2 at offset -32
1318 // t7 at offset -56, t4 at offset -48
1319 // t6 at offset -64
1320 //
1321 // note that this matches the offsets previously noted for the
1322 // loads
1323
1324 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1325 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1326 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1327 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1328 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1329 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1330 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1331 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1332 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1333 }
1334
1335 __ subs(count, count, 8);
1336 __ br(Assembler::HS, again);
1337
1338 // Drain
1339 //
1340 // this uses the same pattern of offsets and register arguments
1341 // as above
1342 __ bind(drain);
1343 if (direction == copy_forwards) {
1344 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1345 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1346 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1347 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1348 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1349 } else {
1350 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1351 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1352 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1353 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1354 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1355 }
1356 // now we need to copy any remaining part block which may
1357 // include a 4 word block subblock and/or a 2 word subblock.
1358 // bits 2 and 1 in the count are the tell-tale for whether we
1359 // have each such subblock
1360 {
1361 Label L1, L2;
1362 __ tbz(count, exact_log2(4), L1);
1363 // this is the same as above but copying only 4 longs hence
1364 // with only one intervening stp between the str instructions
1365 // but note that the offsets and registers still follow the
1366 // same pattern
1367 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1368 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1369 if (direction == copy_forwards) {
1370 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1371 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1372 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1373 } else {
1374 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1375 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1376 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1377 }
1378 __ bind(L1);
1379
1380 __ tbz(count, 1, L2);
1381 // this is the same as above but copying only 2 longs hence
1382 // there is no intervening stp between the str instructions
1383 // but note that the offset and register patterns are still
1384 // the same
1385 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1386 if (direction == copy_forwards) {
1387 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1388 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1389 } else {
1390 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1391 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1392 }
1393 __ bind(L2);
1394
1395 // for forwards copy we need to re-adjust the offsets we
1396 // applied so that s and d are follow the last words written
1397
1398 if (direction == copy_forwards) {
1399 __ add(s, s, 16);
1400 __ add(d, d, 8);
1401 }
1402
1403 }
1404
1405 __ ret(lr);
1406 }
1407
1408 // record the stub entry and end
1409 store_archive_data(stub_id, start, __ pc());
1410
1411 return start;
1412 }
1413
1414 // Small copy: less than 16 bytes.
1415 //
1416 // NB: Ignores all of the bits of count which represent more than 15
1417 // bytes, so a caller doesn't have to mask them.
1418
1419 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1420 bool is_backwards = step < 0;
1421 size_t granularity = g_uabs(step);
1422 int direction = is_backwards ? -1 : 1;
1423
1424 Label Lword, Lint, Lshort, Lbyte;
1425
1426 assert(granularity
1427 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1428
1429 const Register t0 = r3;
1430 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1431 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1432
1433 // ??? I don't know if this bit-test-and-branch is the right thing
1434 // to do. It does a lot of jumping, resulting in several
1435 // mispredicted branches. It might make more sense to do this
1436 // with something like Duff's device with a single computed branch.
1437
1438 __ tbz(count, 3 - exact_log2(granularity), Lword);
1439 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1440 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1441 __ bind(Lword);
1442
1443 if (granularity <= sizeof (jint)) {
1444 __ tbz(count, 2 - exact_log2(granularity), Lint);
1445 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1446 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1447 __ bind(Lint);
1448 }
1449
1450 if (granularity <= sizeof (jshort)) {
1451 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1452 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1453 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1454 __ bind(Lshort);
1455 }
1456
1457 if (granularity <= sizeof (jbyte)) {
1458 __ tbz(count, 0, Lbyte);
1459 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1460 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1461 __ bind(Lbyte);
1462 }
1463 }
1464
1465 // All-singing all-dancing memory copy.
1466 //
1467 // Copy count units of memory from s to d. The size of a unit is
1468 // step, which can be positive or negative depending on the direction
1469 // of copy. If is_aligned is false, we align the source address.
1470 //
1471
1472 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1473 Register s, Register d, Register count, int step) {
1474 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1475 bool is_backwards = step < 0;
1476 unsigned int granularity = g_uabs(step);
1477 const Register t0 = r3, t1 = r4;
1478
1479 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1480 // load all the data before writing anything
1481 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1482 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1483 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1484 const Register send = r17, dend = r16;
1485 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1486 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1487 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1488
1489 if (PrefetchCopyIntervalInBytes > 0)
1490 __ prfm(Address(s, 0), PLDL1KEEP);
1491 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1492 __ br(Assembler::HI, copy_big);
1493
1494 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1495 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1496
1497 __ cmp(count, u1(16/granularity));
1498 __ br(Assembler::LS, copy16);
1499
1500 __ cmp(count, u1(64/granularity));
1501 __ br(Assembler::HI, copy80);
1502
1503 __ cmp(count, u1(32/granularity));
1504 __ br(Assembler::LS, copy32);
1505
1506 // 33..64 bytes
1507 if (UseSIMDForMemoryOps) {
1508 bs.copy_load_at_32(v0, v1, Address(s, 0));
1509 bs.copy_load_at_32(v2, v3, Address(send, -32));
1510 bs.copy_store_at_32(Address(d, 0), v0, v1);
1511 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1512 } else {
1513 bs.copy_load_at_16(t0, t1, Address(s, 0));
1514 bs.copy_load_at_16(t2, t3, Address(s, 16));
1515 bs.copy_load_at_16(t4, t5, Address(send, -32));
1516 bs.copy_load_at_16(t6, t7, Address(send, -16));
1517
1518 bs.copy_store_at_16(Address(d, 0), t0, t1);
1519 bs.copy_store_at_16(Address(d, 16), t2, t3);
1520 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1521 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1522 }
1523 __ b(finish);
1524
1525 // 17..32 bytes
1526 __ bind(copy32);
1527 bs.copy_load_at_16(t0, t1, Address(s, 0));
1528 bs.copy_load_at_16(t6, t7, Address(send, -16));
1529
1530 bs.copy_store_at_16(Address(d, 0), t0, t1);
1531 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1532 __ b(finish);
1533
1534 // 65..80/96 bytes
1535 // (96 bytes if SIMD because we do 32 byes per instruction)
1536 __ bind(copy80);
1537 if (UseSIMDForMemoryOps) {
1538 bs.copy_load_at_32(v0, v1, Address(s, 0));
1539 bs.copy_load_at_32(v2, v3, Address(s, 32));
1540 // Unaligned pointers can be an issue for copying.
1541 // The issue has more chances to happen when granularity of data is
1542 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1543 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1544 // The most performance drop has been seen for the range 65-80 bytes.
1545 // For such cases using the pair of ldp/stp instead of the third pair of
1546 // ldpq/stpq fixes the performance issue.
1547 if (granularity < sizeof (jint)) {
1548 Label copy96;
1549 __ cmp(count, u1(80/granularity));
1550 __ br(Assembler::HI, copy96);
1551 bs.copy_load_at_16(t0, t1, Address(send, -16));
1552
1553 bs.copy_store_at_32(Address(d, 0), v0, v1);
1554 bs.copy_store_at_32(Address(d, 32), v2, v3);
1555
1556 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1557 __ b(finish);
1558
1559 __ bind(copy96);
1560 }
1561 bs.copy_load_at_32(v4, v5, Address(send, -32));
1562
1563 bs.copy_store_at_32(Address(d, 0), v0, v1);
1564 bs.copy_store_at_32(Address(d, 32), v2, v3);
1565
1566 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1567 } else {
1568 bs.copy_load_at_16(t0, t1, Address(s, 0));
1569 bs.copy_load_at_16(t2, t3, Address(s, 16));
1570 bs.copy_load_at_16(t4, t5, Address(s, 32));
1571 bs.copy_load_at_16(t6, t7, Address(s, 48));
1572 bs.copy_load_at_16(t8, t9, Address(send, -16));
1573
1574 bs.copy_store_at_16(Address(d, 0), t0, t1);
1575 bs.copy_store_at_16(Address(d, 16), t2, t3);
1576 bs.copy_store_at_16(Address(d, 32), t4, t5);
1577 bs.copy_store_at_16(Address(d, 48), t6, t7);
1578 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1579 }
1580 __ b(finish);
1581
1582 // 0..16 bytes
1583 __ bind(copy16);
1584 __ cmp(count, u1(8/granularity));
1585 __ br(Assembler::LO, copy8);
1586
1587 // 8..16 bytes
1588 bs.copy_load_at_8(t0, Address(s, 0));
1589 bs.copy_load_at_8(t1, Address(send, -8));
1590 bs.copy_store_at_8(Address(d, 0), t0);
1591 bs.copy_store_at_8(Address(dend, -8), t1);
1592 __ b(finish);
1593
1594 if (granularity < 8) {
1595 // 4..7 bytes
1596 __ bind(copy8);
1597 __ tbz(count, 2 - exact_log2(granularity), copy4);
1598 __ ldrw(t0, Address(s, 0));
1599 __ ldrw(t1, Address(send, -4));
1600 __ strw(t0, Address(d, 0));
1601 __ strw(t1, Address(dend, -4));
1602 __ b(finish);
1603 if (granularity < 4) {
1604 // 0..3 bytes
1605 __ bind(copy4);
1606 __ cbz(count, finish); // get rid of 0 case
1607 if (granularity == 2) {
1608 __ ldrh(t0, Address(s, 0));
1609 __ strh(t0, Address(d, 0));
1610 } else { // granularity == 1
1611 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1612 // the first and last byte.
1613 // Handle the 3 byte case by loading and storing base + count/2
1614 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1615 // This does means in the 1 byte case we load/store the same
1616 // byte 3 times.
1617 __ lsr(count, count, 1);
1618 __ ldrb(t0, Address(s, 0));
1619 __ ldrb(t1, Address(send, -1));
1620 __ ldrb(t2, Address(s, count));
1621 __ strb(t0, Address(d, 0));
1622 __ strb(t1, Address(dend, -1));
1623 __ strb(t2, Address(d, count));
1624 }
1625 __ b(finish);
1626 }
1627 }
1628
1629 __ bind(copy_big);
1630 if (is_backwards) {
1631 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1632 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1633 }
1634
1635 // Now we've got the small case out of the way we can align the
1636 // source address on a 2-word boundary.
1637
1638 // Here we will materialize a count in r15, which is used by copy_memory_small
1639 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1640 // Up until here, we have used t9, which aliases r15, but from here on, that register
1641 // can not be used as a temp register, as it contains the count.
1642
1643 Label aligned;
1644
1645 if (is_aligned) {
1646 // We may have to adjust by 1 word to get s 2-word-aligned.
1647 __ tbz(s, exact_log2(wordSize), aligned);
1648 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1649 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1650 __ sub(count, count, wordSize/granularity);
1651 } else {
1652 if (is_backwards) {
1653 __ andr(r15, s, 2 * wordSize - 1);
1654 } else {
1655 __ neg(r15, s);
1656 __ andr(r15, r15, 2 * wordSize - 1);
1657 }
1658 // r15 is the byte adjustment needed to align s.
1659 __ cbz(r15, aligned);
1660 int shift = exact_log2(granularity);
1661 if (shift > 0) {
1662 __ lsr(r15, r15, shift);
1663 }
1664 __ sub(count, count, r15);
1665
1666 #if 0
1667 // ?? This code is only correct for a disjoint copy. It may or
1668 // may not make sense to use it in that case.
1669
1670 // Copy the first pair; s and d may not be aligned.
1671 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1672 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1673
1674 // Align s and d, adjust count
1675 if (is_backwards) {
1676 __ sub(s, s, r15);
1677 __ sub(d, d, r15);
1678 } else {
1679 __ add(s, s, r15);
1680 __ add(d, d, r15);
1681 }
1682 #else
1683 copy_memory_small(decorators, type, s, d, r15, step);
1684 #endif
1685 }
1686
1687 __ bind(aligned);
1688
1689 // s is now 2-word-aligned.
1690
1691 // We have a count of units and some trailing bytes. Adjust the
1692 // count and do a bulk copy of words. If the shift is zero
1693 // perform a move instead to benefit from zero latency moves.
1694 int shift = exact_log2(wordSize/granularity);
1695 if (shift > 0) {
1696 __ lsr(r15, count, shift);
1697 } else {
1698 __ mov(r15, count);
1699 }
1700 if (direction == copy_forwards) {
1701 if (type != T_OBJECT) {
1702 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
1703 __ blr(rscratch1);
1704 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1705 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
1706 __ blr(rscratch1);
1707 } else {
1708 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
1709 __ blr(rscratch1);
1710 }
1711 } else {
1712 if (type != T_OBJECT) {
1713 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
1714 __ blr(rscratch1);
1715 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1716 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
1717 __ blr(rscratch1);
1718 } else {
1719 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
1720 __ blr(rscratch1);
1721 }
1722 }
1723
1724 // And the tail.
1725 copy_memory_small(decorators, type, s, d, count, step);
1726
1727 if (granularity >= 8) __ bind(copy8);
1728 if (granularity >= 4) __ bind(copy4);
1729 __ bind(finish);
1730 }
1731
1732
1733 void clobber_registers() {
1734 #ifdef ASSERT
1735 RegSet clobbered
1736 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1737 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1738 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1739 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1740 __ mov(*it, rscratch1);
1741 }
1742 #endif
1743
1744 }
1745
1746 // Scan over array at a for count oops, verifying each one.
1747 // Preserves a and count, clobbers rscratch1 and rscratch2.
1748 void verify_oop_array (int size, Register a, Register count, Register temp) {
1749 Label loop, end;
1750 __ mov(rscratch1, a);
1751 __ mov(rscratch2, zr);
1752 __ bind(loop);
1753 __ cmp(rscratch2, count);
1754 __ br(Assembler::HS, end);
1755 if (size == wordSize) {
1756 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1757 __ verify_oop(temp);
1758 } else {
1759 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1760 __ decode_heap_oop(temp); // calls verify_oop
1761 }
1762 __ add(rscratch2, rscratch2, 1);
1763 __ b(loop);
1764 __ bind(end);
1765 }
1766
1767 // Arguments:
1768 // stub_id - is used to name the stub and identify all details of
1769 // how to perform the copy.
1770 //
1771 // nopush_entry - is assigned to the stub's post push entry point
1772 // unless it is null
1773 //
1774 // Inputs:
1775 // c_rarg0 - source array address
1776 // c_rarg1 - destination array address
1777 // c_rarg2 - element count, treated as ssize_t, can be zero
1778 //
1779 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1780 // the hardware handle it. The two dwords within qwords that span
1781 // cache line boundaries will still be loaded and stored atomically.
1782 //
1783 // Side Effects: nopush_entry is set to the (post push) entry point
1784 // so it can be used by the corresponding conjoint
1785 // copy method
1786 //
1787 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1788 int size;
1789 bool aligned;
1790 bool is_oop;
1791 bool dest_uninitialized;
1792 switch (stub_id) {
1793 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1794 size = sizeof(jbyte);
1795 aligned = false;
1796 is_oop = false;
1797 dest_uninitialized = false;
1798 break;
1799 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1800 size = sizeof(jbyte);
1801 aligned = true;
1802 is_oop = false;
1803 dest_uninitialized = false;
1804 break;
1805 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1806 size = sizeof(jshort);
1807 aligned = false;
1808 is_oop = false;
1809 dest_uninitialized = false;
1810 break;
1811 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1812 size = sizeof(jshort);
1813 aligned = true;
1814 is_oop = false;
1815 dest_uninitialized = false;
1816 break;
1817 case StubId::stubgen_jint_disjoint_arraycopy_id:
1818 size = sizeof(jint);
1819 aligned = false;
1820 is_oop = false;
1821 dest_uninitialized = false;
1822 break;
1823 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1824 size = sizeof(jint);
1825 aligned = true;
1826 is_oop = false;
1827 dest_uninitialized = false;
1828 break;
1829 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1830 // since this is always aligned we can (should!) use the same
1831 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1832 ShouldNotReachHere();
1833 break;
1834 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1835 size = sizeof(jlong);
1836 aligned = true;
1837 is_oop = false;
1838 dest_uninitialized = false;
1839 break;
1840 case StubId::stubgen_oop_disjoint_arraycopy_id:
1841 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1842 aligned = !UseCompressedOops;
1843 is_oop = true;
1844 dest_uninitialized = false;
1845 break;
1846 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1847 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1848 aligned = !UseCompressedOops;
1849 is_oop = true;
1850 dest_uninitialized = false;
1851 break;
1852 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1853 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1854 aligned = !UseCompressedOops;
1855 is_oop = true;
1856 dest_uninitialized = true;
1857 break;
1858 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1859 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1860 aligned = !UseCompressedOops;
1861 is_oop = true;
1862 dest_uninitialized = true;
1863 break;
1864 default:
1865 ShouldNotReachHere();
1866 break;
1867 }
1868 // all stubs provide a 2nd entry which omits the frame push for
1869 // use when bailing out from a conjoint copy. However we may also
1870 // need some extra addressses for memory access protection.
1871 int entry_count = StubInfo::entry_count(stub_id);
1872 assert(entry_count == 2, "sanity check");
1873 assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
1874
1875 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
1876 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
1877 GrowableArray<address> entries;
1878 GrowableArray<address> extras;
1879 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
1880 address start = load_archive_data(stub_id, &entries, extras_ptr);
1881 if (start != nullptr) {
1882 assert(entries.length() == entry_count - 1,
1883 "unexpected entries count %d", entries.length());
1884 *nopush_entry = entries.at(0);
1885 assert(extras.length() == extra_count,
1886 "unexpected extra count %d", extras.length());
1887 if (add_extras) {
1888 // register one handler at offset 0
1889 register_unsafe_access_handlers(extras, 0, 1);
1890 }
1891 return start;
1892 }
1893
1894 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1895 RegSet saved_reg = RegSet::of(s, d, count);
1896
1897 __ align(CodeEntryAlignment);
1898 StubCodeMark mark(this, stub_id);
1899 start = __ pc();
1900 __ enter();
1901
1902 *nopush_entry = __ pc();
1903 entries.append(*nopush_entry);
1904
1905 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1906 BLOCK_COMMENT("Post-Push Entry:");
1907
1908 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1909 if (dest_uninitialized) {
1910 decorators |= IS_DEST_UNINITIALIZED;
1911 }
1912 if (aligned) {
1913 decorators |= ARRAYCOPY_ALIGNED;
1914 }
1915
1916 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1917 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1918
1919 if (is_oop) {
1920 // save regs before copy_memory
1921 __ push(RegSet::of(d, count), sp);
1922 }
1923 {
1924 // UnsafeMemoryAccess page error: continue after unsafe access
1925 UnsafeMemoryAccessMark umam(this, add_extras, true);
1926 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1927 }
1928
1929 if (is_oop) {
1930 __ pop(RegSet::of(d, count), sp);
1931 if (VerifyOops)
1932 verify_oop_array(size, d, count, r16);
1933 }
1934
1935 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1936
1937 __ leave();
1938 __ mov(r0, zr); // return 0
1939 __ ret(lr);
1940
1941 address end = __ pc();
1942
1943 if (add_extras) {
1944 // retrieve the registered handler addresses
1945 retrieve_unsafe_access_handlers(start, end, extras);
1946 assert(extras.length() == extra_count
1947 , "incorrect handlers count %d", extras.length());
1948 }
1949
1950 // record the stub entry and end plus the no_push entry and any
1951 // extra handler addresses
1952 store_archive_data(stub_id, start, end, &entries, extras_ptr);
1953
1954 return start;
1955 }
1956
1957 // Arguments:
1958 // stub_id - is used to name the stub and identify all details of
1959 // how to perform the copy.
1960 //
1961 // nooverlap_target - identifes the (post push) entry for the
1962 // corresponding disjoint copy routine which can be
1963 // jumped to if the ranges do not actually overlap
1964 //
1965 // nopush_entry - is assigned to the stub's post push entry point
1966 // unless it is null
1967 //
1968 //
1969 // Inputs:
1970 // c_rarg0 - source array address
1971 // c_rarg1 - destination array address
1972 // c_rarg2 - element count, treated as ssize_t, can be zero
1973 //
1974 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1975 // the hardware handle it. The two dwords within qwords that span
1976 // cache line boundaries will still be loaded and stored atomically.
1977 //
1978 // Side Effects:
1979 // nopush_entry is set to the no-overlap entry point so it can be
1980 // used by some other conjoint copy method
1981 //
1982 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1983 int size;
1984 bool aligned;
1985 bool is_oop;
1986 bool dest_uninitialized;
1987 switch (stub_id) {
1988 case StubId::stubgen_jbyte_arraycopy_id:
1989 size = sizeof(jbyte);
1990 aligned = false;
1991 is_oop = false;
1992 dest_uninitialized = false;
1993 break;
1994 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1995 size = sizeof(jbyte);
1996 aligned = true;
1997 is_oop = false;
1998 dest_uninitialized = false;
1999 break;
2000 case StubId::stubgen_jshort_arraycopy_id:
2001 size = sizeof(jshort);
2002 aligned = false;
2003 is_oop = false;
2004 dest_uninitialized = false;
2005 break;
2006 case StubId::stubgen_arrayof_jshort_arraycopy_id:
2007 size = sizeof(jshort);
2008 aligned = true;
2009 is_oop = false;
2010 dest_uninitialized = false;
2011 break;
2012 case StubId::stubgen_jint_arraycopy_id:
2013 size = sizeof(jint);
2014 aligned = false;
2015 is_oop = false;
2016 dest_uninitialized = false;
2017 break;
2018 case StubId::stubgen_arrayof_jint_arraycopy_id:
2019 size = sizeof(jint);
2020 aligned = true;
2021 is_oop = false;
2022 dest_uninitialized = false;
2023 break;
2024 case StubId::stubgen_jlong_arraycopy_id:
2025 // since this is always aligned we can (should!) use the same
2026 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
2027 ShouldNotReachHere();
2028 break;
2029 case StubId::stubgen_arrayof_jlong_arraycopy_id:
2030 size = sizeof(jlong);
2031 aligned = true;
2032 is_oop = false;
2033 dest_uninitialized = false;
2034 break;
2035 case StubId::stubgen_oop_arraycopy_id:
2036 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2037 aligned = !UseCompressedOops;
2038 is_oop = true;
2039 dest_uninitialized = false;
2040 break;
2041 case StubId::stubgen_arrayof_oop_arraycopy_id:
2042 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2043 aligned = !UseCompressedOops;
2044 is_oop = true;
2045 dest_uninitialized = false;
2046 break;
2047 case StubId::stubgen_oop_arraycopy_uninit_id:
2048 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2049 aligned = !UseCompressedOops;
2050 is_oop = true;
2051 dest_uninitialized = true;
2052 break;
2053 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2054 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2055 aligned = !UseCompressedOops;
2056 is_oop = true;
2057 dest_uninitialized = true;
2058 break;
2059 default:
2060 ShouldNotReachHere();
2061 }
2062 // only some conjoint stubs generate a 2nd entry
2063 int entry_count = StubInfo::entry_count(stub_id);
2064 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2065 assert(entry_count == expected_entry_count,
2066 "expected entry count %d does not match declared entry count %d for stub %s",
2067 expected_entry_count, entry_count, StubInfo::name(stub_id));
2068
2069 // We need to protect memory accesses in certain cases
2070 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
2071 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
2072 GrowableArray<address> entries;
2073 GrowableArray<address> extras;
2074 GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
2075 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
2076 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2077 if (start != nullptr) {
2078 assert(entries.length() == expected_entry_count - 1,
2079 "unexpected entries count %d", entries.length());
2080 assert(extras.length() == extra_count,
2081 "unexpected extra count %d", extras.length());
2082 if (nopush_entry != nullptr) {
2083 *nopush_entry = entries.at(0);
2084 }
2085 if (add_extras) {
2086 // register one handler at offset 0
2087 register_unsafe_access_handlers(extras, 0, 1);
2088 }
2089 return start;
2090 }
2091
2092 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2093 RegSet saved_regs = RegSet::of(s, d, count);
2094 StubCodeMark mark(this, stub_id);
2095 start = __ pc();
2096 __ enter();
2097
2098 if (nopush_entry != nullptr) {
2099 *nopush_entry = __ pc();
2100 entries.append(*nopush_entry);
2101 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2102 BLOCK_COMMENT("Post-Push Entry:");
2103 }
2104
2105 // use fwd copy when (d-s) above_equal (count*size)
2106 Label L_overlapping;
2107 __ sub(rscratch1, d, s);
2108 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
2109 __ br(Assembler::LO, L_overlapping);
2110 __ b(RuntimeAddress(nooverlap_target));
2111 __ bind(L_overlapping);
2112
2113 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2114 if (dest_uninitialized) {
2115 decorators |= IS_DEST_UNINITIALIZED;
2116 }
2117 if (aligned) {
2118 decorators |= ARRAYCOPY_ALIGNED;
2119 }
2120
2121 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2122 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
2123
2124 if (is_oop) {
2125 // save regs before copy_memory
2126 __ push(RegSet::of(d, count), sp);
2127 }
2128 {
2129 // UnsafeMemoryAccess page error: continue after unsafe access
2130 UnsafeMemoryAccessMark umam(this, add_extras, true);
2131 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
2132 }
2133 if (is_oop) {
2134 __ pop(RegSet::of(d, count), sp);
2135 if (VerifyOops)
2136 verify_oop_array(size, d, count, r16);
2137 }
2138 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
2139 __ leave();
2140 __ mov(r0, zr); // return 0
2141 __ ret(lr);
2142
2143 assert(entries.length() == expected_entry_count - 1,
2144 "unexpected entries count %d", entries.length());
2145
2146 address end = __ pc();
2147
2148 if (add_extras) {
2149 // retrieve the registered handler addresses
2150 retrieve_unsafe_access_handlers(start, end, extras);
2151 assert(extras.length() == extra_count,
2152 "incorrect handlers count %d", extras.length());
2153 }
2154
2155 // record the stub entry and end plus any no_push entry and/or
2156 // extra handler addresses
2157 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2158
2159 return start;
2160 }
2161
2162 // Helper for generating a dynamic type check.
2163 // Smashes rscratch1, rscratch2.
2164 void generate_type_check(Register sub_klass,
2165 Register super_check_offset,
2166 Register super_klass,
2167 Register temp1,
2168 Register temp2,
2169 Register result,
2170 Label& L_success) {
2171 assert_different_registers(sub_klass, super_check_offset, super_klass);
2172
2173 BLOCK_COMMENT("type_check:");
2174
2175 Label L_miss;
2176
2177 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2178 super_check_offset);
2179 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
2180
2181 // Fall through on failure!
2182 __ BIND(L_miss);
2183 }
2184
2185 //
2186 // Generate checkcasting array copy stub
2187 //
2188 // Input:
2189 // c_rarg0 - source array address
2190 // c_rarg1 - destination array address
2191 // c_rarg2 - element count, treated as ssize_t, can be zero
2192 // c_rarg3 - size_t ckoff (super_check_offset)
2193 // c_rarg4 - oop ckval (super_klass)
2194 //
2195 // Output:
2196 // r0 == 0 - success
2197 // r0 == -1^K - failure, where K is partial transfer count
2198 //
2199 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
2200 bool dest_uninitialized;
2201 switch (stub_id) {
2202 case StubId::stubgen_checkcast_arraycopy_id:
2203 dest_uninitialized = false;
2204 break;
2205 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2206 dest_uninitialized = true;
2207 break;
2208 default:
2209 ShouldNotReachHere();
2210 }
2211
2212 // The normal stub provides a 2nd entry which omits the frame push
2213 // for use when bailing out from a disjoint copy.
2214 // Only some conjoint stubs generate a 2nd entry
2215 int entry_count = StubInfo::entry_count(stub_id);
2216 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2217 GrowableArray<address> entries;
2218 GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
2219 assert(entry_count == expected_entry_count,
2220 "expected entry count %d does not match declared entry count %d for stub %s",
2221 expected_entry_count, entry_count, StubInfo::name(stub_id));
2222 address start = load_archive_data(stub_id, entries_ptr);
2223 if (start != nullptr) {
2224 assert(entries.length() + 1 == expected_entry_count,
2225 "expected entry count %d does not match return entry count %d for stub %s",
2226 expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
2227 if (nopush_entry != nullptr) {
2228 *nopush_entry = entries.at(0);
2229 }
2230 return start;
2231 }
2232
2233 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
2234
2235 // Input registers (after setup_arg_regs)
2236 const Register from = c_rarg0; // source array address
2237 const Register to = c_rarg1; // destination array address
2238 const Register count = c_rarg2; // elementscount
2239 const Register ckoff = c_rarg3; // super_check_offset
2240 const Register ckval = c_rarg4; // super_klass
2241
2242 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
2243
2244 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
2245 const Register copied_oop = r22; // actual oop copied
2246 const Register count_save = r21; // orig elementscount
2247 const Register start_to = r20; // destination array start address
2248 const Register r19_klass = r19; // oop._klass
2249
2250 // Registers used as gc temps (r5, r6, r7 are save-on-call)
2251 const Register gct1 = r5, gct2 = r6, gct3 = r7;
2252
2253 //---------------------------------------------------------------
2254 // Assembler stub will be used for this call to arraycopy
2255 // if the two arrays are subtypes of Object[] but the
2256 // destination array type is not equal to or a supertype
2257 // of the source type. Each element must be separately
2258 // checked.
2259
2260 assert_different_registers(from, to, count, ckoff, ckval, start_to,
2261 copied_oop, r19_klass, count_save);
2262
2263 __ align(CodeEntryAlignment);
2264 StubCodeMark mark(this, stub_id);
2265 start = __ pc();
2266
2267 __ enter(); // required for proper stackwalking of RuntimeStub frame
2268
2269 #ifdef ASSERT
2270 // caller guarantees that the arrays really are different
2271 // otherwise, we would have to make conjoint checks
2272 { Label L;
2273 __ b(L); // conjoint check not yet implemented
2274 __ stop("checkcast_copy within a single array");
2275 __ bind(L);
2276 }
2277 #endif //ASSERT
2278
2279 // Caller of this entry point must set up the argument registers.
2280 if (nopush_entry != nullptr) {
2281 *nopush_entry = __ pc();
2282 entries.append(*nopush_entry);
2283 BLOCK_COMMENT("Entry:");
2284 }
2285
2286 // Empty array: Nothing to do.
2287 __ cbz(count, L_done);
2288 __ push(RegSet::of(r19, r20, r21, r22), sp);
2289
2290 #ifdef ASSERT
2291 BLOCK_COMMENT("assert consistent ckoff/ckval");
2292 // The ckoff and ckval must be mutually consistent,
2293 // even though caller generates both.
2294 { Label L;
2295 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2296 __ ldrw(start_to, Address(ckval, sco_offset));
2297 __ cmpw(ckoff, start_to);
2298 __ br(Assembler::EQ, L);
2299 __ stop("super_check_offset inconsistent");
2300 __ bind(L);
2301 }
2302 #endif //ASSERT
2303
2304 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2305 bool is_oop = true;
2306 int element_size = UseCompressedOops ? 4 : 8;
2307 if (dest_uninitialized) {
2308 decorators |= IS_DEST_UNINITIALIZED;
2309 }
2310
2311 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2312 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
2313
2314 // save the original count
2315 __ mov(count_save, count);
2316
2317 // Copy from low to high addresses
2318 __ mov(start_to, to); // Save destination array start address
2319 __ b(L_load_element);
2320
2321 // ======== begin loop ========
2322 // (Loop is rotated; its entry is L_load_element.)
2323 // Loop control:
2324 // for (; count != 0; count--) {
2325 // copied_oop = load_heap_oop(from++);
2326 // ... generate_type_check ...;
2327 // store_heap_oop(to++, copied_oop);
2328 // }
2329 __ align(OptoLoopAlignment);
2330
2331 __ BIND(L_store_element);
2332 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
2333 __ post(to, element_size), copied_oop, noreg,
2334 gct1, gct2, gct3);
2335 __ sub(count, count, 1);
2336 __ cbz(count, L_do_card_marks);
2337
2338 // ======== loop entry is here ========
2339 __ BIND(L_load_element);
2340 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
2341 copied_oop, noreg, __ post(from, element_size),
2342 gct1);
2343 __ cbz(copied_oop, L_store_element);
2344
2345 __ load_klass(r19_klass, copied_oop);// query the object klass
2346
2347 BLOCK_COMMENT("type_check:");
2348 generate_type_check(/*sub_klass*/r19_klass,
2349 /*super_check_offset*/ckoff,
2350 /*super_klass*/ckval,
2351 /*r_array_base*/gct1,
2352 /*temp2*/gct2,
2353 /*result*/r10, L_store_element);
2354
2355 // Fall through on failure!
2356
2357 // ======== end loop ========
2358
2359 // It was a real error; we must depend on the caller to finish the job.
2360 // Register count = remaining oops, count_orig = total oops.
2361 // Emit GC store barriers for the oops we have copied and report
2362 // their number to the caller.
2363
2364 __ subs(count, count_save, count); // K = partially copied oop count
2365 __ eon(count, count, zr); // report (-1^K) to caller
2366 __ br(Assembler::EQ, L_done_pop);
2367
2368 __ BIND(L_do_card_marks);
2369 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2370
2371 __ bind(L_done_pop);
2372 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2373 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2374
2375 __ bind(L_done);
2376 __ mov(r0, count);
2377 __ leave();
2378 __ ret(lr);
2379
2380 // record the stub entry and end plus any no_push entry
2381 store_archive_data(stub_id, start, __ pc() , entries_ptr);
2382 return start;
2383 }
2384
2385 // Perform range checks on the proposed arraycopy.
2386 // Kills temp, but nothing else.
2387 // Also, clean the sign bits of src_pos and dst_pos.
2388 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2389 Register src_pos, // source position (c_rarg1)
2390 Register dst, // destination array oo (c_rarg2)
2391 Register dst_pos, // destination position (c_rarg3)
2392 Register length,
2393 Register temp,
2394 Label& L_failed) {
2395 BLOCK_COMMENT("arraycopy_range_checks:");
2396
2397 assert_different_registers(rscratch1, temp);
2398
2399 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2400 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2401 __ addw(temp, length, src_pos);
2402 __ cmpw(temp, rscratch1);
2403 __ br(Assembler::HI, L_failed);
2404
2405 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2406 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2407 __ addw(temp, length, dst_pos);
2408 __ cmpw(temp, rscratch1);
2409 __ br(Assembler::HI, L_failed);
2410
2411 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2412 __ movw(src_pos, src_pos);
2413 __ movw(dst_pos, dst_pos);
2414
2415 BLOCK_COMMENT("arraycopy_range_checks done");
2416 }
2417
2418 // These stubs get called from some dumb test routine.
2419 // I'll write them properly when they're called from
2420 // something that's actually doing something.
2421 static void fake_arraycopy_stub(address src, address dst, int count) {
2422 assert(count == 0, "huh?");
2423 }
2424
2425
2426 //
2427 // Generate 'unsafe' array copy stub
2428 // Though just as safe as the other stubs, it takes an unscaled
2429 // size_t argument instead of an element count.
2430 //
2431 // Input:
2432 // c_rarg0 - source array address
2433 // c_rarg1 - destination array address
2434 // c_rarg2 - byte count, treated as ssize_t, can be zero
2435 //
2436 // Examines the alignment of the operands and dispatches
2437 // to a long, int, short, or byte copy loop.
2438 //
2439 address generate_unsafe_copy(address byte_copy_entry,
2440 address short_copy_entry,
2441 address int_copy_entry,
2442 address long_copy_entry) {
2443 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2444 int entry_count = StubInfo::entry_count(stub_id);
2445 assert(entry_count == 1, "sanity check");
2446 address start = load_archive_data(stub_id);
2447 if (start != nullptr) {
2448 return start;
2449 }
2450 Label L_long_aligned, L_int_aligned, L_short_aligned;
2451 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2452
2453 __ align(CodeEntryAlignment);
2454 StubCodeMark mark(this, stub_id);
2455 start = __ pc();
2456 __ enter(); // required for proper stackwalking of RuntimeStub frame
2457
2458 // bump this on entry, not on exit:
2459 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2460
2461 __ orr(rscratch1, s, d);
2462 __ orr(rscratch1, rscratch1, count);
2463
2464 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2465 __ cbz(rscratch1, L_long_aligned);
2466 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2467 __ cbz(rscratch1, L_int_aligned);
2468 __ tbz(rscratch1, 0, L_short_aligned);
2469 __ b(RuntimeAddress(byte_copy_entry));
2470
2471 __ BIND(L_short_aligned);
2472 __ lsr(count, count, LogBytesPerShort); // size => short_count
2473 __ b(RuntimeAddress(short_copy_entry));
2474 __ BIND(L_int_aligned);
2475 __ lsr(count, count, LogBytesPerInt); // size => int_count
2476 __ b(RuntimeAddress(int_copy_entry));
2477 __ BIND(L_long_aligned);
2478 __ lsr(count, count, LogBytesPerLong); // size => long_count
2479 __ b(RuntimeAddress(long_copy_entry));
2480
2481 // record the stub entry and end
2482 store_archive_data(stub_id, start, __ pc());
2483
2484 return start;
2485 }
2486
2487 //
2488 // Generate generic array copy stubs
2489 //
2490 // Input:
2491 // c_rarg0 - src oop
2492 // c_rarg1 - src_pos (32-bits)
2493 // c_rarg2 - dst oop
2494 // c_rarg3 - dst_pos (32-bits)
2495 // c_rarg4 - element count (32-bits)
2496 //
2497 // Output:
2498 // r0 == 0 - success
2499 // r0 == -1^K - failure, where K is partial transfer count
2500 //
2501 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2502 address int_copy_entry, address oop_copy_entry,
2503 address long_copy_entry, address checkcast_copy_entry) {
2504 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2505 int entry_count = StubInfo::entry_count(stub_id);
2506 assert(entry_count == 1, "sanity check");
2507 address start = load_archive_data(stub_id);
2508 if (start != nullptr) {
2509 return start;
2510 }
2511 Label L_failed, L_objArray;
2512 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2513
2514 // Input registers
2515 const Register src = c_rarg0; // source array oop
2516 const Register src_pos = c_rarg1; // source position
2517 const Register dst = c_rarg2; // destination array oop
2518 const Register dst_pos = c_rarg3; // destination position
2519 const Register length = c_rarg4;
2520
2521
2522 // Registers used as temps
2523 const Register dst_klass = c_rarg5;
2524
2525 __ align(CodeEntryAlignment);
2526
2527 StubCodeMark mark(this, stub_id);
2528
2529 start = __ pc();
2530
2531 __ enter(); // required for proper stackwalking of RuntimeStub frame
2532
2533 // bump this on entry, not on exit:
2534 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2535
2536 //-----------------------------------------------------------------------
2537 // Assembler stub will be used for this call to arraycopy
2538 // if the following conditions are met:
2539 //
2540 // (1) src and dst must not be null.
2541 // (2) src_pos must not be negative.
2542 // (3) dst_pos must not be negative.
2543 // (4) length must not be negative.
2544 // (5) src klass and dst klass should be the same and not null.
2545 // (6) src and dst should be arrays.
2546 // (7) src_pos + length must not exceed length of src.
2547 // (8) dst_pos + length must not exceed length of dst.
2548 //
2549
2550 // if (src == nullptr) return -1;
2551 __ cbz(src, L_failed);
2552
2553 // if (src_pos < 0) return -1;
2554 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2555
2556 // if (dst == nullptr) return -1;
2557 __ cbz(dst, L_failed);
2558
2559 // if (dst_pos < 0) return -1;
2560 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2561
2562 // registers used as temp
2563 const Register scratch_length = r16; // elements count to copy
2564 const Register scratch_src_klass = r17; // array klass
2565 const Register lh = r15; // layout helper
2566
2567 // if (length < 0) return -1;
2568 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2569 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2570
2571 __ load_klass(scratch_src_klass, src);
2572 #ifdef ASSERT
2573 // assert(src->klass() != nullptr);
2574 {
2575 BLOCK_COMMENT("assert klasses not null {");
2576 Label L1, L2;
2577 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2578 __ bind(L1);
2579 __ stop("broken null klass");
2580 __ bind(L2);
2581 __ load_klass(rscratch1, dst);
2582 __ cbz(rscratch1, L1); // this would be broken also
2583 BLOCK_COMMENT("} assert klasses not null done");
2584 }
2585 #endif
2586
2587 // Load layout helper (32-bits)
2588 //
2589 // |array_tag| | header_size | element_type | |log2_element_size|
2590 // 32 30 24 16 8 2 0
2591 //
2592 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2593 //
2594
2595 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2596
2597 // Handle objArrays completely differently...
2598 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2599 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2600 __ movw(rscratch1, objArray_lh);
2601 __ eorw(rscratch2, lh, rscratch1);
2602 __ cbzw(rscratch2, L_objArray);
2603
2604 // if (src->klass() != dst->klass()) return -1;
2605 __ load_klass(rscratch2, dst);
2606 __ eor(rscratch2, rscratch2, scratch_src_klass);
2607 __ cbnz(rscratch2, L_failed);
2608
2609 // if (!src->is_Array()) return -1;
2610 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2611
2612 // At this point, it is known to be a typeArray (array_tag 0x3).
2613 #ifdef ASSERT
2614 {
2615 BLOCK_COMMENT("assert primitive array {");
2616 Label L;
2617 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2618 __ cmpw(lh, rscratch2);
2619 __ br(Assembler::GE, L);
2620 __ stop("must be a primitive array");
2621 __ bind(L);
2622 BLOCK_COMMENT("} assert primitive array done");
2623 }
2624 #endif
2625
2626 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2627 rscratch2, L_failed);
2628
2629 // TypeArrayKlass
2630 //
2631 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2632 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2633 //
2634
2635 const Register rscratch1_offset = rscratch1; // array offset
2636 const Register r15_elsize = lh; // element size
2637
2638 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2639 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2640 __ add(src, src, rscratch1_offset); // src array offset
2641 __ add(dst, dst, rscratch1_offset); // dst array offset
2642 BLOCK_COMMENT("choose copy loop based on element size");
2643
2644 // next registers should be set before the jump to corresponding stub
2645 const Register from = c_rarg0; // source array address
2646 const Register to = c_rarg1; // destination array address
2647 const Register count = c_rarg2; // elements count
2648
2649 // 'from', 'to', 'count' registers should be set in such order
2650 // since they are the same as 'src', 'src_pos', 'dst'.
2651
2652 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2653
2654 // The possible values of elsize are 0-3, i.e. exact_log2(element
2655 // size in bytes). We do a simple bitwise binary search.
2656 __ BIND(L_copy_bytes);
2657 __ tbnz(r15_elsize, 1, L_copy_ints);
2658 __ tbnz(r15_elsize, 0, L_copy_shorts);
2659 __ lea(from, Address(src, src_pos));// src_addr
2660 __ lea(to, Address(dst, dst_pos));// dst_addr
2661 __ movw(count, scratch_length); // length
2662 __ b(RuntimeAddress(byte_copy_entry));
2663
2664 __ BIND(L_copy_shorts);
2665 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2666 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2667 __ movw(count, scratch_length); // length
2668 __ b(RuntimeAddress(short_copy_entry));
2669
2670 __ BIND(L_copy_ints);
2671 __ tbnz(r15_elsize, 0, L_copy_longs);
2672 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2673 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2674 __ movw(count, scratch_length); // length
2675 __ b(RuntimeAddress(int_copy_entry));
2676
2677 __ BIND(L_copy_longs);
2678 #ifdef ASSERT
2679 {
2680 BLOCK_COMMENT("assert long copy {");
2681 Label L;
2682 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2683 __ cmpw(r15_elsize, LogBytesPerLong);
2684 __ br(Assembler::EQ, L);
2685 __ stop("must be long copy, but elsize is wrong");
2686 __ bind(L);
2687 BLOCK_COMMENT("} assert long copy done");
2688 }
2689 #endif
2690 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2691 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2692 __ movw(count, scratch_length); // length
2693 __ b(RuntimeAddress(long_copy_entry));
2694
2695 // ObjArrayKlass
2696 __ BIND(L_objArray);
2697 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2698
2699 Label L_plain_copy, L_checkcast_copy;
2700 // test array classes for subtyping
2701 __ load_klass(r15, dst);
2702 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2703 __ br(Assembler::NE, L_checkcast_copy);
2704
2705 // Identically typed arrays can be copied without element-wise checks.
2706 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2707 rscratch2, L_failed);
2708
2709 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2710 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2711 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2712 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2713 __ movw(count, scratch_length); // length
2714 __ BIND(L_plain_copy);
2715 __ b(RuntimeAddress(oop_copy_entry));
2716
2717 __ BIND(L_checkcast_copy);
2718 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2719 {
2720 // Before looking at dst.length, make sure dst is also an objArray.
2721 __ ldrw(rscratch1, Address(r15, lh_offset));
2722 __ movw(rscratch2, objArray_lh);
2723 __ eorw(rscratch1, rscratch1, rscratch2);
2724 __ cbnzw(rscratch1, L_failed);
2725
2726 // It is safe to examine both src.length and dst.length.
2727 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2728 r15, L_failed);
2729
2730 __ load_klass(dst_klass, dst); // reload
2731
2732 // Marshal the base address arguments now, freeing registers.
2733 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2734 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2735 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2736 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2737 __ movw(count, length); // length (reloaded)
2738 Register sco_temp = c_rarg3; // this register is free now
2739 assert_different_registers(from, to, count, sco_temp,
2740 dst_klass, scratch_src_klass);
2741 // assert_clean_int(count, sco_temp);
2742
2743 // Generate the type check.
2744 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2745 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2746
2747 // Smashes rscratch1, rscratch2
2748 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2749 L_plain_copy);
2750
2751 // Fetch destination element klass from the ObjArrayKlass header.
2752 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2753 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2754 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2755
2756 // the checkcast_copy loop needs two extra arguments:
2757 assert(c_rarg3 == sco_temp, "#3 already in place");
2758 // Set up arguments for checkcast_copy_entry.
2759 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2760 __ b(RuntimeAddress(checkcast_copy_entry));
2761 }
2762
2763 __ BIND(L_failed);
2764 __ mov(r0, -1);
2765 __ leave(); // required for proper stackwalking of RuntimeStub frame
2766 __ ret(lr);
2767
2768 // record the stub entry and end
2769 store_archive_data(stub_id, start, __ pc());
2770
2771 return start;
2772 }
2773
2774 //
2775 // Generate stub for array fill. If "aligned" is true, the
2776 // "to" address is assumed to be heapword aligned.
2777 //
2778 // Arguments for generated stub:
2779 // to: c_rarg0
2780 // value: c_rarg1
2781 // count: c_rarg2 treated as signed
2782 //
2783 address generate_fill(StubId stub_id) {
2784 BasicType t;
2785 bool aligned;
2786
2787 switch (stub_id) {
2788 case StubId::stubgen_jbyte_fill_id:
2789 t = T_BYTE;
2790 aligned = false;
2791 break;
2792 case StubId::stubgen_jshort_fill_id:
2793 t = T_SHORT;
2794 aligned = false;
2795 break;
2796 case StubId::stubgen_jint_fill_id:
2797 t = T_INT;
2798 aligned = false;
2799 break;
2800 case StubId::stubgen_arrayof_jbyte_fill_id:
2801 t = T_BYTE;
2802 aligned = true;
2803 break;
2804 case StubId::stubgen_arrayof_jshort_fill_id:
2805 t = T_SHORT;
2806 aligned = true;
2807 break;
2808 case StubId::stubgen_arrayof_jint_fill_id:
2809 t = T_INT;
2810 aligned = true;
2811 break;
2812 default:
2813 ShouldNotReachHere();
2814 };
2815 int entry_count = StubInfo::entry_count(stub_id);
2816 assert(entry_count == 1, "sanity check");
2817 address start = load_archive_data(stub_id);
2818 if (start != nullptr) {
2819 return start;
2820 }
2821 __ align(CodeEntryAlignment);
2822 StubCodeMark mark(this, stub_id);
2823 start = __ pc();
2824
2825 BLOCK_COMMENT("Entry:");
2826
2827 const Register to = c_rarg0; // source array address
2828 const Register value = c_rarg1; // value
2829 const Register count = c_rarg2; // elements count
2830
2831 const Register bz_base = r10; // base for block_zero routine
2832 const Register cnt_words = r11; // temp register
2833
2834 __ enter();
2835
2836 Label L_fill_elements, L_exit1;
2837
2838 int shift = -1;
2839 switch (t) {
2840 case T_BYTE:
2841 shift = 0;
2842 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2843 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2844 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2845 __ br(Assembler::LO, L_fill_elements);
2846 break;
2847 case T_SHORT:
2848 shift = 1;
2849 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2850 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2851 __ br(Assembler::LO, L_fill_elements);
2852 break;
2853 case T_INT:
2854 shift = 2;
2855 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2856 __ br(Assembler::LO, L_fill_elements);
2857 break;
2858 default: ShouldNotReachHere();
2859 }
2860
2861 // Align source address at 8 bytes address boundary.
2862 Label L_skip_align1, L_skip_align2, L_skip_align4;
2863 if (!aligned) {
2864 switch (t) {
2865 case T_BYTE:
2866 // One byte misalignment happens only for byte arrays.
2867 __ tbz(to, 0, L_skip_align1);
2868 __ strb(value, Address(__ post(to, 1)));
2869 __ subw(count, count, 1);
2870 __ bind(L_skip_align1);
2871 // Fallthrough
2872 case T_SHORT:
2873 // Two bytes misalignment happens only for byte and short (char) arrays.
2874 __ tbz(to, 1, L_skip_align2);
2875 __ strh(value, Address(__ post(to, 2)));
2876 __ subw(count, count, 2 >> shift);
2877 __ bind(L_skip_align2);
2878 // Fallthrough
2879 case T_INT:
2880 // Align to 8 bytes, we know we are 4 byte aligned to start.
2881 __ tbz(to, 2, L_skip_align4);
2882 __ strw(value, Address(__ post(to, 4)));
2883 __ subw(count, count, 4 >> shift);
2884 __ bind(L_skip_align4);
2885 break;
2886 default: ShouldNotReachHere();
2887 }
2888 }
2889
2890 //
2891 // Fill large chunks
2892 //
2893 __ lsrw(cnt_words, count, 3 - shift); // number of words
2894 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2895 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2896 if (UseBlockZeroing) {
2897 Label non_block_zeroing, rest;
2898 // If the fill value is zero we can use the fast zero_words().
2899 __ cbnz(value, non_block_zeroing);
2900 __ mov(bz_base, to);
2901 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2902 address tpc = __ zero_words(bz_base, cnt_words);
2903 if (tpc == nullptr) {
2904 fatal("CodeCache is full at generate_fill");
2905 }
2906 __ b(rest);
2907 __ bind(non_block_zeroing);
2908 __ fill_words(to, cnt_words, value);
2909 __ bind(rest);
2910 } else {
2911 __ fill_words(to, cnt_words, value);
2912 }
2913
2914 // Remaining count is less than 8 bytes. Fill it by a single store.
2915 // Note that the total length is no less than 8 bytes.
2916 if (t == T_BYTE || t == T_SHORT) {
2917 Label L_exit1;
2918 __ cbzw(count, L_exit1);
2919 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2920 __ str(value, Address(to, -8)); // overwrite some elements
2921 __ bind(L_exit1);
2922 __ leave();
2923 __ ret(lr);
2924 }
2925
2926 // Handle copies less than 8 bytes.
2927 Label L_fill_2, L_fill_4, L_exit2;
2928 __ bind(L_fill_elements);
2929 switch (t) {
2930 case T_BYTE:
2931 __ tbz(count, 0, L_fill_2);
2932 __ strb(value, Address(__ post(to, 1)));
2933 __ bind(L_fill_2);
2934 __ tbz(count, 1, L_fill_4);
2935 __ strh(value, Address(__ post(to, 2)));
2936 __ bind(L_fill_4);
2937 __ tbz(count, 2, L_exit2);
2938 __ strw(value, Address(to));
2939 break;
2940 case T_SHORT:
2941 __ tbz(count, 0, L_fill_4);
2942 __ strh(value, Address(__ post(to, 2)));
2943 __ bind(L_fill_4);
2944 __ tbz(count, 1, L_exit2);
2945 __ strw(value, Address(to));
2946 break;
2947 case T_INT:
2948 __ cbzw(count, L_exit2);
2949 __ strw(value, Address(to));
2950 break;
2951 default: ShouldNotReachHere();
2952 }
2953 __ bind(L_exit2);
2954 __ leave();
2955 __ ret(lr);
2956
2957 // record the stub entry and end
2958 store_archive_data(stub_id, start, __ pc());
2959
2960 return start;
2961 }
2962
2963 address generate_unsafecopy_common_error_exit() {
2964 StubId stub_id = StubId::stubgen_unsafecopy_common_id;
2965 int entry_count = StubInfo::entry_count(stub_id);
2966 assert(entry_count == 1, "sanity check");
2967 address start = load_archive_data(stub_id);
2968 if (start != nullptr) {
2969 return start;
2970 }
2971 __ align(CodeEntryAlignment);
2972 StubCodeMark mark(this, stub_id);
2973 start = __ pc();
2974 __ leave();
2975 __ mov(r0, 0);
2976 __ ret(lr);
2977
2978 // record the stub entry and end
2979 store_archive_data(stub_id, start, __ pc());
2980
2981 return start;
2982 }
2983
2984 //
2985 // Generate 'unsafe' set memory stub
2986 // Though just as safe as the other stubs, it takes an unscaled
2987 // size_t (# bytes) argument instead of an element count.
2988 //
2989 // This fill operation is atomicity preserving: as long as the
2990 // address supplied is sufficiently aligned, all writes of up to 64
2991 // bits in size are single-copy atomic.
2992 //
2993 // Input:
2994 // c_rarg0 - destination array address
2995 // c_rarg1 - byte count (size_t)
2996 // c_rarg2 - byte value
2997 //
2998 address generate_unsafe_setmemory() {
2999 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
3000 int entry_count = StubInfo::entry_count(stub_id);
3001 assert(entry_count == 1, "sanity check");
3002 // we expect one set of extra unsafememory access handler entries
3003 GrowableArray<address> extras;
3004 int extra_count = 1 * UnsafeMemoryAccess::COLUMN_COUNT;
3005 address start = load_archive_data(stub_id, nullptr, &extras);
3006 if (start != nullptr) {
3007 assert(extras.length() == extra_count,
3008 "unexpected extra entry count %d", extras.length());
3009 register_unsafe_access_handlers(extras, 0, 1);
3010 return start;
3011 }
3012
3013 __ align(CodeEntryAlignment);
3014 StubCodeMark mark(this, stub_id);
3015 start = __ pc();
3016
3017 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
3018 Label tail;
3019
3020 {
3021 UnsafeMemoryAccessMark umam(this, true, false);
3022
3023 __ enter(); // required for proper stackwalking of RuntimeStub frame
3024
3025 __ dup(v0, __ T16B, value);
3026
3027 if (AvoidUnalignedAccesses) {
3028 __ cmp(count, (u1)16);
3029 __ br(__ LO, tail);
3030
3031 __ mov(rscratch1, 16);
3032 __ andr(rscratch2, dest, 15);
3033 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
3034 __ strq(v0, Address(dest));
3035 __ sub(count, count, rscratch1);
3036 __ add(dest, dest, rscratch1);
3037 }
3038
3039 __ subs(count, count, (u1)64);
3040 __ br(__ LO, tail);
3041 {
3042 Label again;
3043 __ bind(again);
3044 __ stpq(v0, v0, Address(dest));
3045 __ stpq(v0, v0, Address(dest, 32));
3046
3047 __ subs(count, count, 64);
3048 __ add(dest, dest, 64);
3049 __ br(__ HS, again);
3050 }
3051
3052 __ bind(tail);
3053 // The count of bytes is off by 64, but we don't need to correct
3054 // it because we're only going to use the least-significant few
3055 // count bits from here on.
3056 // __ add(count, count, 64);
3057
3058 {
3059 Label dont;
3060 __ tbz(count, exact_log2(32), dont);
3061 __ stpq(v0, v0, __ post(dest, 32));
3062 __ bind(dont);
3063 }
3064 {
3065 Label dont;
3066 __ tbz(count, exact_log2(16), dont);
3067 __ strq(v0, __ post(dest, 16));
3068 __ bind(dont);
3069 }
3070 {
3071 Label dont;
3072 __ tbz(count, exact_log2(8), dont);
3073 __ strd(v0, __ post(dest, 8));
3074 __ bind(dont);
3075 }
3076
3077 Label finished;
3078 __ tst(count, 7);
3079 __ br(__ EQ, finished);
3080
3081 {
3082 Label dont;
3083 __ tbz(count, exact_log2(4), dont);
3084 __ strs(v0, __ post(dest, 4));
3085 __ bind(dont);
3086 }
3087 {
3088 Label dont;
3089 __ tbz(count, exact_log2(2), dont);
3090 __ bfi(value, value, 8, 8);
3091 __ strh(value, __ post(dest, 2));
3092 __ bind(dont);
3093 }
3094 {
3095 Label dont;
3096 __ tbz(count, exact_log2(1), dont);
3097 __ strb(value, Address(dest));
3098 __ bind(dont);
3099 }
3100
3101 __ bind(finished);
3102 __ leave();
3103 __ ret(lr);
3104 // have to exit the block and destroy the UnsafeMemoryAccessMark
3105 // in order to retrieve the handler end address
3106 }
3107
3108 // install saved handler addresses in extras
3109 address end = __ pc();
3110 retrieve_unsafe_access_handlers(start, end, extras);
3111 assert(extras.length() == extra_count,
3112 "incorrect handlers count %d", extras.length());
3113 // record the stub entry and end plus the extras
3114 store_archive_data(stub_id, start, end, nullptr, &extras);
3115
3116 return start;
3117 }
3118
3119 address generate_data_cache_writeback() {
3120 const Register line = c_rarg0; // address of line to write back
3121
3122 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3123 int entry_count = StubInfo::entry_count(stub_id);
3124 assert(entry_count == 1, "sanity check");
3125 address start = load_archive_data(stub_id);
3126 if (start != nullptr) {
3127 return start;
3128 }
3129 __ align(CodeEntryAlignment);
3130 StubCodeMark mark(this, stub_id);
3131
3132 start = __ pc();
3133 __ enter();
3134 __ cache_wb(Address(line, 0));
3135 __ leave();
3136 __ ret(lr);
3137
3138 // record the stub entry and end
3139 store_archive_data(stub_id, start, __ pc());
3140
3141 return start;
3142 }
3143
3144 address generate_data_cache_writeback_sync() {
3145 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3146 int entry_count = StubInfo::entry_count(stub_id);
3147 assert(entry_count == 1, "sanity check");
3148 address start = load_archive_data(stub_id);
3149 if (start != nullptr) {
3150 return start;
3151 }
3152 const Register is_pre = c_rarg0; // pre or post sync
3153 __ align(CodeEntryAlignment);
3154 StubCodeMark mark(this, stub_id);
3155
3156 // pre wbsync is a no-op
3157 // post wbsync translates to an sfence
3158
3159 Label skip;
3160 start = __ pc();
3161 __ enter();
3162 __ cbnz(is_pre, skip);
3163 __ cache_wbsync(false);
3164 __ bind(skip);
3165 __ leave();
3166 __ ret(lr);
3167
3168 // record the stub entry and end
3169 store_archive_data(stub_id, start, __ pc());
3170
3171 return start;
3172 }
3173
3174 void generate_arraycopy_stubs() {
3175 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
3176 // entry immediately following their stack push. This can be used
3177 // as a post-push branch target for compatible stubs when they
3178 // identify a special case that can be handled by the fallback
3179 // stub e.g a disjoint copy stub may be use as a special case
3180 // fallback for its compatible conjoint copy stub.
3181 //
3182 // A no push entry is always returned in the following local and
3183 // then published by assigning to the appropriate entry field in
3184 // class StubRoutines. The entry value is then passed to the
3185 // generator for the compatible stub. That means the entry must be
3186 // listed when saving to/restoring from the AOT cache, ensuring
3187 // that the inter-stub jumps are noted at AOT-cache save and
3188 // relocated at AOT cache load.
3189 address nopush_entry;
3190
3191 // generate the common exit first so later stubs can rely on it if
3192 // they want an UnsafeMemoryAccess exit non-local to the stub
3193 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3194 // register the stub as the default exit with class UnsafeMemoryAccess
3195 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3196
3197 // generate and publish arch64-specific bulk copy routines first
3198 // so we can call them from other copy stubs
3199 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3200 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3201
3202 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3203 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3204
3205 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3206 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3207
3208 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
3209
3210 //*** jbyte
3211 // Always need aligned and unaligned versions
3212 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
3213 // disjoint nopush entry is needed by conjoint copy
3214 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3215 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
3216 // conjoint nopush entry is needed by generic/unsafe copy
3217 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
3218 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
3219 // disjoint arrayof nopush entry is needed by conjoint copy
3220 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3221 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
3222
3223 //*** jshort
3224 // Always need aligned and unaligned versions
3225 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
3226 // disjoint nopush entry is needed by conjoint copy
3227 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
3228 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
3229 // conjoint nopush entry is used by generic/unsafe copy
3230 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
3231 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
3232 // disjoint arrayof nopush entry is needed by conjoint copy
3233 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
3234 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
3235
3236 //*** jint
3237 // Aligned versions
3238 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
3239 // disjoint arrayof nopush entry is needed by conjoint copy
3240 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
3241 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
3242 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3243 // jint_arraycopy_nopush always points to the unaligned version
3244 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
3245 // disjoint nopush entry is needed by conjoint copy
3246 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
3247 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
3248 // conjoint nopush entry is needed by generic/unsafe copy
3249 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
3250
3251 //*** jlong
3252 // It is always aligned
3253 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
3254 // disjoint arrayof nopush entry is needed by conjoint copy
3255 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
3256 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
3257 // conjoint nopush entry is needed by generic/unsafe copy
3258 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
3259 // disjoint normal/nopush and conjoint normal entries are not
3260 // generated since the arrayof versions are the same
3261 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3262 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
3263 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3264
3265 //*** oops
3266 {
3267 StubRoutines::_arrayof_oop_disjoint_arraycopy
3268 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
3269 // disjoint arrayof nopush entry is needed by conjoint copy
3270 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
3271 StubRoutines::_arrayof_oop_arraycopy
3272 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
3273 // conjoint arrayof nopush entry is needed by generic/unsafe copy
3274 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
3275 // Aligned versions without pre-barriers
3276 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
3277 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
3278 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
3279 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
3280 // note that we don't need a returned nopush entry because the
3281 // generic/unsafe copy does not cater for uninit arrays.
3282 StubRoutines::_arrayof_oop_arraycopy_uninit
3283 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
3284 }
3285
3286 // for oop copies reuse arrayof entries for non-arrayof cases
3287 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3288 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
3289 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3290 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3291 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
3292 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3293
3294 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
3295 // checkcast nopush entry is needed by generic copy
3296 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
3297 // note that we don't need a returned nopush entry because the
3298 // generic copy does not cater for uninit arrays.
3299 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
3300
3301 // unsafe arraycopy may fallback on conjoint stubs
3302 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
3303 StubRoutines::_jshort_arraycopy_nopush,
3304 StubRoutines::_jint_arraycopy_nopush,
3305 StubRoutines::_jlong_arraycopy_nopush);
3306
3307 // generic arraycopy may fallback on conjoint stubs
3308 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
3309 StubRoutines::_jshort_arraycopy_nopush,
3310 StubRoutines::_jint_arraycopy_nopush,
3311 StubRoutines::_oop_arraycopy_nopush,
3312 StubRoutines::_jlong_arraycopy_nopush,
3313 StubRoutines::_checkcast_arraycopy_nopush);
3314
3315 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
3316 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
3317 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
3318 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3319 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3320 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3321 }
3322
3323 void generate_math_stubs() { Unimplemented(); }
3324
3325 // Arguments:
3326 //
3327 // Inputs:
3328 // c_rarg0 - source byte array address
3329 // c_rarg1 - destination byte array address
3330 // c_rarg2 - sessionKe (key) in little endian int array
3331 //
3332 address generate_aescrypt_encryptBlock() {
3333 assert(UseAES, "need AES cryptographic extension support");
3334 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
3335 int entry_count = StubInfo::entry_count(stub_id);
3336 assert(entry_count == 1, "sanity check");
3337 address start = load_archive_data(stub_id);
3338 if (start != nullptr) {
3339 return start;
3340 }
3341 __ align(CodeEntryAlignment);
3342 StubCodeMark mark(this, stub_id);
3343
3344 const Register from = c_rarg0; // source array address
3345 const Register to = c_rarg1; // destination array address
3346 const Register key = c_rarg2; // key array address
3347 const Register keylen = rscratch1;
3348
3349 start = __ pc();
3350 __ enter();
3351
3352 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3353
3354 __ aesenc_loadkeys(key, keylen);
3355 __ aesecb_encrypt(from, to, keylen);
3356
3357 __ mov(r0, 0);
3358
3359 __ leave();
3360 __ ret(lr);
3361
3362 // record the stub entry and end
3363 store_archive_data(stub_id, start, __ pc());
3364
3365 return start;
3366 }
3367
3368 // Arguments:
3369 //
3370 // Inputs:
3371 // c_rarg0 - source byte array address
3372 // c_rarg1 - destination byte array address
3373 // c_rarg2 - sessionKd (key) in little endian int array
3374 //
3375 address generate_aescrypt_decryptBlock() {
3376 assert(UseAES, "need AES cryptographic extension support");
3377 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
3378 int entry_count = StubInfo::entry_count(stub_id);
3379 assert(entry_count == 1, "sanity check");
3380 address start = load_archive_data(stub_id);
3381 if (start != nullptr) {
3382 return start;
3383 }
3384 __ align(CodeEntryAlignment);
3385 StubCodeMark mark(this, stub_id);
3386 Label L_doLast;
3387
3388 const Register from = c_rarg0; // source array address
3389 const Register to = c_rarg1; // destination array address
3390 const Register key = c_rarg2; // key array address
3391 const Register keylen = rscratch1;
3392
3393 start = __ pc();
3394 __ enter(); // required for proper stackwalking of RuntimeStub frame
3395
3396 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3397
3398 __ aesecb_decrypt(from, to, key, keylen);
3399
3400 __ mov(r0, 0);
3401
3402 __ leave();
3403 __ ret(lr);
3404
3405 // record the stub entry and end
3406 store_archive_data(stub_id, start, __ pc());
3407
3408 return start;
3409 }
3410
3411 // Arguments:
3412 //
3413 // Inputs:
3414 // c_rarg0 - source byte array address
3415 // c_rarg1 - destination byte array address
3416 // c_rarg2 - sessionKe (key) in little endian int array
3417 // c_rarg3 - r vector byte array address
3418 // c_rarg4 - input length
3419 //
3420 // Output:
3421 // x0 - input length
3422 //
3423 address generate_cipherBlockChaining_encryptAESCrypt() {
3424 assert(UseAES, "need AES cryptographic extension support");
3425 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
3426 int entry_count = StubInfo::entry_count(stub_id);
3427 assert(entry_count == 1, "sanity check");
3428 address start = load_archive_data(stub_id);
3429 if (start != nullptr) {
3430 return start;
3431 }
3432 __ align(CodeEntryAlignment);
3433 StubCodeMark mark(this, stub_id);
3434
3435 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3436
3437 const Register from = c_rarg0; // source array address
3438 const Register to = c_rarg1; // destination array address
3439 const Register key = c_rarg2; // key array address
3440 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3441 // and left with the results of the last encryption block
3442 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3443 const Register keylen = rscratch1;
3444
3445 start = __ pc();
3446
3447 __ enter();
3448
3449 __ movw(rscratch2, len_reg);
3450
3451 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3452
3453 __ ld1(v0, __ T16B, rvec);
3454
3455 __ cmpw(keylen, 52);
3456 __ br(Assembler::CC, L_loadkeys_44);
3457 __ br(Assembler::EQ, L_loadkeys_52);
3458
3459 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3460 __ rev32(v17, __ T16B, v17);
3461 __ rev32(v18, __ T16B, v18);
3462 __ BIND(L_loadkeys_52);
3463 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3464 __ rev32(v19, __ T16B, v19);
3465 __ rev32(v20, __ T16B, v20);
3466 __ BIND(L_loadkeys_44);
3467 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3468 __ rev32(v21, __ T16B, v21);
3469 __ rev32(v22, __ T16B, v22);
3470 __ rev32(v23, __ T16B, v23);
3471 __ rev32(v24, __ T16B, v24);
3472 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3473 __ rev32(v25, __ T16B, v25);
3474 __ rev32(v26, __ T16B, v26);
3475 __ rev32(v27, __ T16B, v27);
3476 __ rev32(v28, __ T16B, v28);
3477 __ ld1(v29, v30, v31, __ T16B, key);
3478 __ rev32(v29, __ T16B, v29);
3479 __ rev32(v30, __ T16B, v30);
3480 __ rev32(v31, __ T16B, v31);
3481
3482 __ BIND(L_aes_loop);
3483 __ ld1(v1, __ T16B, __ post(from, 16));
3484 __ eor(v0, __ T16B, v0, v1);
3485
3486 __ br(Assembler::CC, L_rounds_44);
3487 __ br(Assembler::EQ, L_rounds_52);
3488
3489 __ aese(v0, v17); __ aesmc(v0, v0);
3490 __ aese(v0, v18); __ aesmc(v0, v0);
3491 __ BIND(L_rounds_52);
3492 __ aese(v0, v19); __ aesmc(v0, v0);
3493 __ aese(v0, v20); __ aesmc(v0, v0);
3494 __ BIND(L_rounds_44);
3495 __ aese(v0, v21); __ aesmc(v0, v0);
3496 __ aese(v0, v22); __ aesmc(v0, v0);
3497 __ aese(v0, v23); __ aesmc(v0, v0);
3498 __ aese(v0, v24); __ aesmc(v0, v0);
3499 __ aese(v0, v25); __ aesmc(v0, v0);
3500 __ aese(v0, v26); __ aesmc(v0, v0);
3501 __ aese(v0, v27); __ aesmc(v0, v0);
3502 __ aese(v0, v28); __ aesmc(v0, v0);
3503 __ aese(v0, v29); __ aesmc(v0, v0);
3504 __ aese(v0, v30);
3505 __ eor(v0, __ T16B, v0, v31);
3506
3507 __ st1(v0, __ T16B, __ post(to, 16));
3508
3509 __ subw(len_reg, len_reg, 16);
3510 __ cbnzw(len_reg, L_aes_loop);
3511
3512 __ st1(v0, __ T16B, rvec);
3513
3514 __ mov(r0, rscratch2);
3515
3516 __ leave();
3517 __ ret(lr);
3518
3519 // record the stub entry and end
3520 store_archive_data(stub_id, start, __ pc());
3521
3522 return start;
3523 }
3524
3525 // Arguments:
3526 //
3527 // Inputs:
3528 // c_rarg0 - source byte array address
3529 // c_rarg1 - destination byte array address
3530 // c_rarg2 - sessionKd (key) in little endian int array
3531 // c_rarg3 - r vector byte array address
3532 // c_rarg4 - input length
3533 //
3534 // Output:
3535 // r0 - input length
3536 //
3537 address generate_cipherBlockChaining_decryptAESCrypt() {
3538 assert(UseAES, "need AES cryptographic extension support");
3539 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3540 int entry_count = StubInfo::entry_count(stub_id);
3541 assert(entry_count == 1, "sanity check");
3542 address start = load_archive_data(stub_id);
3543 if (start != nullptr) {
3544 return start;
3545 }
3546 __ align(CodeEntryAlignment);
3547 StubCodeMark mark(this, stub_id);
3548
3549 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3550
3551 const Register from = c_rarg0; // source array address
3552 const Register to = c_rarg1; // destination array address
3553 const Register key = c_rarg2; // key array address
3554 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3555 // and left with the results of the last encryption block
3556 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3557 const Register keylen = rscratch1;
3558
3559 start = __ pc();
3560
3561 __ enter();
3562
3563 __ movw(rscratch2, len_reg);
3564
3565 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3566
3567 __ ld1(v2, __ T16B, rvec);
3568
3569 __ ld1(v31, __ T16B, __ post(key, 16));
3570 __ rev32(v31, __ T16B, v31);
3571
3572 __ cmpw(keylen, 52);
3573 __ br(Assembler::CC, L_loadkeys_44);
3574 __ br(Assembler::EQ, L_loadkeys_52);
3575
3576 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3577 __ rev32(v17, __ T16B, v17);
3578 __ rev32(v18, __ T16B, v18);
3579 __ BIND(L_loadkeys_52);
3580 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3581 __ rev32(v19, __ T16B, v19);
3582 __ rev32(v20, __ T16B, v20);
3583 __ BIND(L_loadkeys_44);
3584 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3585 __ rev32(v21, __ T16B, v21);
3586 __ rev32(v22, __ T16B, v22);
3587 __ rev32(v23, __ T16B, v23);
3588 __ rev32(v24, __ T16B, v24);
3589 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3590 __ rev32(v25, __ T16B, v25);
3591 __ rev32(v26, __ T16B, v26);
3592 __ rev32(v27, __ T16B, v27);
3593 __ rev32(v28, __ T16B, v28);
3594 __ ld1(v29, v30, __ T16B, key);
3595 __ rev32(v29, __ T16B, v29);
3596 __ rev32(v30, __ T16B, v30);
3597
3598 __ BIND(L_aes_loop);
3599 __ ld1(v0, __ T16B, __ post(from, 16));
3600 __ orr(v1, __ T16B, v0, v0);
3601
3602 __ br(Assembler::CC, L_rounds_44);
3603 __ br(Assembler::EQ, L_rounds_52);
3604
3605 __ aesd(v0, v17); __ aesimc(v0, v0);
3606 __ aesd(v0, v18); __ aesimc(v0, v0);
3607 __ BIND(L_rounds_52);
3608 __ aesd(v0, v19); __ aesimc(v0, v0);
3609 __ aesd(v0, v20); __ aesimc(v0, v0);
3610 __ BIND(L_rounds_44);
3611 __ aesd(v0, v21); __ aesimc(v0, v0);
3612 __ aesd(v0, v22); __ aesimc(v0, v0);
3613 __ aesd(v0, v23); __ aesimc(v0, v0);
3614 __ aesd(v0, v24); __ aesimc(v0, v0);
3615 __ aesd(v0, v25); __ aesimc(v0, v0);
3616 __ aesd(v0, v26); __ aesimc(v0, v0);
3617 __ aesd(v0, v27); __ aesimc(v0, v0);
3618 __ aesd(v0, v28); __ aesimc(v0, v0);
3619 __ aesd(v0, v29); __ aesimc(v0, v0);
3620 __ aesd(v0, v30);
3621 __ eor(v0, __ T16B, v0, v31);
3622 __ eor(v0, __ T16B, v0, v2);
3623
3624 __ st1(v0, __ T16B, __ post(to, 16));
3625 __ orr(v2, __ T16B, v1, v1);
3626
3627 __ subw(len_reg, len_reg, 16);
3628 __ cbnzw(len_reg, L_aes_loop);
3629
3630 __ st1(v2, __ T16B, rvec);
3631
3632 __ mov(r0, rscratch2);
3633
3634 __ leave();
3635 __ ret(lr);
3636
3637 // record the stub entry and end
3638 store_archive_data(stub_id, start, __ pc());
3639
3640 return start;
3641 }
3642
3643 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3644 // Inputs: 128-bits. in is preserved.
3645 // The least-significant 64-bit word is in the upper dword of each vector.
3646 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3647 // Output: result
3648 void be_add_128_64(FloatRegister result, FloatRegister in,
3649 FloatRegister inc, FloatRegister tmp) {
3650 assert_different_registers(result, tmp, inc);
3651
3652 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3653 // input
3654 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3655 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3656 // MSD == 0 (must be!) to LSD
3657 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3658 }
3659
3660 // CTR AES crypt.
3661 // Arguments:
3662 //
3663 // Inputs:
3664 // c_rarg0 - source byte array address
3665 // c_rarg1 - destination byte array address
3666 // c_rarg2 - sessionKe (key) in little endian int array
3667 // c_rarg3 - counter vector byte array address
3668 // c_rarg4 - input length
3669 // c_rarg5 - saved encryptedCounter start
3670 // c_rarg6 - saved used length
3671 //
3672 // Output:
3673 // r0 - input length
3674 //
3675 address generate_counterMode_AESCrypt() {
3676 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3677 int entry_count = StubInfo::entry_count(stub_id);
3678 assert(entry_count == 1, "sanity check");
3679 address start = load_archive_data(stub_id);
3680 if (start != nullptr) {
3681 return start;
3682 }
3683 const Register in = c_rarg0;
3684 const Register out = c_rarg1;
3685 const Register key = c_rarg2;
3686 const Register counter = c_rarg3;
3687 const Register saved_len = c_rarg4, len = r10;
3688 const Register saved_encrypted_ctr = c_rarg5;
3689 const Register used_ptr = c_rarg6, used = r12;
3690
3691 const Register offset = r7;
3692 const Register keylen = r11;
3693
3694 const unsigned char block_size = 16;
3695 const int bulk_width = 4;
3696 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3697 // performance with larger data sizes, but it also means that the
3698 // fast path isn't used until you have at least 8 blocks, and up
3699 // to 127 bytes of data will be executed on the slow path. For
3700 // that reason, and also so as not to blow away too much icache, 4
3701 // blocks seems like a sensible compromise.
3702
3703 // Algorithm:
3704 //
3705 // if (len == 0) {
3706 // goto DONE;
3707 // }
3708 // int result = len;
3709 // do {
3710 // if (used >= blockSize) {
3711 // if (len >= bulk_width * blockSize) {
3712 // CTR_large_block();
3713 // if (len == 0)
3714 // goto DONE;
3715 // }
3716 // for (;;) {
3717 // 16ByteVector v0 = counter;
3718 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3719 // used = 0;
3720 // if (len < blockSize)
3721 // break; /* goto NEXT */
3722 // 16ByteVector v1 = load16Bytes(in, offset);
3723 // v1 = v1 ^ encryptedCounter;
3724 // store16Bytes(out, offset);
3725 // used = blockSize;
3726 // offset += blockSize;
3727 // len -= blockSize;
3728 // if (len == 0)
3729 // goto DONE;
3730 // }
3731 // }
3732 // NEXT:
3733 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3734 // len--;
3735 // } while (len != 0);
3736 // DONE:
3737 // return result;
3738 //
3739 // CTR_large_block()
3740 // Wide bulk encryption of whole blocks.
3741
3742 __ align(CodeEntryAlignment);
3743 StubCodeMark mark(this, stub_id);
3744 start = __ pc();
3745 __ enter();
3746
3747 Label DONE, CTR_large_block, large_block_return;
3748 __ ldrw(used, Address(used_ptr));
3749 __ cbzw(saved_len, DONE);
3750
3751 __ mov(len, saved_len);
3752 __ mov(offset, 0);
3753
3754 // Compute #rounds for AES based on the length of the key array
3755 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3756
3757 __ aesenc_loadkeys(key, keylen);
3758
3759 {
3760 Label L_CTR_loop, NEXT;
3761
3762 __ bind(L_CTR_loop);
3763
3764 __ cmp(used, block_size);
3765 __ br(__ LO, NEXT);
3766
3767 // Maybe we have a lot of data
3768 __ subsw(rscratch1, len, bulk_width * block_size);
3769 __ br(__ HS, CTR_large_block);
3770 __ BIND(large_block_return);
3771 __ cbzw(len, DONE);
3772
3773 // Setup the counter
3774 __ movi(v4, __ T4S, 0);
3775 __ movi(v5, __ T4S, 1);
3776 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3777
3778 // 128-bit big-endian increment
3779 __ ld1(v0, __ T16B, counter);
3780 __ rev64(v16, __ T16B, v0);
3781 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3782 __ rev64(v16, __ T16B, v16);
3783 __ st1(v16, __ T16B, counter);
3784 // Previous counter value is in v0
3785 // v4 contains { 0, 1 }
3786
3787 {
3788 // We have fewer than bulk_width blocks of data left. Encrypt
3789 // them one by one until there is less than a full block
3790 // remaining, being careful to save both the encrypted counter
3791 // and the counter.
3792
3793 Label inner_loop;
3794 __ bind(inner_loop);
3795 // Counter to encrypt is in v0
3796 __ aesecb_encrypt(noreg, noreg, keylen);
3797 __ st1(v0, __ T16B, saved_encrypted_ctr);
3798
3799 // Do we have a remaining full block?
3800
3801 __ mov(used, 0);
3802 __ cmp(len, block_size);
3803 __ br(__ LO, NEXT);
3804
3805 // Yes, we have a full block
3806 __ ldrq(v1, Address(in, offset));
3807 __ eor(v1, __ T16B, v1, v0);
3808 __ strq(v1, Address(out, offset));
3809 __ mov(used, block_size);
3810 __ add(offset, offset, block_size);
3811
3812 __ subw(len, len, block_size);
3813 __ cbzw(len, DONE);
3814
3815 // Increment the counter, store it back
3816 __ orr(v0, __ T16B, v16, v16);
3817 __ rev64(v16, __ T16B, v16);
3818 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3819 __ rev64(v16, __ T16B, v16);
3820 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3821
3822 __ b(inner_loop);
3823 }
3824
3825 __ BIND(NEXT);
3826
3827 // Encrypt a single byte, and loop.
3828 // We expect this to be a rare event.
3829 __ ldrb(rscratch1, Address(in, offset));
3830 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3831 __ eor(rscratch1, rscratch1, rscratch2);
3832 __ strb(rscratch1, Address(out, offset));
3833 __ add(offset, offset, 1);
3834 __ add(used, used, 1);
3835 __ subw(len, len,1);
3836 __ cbnzw(len, L_CTR_loop);
3837 }
3838
3839 __ bind(DONE);
3840 __ strw(used, Address(used_ptr));
3841 __ mov(r0, saved_len);
3842
3843 __ leave(); // required for proper stackwalking of RuntimeStub frame
3844 __ ret(lr);
3845
3846 // Bulk encryption
3847
3848 __ BIND (CTR_large_block);
3849 assert(bulk_width == 4 || bulk_width == 8, "must be");
3850
3851 if (bulk_width == 8) {
3852 __ sub(sp, sp, 4 * 16);
3853 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3854 }
3855 __ sub(sp, sp, 4 * 16);
3856 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3857 RegSet saved_regs = (RegSet::of(in, out, offset)
3858 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3859 __ push(saved_regs, sp);
3860 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3861 __ add(in, in, offset);
3862 __ add(out, out, offset);
3863
3864 // Keys should already be loaded into the correct registers
3865
3866 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3867 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3868
3869 // AES/CTR loop
3870 {
3871 Label L_CTR_loop;
3872 __ BIND(L_CTR_loop);
3873
3874 // Setup the counters
3875 __ movi(v8, __ T4S, 0);
3876 __ movi(v9, __ T4S, 1);
3877 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3878
3879 for (int i = 0; i < bulk_width; i++) {
3880 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3881 __ rev64(v0_ofs, __ T16B, v16);
3882 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3883 }
3884
3885 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3886
3887 // Encrypt the counters
3888 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3889
3890 if (bulk_width == 8) {
3891 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3892 }
3893
3894 // XOR the encrypted counters with the inputs
3895 for (int i = 0; i < bulk_width; i++) {
3896 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3897 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3898 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3899 }
3900
3901 // Write the encrypted data
3902 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3903 if (bulk_width == 8) {
3904 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3905 }
3906
3907 __ subw(len, len, 16 * bulk_width);
3908 __ cbnzw(len, L_CTR_loop);
3909 }
3910
3911 // Save the counter back where it goes
3912 __ rev64(v16, __ T16B, v16);
3913 __ st1(v16, __ T16B, counter);
3914
3915 __ pop(saved_regs, sp);
3916
3917 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3918 if (bulk_width == 8) {
3919 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3920 }
3921
3922 __ andr(rscratch1, len, -16 * bulk_width);
3923 __ sub(len, len, rscratch1);
3924 __ add(offset, offset, rscratch1);
3925 __ mov(used, 16);
3926 __ strw(used, Address(used_ptr));
3927 __ b(large_block_return);
3928
3929 // record the stub entry and end
3930 store_archive_data(stub_id, start, __ pc());
3931
3932 return start;
3933 }
3934
3935 // Vector AES Galois Counter Mode implementation. Parameters:
3936 //
3937 // in = c_rarg0
3938 // len = c_rarg1
3939 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3940 // out = c_rarg3
3941 // key = c_rarg4
3942 // state = c_rarg5 - GHASH.state
3943 // subkeyHtbl = c_rarg6 - powers of H
3944 // counter = c_rarg7 - 16 bytes of CTR
3945 // return - number of processed bytes
3946 address generate_galoisCounterMode_AESCrypt() {
3947 Label ghash_polynomial; // local data generated after code
3948 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3949 int entry_count = StubInfo::entry_count(stub_id);
3950 assert(entry_count == 1, "sanity check");
3951 address start = load_archive_data(stub_id);
3952 if (start != nullptr) {
3953 return start;
3954 }
3955 __ align(CodeEntryAlignment);
3956 StubCodeMark mark(this, stub_id);
3957 start = __ pc();
3958 __ enter();
3959
3960 const Register in = c_rarg0;
3961 const Register len = c_rarg1;
3962 const Register ct = c_rarg2;
3963 const Register out = c_rarg3;
3964 // and updated with the incremented counter in the end
3965
3966 const Register key = c_rarg4;
3967 const Register state = c_rarg5;
3968
3969 const Register subkeyHtbl = c_rarg6;
3970
3971 const Register counter = c_rarg7;
3972
3973 const Register keylen = r10;
3974 // Save state before entering routine
3975 __ sub(sp, sp, 4 * 16);
3976 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3977 __ sub(sp, sp, 4 * 16);
3978 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3979
3980 // __ andr(len, len, -512);
3981 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3982 __ str(len, __ pre(sp, -2 * wordSize));
3983
3984 Label DONE;
3985 __ cbz(len, DONE);
3986
3987 // Compute #rounds for AES based on the length of the key array
3988 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3989
3990 __ aesenc_loadkeys(key, keylen);
3991 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3992 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3993
3994 // AES/CTR loop
3995 {
3996 Label L_CTR_loop;
3997 __ BIND(L_CTR_loop);
3998
3999 // Setup the counters
4000 __ movi(v8, __ T4S, 0);
4001 __ movi(v9, __ T4S, 1);
4002 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
4003
4004 assert(v0->encoding() < v8->encoding(), "");
4005 for (int i = v0->encoding(); i < v8->encoding(); i++) {
4006 FloatRegister f = as_FloatRegister(i);
4007 __ rev32(f, __ T16B, v16);
4008 __ addv(v16, __ T4S, v16, v8);
4009 }
4010
4011 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
4012
4013 // Encrypt the counters
4014 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
4015
4016 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
4017
4018 // XOR the encrypted counters with the inputs
4019 for (int i = 0; i < 8; i++) {
4020 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
4021 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
4022 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
4023 }
4024 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
4025 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
4026
4027 __ subw(len, len, 16 * 8);
4028 __ cbnzw(len, L_CTR_loop);
4029 }
4030
4031 __ rev32(v16, __ T16B, v16);
4032 __ st1(v16, __ T16B, counter);
4033
4034 __ ldr(len, Address(sp));
4035 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
4036
4037 // GHASH/CTR loop
4038 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
4039 len, /*unrolls*/4);
4040
4041 #ifdef ASSERT
4042 { Label L;
4043 __ cmp(len, (unsigned char)0);
4044 __ br(Assembler::EQ, L);
4045 __ stop("stubGenerator: abort");
4046 __ bind(L);
4047 }
4048 #endif
4049
4050 __ bind(DONE);
4051 // Return the number of bytes processed
4052 __ ldr(r0, __ post(sp, 2 * wordSize));
4053
4054 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
4055 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
4056
4057 __ leave(); // required for proper stackwalking of RuntimeStub frame
4058 __ ret(lr);
4059
4060 // bind label and generate polynomial data
4061 __ align(wordSize * 2);
4062 __ bind(ghash_polynomial);
4063 __ emit_int64(0x87); // The low-order bits of the field
4064 // polynomial (i.e. p = z^7+z^2+z+1)
4065 // repeated in the low and high parts of a
4066 // 128-bit vector
4067 __ emit_int64(0x87);
4068
4069 // record the stub entry and end
4070 store_archive_data(stub_id, start, __ pc());
4071
4072 return start;
4073 }
4074
4075 class Cached64Bytes {
4076 private:
4077 MacroAssembler *_masm;
4078 Register _regs[8];
4079
4080 public:
4081 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
4082 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
4083 auto it = rs.begin();
4084 for (auto &r: _regs) {
4085 r = *it;
4086 ++it;
4087 }
4088 }
4089
4090 void gen_loads(Register base) {
4091 for (int i = 0; i < 8; i += 2) {
4092 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
4093 }
4094 }
4095
4096 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
4097 void extract_u32(Register dest, int i) {
4098 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
4099 }
4100 };
4101
4102 // Utility routines for md5.
4103 // Clobbers r10 and r11.
4104 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4105 int k, int s, int t) {
4106 Register rscratch3 = r10;
4107 Register rscratch4 = r11;
4108
4109 __ eorw(rscratch3, r3, r4);
4110 __ movw(rscratch2, t);
4111 __ andw(rscratch3, rscratch3, r2);
4112 __ addw(rscratch4, r1, rscratch2);
4113 reg_cache.extract_u32(rscratch1, k);
4114 __ eorw(rscratch3, rscratch3, r4);
4115 __ addw(rscratch4, rscratch4, rscratch1);
4116 __ addw(rscratch3, rscratch3, rscratch4);
4117 __ rorw(rscratch2, rscratch3, 32 - s);
4118 __ addw(r1, rscratch2, r2);
4119 }
4120
4121 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4122 int k, int s, int t) {
4123 Register rscratch3 = r10;
4124 Register rscratch4 = r11;
4125
4126 reg_cache.extract_u32(rscratch1, k);
4127 __ movw(rscratch2, t);
4128 __ addw(rscratch4, r1, rscratch2);
4129 __ addw(rscratch4, rscratch4, rscratch1);
4130 __ bicw(rscratch2, r3, r4);
4131 __ andw(rscratch3, r2, r4);
4132 __ addw(rscratch2, rscratch2, rscratch4);
4133 __ addw(rscratch2, rscratch2, rscratch3);
4134 __ rorw(rscratch2, rscratch2, 32 - s);
4135 __ addw(r1, rscratch2, r2);
4136 }
4137
4138 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4139 int k, int s, int t) {
4140 Register rscratch3 = r10;
4141 Register rscratch4 = r11;
4142
4143 __ eorw(rscratch3, r3, r4);
4144 __ movw(rscratch2, t);
4145 __ addw(rscratch4, r1, rscratch2);
4146 reg_cache.extract_u32(rscratch1, k);
4147 __ eorw(rscratch3, rscratch3, r2);
4148 __ addw(rscratch4, rscratch4, rscratch1);
4149 __ addw(rscratch3, rscratch3, rscratch4);
4150 __ rorw(rscratch2, rscratch3, 32 - s);
4151 __ addw(r1, rscratch2, r2);
4152 }
4153
4154 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4155 int k, int s, int t) {
4156 Register rscratch3 = r10;
4157 Register rscratch4 = r11;
4158
4159 __ movw(rscratch3, t);
4160 __ ornw(rscratch2, r2, r4);
4161 __ addw(rscratch4, r1, rscratch3);
4162 reg_cache.extract_u32(rscratch1, k);
4163 __ eorw(rscratch3, rscratch2, r3);
4164 __ addw(rscratch4, rscratch4, rscratch1);
4165 __ addw(rscratch3, rscratch3, rscratch4);
4166 __ rorw(rscratch2, rscratch3, 32 - s);
4167 __ addw(r1, rscratch2, r2);
4168 }
4169
4170 // Arguments:
4171 //
4172 // Inputs:
4173 // c_rarg0 - byte[] source+offset
4174 // c_rarg1 - int[] SHA.state
4175 // c_rarg2 - int offset
4176 // c_rarg3 - int limit
4177 //
4178 address generate_md5_implCompress(StubId stub_id) {
4179 bool multi_block;
4180 switch (stub_id) {
4181 case StubId::stubgen_md5_implCompress_id:
4182 multi_block = false;
4183 break;
4184 case StubId::stubgen_md5_implCompressMB_id:
4185 multi_block = true;
4186 break;
4187 default:
4188 ShouldNotReachHere();
4189 }
4190 int entry_count = StubInfo::entry_count(stub_id);
4191 assert(entry_count == 1, "sanity check");
4192 address start = load_archive_data(stub_id);
4193 if (start != nullptr) {
4194 return start;
4195 }
4196 __ align(CodeEntryAlignment);
4197
4198 StubCodeMark mark(this, stub_id);
4199 start = __ pc();
4200
4201 Register buf = c_rarg0;
4202 Register state = c_rarg1;
4203 Register ofs = c_rarg2;
4204 Register limit = c_rarg3;
4205 Register a = r4;
4206 Register b = r5;
4207 Register c = r6;
4208 Register d = r7;
4209 Register rscratch3 = r10;
4210 Register rscratch4 = r11;
4211
4212 Register state_regs[2] = { r12, r13 };
4213 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
4214 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
4215
4216 __ push(saved_regs, sp);
4217
4218 __ ldp(state_regs[0], state_regs[1], Address(state));
4219 __ ubfx(a, state_regs[0], 0, 32);
4220 __ ubfx(b, state_regs[0], 32, 32);
4221 __ ubfx(c, state_regs[1], 0, 32);
4222 __ ubfx(d, state_regs[1], 32, 32);
4223
4224 Label md5_loop;
4225 __ BIND(md5_loop);
4226
4227 reg_cache.gen_loads(buf);
4228
4229 // Round 1
4230 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
4231 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
4232 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
4233 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
4234 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
4235 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
4236 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
4237 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
4238 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
4239 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
4240 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
4241 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
4242 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
4243 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
4244 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
4245 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
4246
4247 // Round 2
4248 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
4249 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
4250 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
4251 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
4252 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
4253 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
4254 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
4255 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
4256 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
4257 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
4258 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
4259 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
4260 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
4261 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
4262 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
4263 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
4264
4265 // Round 3
4266 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
4267 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
4268 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
4269 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
4270 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
4271 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
4272 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
4273 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
4274 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
4275 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
4276 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
4277 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
4278 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
4279 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
4280 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
4281 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
4282
4283 // Round 4
4284 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
4285 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
4286 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
4287 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
4288 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
4289 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
4290 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
4291 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
4292 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
4293 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
4294 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
4295 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
4296 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
4297 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
4298 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
4299 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
4300
4301 __ addw(a, state_regs[0], a);
4302 __ ubfx(rscratch2, state_regs[0], 32, 32);
4303 __ addw(b, rscratch2, b);
4304 __ addw(c, state_regs[1], c);
4305 __ ubfx(rscratch4, state_regs[1], 32, 32);
4306 __ addw(d, rscratch4, d);
4307
4308 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
4309 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
4310
4311 if (multi_block) {
4312 __ add(buf, buf, 64);
4313 __ add(ofs, ofs, 64);
4314 __ cmp(ofs, limit);
4315 __ br(Assembler::LE, md5_loop);
4316 __ mov(c_rarg0, ofs); // return ofs
4317 }
4318
4319 // write hash values back in the correct order
4320 __ stp(state_regs[0], state_regs[1], Address(state));
4321
4322 __ pop(saved_regs, sp);
4323
4324 __ ret(lr);
4325
4326 // record the stub entry and end
4327 store_archive_data(stub_id, start, __ pc());
4328
4329 return start;
4330 }
4331
4332 // Arguments:
4333 //
4334 // Inputs:
4335 // c_rarg0 - byte[] source+offset
4336 // c_rarg1 - int[] SHA.state
4337 // c_rarg2 - int offset
4338 // c_rarg3 - int limit
4339 //
4340 address generate_sha1_implCompress(StubId stub_id) {
4341 bool multi_block;
4342 switch (stub_id) {
4343 case StubId::stubgen_sha1_implCompress_id:
4344 multi_block = false;
4345 break;
4346 case StubId::stubgen_sha1_implCompressMB_id:
4347 multi_block = true;
4348 break;
4349 default:
4350 ShouldNotReachHere();
4351 }
4352 int entry_count = StubInfo::entry_count(stub_id);
4353 assert(entry_count == 1, "sanity check");
4354 address start = load_archive_data(stub_id);
4355 if (start != nullptr) {
4356 return start;
4357 }
4358 __ align(CodeEntryAlignment);
4359
4360 StubCodeMark mark(this, stub_id);
4361 start = __ pc();
4362
4363 Register buf = c_rarg0;
4364 Register state = c_rarg1;
4365 Register ofs = c_rarg2;
4366 Register limit = c_rarg3;
4367
4368 Label keys;
4369 Label sha1_loop;
4370
4371 // load the keys into v0..v3
4372 __ adr(rscratch1, keys);
4373 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
4374 // load 5 words state into v6, v7
4375 __ ldrq(v6, Address(state, 0));
4376 __ ldrs(v7, Address(state, 16));
4377
4378
4379 __ BIND(sha1_loop);
4380 // load 64 bytes of data into v16..v19
4381 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
4382 __ rev32(v16, __ T16B, v16);
4383 __ rev32(v17, __ T16B, v17);
4384 __ rev32(v18, __ T16B, v18);
4385 __ rev32(v19, __ T16B, v19);
4386
4387 // do the sha1
4388 __ addv(v4, __ T4S, v16, v0);
4389 __ orr(v20, __ T16B, v6, v6);
4390
4391 FloatRegister d0 = v16;
4392 FloatRegister d1 = v17;
4393 FloatRegister d2 = v18;
4394 FloatRegister d3 = v19;
4395
4396 for (int round = 0; round < 20; round++) {
4397 FloatRegister tmp1 = (round & 1) ? v4 : v5;
4398 FloatRegister tmp2 = (round & 1) ? v21 : v22;
4399 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
4400 FloatRegister tmp4 = (round & 1) ? v5 : v4;
4401 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
4402
4403 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
4404 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
4405 __ sha1h(tmp2, __ T4S, v20);
4406 if (round < 5)
4407 __ sha1c(v20, __ T4S, tmp3, tmp4);
4408 else if (round < 10 || round >= 15)
4409 __ sha1p(v20, __ T4S, tmp3, tmp4);
4410 else
4411 __ sha1m(v20, __ T4S, tmp3, tmp4);
4412 if (round < 16) __ sha1su1(d0, __ T4S, d3);
4413
4414 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4415 }
4416
4417 __ addv(v7, __ T2S, v7, v21);
4418 __ addv(v6, __ T4S, v6, v20);
4419
4420 if (multi_block) {
4421 __ add(ofs, ofs, 64);
4422 __ cmp(ofs, limit);
4423 __ br(Assembler::LE, sha1_loop);
4424 __ mov(c_rarg0, ofs); // return ofs
4425 }
4426
4427 __ strq(v6, Address(state, 0));
4428 __ strs(v7, Address(state, 16));
4429
4430 __ ret(lr);
4431
4432 __ bind(keys);
4433 __ emit_int32(0x5a827999);
4434 __ emit_int32(0x6ed9eba1);
4435 __ emit_int32(0x8f1bbcdc);
4436 __ emit_int32(0xca62c1d6);
4437
4438 // record the stub entry and end
4439 store_archive_data(stub_id, start, __ pc());
4440
4441 return start;
4442 }
4443
4444
4445 // Arguments:
4446 //
4447 // Inputs:
4448 // c_rarg0 - byte[] source+offset
4449 // c_rarg1 - int[] SHA.state
4450 // c_rarg2 - int offset
4451 // c_rarg3 - int limit
4452 //
4453 address generate_sha256_implCompress(StubId stub_id) {
4454 bool multi_block;
4455 switch (stub_id) {
4456 case StubId::stubgen_sha256_implCompress_id:
4457 multi_block = false;
4458 break;
4459 case StubId::stubgen_sha256_implCompressMB_id:
4460 multi_block = true;
4461 break;
4462 default:
4463 ShouldNotReachHere();
4464 }
4465 int entry_count = StubInfo::entry_count(stub_id);
4466 assert(entry_count == 1, "sanity check");
4467 address start = load_archive_data(stub_id);
4468 if (start != nullptr) {
4469 return start;
4470 }
4471 __ align(CodeEntryAlignment);
4472 StubCodeMark mark(this, stub_id);
4473 start = __ pc();
4474
4475 Register buf = c_rarg0;
4476 Register state = c_rarg1;
4477 Register ofs = c_rarg2;
4478 Register limit = c_rarg3;
4479
4480 Label sha1_loop;
4481
4482 __ stpd(v8, v9, __ pre(sp, -32));
4483 __ stpd(v10, v11, Address(sp, 16));
4484
4485 // dga == v0
4486 // dgb == v1
4487 // dg0 == v2
4488 // dg1 == v3
4489 // dg2 == v4
4490 // t0 == v6
4491 // t1 == v7
4492
4493 // load 16 keys to v16..v31
4494 __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
4495 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4496 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4497 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4498 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4499
4500 // load 8 words (256 bits) state
4501 __ ldpq(v0, v1, state);
4502
4503 __ BIND(sha1_loop);
4504 // load 64 bytes of data into v8..v11
4505 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4506 __ rev32(v8, __ T16B, v8);
4507 __ rev32(v9, __ T16B, v9);
4508 __ rev32(v10, __ T16B, v10);
4509 __ rev32(v11, __ T16B, v11);
4510
4511 __ addv(v6, __ T4S, v8, v16);
4512 __ orr(v2, __ T16B, v0, v0);
4513 __ orr(v3, __ T16B, v1, v1);
4514
4515 FloatRegister d0 = v8;
4516 FloatRegister d1 = v9;
4517 FloatRegister d2 = v10;
4518 FloatRegister d3 = v11;
4519
4520
4521 for (int round = 0; round < 16; round++) {
4522 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4523 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4524 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4525 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4526
4527 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4528 __ orr(v4, __ T16B, v2, v2);
4529 if (round < 15)
4530 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4531 __ sha256h(v2, __ T4S, v3, tmp2);
4532 __ sha256h2(v3, __ T4S, v4, tmp2);
4533 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4534
4535 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4536 }
4537
4538 __ addv(v0, __ T4S, v0, v2);
4539 __ addv(v1, __ T4S, v1, v3);
4540
4541 if (multi_block) {
4542 __ add(ofs, ofs, 64);
4543 __ cmp(ofs, limit);
4544 __ br(Assembler::LE, sha1_loop);
4545 __ mov(c_rarg0, ofs); // return ofs
4546 }
4547
4548 __ ldpd(v10, v11, Address(sp, 16));
4549 __ ldpd(v8, v9, __ post(sp, 32));
4550
4551 __ stpq(v0, v1, state);
4552
4553 __ ret(lr);
4554
4555 // record the stub entry and end
4556 store_archive_data(stub_id, start, __ pc());
4557
4558 return start;
4559 }
4560
4561 // Double rounds for sha512.
4562 void sha512_dround(int dr,
4563 FloatRegister vi0, FloatRegister vi1,
4564 FloatRegister vi2, FloatRegister vi3,
4565 FloatRegister vi4, FloatRegister vrc0,
4566 FloatRegister vrc1, FloatRegister vin0,
4567 FloatRegister vin1, FloatRegister vin2,
4568 FloatRegister vin3, FloatRegister vin4) {
4569 if (dr < 36) {
4570 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4571 }
4572 __ addv(v5, __ T2D, vrc0, vin0);
4573 __ ext(v6, __ T16B, vi2, vi3, 8);
4574 __ ext(v5, __ T16B, v5, v5, 8);
4575 __ ext(v7, __ T16B, vi1, vi2, 8);
4576 __ addv(vi3, __ T2D, vi3, v5);
4577 if (dr < 32) {
4578 __ ext(v5, __ T16B, vin3, vin4, 8);
4579 __ sha512su0(vin0, __ T2D, vin1);
4580 }
4581 __ sha512h(vi3, __ T2D, v6, v7);
4582 if (dr < 32) {
4583 __ sha512su1(vin0, __ T2D, vin2, v5);
4584 }
4585 __ addv(vi4, __ T2D, vi1, vi3);
4586 __ sha512h2(vi3, __ T2D, vi1, vi0);
4587 }
4588
4589 // Arguments:
4590 //
4591 // Inputs:
4592 // c_rarg0 - byte[] source+offset
4593 // c_rarg1 - int[] SHA.state
4594 // c_rarg2 - int offset
4595 // c_rarg3 - int limit
4596 //
4597 address generate_sha512_implCompress(StubId stub_id) {
4598 bool multi_block;
4599 switch (stub_id) {
4600 case StubId::stubgen_sha512_implCompress_id:
4601 multi_block = false;
4602 break;
4603 case StubId::stubgen_sha512_implCompressMB_id:
4604 multi_block = true;
4605 break;
4606 default:
4607 ShouldNotReachHere();
4608 }
4609 int entry_count = StubInfo::entry_count(stub_id);
4610 assert(entry_count == 1, "sanity check");
4611 address start = load_archive_data(stub_id);
4612 if (start != nullptr) {
4613 return start;
4614 }
4615 __ align(CodeEntryAlignment);
4616 StubCodeMark mark(this, stub_id);
4617 start = __ pc();
4618
4619 Register buf = c_rarg0;
4620 Register state = c_rarg1;
4621 Register ofs = c_rarg2;
4622 Register limit = c_rarg3;
4623
4624 __ stpd(v8, v9, __ pre(sp, -64));
4625 __ stpd(v10, v11, Address(sp, 16));
4626 __ stpd(v12, v13, Address(sp, 32));
4627 __ stpd(v14, v15, Address(sp, 48));
4628
4629 Label sha512_loop;
4630
4631 // load state
4632 __ ld1(v8, v9, v10, v11, __ T2D, state);
4633
4634 // load first 4 round constants
4635 __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
4636 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4637
4638 __ BIND(sha512_loop);
4639 // load 128B of data into v12..v19
4640 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4641 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4642 __ rev64(v12, __ T16B, v12);
4643 __ rev64(v13, __ T16B, v13);
4644 __ rev64(v14, __ T16B, v14);
4645 __ rev64(v15, __ T16B, v15);
4646 __ rev64(v16, __ T16B, v16);
4647 __ rev64(v17, __ T16B, v17);
4648 __ rev64(v18, __ T16B, v18);
4649 __ rev64(v19, __ T16B, v19);
4650
4651 __ mov(rscratch2, rscratch1);
4652
4653 __ mov(v0, __ T16B, v8);
4654 __ mov(v1, __ T16B, v9);
4655 __ mov(v2, __ T16B, v10);
4656 __ mov(v3, __ T16B, v11);
4657
4658 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4659 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4660 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4661 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4662 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4663 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4664 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4665 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4666 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4667 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4668 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4669 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4670 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4671 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4672 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4673 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4674 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4675 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4676 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4677 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4678 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4679 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4680 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4681 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4682 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4683 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4684 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4685 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4686 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4687 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4688 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4689 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4690 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4691 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4692 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4693 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4694 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4695 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4696 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4697 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4698
4699 __ addv(v8, __ T2D, v8, v0);
4700 __ addv(v9, __ T2D, v9, v1);
4701 __ addv(v10, __ T2D, v10, v2);
4702 __ addv(v11, __ T2D, v11, v3);
4703
4704 if (multi_block) {
4705 __ add(ofs, ofs, 128);
4706 __ cmp(ofs, limit);
4707 __ br(Assembler::LE, sha512_loop);
4708 __ mov(c_rarg0, ofs); // return ofs
4709 }
4710
4711 __ st1(v8, v9, v10, v11, __ T2D, state);
4712
4713 __ ldpd(v14, v15, Address(sp, 48));
4714 __ ldpd(v12, v13, Address(sp, 32));
4715 __ ldpd(v10, v11, Address(sp, 16));
4716 __ ldpd(v8, v9, __ post(sp, 64));
4717
4718 __ ret(lr);
4719
4720 // record the stub entry and end
4721 store_archive_data(stub_id, start, __ pc());
4722
4723 return start;
4724 }
4725
4726 // Execute one round of keccak of two computations in parallel.
4727 // One of the states should be loaded into the lower halves of
4728 // the vector registers v0-v24, the other should be loaded into
4729 // the upper halves of those registers. The ld1r instruction loads
4730 // the round constant into both halves of register v31.
4731 // Intermediate results c0...c5 and d0...d5 are computed
4732 // in registers v25...v30.
4733 // All vector instructions that are used operate on both register
4734 // halves in parallel.
4735 // If only a single computation is needed, one can only load the lower halves.
4736 void keccak_round(Register rscratch1) {
4737 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4738 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4739 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4740 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4741 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4742 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4743 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4744 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4745 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4746 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4747
4748 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4749 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4750 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4751 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4752 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4753
4754 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4755 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4756 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4757 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4758 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4759 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4760 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4761 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4762 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4763 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4764 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4765 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4766 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4767 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4768 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4769 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4770 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4771 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4772 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4773 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4774 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4775 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4776 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4777 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4778 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4779
4780 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4781 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4782 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4783 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4784 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4785
4786 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4787
4788 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4789 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4790 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4791 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4792 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4793
4794 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4795 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4796 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4797 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4798 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4799
4800 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4801 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4802 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4803 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4804 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4805
4806 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4807 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4808 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4809 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4810 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4811
4812 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4813 }
4814
4815 // Arguments:
4816 //
4817 // Inputs:
4818 // c_rarg0 - byte[] source+offset
4819 // c_rarg1 - byte[] SHA.state
4820 // c_rarg2 - int block_size
4821 // c_rarg3 - int offset
4822 // c_rarg4 - int limit
4823 //
4824 address generate_sha3_implCompress(StubId stub_id) {
4825 bool multi_block;
4826 switch (stub_id) {
4827 case StubId::stubgen_sha3_implCompress_id:
4828 multi_block = false;
4829 break;
4830 case StubId::stubgen_sha3_implCompressMB_id:
4831 multi_block = true;
4832 break;
4833 default:
4834 ShouldNotReachHere();
4835 }
4836 int entry_count = StubInfo::entry_count(stub_id);
4837 assert(entry_count == 1, "sanity check");
4838 address start = load_archive_data(stub_id);
4839 if (start != nullptr) {
4840 return start;
4841 }
4842 __ align(CodeEntryAlignment);
4843 StubCodeMark mark(this, stub_id);
4844 start = __ pc();
4845
4846 Register buf = c_rarg0;
4847 Register state = c_rarg1;
4848 Register block_size = c_rarg2;
4849 Register ofs = c_rarg3;
4850 Register limit = c_rarg4;
4851
4852 Label sha3_loop, rounds24_loop;
4853 Label sha3_512_or_sha3_384, shake128;
4854
4855 __ stpd(v8, v9, __ pre(sp, -64));
4856 __ stpd(v10, v11, Address(sp, 16));
4857 __ stpd(v12, v13, Address(sp, 32));
4858 __ stpd(v14, v15, Address(sp, 48));
4859
4860 // load state
4861 __ add(rscratch1, state, 32);
4862 __ ld1(v0, v1, v2, v3, __ T1D, state);
4863 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4864 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4865 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4866 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4867 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4868 __ ld1(v24, __ T1D, rscratch1);
4869
4870 __ BIND(sha3_loop);
4871
4872 // 24 keccak rounds
4873 __ movw(rscratch2, 24);
4874
4875 // load round_constants base
4876 __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
4877
4878 // load input
4879 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4880 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4881 __ eor(v0, __ T8B, v0, v25);
4882 __ eor(v1, __ T8B, v1, v26);
4883 __ eor(v2, __ T8B, v2, v27);
4884 __ eor(v3, __ T8B, v3, v28);
4885 __ eor(v4, __ T8B, v4, v29);
4886 __ eor(v5, __ T8B, v5, v30);
4887 __ eor(v6, __ T8B, v6, v31);
4888
4889 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4890 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4891
4892 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4893 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4894 __ eor(v7, __ T8B, v7, v25);
4895 __ eor(v8, __ T8B, v8, v26);
4896 __ eor(v9, __ T8B, v9, v27);
4897 __ eor(v10, __ T8B, v10, v28);
4898 __ eor(v11, __ T8B, v11, v29);
4899 __ eor(v12, __ T8B, v12, v30);
4900 __ eor(v13, __ T8B, v13, v31);
4901
4902 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4903 __ eor(v14, __ T8B, v14, v25);
4904 __ eor(v15, __ T8B, v15, v26);
4905 __ eor(v16, __ T8B, v16, v27);
4906
4907 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4908 __ andw(c_rarg5, block_size, 48);
4909 __ cbzw(c_rarg5, rounds24_loop);
4910
4911 __ tbnz(block_size, 5, shake128);
4912 // block_size == 144, bit5 == 0, SHA3-224
4913 __ ldrd(v28, __ post(buf, 8));
4914 __ eor(v17, __ T8B, v17, v28);
4915 __ b(rounds24_loop);
4916
4917 __ BIND(shake128);
4918 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4919 __ eor(v17, __ T8B, v17, v28);
4920 __ eor(v18, __ T8B, v18, v29);
4921 __ eor(v19, __ T8B, v19, v30);
4922 __ eor(v20, __ T8B, v20, v31);
4923 __ b(rounds24_loop); // block_size == 168, SHAKE128
4924
4925 __ BIND(sha3_512_or_sha3_384);
4926 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4927 __ eor(v7, __ T8B, v7, v25);
4928 __ eor(v8, __ T8B, v8, v26);
4929 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4930
4931 // SHA3-384
4932 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4933 __ eor(v9, __ T8B, v9, v27);
4934 __ eor(v10, __ T8B, v10, v28);
4935 __ eor(v11, __ T8B, v11, v29);
4936 __ eor(v12, __ T8B, v12, v30);
4937
4938 __ BIND(rounds24_loop);
4939 __ subw(rscratch2, rscratch2, 1);
4940
4941 keccak_round(rscratch1);
4942
4943 __ cbnzw(rscratch2, rounds24_loop);
4944
4945 if (multi_block) {
4946 __ add(ofs, ofs, block_size);
4947 __ cmp(ofs, limit);
4948 __ br(Assembler::LE, sha3_loop);
4949 __ mov(c_rarg0, ofs); // return ofs
4950 }
4951
4952 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4953 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4954 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4955 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4956 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4957 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4958 __ st1(v24, __ T1D, state);
4959
4960 // restore callee-saved registers
4961 __ ldpd(v14, v15, Address(sp, 48));
4962 __ ldpd(v12, v13, Address(sp, 32));
4963 __ ldpd(v10, v11, Address(sp, 16));
4964 __ ldpd(v8, v9, __ post(sp, 64));
4965
4966 __ ret(lr);
4967
4968 // record the stub entry and end
4969 store_archive_data(stub_id, start, __ pc());
4970
4971 return start;
4972 }
4973
4974 // Inputs:
4975 // c_rarg0 - long[] state0
4976 // c_rarg1 - long[] state1
4977 address generate_double_keccak() {
4978 StubId stub_id = StubId::stubgen_double_keccak_id;
4979 int entry_count = StubInfo::entry_count(stub_id);
4980 assert(entry_count == 1, "sanity check");
4981 address start = load_archive_data(stub_id);
4982 if (start != nullptr) {
4983 return start;
4984 }
4985 // Implements the double_keccak() method of the
4986 // sun.security.provider.SHA3Parallel class
4987 __ align(CodeEntryAlignment);
4988 StubCodeMark mark(this, stub_id);
4989 start = __ pc();
4990 __ enter();
4991
4992 Register state0 = c_rarg0;
4993 Register state1 = c_rarg1;
4994
4995 Label rounds24_loop;
4996
4997 // save callee-saved registers
4998 __ stpd(v8, v9, __ pre(sp, -64));
4999 __ stpd(v10, v11, Address(sp, 16));
5000 __ stpd(v12, v13, Address(sp, 32));
5001 __ stpd(v14, v15, Address(sp, 48));
5002
5003 // load states
5004 __ add(rscratch1, state0, 32);
5005 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
5006 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
5007 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
5008 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
5009 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
5010 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
5011 __ ld1(v24, __ D, 0, rscratch1);
5012 __ add(rscratch1, state1, 32);
5013 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
5014 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
5015 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
5016 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
5017 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
5018 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
5019 __ ld1(v24, __ D, 1, rscratch1);
5020
5021 // 24 keccak rounds
5022 __ movw(rscratch2, 24);
5023
5024 // load round_constants base
5025 __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
5026
5027 __ BIND(rounds24_loop);
5028 __ subw(rscratch2, rscratch2, 1);
5029 keccak_round(rscratch1);
5030 __ cbnzw(rscratch2, rounds24_loop);
5031
5032 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
5033 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
5034 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
5035 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
5036 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
5037 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
5038 __ st1(v24, __ D, 0, state0);
5039 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
5040 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
5041 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
5042 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
5043 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
5044 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
5045 __ st1(v24, __ D, 1, state1);
5046
5047 // restore callee-saved vector registers
5048 __ ldpd(v14, v15, Address(sp, 48));
5049 __ ldpd(v12, v13, Address(sp, 32));
5050 __ ldpd(v10, v11, Address(sp, 16));
5051 __ ldpd(v8, v9, __ post(sp, 64));
5052
5053 __ leave(); // required for proper stackwalking of RuntimeStub frame
5054
5055 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
5056 __ ret(lr);
5057
5058 // record the stub entry and end
5059 store_archive_data(stub_id, start, __ pc());
5060
5061 return start;
5062 }
5063
5064 // ChaCha20 block function. This version parallelizes the 32-bit
5065 // state elements on each of 16 vectors, producing 4 blocks of
5066 // keystream at a time.
5067 //
5068 // state (int[16]) = c_rarg0
5069 // keystream (byte[256]) = c_rarg1
5070 // return - number of bytes of produced keystream (always 256)
5071 //
5072 // This implementation takes each 32-bit integer from the state
5073 // array and broadcasts it across all 4 32-bit lanes of a vector register
5074 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
5075 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
5076 // the quarter round schedule is implemented as outlined in RFC 7539 section
5077 // 2.3. However, instead of sequentially processing the 3 quarter round
5078 // operations represented by one QUARTERROUND function, we instead stack all
5079 // the adds, xors and left-rotations from the first 4 quarter rounds together
5080 // and then do the same for the second set of 4 quarter rounds. This removes
5081 // some latency that would otherwise be incurred by waiting for an add to
5082 // complete before performing an xor (which depends on the result of the
5083 // add), etc. An adjustment happens between the first and second groups of 4
5084 // quarter rounds, but this is done only in the inputs to the macro functions
5085 // that generate the assembly instructions - these adjustments themselves are
5086 // not part of the resulting assembly.
5087 // The 4 registers v0-v3 are used during the quarter round operations as
5088 // scratch registers. Once the 20 rounds are complete, these 4 scratch
5089 // registers become the vectors involved in adding the start state back onto
5090 // the post-QR working state. After the adds are complete, each of the 16
5091 // vectors write their first lane back to the keystream buffer, followed
5092 // by the second lane from all vectors and so on.
5093 address generate_chacha20Block_blockpar() {
5094 StubId stub_id = StubId::stubgen_chacha20Block_id;
5095 int entry_count = StubInfo::entry_count(stub_id);
5096 assert(entry_count == 1, "sanity check");
5097 address start = load_archive_data(stub_id);
5098 if (start != nullptr) {
5099 return start;
5100 }
5101 Label L_twoRounds, L_cc20_const;
5102 __ align(CodeEntryAlignment);
5103 StubCodeMark mark(this, stub_id);
5104 start = __ pc();
5105 __ enter();
5106
5107 int i, j;
5108 const Register state = c_rarg0;
5109 const Register keystream = c_rarg1;
5110 const Register loopCtr = r10;
5111 const Register tmpAddr = r11;
5112 const FloatRegister ctrAddOverlay = v28;
5113 const FloatRegister lrot8Tbl = v29;
5114
5115 // Organize SIMD registers in an array that facilitates
5116 // putting repetitive opcodes into loop structures. It is
5117 // important that each grouping of 4 registers is monotonically
5118 // increasing to support the requirements of multi-register
5119 // instructions (e.g. ld4r, st4, etc.)
5120 const FloatRegister workSt[16] = {
5121 v4, v5, v6, v7, v16, v17, v18, v19,
5122 v20, v21, v22, v23, v24, v25, v26, v27
5123 };
5124
5125 // Pull in constant data. The first 16 bytes are the add overlay
5126 // which is applied to the vector holding the counter (state[12]).
5127 // The second 16 bytes is the index register for the 8-bit left
5128 // rotation tbl instruction.
5129 __ adr(tmpAddr, L_cc20_const);
5130 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
5131
5132 // Load from memory and interlace across 16 SIMD registers,
5133 // With each word from memory being broadcast to all lanes of
5134 // each successive SIMD register.
5135 // Addr(0) -> All lanes in workSt[i]
5136 // Addr(4) -> All lanes workSt[i + 1], etc.
5137 __ mov(tmpAddr, state);
5138 for (i = 0; i < 16; i += 4) {
5139 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
5140 __ post(tmpAddr, 16));
5141 }
5142 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5143
5144 // Before entering the loop, create 5 4-register arrays. These
5145 // will hold the 4 registers that represent the a/b/c/d fields
5146 // in the quarter round operation. For instance the "b" field
5147 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
5148 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
5149 // since it is part of a diagonal organization. The aSet and scratch
5150 // register sets are defined at declaration time because they do not change
5151 // organization at any point during the 20-round processing.
5152 FloatRegister aSet[4] = { v4, v5, v6, v7 };
5153 FloatRegister bSet[4];
5154 FloatRegister cSet[4];
5155 FloatRegister dSet[4];
5156 FloatRegister scratch[4] = { v0, v1, v2, v3 };
5157
5158 // Set up the 10 iteration loop and perform all 8 quarter round ops
5159 __ mov(loopCtr, 10);
5160 __ BIND(L_twoRounds);
5161
5162 // Set to columnar organization and do the following 4 quarter-rounds:
5163 // QUARTERROUND(0, 4, 8, 12)
5164 // QUARTERROUND(1, 5, 9, 13)
5165 // QUARTERROUND(2, 6, 10, 14)
5166 // QUARTERROUND(3, 7, 11, 15)
5167 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
5168 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
5169 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
5170
5171 __ cc20_qr_add4(aSet, bSet); // a += b
5172 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5173 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5174
5175 __ cc20_qr_add4(cSet, dSet); // c += d
5176 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5177 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5178
5179 __ cc20_qr_add4(aSet, bSet); // a += b
5180 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5181 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5182
5183 __ cc20_qr_add4(cSet, dSet); // c += d
5184 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5185 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5186
5187 // Set to diagonal organization and do the next 4 quarter-rounds:
5188 // QUARTERROUND(0, 5, 10, 15)
5189 // QUARTERROUND(1, 6, 11, 12)
5190 // QUARTERROUND(2, 7, 8, 13)
5191 // QUARTERROUND(3, 4, 9, 14)
5192 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
5193 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
5194 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
5195
5196 __ cc20_qr_add4(aSet, bSet); // a += b
5197 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5198 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5199
5200 __ cc20_qr_add4(cSet, dSet); // c += d
5201 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5202 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5203
5204 __ cc20_qr_add4(aSet, bSet); // a += b
5205 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5206 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5207
5208 __ cc20_qr_add4(cSet, dSet); // c += d
5209 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5210 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5211
5212 // Decrement and iterate
5213 __ sub(loopCtr, loopCtr, 1);
5214 __ cbnz(loopCtr, L_twoRounds);
5215
5216 __ mov(tmpAddr, state);
5217
5218 // Add the starting state back to the post-loop keystream
5219 // state. We read/interlace the state array from memory into
5220 // 4 registers similar to what we did in the beginning. Then
5221 // add the counter overlay onto workSt[12] at the end.
5222 for (i = 0; i < 16; i += 4) {
5223 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
5224 __ addv(workSt[i], __ T4S, workSt[i], v0);
5225 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
5226 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
5227 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
5228 }
5229 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5230
5231 // Write working state into the keystream buffer. This is accomplished
5232 // by taking the lane "i" from each of the four vectors and writing
5233 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
5234 // repeating with the next 4 vectors until all 16 vectors have been used.
5235 // Then move to the next lane and repeat the process until all lanes have
5236 // been written.
5237 for (i = 0; i < 4; i++) {
5238 for (j = 0; j < 16; j += 4) {
5239 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
5240 __ post(keystream, 16));
5241 }
5242 }
5243
5244 __ mov(r0, 256); // Return length of output keystream
5245 __ leave();
5246 __ ret(lr);
5247
5248 // bind label and generate local constant data used by this stub
5249 // The constant data is broken into two 128-bit segments to be loaded
5250 // onto FloatRegisters. The first 128 bits are a counter add overlay
5251 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
5252 // The second 128-bits is a table constant used for 8-bit left rotations.
5253 __ BIND(L_cc20_const);
5254 __ emit_int64(0x0000000100000000UL);
5255 __ emit_int64(0x0000000300000002UL);
5256 __ emit_int64(0x0605040702010003UL);
5257 __ emit_int64(0x0E0D0C0F0A09080BUL);
5258
5259 // record the stub entry and end
5260 store_archive_data(stub_id, start, __ pc());
5261
5262 return start;
5263 }
5264
5265 // Helpers to schedule parallel operation bundles across vector
5266 // register sequences of size 2, 4 or 8.
5267
5268 // Implement various primitive computations across vector sequences
5269
5270 template<int N>
5271 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5272 const VSeq<N>& v1, const VSeq<N>& v2) {
5273 // output must not be constant
5274 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5275 // output cannot overwrite pending inputs
5276 assert(!vs_write_before_read(v, v1), "output overwrites input");
5277 assert(!vs_write_before_read(v, v2), "output overwrites input");
5278 for (int i = 0; i < N; i++) {
5279 __ addv(v[i], T, v1[i], v2[i]);
5280 }
5281 }
5282
5283 template<int N>
5284 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5285 const VSeq<N>& v1, const VSeq<N>& v2) {
5286 // output must not be constant
5287 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5288 // output cannot overwrite pending inputs
5289 assert(!vs_write_before_read(v, v1), "output overwrites input");
5290 assert(!vs_write_before_read(v, v2), "output overwrites input");
5291 for (int i = 0; i < N; i++) {
5292 __ subv(v[i], T, v1[i], v2[i]);
5293 }
5294 }
5295
5296 template<int N>
5297 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5298 const VSeq<N>& v1, const VSeq<N>& v2) {
5299 // output must not be constant
5300 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5301 // output cannot overwrite pending inputs
5302 assert(!vs_write_before_read(v, v1), "output overwrites input");
5303 assert(!vs_write_before_read(v, v2), "output overwrites input");
5304 for (int i = 0; i < N; i++) {
5305 __ mulv(v[i], T, v1[i], v2[i]);
5306 }
5307 }
5308
5309 template<int N>
5310 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
5311 // output must not be constant
5312 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5313 // output cannot overwrite pending inputs
5314 assert(!vs_write_before_read(v, v1), "output overwrites input");
5315 for (int i = 0; i < N; i++) {
5316 __ negr(v[i], T, v1[i]);
5317 }
5318 }
5319
5320 template<int N>
5321 void vs_shl(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5322 const VSeq<N>& v1, int shift) {
5323 // output must not be constant
5324 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5325 // output cannot overwrite pending inputs
5326 assert(!vs_write_before_read(v, v1), "output overwrites input");
5327
5328 for (int i = 0; i < N; i++) {
5329 __ shl(v[i], T, v1[i], shift);
5330 }
5331 }
5332
5333 template<int N>
5334 void vs_ushr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5335 const VSeq<N>& v1, int shift) {
5336 // output must not be constant
5337 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5338 // output cannot overwrite pending inputs
5339 assert(!vs_write_before_read(v, v1), "output overwrites input");
5340
5341 for (int i = 0; i < N; i++) {
5342 __ ushr(v[i], T, v1[i], shift);
5343 }
5344 }
5345
5346 template<int N>
5347 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5348 const VSeq<N>& v1, int shift) {
5349 // output must not be constant
5350 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5351 // output cannot overwrite pending inputs
5352 assert(!vs_write_before_read(v, v1), "output overwrites input");
5353 for (int i = 0; i < N; i++) {
5354 __ sshr(v[i], T, v1[i], shift);
5355 }
5356 }
5357
5358 template<int N>
5359 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5360 // output must not be constant
5361 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5362 // output cannot overwrite pending inputs
5363 assert(!vs_write_before_read(v, v1), "output overwrites input");
5364 assert(!vs_write_before_read(v, v2), "output overwrites input");
5365 for (int i = 0; i < N; i++) {
5366 __ andr(v[i], __ T16B, v1[i], v2[i]);
5367 }
5368 }
5369
5370 template<int N>
5371 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const FloatRegister v2) {
5372 // output must not be constant
5373 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5374 // output cannot overwrite pending inputs
5375 assert(!vs_write_before_read(v, v1), "output overwrites input");
5376 for (int i = 0; i < N; i++) {
5377 __ andr(v[i], __ T16B, v1[i], v2);
5378 }
5379 }
5380
5381 template<int N>
5382 void vs_eor(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5383 // output must not be constant
5384 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5385 // output cannot overwrite pending inputs
5386 assert(!vs_write_before_read(v, v1), "output overwrites input");
5387 assert(!vs_write_before_read(v, v2), "output overwrites input");
5388 for (int i = 0; i < N; i++) {
5389 __ eor(v[i], __ T16B, v1[i], v2[i]);
5390 }
5391 }
5392
5393 template<int N>
5394 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5395 // output must not be constant
5396 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5397 // output cannot overwrite pending inputs
5398 assert(!vs_write_before_read(v, v1), "output overwrites input");
5399 assert(!vs_write_before_read(v, v2), "output overwrites input");
5400 for (int i = 0; i < N; i++) {
5401 __ orr(v[i], __ T16B, v1[i], v2[i]);
5402 }
5403 }
5404
5405 template<int N>
5406 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
5407 // output must not be constant
5408 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5409 // output cannot overwrite pending inputs
5410 assert(!vs_write_before_read(v, v1), "output overwrites input");
5411 for (int i = 0; i < N; i++) {
5412 __ notr(v[i], __ T16B, v1[i]);
5413 }
5414 }
5415
5416 template<int N>
5417 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
5418 // output must not be constant
5419 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5420 // output cannot overwrite pending inputs
5421 assert(!vs_write_before_read(v, v1), "output overwrites input");
5422 assert(!vs_write_before_read(v, v2), "output overwrites input");
5423 for (int i = 0; i < N; i++) {
5424 __ sqdmulh(v[i], T, v1[i], v2[i]);
5425 }
5426 }
5427
5428 template<int N>
5429 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
5430 // output must not be constant
5431 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5432 // output cannot overwrite pending inputs
5433 assert(!vs_write_before_read(v, v1), "output overwrites input");
5434 assert(!vs_write_before_read(v, v2), "output overwrites input");
5435 for (int i = 0; i < N; i++) {
5436 __ mlsv(v[i], T, v1[i], v2[i]);
5437 }
5438 }
5439
5440 // load N/2 successive pairs of quadword values from memory in order
5441 // into N successive vector registers of the sequence via the
5442 // address supplied in base.
5443 template<int N>
5444 void vs_ldpq(const VSeq<N>& v, Register base) {
5445 static_assert(N > 0 && is_even(N), "sequence length must be even");
5446 for (int i = 0; i < N; i += 2) {
5447 __ ldpq(v[i], v[i+1], Address(base, 16 * i));
5448 }
5449 }
5450
5451 // load N/2 successive pairs of quadword values from memory in order
5452 // into N vector registers of the sequence via the address supplied
5453 // in base using post-increment addressing
5454 template<int N>
5455 void vs_ldpq_post(const VSeq<N>& v, Register base) {
5456 static_assert(N > 0 && is_even(N), "sequence length must be even");
5457 for (int i = 0; i < N; i += 2) {
5458 __ ldpq(v[i], v[i+1], __ post(base, 32));
5459 }
5460 }
5461
5462 // store N successive vector registers of the sequence into N/2
5463 // successive pairs of quadword memory locations via the address
5464 // supplied in base using post-increment addressing
5465 template<int N>
5466 void vs_stpq_post(const VSeq<N>& v, Register base) {
5467 static_assert(N > 0 && is_even(N), "sequence length must be even");
5468 for (int i = 0; i < N; i += 2) {
5469 __ stpq(v[i], v[i+1], __ post(base, 32));
5470 }
5471 }
5472
5473 // load N/2 pairs of quadword values from memory de-interleaved into
5474 // N vector registers 2 at a time via the address supplied in base
5475 // using post-increment addressing.
5476 template<int N>
5477 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5478 static_assert(N > 0 && is_even(N), "sequence length must be even");
5479 for (int i = 0; i < N; i += 2) {
5480 __ ld2(v[i], v[i+1], T, __ post(base, 32));
5481 }
5482 }
5483
5484 // store N vector registers interleaved into N/2 pairs of quadword
5485 // memory locations via the address supplied in base using
5486 // post-increment addressing.
5487 template<int N>
5488 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5489 static_assert(N > 0 && is_even(N), "sequence length must be even");
5490 for (int i = 0; i < N; i += 2) {
5491 __ st2(v[i], v[i+1], T, __ post(base, 32));
5492 }
5493 }
5494
5495 // store two vector register sequences of length N
5496 // interleaved into N pairs of quadword memory locations
5497 // starting at the address supplied in dest using
5498 // post-increment addressing.
5499 template<int N>
5500 void vs_st1_interleaved(VSeq<N> A, VSeq<N> B, Register dest) {
5501 for (int i = 0; i < N; i++) {
5502 __ st1(A[i], __ T2D, __ post(dest, 16));
5503 __ st1(B[i], __ T2D, __ post(dest, 16));
5504 }
5505 }
5506
5507 // load N quadword values from memory de-interleaved into N vector
5508 // registers 3 elements at a time via the address supplied in base.
5509 template<int N>
5510 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5511 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5512 for (int i = 0; i < N; i += 3) {
5513 __ ld3(v[i], v[i+1], v[i+2], T, base);
5514 }
5515 }
5516
5517 // load N quadword values from memory de-interleaved into N vector
5518 // registers 3 elements at a time via the address supplied in base
5519 // using post-increment addressing.
5520 template<int N>
5521 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5522 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5523 for (int i = 0; i < N; i += 3) {
5524 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
5525 }
5526 }
5527
5528 // load N/2 pairs of quadword values from memory into N vector
5529 // registers via the address supplied in base with each pair indexed
5530 // using the start offset plus the corresponding entry in the
5531 // offsets array
5532 template<int N>
5533 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5534 static_assert(N > 0 && is_even(N), "sequence length must be even");
5535 for (int i = 0; i < N/2; i++) {
5536 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5537 }
5538 }
5539
5540 // store N vector registers into N/2 pairs of quadword memory
5541 // locations via the address supplied in base with each pair indexed
5542 // using the start offset plus the corresponding entry in the
5543 // offsets array
5544 template<int N>
5545 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5546 for (int i = 0; i < N/2; i++) {
5547 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5548 }
5549 }
5550
5551 // load N single quadword values from memory into N vector registers
5552 // via the address supplied in base with each value indexed using
5553 // the start offset plus the corresponding entry in the offsets
5554 // array
5555 template<int N>
5556 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5557 int start, int (&offsets)[N]) {
5558 for (int i = 0; i < N; i++) {
5559 __ ldr(v[i], T, Address(base, start + offsets[i]));
5560 }
5561 }
5562
5563 // store N vector registers into N single quadword memory locations
5564 // via the address supplied in base with each value indexed using
5565 // the start offset plus the corresponding entry in the offsets
5566 // array
5567 template<int N>
5568 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5569 int start, int (&offsets)[N]) {
5570 for (int i = 0; i < N; i++) {
5571 __ str(v[i], T, Address(base, start + offsets[i]));
5572 }
5573 }
5574
5575 // load N/2 pairs of quadword values from memory de-interleaved into
5576 // N vector registers 2 at a time via the address supplied in base
5577 // with each pair indexed using the start offset plus the
5578 // corresponding entry in the offsets array
5579 template<int N>
5580 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5581 Register tmp, int start, int (&offsets)[N/2]) {
5582 static_assert(N > 0 && is_even(N), "sequence length must be even");
5583 for (int i = 0; i < N/2; i++) {
5584 __ add(tmp, base, start + offsets[i]);
5585 __ ld2(v[2*i], v[2*i+1], T, tmp);
5586 }
5587 }
5588
5589 // store N vector registers 2 at a time interleaved into N/2 pairs
5590 // of quadword memory locations via the address supplied in base
5591 // with each pair indexed using the start offset plus the
5592 // corresponding entry in the offsets array
5593 template<int N>
5594 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5595 Register tmp, int start, int (&offsets)[N/2]) {
5596 static_assert(N > 0 && is_even(N), "sequence length must be even");
5597 for (int i = 0; i < N/2; i++) {
5598 __ add(tmp, base, start + offsets[i]);
5599 __ st2(v[2*i], v[2*i+1], T, tmp);
5600 }
5601 }
5602
5603 // Helper routines for various flavours of Montgomery multiply
5604
5605 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5606 // multiplications in parallel
5607 //
5608
5609 // See the montMul() method of the sun.security.provider.ML_DSA
5610 // class.
5611 //
5612 // Computes 4x4S results or 8x8H results
5613 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5614 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5615 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5616 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5617 // Outputs: va - 4x4S or 4x8H vector register sequences
5618 // vb, vc, vtmp and vq must all be disjoint
5619 // va must be disjoint from all other inputs/temps or must equal vc
5620 // va must have a non-zero delta i.e. it must not be a constant vseq.
5621 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5622 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5623 Assembler::SIMD_Arrangement T,
5624 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5625 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5626 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5627 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5628 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5629
5630 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5631 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5632
5633 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5634
5635 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5636 assert(vs_disjoint(va, vb), "va and vb overlap");
5637 assert(vs_disjoint(va, vq), "va and vq overlap");
5638 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5639 assert(!va.is_constant(), "output vector must identify 4 different registers");
5640
5641 // schedule 4 streams of instructions across the vector sequences
5642 for (int i = 0; i < 4; i++) {
5643 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5644 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5645 }
5646
5647 for (int i = 0; i < 4; i++) {
5648 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5649 }
5650
5651 for (int i = 0; i < 4; i++) {
5652 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5653 }
5654
5655 for (int i = 0; i < 4; i++) {
5656 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5657 }
5658 }
5659
5660 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5661 // multiplications in parallel
5662 //
5663
5664 // See the montMul() method of the sun.security.provider.ML_DSA
5665 // class.
5666 //
5667 // Computes 4x4S results or 8x8H results
5668 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5669 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5670 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5671 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5672 // Outputs: va - 4x4S or 4x8H vector register sequences
5673 // vb, vc, vtmp and vq must all be disjoint
5674 // va must be disjoint from all other inputs/temps or must equal vc
5675 // va must have a non-zero delta i.e. it must not be a constant vseq.
5676 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5677 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5678 Assembler::SIMD_Arrangement T,
5679 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5680 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5681 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5682 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5683 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5684
5685 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5686 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5687
5688 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5689
5690 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5691 assert(vs_disjoint(va, vb), "va and vb overlap");
5692 assert(vs_disjoint(va, vq), "va and vq overlap");
5693 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5694 assert(!va.is_constant(), "output vector must identify 2 different registers");
5695
5696 // schedule 2 streams of instructions across the vector sequences
5697 for (int i = 0; i < 2; i++) {
5698 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5699 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5700 }
5701
5702 for (int i = 0; i < 2; i++) {
5703 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5704 }
5705
5706 for (int i = 0; i < 2; i++) {
5707 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5708 }
5709
5710 for (int i = 0; i < 2; i++) {
5711 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5712 }
5713 }
5714
5715 // Perform 16 16-bit Montgomery multiplications in parallel.
5716 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5717 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5718 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5719 // It will assert that the register use is valid
5720 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5721 }
5722
5723 // Perform 32 16-bit Montgomery multiplications in parallel.
5724 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5725 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5726 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5727 // It will assert that the register use is valid
5728 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5729 }
5730
5731 // Perform 64 16-bit Montgomery multiplications in parallel.
5732 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5733 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5734 // Schedule two successive 4x8H multiplies via the montmul helper
5735 // on the front and back halves of va, vb and vc. The helper will
5736 // assert that the register use has no overlap conflicts on each
5737 // individual call but we also need to ensure that the necessary
5738 // disjoint/equality constraints are met across both calls.
5739
5740 // vb, vc, vtmp and vq must be disjoint. va must either be
5741 // disjoint from all other registers or equal vc
5742
5743 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5744 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5745 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5746
5747 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5748 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5749
5750 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5751
5752 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5753 assert(vs_disjoint(va, vb), "va and vb overlap");
5754 assert(vs_disjoint(va, vq), "va and vq overlap");
5755 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5756
5757 // we multiply the front and back halves of each sequence 4 at a
5758 // time because
5759 //
5760 // 1) we are currently only able to get 4-way instruction
5761 // parallelism at best
5762 //
5763 // 2) we need registers for the constants in vq and temporary
5764 // scratch registers to hold intermediate results so vtmp can only
5765 // be a VSeq<4> which means we only have 4 scratch slots
5766
5767 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5768 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5769 }
5770
5771 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5772 const VSeq<4>& vc,
5773 const VSeq<4>& vtmp,
5774 const VSeq<2>& vq) {
5775 // compute a = montmul(a1, c)
5776 kyber_montmul32(vc, va1, vc, vtmp, vq);
5777 // ouptut a1 = a0 - a
5778 vs_subv(va1, __ T8H, va0, vc);
5779 // and a0 = a0 + a
5780 vs_addv(va0, __ T8H, va0, vc);
5781 }
5782
5783 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5784 const VSeq<4>& vb,
5785 const VSeq<4>& vtmp1,
5786 const VSeq<4>& vtmp2,
5787 const VSeq<2>& vq) {
5788 // compute c = a0 - a1
5789 vs_subv(vtmp1, __ T8H, va0, va1);
5790 // output a0 = a0 + a1
5791 vs_addv(va0, __ T8H, va0, va1);
5792 // output a1 = b montmul c
5793 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5794 }
5795
5796 void load64shorts(const VSeq<8>& v, Register shorts) {
5797 vs_ldpq_post(v, shorts);
5798 }
5799
5800 void load32shorts(const VSeq<4>& v, Register shorts) {
5801 vs_ldpq_post(v, shorts);
5802 }
5803
5804 void store64shorts(VSeq<8> v, Register tmpAddr) {
5805 vs_stpq_post(v, tmpAddr);
5806 }
5807
5808 // Kyber NTT function.
5809 // Implements
5810 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5811 //
5812 // coeffs (short[256]) = c_rarg0
5813 // ntt_zetas (short[256]) = c_rarg1
5814 address generate_kyberNtt() {
5815 StubId stub_id = StubId::stubgen_kyberNtt_id;
5816 int entry_count = StubInfo::entry_count(stub_id);
5817 assert(entry_count == 1, "sanity check");
5818 address start = load_archive_data(stub_id);
5819 if (start != nullptr) {
5820 return start;
5821 }
5822 __ align(CodeEntryAlignment);
5823 StubCodeMark mark(this, stub_id);
5824 start = __ pc();
5825 __ enter();
5826
5827 const Register coeffs = c_rarg0;
5828 const Register zetas = c_rarg1;
5829
5830 const Register kyberConsts = r10;
5831 const Register tmpAddr = r11;
5832
5833 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5834 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5835 VSeq<2> vq(30); // n.b. constants overlap vs3
5836
5837 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5838 // load the montmul constants
5839 vs_ldpq(vq, kyberConsts);
5840
5841 // Each level corresponds to an iteration of the outermost loop of the
5842 // Java method seilerNTT(int[] coeffs). There are some differences
5843 // from what is done in the seilerNTT() method, though:
5844 // 1. The computation is using 16-bit signed values, we do not convert them
5845 // to ints here.
5846 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5847 // this array for each level, it is easier that way to fill up the vector
5848 // registers.
5849 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5850 // multiplications (this is because that way there should not be any
5851 // overflow during the inverse NTT computation), here we use R = 2^16 so
5852 // that we can use the 16-bit arithmetic in the vector unit.
5853 //
5854 // On each level, we fill up the vector registers in such a way that the
5855 // array elements that need to be multiplied by the zetas go into one
5856 // set of vector registers while the corresponding ones that don't need to
5857 // be multiplied, go into another set.
5858 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5859 // registers interleaving the steps of 4 identical computations,
5860 // each done on 8 16-bit values per register.
5861
5862 // At levels 0-3 the coefficients multiplied by or added/subtracted
5863 // to the zetas occur in discrete blocks whose size is some multiple
5864 // of 32.
5865
5866 // level 0
5867 __ add(tmpAddr, coeffs, 256);
5868 load64shorts(vs1, tmpAddr);
5869 load64shorts(vs2, zetas);
5870 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5871 __ add(tmpAddr, coeffs, 0);
5872 load64shorts(vs1, tmpAddr);
5873 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5874 vs_addv(vs1, __ T8H, vs1, vs2);
5875 __ add(tmpAddr, coeffs, 0);
5876 vs_stpq_post(vs1, tmpAddr);
5877 __ add(tmpAddr, coeffs, 256);
5878 vs_stpq_post(vs3, tmpAddr);
5879 // restore montmul constants
5880 vs_ldpq(vq, kyberConsts);
5881 load64shorts(vs1, tmpAddr);
5882 load64shorts(vs2, zetas);
5883 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5884 __ add(tmpAddr, coeffs, 128);
5885 load64shorts(vs1, tmpAddr);
5886 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5887 vs_addv(vs1, __ T8H, vs1, vs2);
5888 __ add(tmpAddr, coeffs, 128);
5889 store64shorts(vs1, tmpAddr);
5890 __ add(tmpAddr, coeffs, 384);
5891 store64shorts(vs3, tmpAddr);
5892
5893 // level 1
5894 // restore montmul constants
5895 vs_ldpq(vq, kyberConsts);
5896 __ add(tmpAddr, coeffs, 128);
5897 load64shorts(vs1, tmpAddr);
5898 load64shorts(vs2, zetas);
5899 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5900 __ add(tmpAddr, coeffs, 0);
5901 load64shorts(vs1, tmpAddr);
5902 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5903 vs_addv(vs1, __ T8H, vs1, vs2);
5904 __ add(tmpAddr, coeffs, 0);
5905 store64shorts(vs1, tmpAddr);
5906 store64shorts(vs3, tmpAddr);
5907 vs_ldpq(vq, kyberConsts);
5908 __ add(tmpAddr, coeffs, 384);
5909 load64shorts(vs1, tmpAddr);
5910 load64shorts(vs2, zetas);
5911 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5912 __ add(tmpAddr, coeffs, 256);
5913 load64shorts(vs1, tmpAddr);
5914 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5915 vs_addv(vs1, __ T8H, vs1, vs2);
5916 __ add(tmpAddr, coeffs, 256);
5917 store64shorts(vs1, tmpAddr);
5918 store64shorts(vs3, tmpAddr);
5919
5920 // level 2
5921 vs_ldpq(vq, kyberConsts);
5922 int offsets1[4] = { 0, 32, 128, 160 };
5923 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5924 load64shorts(vs2, zetas);
5925 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5926 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5927 // kyber_subv_addv64();
5928 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5929 vs_addv(vs1, __ T8H, vs1, vs2);
5930 __ add(tmpAddr, coeffs, 0);
5931 vs_stpq_post(vs_front(vs1), tmpAddr);
5932 vs_stpq_post(vs_front(vs3), tmpAddr);
5933 vs_stpq_post(vs_back(vs1), tmpAddr);
5934 vs_stpq_post(vs_back(vs3), tmpAddr);
5935 vs_ldpq(vq, kyberConsts);
5936 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5937 load64shorts(vs2, zetas);
5938 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5939 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5940 // kyber_subv_addv64();
5941 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5942 vs_addv(vs1, __ T8H, vs1, vs2);
5943 __ add(tmpAddr, coeffs, 256);
5944 vs_stpq_post(vs_front(vs1), tmpAddr);
5945 vs_stpq_post(vs_front(vs3), tmpAddr);
5946 vs_stpq_post(vs_back(vs1), tmpAddr);
5947 vs_stpq_post(vs_back(vs3), tmpAddr);
5948
5949 // level 3
5950 vs_ldpq(vq, kyberConsts);
5951 int offsets2[4] = { 0, 64, 128, 192 };
5952 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5953 load64shorts(vs2, zetas);
5954 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5955 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5956 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5957 vs_addv(vs1, __ T8H, vs1, vs2);
5958 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5959 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5960
5961 vs_ldpq(vq, kyberConsts);
5962 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5963 load64shorts(vs2, zetas);
5964 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5965 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5966 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5967 vs_addv(vs1, __ T8H, vs1, vs2);
5968 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5969 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5970
5971 // level 4
5972 // At level 4 coefficients occur in 8 discrete blocks of size 16
5973 // so they are loaded by employing an ldr at 8 distinct offsets.
5974
5975 vs_ldpq(vq, kyberConsts);
5976 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5977 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5978 load64shorts(vs2, zetas);
5979 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5980 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5981 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5982 vs_addv(vs1, __ T8H, vs1, vs2);
5983 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5984 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5985
5986 vs_ldpq(vq, kyberConsts);
5987 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5988 load64shorts(vs2, zetas);
5989 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5990 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5991 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5992 vs_addv(vs1, __ T8H, vs1, vs2);
5993 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5994 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5995
5996 // level 5
5997 // At level 5 related coefficients occur in discrete blocks of size 8 so
5998 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5999
6000 vs_ldpq(vq, kyberConsts);
6001 int offsets4[4] = { 0, 32, 64, 96 };
6002 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6003 load32shorts(vs_front(vs2), zetas);
6004 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6005 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6006 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6007 load32shorts(vs_front(vs2), zetas);
6008 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6009 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6010 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6011 load32shorts(vs_front(vs2), zetas);
6012 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6013 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6014
6015 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6016 load32shorts(vs_front(vs2), zetas);
6017 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6018 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6019
6020 // level 6
6021 // At level 6 related coefficients occur in discrete blocks of size 4 so
6022 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
6023
6024 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6025 load32shorts(vs_front(vs2), zetas);
6026 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6027 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6028 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6029 load32shorts(vs_front(vs2), zetas);
6030 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6031 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6032
6033 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6034 load32shorts(vs_front(vs2), zetas);
6035 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6036 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6037
6038 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6039 load32shorts(vs_front(vs2), zetas);
6040 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
6041 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6042
6043 __ leave(); // required for proper stackwalking of RuntimeStub frame
6044 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6045 __ ret(lr);
6046
6047 // record the stub entry and end
6048 store_archive_data(stub_id, start, __ pc());
6049
6050 return start;
6051 }
6052
6053 // Kyber Inverse NTT function
6054 // Implements
6055 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
6056 //
6057 // coeffs (short[256]) = c_rarg0
6058 // ntt_zetas (short[256]) = c_rarg1
6059 address generate_kyberInverseNtt() {
6060 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
6061 int entry_count = StubInfo::entry_count(stub_id);
6062 assert(entry_count == 1, "sanity check");
6063 address start = load_archive_data(stub_id);
6064 if (start != nullptr) {
6065 return start;
6066 }
6067 __ align(CodeEntryAlignment);
6068 StubCodeMark mark(this, stub_id);
6069 start = __ pc();
6070 __ enter();
6071
6072 const Register coeffs = c_rarg0;
6073 const Register zetas = c_rarg1;
6074
6075 const Register kyberConsts = r10;
6076 const Register tmpAddr = r11;
6077 const Register tmpAddr2 = c_rarg2;
6078
6079 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
6080 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6081 VSeq<2> vq(30); // n.b. constants overlap vs3
6082
6083 __ lea(kyberConsts,
6084 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6085
6086 // level 0
6087 // At level 0 related coefficients occur in discrete blocks of size 4 so
6088 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
6089
6090 vs_ldpq(vq, kyberConsts);
6091 int offsets4[4] = { 0, 32, 64, 96 };
6092 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6093 load32shorts(vs_front(vs2), zetas);
6094 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6095 vs_front(vs2), vs_back(vs2), vtmp, vq);
6096 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6097 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6098 load32shorts(vs_front(vs2), zetas);
6099 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6100 vs_front(vs2), vs_back(vs2), vtmp, vq);
6101 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6102 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6103 load32shorts(vs_front(vs2), zetas);
6104 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6105 vs_front(vs2), vs_back(vs2), vtmp, vq);
6106 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6107 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6108 load32shorts(vs_front(vs2), zetas);
6109 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6110 vs_front(vs2), vs_back(vs2), vtmp, vq);
6111 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6112
6113 // level 1
6114 // At level 1 related coefficients occur in discrete blocks of size 8 so
6115 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
6116
6117 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6118 load32shorts(vs_front(vs2), zetas);
6119 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6120 vs_front(vs2), vs_back(vs2), vtmp, vq);
6121 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6122 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6123 load32shorts(vs_front(vs2), zetas);
6124 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6125 vs_front(vs2), vs_back(vs2), vtmp, vq);
6126 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6127
6128 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6129 load32shorts(vs_front(vs2), zetas);
6130 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6131 vs_front(vs2), vs_back(vs2), vtmp, vq);
6132 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6133 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6134 load32shorts(vs_front(vs2), zetas);
6135 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6136 vs_front(vs2), vs_back(vs2), vtmp, vq);
6137 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6138
6139 // level 2
6140 // At level 2 coefficients occur in 8 discrete blocks of size 16
6141 // so they are loaded by employing an ldr at 8 distinct offsets.
6142
6143 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6144 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6145 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
6146 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6147 vs_subv(vs1, __ T8H, vs1, vs2);
6148 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
6149 load64shorts(vs2, zetas);
6150 vs_ldpq(vq, kyberConsts);
6151 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6152 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
6153
6154 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6155 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6156 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6157 vs_subv(vs1, __ T8H, vs1, vs2);
6158 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
6159 load64shorts(vs2, zetas);
6160 vs_ldpq(vq, kyberConsts);
6161 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6162 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6163
6164 // Barrett reduction at indexes where overflow may happen
6165
6166 // load q and the multiplier for the Barrett reduction
6167 __ add(tmpAddr, kyberConsts, 16);
6168 vs_ldpq(vq, tmpAddr);
6169
6170 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
6171 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
6172 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
6173 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6174 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6175 vs_sshr(vs2, __ T8H, vs2, 11);
6176 vs_mlsv(vs1, __ T8H, vs2, vq1);
6177 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
6178 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6179 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6180 vs_sshr(vs2, __ T8H, vs2, 11);
6181 vs_mlsv(vs1, __ T8H, vs2, vq1);
6182 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
6183
6184 // level 3
6185 // From level 3 upwards coefficients occur in discrete blocks whose size is
6186 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
6187
6188 int offsets2[4] = { 0, 64, 128, 192 };
6189 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
6190 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
6191 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6192 vs_subv(vs1, __ T8H, vs1, vs2);
6193 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
6194 load64shorts(vs2, zetas);
6195 vs_ldpq(vq, kyberConsts);
6196 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6197 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
6198
6199 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
6200 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6201 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6202 vs_subv(vs1, __ T8H, vs1, vs2);
6203 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
6204 load64shorts(vs2, zetas);
6205 vs_ldpq(vq, kyberConsts);
6206 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6207 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6208
6209 // level 4
6210
6211 int offsets1[4] = { 0, 32, 128, 160 };
6212 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
6213 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
6214 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6215 vs_subv(vs1, __ T8H, vs1, vs2);
6216 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
6217 load64shorts(vs2, zetas);
6218 vs_ldpq(vq, kyberConsts);
6219 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6220 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
6221
6222 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
6223 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6224 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6225 vs_subv(vs1, __ T8H, vs1, vs2);
6226 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
6227 load64shorts(vs2, zetas);
6228 vs_ldpq(vq, kyberConsts);
6229 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6230 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6231
6232 // level 5
6233
6234 __ add(tmpAddr, coeffs, 0);
6235 load64shorts(vs1, tmpAddr);
6236 __ add(tmpAddr, coeffs, 128);
6237 load64shorts(vs2, tmpAddr);
6238 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6239 vs_subv(vs1, __ T8H, vs1, vs2);
6240 __ add(tmpAddr, coeffs, 0);
6241 store64shorts(vs3, tmpAddr);
6242 load64shorts(vs2, zetas);
6243 vs_ldpq(vq, kyberConsts);
6244 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6245 __ add(tmpAddr, coeffs, 128);
6246 store64shorts(vs2, tmpAddr);
6247
6248 load64shorts(vs1, tmpAddr);
6249 __ add(tmpAddr, coeffs, 384);
6250 load64shorts(vs2, tmpAddr);
6251 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6252 vs_subv(vs1, __ T8H, vs1, vs2);
6253 __ add(tmpAddr, coeffs, 256);
6254 store64shorts(vs3, tmpAddr);
6255 load64shorts(vs2, zetas);
6256 vs_ldpq(vq, kyberConsts);
6257 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6258 __ add(tmpAddr, coeffs, 384);
6259 store64shorts(vs2, tmpAddr);
6260
6261 // Barrett reduction at indexes where overflow may happen
6262
6263 // load q and the multiplier for the Barrett reduction
6264 __ add(tmpAddr, kyberConsts, 16);
6265 vs_ldpq(vq, tmpAddr);
6266
6267 int offsets0[2] = { 0, 256 };
6268 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6269 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6270 vs_sshr(vs2, __ T8H, vs2, 11);
6271 vs_mlsv(vs1, __ T8H, vs2, vq1);
6272 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6273
6274 // level 6
6275
6276 __ add(tmpAddr, coeffs, 0);
6277 load64shorts(vs1, tmpAddr);
6278 __ add(tmpAddr, coeffs, 256);
6279 load64shorts(vs2, tmpAddr);
6280 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6281 vs_subv(vs1, __ T8H, vs1, vs2);
6282 __ add(tmpAddr, coeffs, 0);
6283 store64shorts(vs3, tmpAddr);
6284 load64shorts(vs2, zetas);
6285 vs_ldpq(vq, kyberConsts);
6286 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6287 __ add(tmpAddr, coeffs, 256);
6288 store64shorts(vs2, tmpAddr);
6289
6290 __ add(tmpAddr, coeffs, 128);
6291 load64shorts(vs1, tmpAddr);
6292 __ add(tmpAddr, coeffs, 384);
6293 load64shorts(vs2, tmpAddr);
6294 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6295 vs_subv(vs1, __ T8H, vs1, vs2);
6296 __ add(tmpAddr, coeffs, 128);
6297 store64shorts(vs3, tmpAddr);
6298 load64shorts(vs2, zetas);
6299 vs_ldpq(vq, kyberConsts);
6300 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6301 __ add(tmpAddr, coeffs, 384);
6302 store64shorts(vs2, tmpAddr);
6303
6304 // multiply by 2^-n
6305
6306 // load toMont(2^-n mod q)
6307 __ add(tmpAddr, kyberConsts, 48);
6308 __ ldr(v29, __ Q, tmpAddr);
6309
6310 vs_ldpq(vq, kyberConsts);
6311 __ add(tmpAddr, coeffs, 0);
6312 load64shorts(vs1, tmpAddr);
6313 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6314 __ add(tmpAddr, coeffs, 0);
6315 store64shorts(vs2, tmpAddr);
6316
6317 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
6318 load64shorts(vs1, tmpAddr);
6319 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6320 __ add(tmpAddr, coeffs, 128);
6321 store64shorts(vs2, tmpAddr);
6322
6323 // now tmpAddr contains coeffs + 256
6324 load64shorts(vs1, tmpAddr);
6325 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6326 __ add(tmpAddr, coeffs, 256);
6327 store64shorts(vs2, tmpAddr);
6328
6329 // now tmpAddr contains coeffs + 384
6330 load64shorts(vs1, tmpAddr);
6331 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6332 __ add(tmpAddr, coeffs, 384);
6333 store64shorts(vs2, tmpAddr);
6334
6335 __ leave(); // required for proper stackwalking of RuntimeStub frame
6336 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6337 __ ret(lr);
6338
6339 // record the stub entry and end
6340 store_archive_data(stub_id, start, __ pc());
6341
6342 return start;
6343 }
6344
6345 // Kyber multiply polynomials in the NTT domain.
6346 // Implements
6347 // static int implKyberNttMult(
6348 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
6349 //
6350 // The actual algorithm that is used here differs from the one in the Java
6351 // implementation, it uses Montgomery multiplications instead of Barrett
6352 // reduction, but the end result modulo MLKEM_Q is the same. This is the
6353 // Java equivalent of this intrinsic implementation:
6354 // static void implKyberNttMultJava(short[] result, short[] ntta, short[] nttb) {
6355 // for (int m = 0; m < ML_KEM_N / 2; m++) {
6356 // int a0 = ntta[2 * m];
6357 // int a1 = ntta[2 * m + 1];
6358 // int b0 = nttb[2 * m];
6359 // int b1 = nttb[2 * m + 1];
6360 // int r = montMul(a0, b0) +
6361 // montMul(montMul(a1, b1), MONT_ZETAS_FOR_NTT_MULT[m]);
6362 // result[2 * m] = (short) montMul(r, MONT_R_SQUARE_MOD_Q);
6363 // result[2 * m + 1] = (short) montMul(
6364 // (montMul(a0, b1) + montMul(a1, b0)), MONT_R_SQUARE_MOD_Q);
6365 // }
6366 // }
6367 //
6368 // result (short[256]) = c_rarg0
6369 // ntta (short[256]) = c_rarg1
6370 // nttb (short[256]) = c_rarg2
6371 // zetas (short[128]) = c_rarg3
6372 address generate_kyberNttMult() {
6373 StubId stub_id = StubId::stubgen_kyberNttMult_id;
6374 int entry_count = StubInfo::entry_count(stub_id);
6375 assert(entry_count == 1, "sanity check");
6376 address start = load_archive_data(stub_id);
6377 if (start != nullptr) {
6378 return start;
6379 }
6380 __ align(CodeEntryAlignment);
6381 StubCodeMark mark(this, stub_id);
6382 start = __ pc();
6383 __ enter();
6384
6385 const Register result = c_rarg0;
6386 const Register ntta = c_rarg1;
6387 const Register nttb = c_rarg2;
6388 const Register zetas = c_rarg3;
6389
6390 const Register kyberConsts = r10;
6391 const Register limit = r11;
6392
6393 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
6394 VSeq<4> vs3(16), vs4(20);
6395 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
6396 VSeq<2> vz(28); // pair of zetas
6397 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
6398
6399 __ lea(kyberConsts,
6400 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6401
6402 Label kyberNttMult_loop;
6403
6404 __ add(limit, result, 512);
6405
6406 // load q and qinv
6407 vs_ldpq(vq, kyberConsts);
6408
6409 // load R^2 mod q (to convert back from Montgomery representation)
6410 __ add(kyberConsts, kyberConsts, 64);
6411 __ ldr(v27, __ Q, kyberConsts);
6412
6413 __ BIND(kyberNttMult_loop);
6414
6415 // load 16 zetas
6416 vs_ldpq_post(vz, zetas);
6417
6418 // load 2 sets of 32 coefficients from the two input arrays
6419 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
6420 // are striped across pairs of vector registers
6421 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
6422 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
6423 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
6424 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
6425
6426 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
6427 // i.e. montmul the first and second halves of vs1 in order and
6428 // then with one sequence reversed storing the two results in vs3
6429 //
6430 // vs3[0] <- montmul(a0, b0)
6431 // vs3[1] <- montmul(a1, b1)
6432 // vs3[2] <- montmul(a0, b1)
6433 // vs3[3] <- montmul(a1, b0)
6434 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
6435 kyber_montmul16(vs_back(vs3),
6436 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
6437
6438 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
6439 // i.e. montmul the first and second halves of vs4 in order and
6440 // then with one sequence reversed storing the two results in vs1
6441 //
6442 // vs1[0] <- montmul(a2, b2)
6443 // vs1[1] <- montmul(a3, b3)
6444 // vs1[2] <- montmul(a2, b3)
6445 // vs1[3] <- montmul(a3, b2)
6446 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
6447 kyber_montmul16(vs_back(vs1),
6448 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
6449
6450 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
6451 // We can schedule two montmuls at a time if we use a suitable vector
6452 // sequence <vs3[1], vs1[1]>.
6453 int delta = vs1[1]->encoding() - vs3[1]->encoding();
6454 VSeq<2> vs5(vs3[1], delta);
6455
6456 // vs3[1] <- montmul(montmul(a1, b1), z0)
6457 // vs1[1] <- montmul(montmul(a3, b3), z1)
6458 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
6459
6460 // add results in pairs storing in vs3
6461 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
6462 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
6463 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
6464
6465 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
6466 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
6467 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
6468
6469 // vs1 <- montmul(vs3, montRSquareModQ)
6470 kyber_montmul32(vs1, vs3, vc, vs2, vq);
6471
6472 // store back the two pairs of result vectors de-interleaved as 8H elements
6473 // i.e. storing each pairs of shorts striped across a register pair adjacent
6474 // in memory
6475 vs_st2_post(vs1, __ T8H, result);
6476
6477 __ cmp(result, limit);
6478 __ br(Assembler::NE, kyberNttMult_loop);
6479
6480 __ leave(); // required for proper stackwalking of RuntimeStub frame
6481 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6482 __ ret(lr);
6483
6484 // record the stub entry and end
6485 store_archive_data(stub_id, start, __ pc());
6486
6487 return start;
6488 }
6489
6490 // Kyber add 2 polynomials.
6491 // Implements
6492 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
6493 //
6494 // result (short[256]) = c_rarg0
6495 // a (short[256]) = c_rarg1
6496 // b (short[256]) = c_rarg2
6497 address generate_kyberAddPoly_2() {
6498 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
6499 int entry_count = StubInfo::entry_count(stub_id);
6500 assert(entry_count == 1, "sanity check");
6501 address start = load_archive_data(stub_id);
6502 if (start != nullptr) {
6503 return start;
6504 }
6505 __ align(CodeEntryAlignment);
6506 StubCodeMark mark(this, stub_id);
6507 start = __ pc();
6508 __ enter();
6509
6510 const Register result = c_rarg0;
6511 const Register a = c_rarg1;
6512 const Register b = c_rarg2;
6513
6514 const Register kyberConsts = r11;
6515
6516 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
6517 // So, we can load, add and store the data in 3 groups of 11,
6518 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
6519 // registers. A further constraint is that the mapping needs
6520 // to skip callee saves. So, we allocate the register
6521 // sequences using two 8 sequences, two 2 sequences and two
6522 // single registers.
6523 VSeq<8> vs1_1(0);
6524 VSeq<2> vs1_2(16);
6525 FloatRegister vs1_3 = v28;
6526 VSeq<8> vs2_1(18);
6527 VSeq<2> vs2_2(26);
6528 FloatRegister vs2_3 = v29;
6529
6530 // two constant vector sequences
6531 VSeq<8> vc_1(31, 0);
6532 VSeq<2> vc_2(31, 0);
6533
6534 FloatRegister vc_3 = v31;
6535 __ lea(kyberConsts,
6536 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6537
6538 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6539 for (int i = 0; i < 3; i++) {
6540 // load 80 or 88 values from a into vs1_1/2/3
6541 vs_ldpq_post(vs1_1, a);
6542 vs_ldpq_post(vs1_2, a);
6543 if (i < 2) {
6544 __ ldr(vs1_3, __ Q, __ post(a, 16));
6545 }
6546 // load 80 or 88 values from b into vs2_1/2/3
6547 vs_ldpq_post(vs2_1, b);
6548 vs_ldpq_post(vs2_2, b);
6549 if (i < 2) {
6550 __ ldr(vs2_3, __ Q, __ post(b, 16));
6551 }
6552 // sum 80 or 88 values across vs1 and vs2 into vs1
6553 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6554 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6555 if (i < 2) {
6556 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6557 }
6558 // add constant to all 80 or 88 results
6559 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6560 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6561 if (i < 2) {
6562 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6563 }
6564 // store 80 or 88 values
6565 vs_stpq_post(vs1_1, result);
6566 vs_stpq_post(vs1_2, result);
6567 if (i < 2) {
6568 __ str(vs1_3, __ Q, __ post(result, 16));
6569 }
6570 }
6571
6572 __ leave(); // required for proper stackwalking of RuntimeStub frame
6573 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6574 __ ret(lr);
6575
6576 // record the stub entry and end
6577 store_archive_data(stub_id, start, __ pc());
6578
6579 return start;
6580 }
6581
6582 // Kyber add 3 polynomials.
6583 // Implements
6584 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6585 //
6586 // result (short[256]) = c_rarg0
6587 // a (short[256]) = c_rarg1
6588 // b (short[256]) = c_rarg2
6589 // c (short[256]) = c_rarg3
6590 address generate_kyberAddPoly_3() {
6591 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6592 int entry_count = StubInfo::entry_count(stub_id);
6593 assert(entry_count == 1, "sanity check");
6594 address start = load_archive_data(stub_id);
6595 if (start != nullptr) {
6596 return start;
6597 }
6598 __ align(CodeEntryAlignment);
6599 StubCodeMark mark(this, stub_id);
6600 start = __ pc();
6601 __ enter();
6602
6603 const Register result = c_rarg0;
6604 const Register a = c_rarg1;
6605 const Register b = c_rarg2;
6606 const Register c = c_rarg3;
6607
6608 const Register kyberConsts = r11;
6609
6610 // As above we sum 256 sets of values in total i.e. 32 x 8H
6611 // quadwords. So, we can load, add and store the data in 3
6612 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6613 // of 10 or 11 registers. A further constraint is that the
6614 // mapping needs to skip callee saves. So, we allocate the
6615 // register sequences using two 8 sequences, two 2 sequences
6616 // and two single registers.
6617 VSeq<8> vs1_1(0);
6618 VSeq<2> vs1_2(16);
6619 FloatRegister vs1_3 = v28;
6620 VSeq<8> vs2_1(18);
6621 VSeq<2> vs2_2(26);
6622 FloatRegister vs2_3 = v29;
6623
6624 // two constant vector sequences
6625 VSeq<8> vc_1(31, 0);
6626 VSeq<2> vc_2(31, 0);
6627
6628 FloatRegister vc_3 = v31;
6629
6630 __ lea(kyberConsts,
6631 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6632
6633 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6634 for (int i = 0; i < 3; i++) {
6635 // load 80 or 88 values from a into vs1_1/2/3
6636 vs_ldpq_post(vs1_1, a);
6637 vs_ldpq_post(vs1_2, a);
6638 if (i < 2) {
6639 __ ldr(vs1_3, __ Q, __ post(a, 16));
6640 }
6641 // load 80 or 88 values from b into vs2_1/2/3
6642 vs_ldpq_post(vs2_1, b);
6643 vs_ldpq_post(vs2_2, b);
6644 if (i < 2) {
6645 __ ldr(vs2_3, __ Q, __ post(b, 16));
6646 }
6647 // sum 80 or 88 values across vs1 and vs2 into vs1
6648 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6649 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6650 if (i < 2) {
6651 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6652 }
6653 // load 80 or 88 values from c into vs2_1/2/3
6654 vs_ldpq_post(vs2_1, c);
6655 vs_ldpq_post(vs2_2, c);
6656 if (i < 2) {
6657 __ ldr(vs2_3, __ Q, __ post(c, 16));
6658 }
6659 // sum 80 or 88 values across vs1 and vs2 into vs1
6660 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6661 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6662 if (i < 2) {
6663 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6664 }
6665 // add constant to all 80 or 88 results
6666 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6667 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6668 if (i < 2) {
6669 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6670 }
6671 // store 80 or 88 values
6672 vs_stpq_post(vs1_1, result);
6673 vs_stpq_post(vs1_2, result);
6674 if (i < 2) {
6675 __ str(vs1_3, __ Q, __ post(result, 16));
6676 }
6677 }
6678
6679 __ leave(); // required for proper stackwalking of RuntimeStub frame
6680 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6681 __ ret(lr);
6682
6683 // record the stub entry and end
6684 store_archive_data(stub_id, start, __ pc());
6685
6686 return start;
6687 }
6688
6689 // Kyber parse XOF output to polynomial coefficient candidates
6690 // or decodePoly(12, ...).
6691 // Implements
6692 // static int implKyber12To16(
6693 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6694 //
6695 // we assume that parsed and condensed are allocated such that for
6696 // n = (parsedLength + 63) / 64
6697 // n blocks of 96 bytes of input can be processed, i.e.
6698 // index + n * 96 <= condensed.length and
6699 // n * 64 <= parsed.length
6700 //
6701 // condensed (byte[]) = c_rarg0
6702 // condensedIndex = c_rarg1
6703 // parsed (short[]) = c_rarg2
6704 // parsedLength = c_rarg3
6705 address generate_kyber12To16() {
6706 StubId stub_id = StubId::stubgen_kyber12To16_id;
6707 int entry_count = StubInfo::entry_count(stub_id);
6708 assert(entry_count == 1, "sanity check");
6709 address start = load_archive_data(stub_id);
6710 if (start != nullptr) {
6711 return start;
6712 }
6713 Label L_F00, L_loop;
6714
6715 __ align(CodeEntryAlignment);
6716 StubCodeMark mark(this, stub_id);
6717 start = __ pc();
6718 __ enter();
6719
6720 const Register condensed = c_rarg0;
6721 const Register condensedOffs = c_rarg1;
6722 const Register parsed = c_rarg2;
6723 const Register parsedLength = c_rarg3;
6724
6725 const Register tmpAddr = r11;
6726
6727 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6728 // quadwords so we need a 6 vector sequence for the inputs.
6729 // Parsing produces 64 shorts, employing two 8 vector
6730 // sequences to store and combine the intermediate data.
6731 VSeq<6> vin(24);
6732 VSeq<8> va(0), vb(16);
6733
6734 __ adr(tmpAddr, L_F00);
6735 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6736 __ add(condensed, condensed, condensedOffs);
6737
6738 __ BIND(L_loop);
6739 // load 96 (6 x 16B) byte values
6740 vs_ld3_post(vin, __ T16B, condensed);
6741
6742 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6743 // holds 48 (16x3) contiguous bytes from memory striped
6744 // horizontally across each of the 16 byte lanes. Equivalently,
6745 // that is 16 pairs of 12-bit integers. Likewise the back half
6746 // holds the next 48 bytes in the same arrangement.
6747
6748 // Each vector in the front half can also be viewed as a vertical
6749 // strip across the 16 pairs of 12 bit integers. Each byte in
6750 // vin[0] stores the low 8 bits of the first int in a pair. Each
6751 // byte in vin[1] stores the high 4 bits of the first int and the
6752 // low 4 bits of the second int. Each byte in vin[2] stores the
6753 // high 8 bits of the second int. Likewise the vectors in second
6754 // half.
6755
6756 // Converting the data to 16-bit shorts requires first of all
6757 // expanding each of the 6 x 16B vectors into 6 corresponding
6758 // pairs of 8H vectors. Mask, shift and add operations on the
6759 // resulting vector pairs can be used to combine 4 and 8 bit
6760 // parts of related 8H vector elements.
6761 //
6762 // The middle vectors (vin[2] and vin[5]) are actually expanded
6763 // twice, one copy manipulated to provide the lower 4 bits
6764 // belonging to the first short in a pair and another copy
6765 // manipulated to provide the higher 4 bits belonging to the
6766 // second short in a pair. This is why the vector sequences va
6767 // and vb are used to hold the expanded 8H elements are of length 8.
6768
6769 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6770 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6771 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6772 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6773 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6774 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6775 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6776 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6777
6778 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6779 // and vb[4:5]
6780 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6781 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6782 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6783 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6784 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6785 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6786
6787 // shift lo byte of copy 1 of the middle stripe into the high byte
6788 __ shl(va[2], __ T8H, va[2], 8);
6789 __ shl(va[3], __ T8H, va[3], 8);
6790 __ shl(vb[2], __ T8H, vb[2], 8);
6791 __ shl(vb[3], __ T8H, vb[3], 8);
6792
6793 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6794 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6795 // are in bit positions [4..11].
6796 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6797 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6798 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6799 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6800
6801 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6802 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6803 // copy2
6804 __ andr(va[2], __ T16B, va[2], v31);
6805 __ andr(va[3], __ T16B, va[3], v31);
6806 __ ushr(va[4], __ T8H, va[4], 4);
6807 __ ushr(va[5], __ T8H, va[5], 4);
6808 __ andr(vb[2], __ T16B, vb[2], v31);
6809 __ andr(vb[3], __ T16B, vb[3], v31);
6810 __ ushr(vb[4], __ T8H, vb[4], 4);
6811 __ ushr(vb[5], __ T8H, vb[5], 4);
6812
6813 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6814 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6815 // n.b. the ordering ensures: i) inputs are consumed before they
6816 // are overwritten ii) the order of 16-bit results across successive
6817 // pairs of vectors in va and then vb reflects the order of the
6818 // corresponding 12-bit inputs
6819 __ addv(va[0], __ T8H, va[0], va[2]);
6820 __ addv(va[2], __ T8H, va[1], va[3]);
6821 __ addv(va[1], __ T8H, va[4], va[6]);
6822 __ addv(va[3], __ T8H, va[5], va[7]);
6823 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6824 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6825 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6826 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6827
6828 // store 64 results interleaved as shorts
6829 vs_st2_post(vs_front(va), __ T8H, parsed);
6830 vs_st2_post(vs_front(vb), __ T8H, parsed);
6831
6832 __ sub(parsedLength, parsedLength, 64);
6833 __ cmp(parsedLength, (u1)0);
6834 __ br(Assembler::GT, L_loop);
6835
6836 __ leave(); // required for proper stackwalking of RuntimeStub frame
6837 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6838 __ ret(lr);
6839
6840 // bind label and generate constant data used by this stub
6841 __ BIND(L_F00);
6842 __ emit_int64(0x0f000f000f000f00);
6843 __ emit_int64(0x0f000f000f000f00);
6844
6845 // record the stub entry and end
6846 store_archive_data(stub_id, start, __ pc());
6847
6848 return start;
6849 }
6850
6851 // Kyber Barrett reduce function.
6852 // Implements
6853 // static int implKyberBarrettReduce(short[] coeffs) {}
6854 //
6855 // coeffs (short[256]) = c_rarg0
6856 address generate_kyberBarrettReduce() {
6857 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6858 int entry_count = StubInfo::entry_count(stub_id);
6859 assert(entry_count == 1, "sanity check");
6860 address start = load_archive_data(stub_id);
6861 if (start != nullptr) {
6862 return start;
6863 }
6864 __ align(CodeEntryAlignment);
6865 StubCodeMark mark(this, stub_id);
6866 start = __ pc();
6867 __ enter();
6868
6869 const Register coeffs = c_rarg0;
6870
6871 const Register kyberConsts = r10;
6872 const Register result = r11;
6873
6874 // As above we process 256 sets of values in total i.e. 32 x
6875 // 8H quadwords. So, we can load, add and store the data in 3
6876 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6877 // of 10 or 11 registers. A further constraint is that the
6878 // mapping needs to skip callee saves. So, we allocate the
6879 // register sequences using two 8 sequences, two 2 sequences
6880 // and two single registers.
6881 VSeq<8> vs1_1(0);
6882 VSeq<2> vs1_2(16);
6883 FloatRegister vs1_3 = v28;
6884 VSeq<8> vs2_1(18);
6885 VSeq<2> vs2_2(26);
6886 FloatRegister vs2_3 = v29;
6887
6888 // we also need a pair of corresponding constant sequences
6889
6890 VSeq<8> vc1_1(30, 0);
6891 VSeq<2> vc1_2(30, 0);
6892 FloatRegister vc1_3 = v30; // for kyber_q
6893
6894 VSeq<8> vc2_1(31, 0);
6895 VSeq<2> vc2_2(31, 0);
6896 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6897
6898 __ add(result, coeffs, 0);
6899 __ lea(kyberConsts,
6900 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6901
6902 // load q and the multiplier for the Barrett reduction
6903 __ add(kyberConsts, kyberConsts, 16);
6904 __ ldpq(vc1_3, vc2_3, kyberConsts);
6905
6906 for (int i = 0; i < 3; i++) {
6907 // load 80 or 88 coefficients
6908 vs_ldpq_post(vs1_1, coeffs);
6909 vs_ldpq_post(vs1_2, coeffs);
6910 if (i < 2) {
6911 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6912 }
6913
6914 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6915 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6916 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6917 if (i < 2) {
6918 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6919 }
6920
6921 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6922 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6923 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6924 if (i < 2) {
6925 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6926 }
6927
6928 // vs1 <- vs1 - vs2 * kyber_q
6929 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6930 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6931 if (i < 2) {
6932 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6933 }
6934
6935 vs_stpq_post(vs1_1, result);
6936 vs_stpq_post(vs1_2, result);
6937 if (i < 2) {
6938 __ str(vs1_3, __ Q, __ post(result, 16));
6939 }
6940 }
6941
6942 __ leave(); // required for proper stackwalking of RuntimeStub frame
6943 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
6944 __ ret(lr);
6945
6946 // record the stub entry and end
6947 store_archive_data(stub_id, start, __ pc());
6948
6949 return start;
6950 }
6951
6952
6953 // Dilithium-specific montmul helper routines that generate parallel
6954 // code for, respectively, a single 4x4s vector sequence montmul or
6955 // two such multiplies in a row.
6956
6957 // Perform 16 32-bit Montgomery multiplications in parallel
6958 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6959 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6960 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6961 // It will assert that the register use is valid
6962 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6963 }
6964
6965 // Perform 2x16 32-bit Montgomery multiplications in parallel
6966 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6967 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6968 // Schedule two successive 4x4S multiplies via the montmul helper
6969 // on the front and back halves of va, vb and vc. The helper will
6970 // assert that the register use has no overlap conflicts on each
6971 // individual call but we also need to ensure that the necessary
6972 // disjoint/equality constraints are met across both calls.
6973
6974 // vb, vc, vtmp and vq must be disjoint. va must either be
6975 // disjoint from all other registers or equal vc
6976
6977 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6978 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6979 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6980
6981 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6982 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6983
6984 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6985
6986 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6987 assert(vs_disjoint(va, vb), "va and vb overlap");
6988 assert(vs_disjoint(va, vq), "va and vq overlap");
6989 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6990
6991 // We multiply the front and back halves of each sequence 4 at a
6992 // time because
6993 //
6994 // 1) we are currently only able to get 4-way instruction
6995 // parallelism at best
6996 //
6997 // 2) we need registers for the constants in vq and temporary
6998 // scratch registers to hold intermediate results so vtmp can only
6999 // be a VSeq<4> which means we only have 4 scratch slots.
7000
7001 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
7002 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
7003 }
7004
7005 // Perform combined montmul then add/sub on 4x4S vectors.
7006 void dilithium_montmul16_sub_add(
7007 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
7008 const VSeq<4>& vtmp, const VSeq<2>& vq) {
7009 // compute a = montmul(a1, c)
7010 dilithium_montmul16(vc, va1, vc, vtmp, vq);
7011 // ouptut a1 = a0 - a
7012 vs_subv(va1, __ T4S, va0, vc);
7013 // and a0 = a0 + a
7014 vs_addv(va0, __ T4S, va0, vc);
7015 }
7016
7017 // Perform combined add/sub then montmul on 4x4S vectors.
7018 void dilithium_sub_add_montmul16(
7019 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
7020 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
7021 // compute c = a0 - a1
7022 vs_subv(vtmp1, __ T4S, va0, va1);
7023 // output a0 = a0 + a1
7024 vs_addv(va0, __ T4S, va0, va1);
7025 // output a1 = b montmul c
7026 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
7027 }
7028
7029 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
7030 // in the Java implementation come in sequences of at least 8, so we
7031 // can use ldpq to collect the corresponding data into pairs of vector
7032 // registers.
7033 // We collect the coefficients corresponding to the 'j+l' indexes into
7034 // the vector registers v0-v7, the zetas into the vector registers v16-v23
7035 // then we do the (Montgomery) multiplications by the zetas in parallel
7036 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
7037 // v0-v7, then do the additions into v24-v31 and the subtractions into
7038 // v0-v7 and finally save the results back to the coeffs array.
7039 void dilithiumNttLevel0_4(const Register dilithiumConsts,
7040 const Register coeffs, const Register zetas) {
7041 int c1 = 0;
7042 int c2 = 512;
7043 int startIncr;
7044 // don't use callee save registers v8 - v15
7045 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7046 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7047 VSeq<2> vq(30); // n.b. constants overlap vs3
7048 int offsets[4] = { 0, 32, 64, 96 };
7049
7050 for (int level = 0; level < 5; level++) {
7051 int c1Start = c1;
7052 int c2Start = c2;
7053 if (level == 3) {
7054 offsets[1] = 32;
7055 offsets[2] = 128;
7056 offsets[3] = 160;
7057 } else if (level == 4) {
7058 offsets[1] = 64;
7059 offsets[2] = 128;
7060 offsets[3] = 192;
7061 }
7062
7063 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
7064 // time at 4 different offsets and multiply them in order by the
7065 // next set of input values. So we employ indexed load and store
7066 // pair instructions with arrangement 4S.
7067 for (int i = 0; i < 4; i++) {
7068 // reload q and qinv
7069 vs_ldpq(vq, dilithiumConsts); // qInv, q
7070 // load 8x4S coefficients via second start pos == c2
7071 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
7072 // load next 8x4S inputs == b
7073 vs_ldpq_post(vs2, zetas);
7074 // compute a == c2 * b mod MONT_Q
7075 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7076 // load 8x4s coefficients via first start pos == c1
7077 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7078 // compute a1 = c1 + a
7079 vs_addv(vs3, __ T4S, vs1, vs2);
7080 // compute a2 = c1 - a
7081 vs_subv(vs1, __ T4S, vs1, vs2);
7082 // output a1 and a2
7083 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7084 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
7085
7086 int k = 4 * level + i;
7087
7088 if (k > 7) {
7089 startIncr = 256;
7090 } else if (k == 5) {
7091 startIncr = 384;
7092 } else {
7093 startIncr = 128;
7094 }
7095
7096 c1Start += startIncr;
7097 c2Start += startIncr;
7098 }
7099
7100 c2 /= 2;
7101 }
7102 }
7103
7104 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
7105 // Implements the method
7106 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
7107 // of the Java class sun.security.provider
7108 //
7109 // coeffs (int[256]) = c_rarg0
7110 // zetas (int[256]) = c_rarg1
7111 address generate_dilithiumAlmostNtt() {
7112 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
7113 int entry_count = StubInfo::entry_count(stub_id);
7114 assert(entry_count == 1, "sanity check");
7115 address start = load_archive_data(stub_id);
7116 if (start != nullptr) {
7117 return start;
7118 }
7119 __ align(CodeEntryAlignment);
7120 StubCodeMark mark(this, stub_id);
7121 start = __ pc();
7122 __ enter();
7123
7124 const Register coeffs = c_rarg0;
7125 const Register zetas = c_rarg1;
7126
7127 const Register tmpAddr = r9;
7128 const Register dilithiumConsts = r10;
7129 const Register result = r11;
7130 // don't use callee save registers v8 - v15
7131 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7132 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7133 VSeq<2> vq(30); // n.b. constants overlap vs3
7134 int offsets[4] = { 0, 32, 64, 96};
7135 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7136 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7137 __ add(result, coeffs, 0);
7138 __ lea(dilithiumConsts,
7139 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7140
7141 // Each level represents one iteration of the outer for loop of the Java version.
7142
7143 // level 0-4
7144 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
7145
7146 // level 5
7147
7148 // At level 5 the coefficients we need to combine with the zetas
7149 // are grouped in memory in blocks of size 4. So, for both sets of
7150 // coefficients we load 4 adjacent values at 8 different offsets
7151 // using an indexed ldr with register variant Q and multiply them
7152 // in sequence order by the next set of inputs. Likewise we store
7153 // the results using an indexed str with register variant Q.
7154 for (int i = 0; i < 1024; i += 256) {
7155 // reload constants q, qinv each iteration as they get clobbered later
7156 vs_ldpq(vq, dilithiumConsts); // qInv, q
7157 // load 32 (8x4S) coefficients via first offsets = c1
7158 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7159 // load next 32 (8x4S) inputs = b
7160 vs_ldpq_post(vs2, zetas);
7161 // a = b montul c1
7162 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7163 // load 32 (8x4S) coefficients via second offsets = c2
7164 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
7165 // add/sub with result of multiply
7166 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
7167 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
7168 // write back new coefficients using same offsets
7169 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
7170 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
7171 }
7172
7173 // level 6
7174 // At level 6 the coefficients we need to combine with the zetas
7175 // are grouped in memory in pairs, the first two being montmul
7176 // inputs and the second add/sub inputs. We can still implement
7177 // the montmul+sub+add using 4-way parallelism but only if we
7178 // combine the coefficients with the zetas 16 at a time. We load 8
7179 // adjacent values at 4 different offsets using an ld2 load with
7180 // arrangement 2D. That interleaves the lower and upper halves of
7181 // each pair of quadwords into successive vector registers. We
7182 // then need to montmul the 4 even elements of the coefficients
7183 // register sequence by the zetas in order and then add/sub the 4
7184 // odd elements of the coefficients register sequence. We use an
7185 // equivalent st2 operation to store the results back into memory
7186 // de-interleaved.
7187 for (int i = 0; i < 1024; i += 128) {
7188 // reload constants q, qinv each iteration as they get clobbered later
7189 vs_ldpq(vq, dilithiumConsts); // qInv, q
7190 // load interleaved 16 (4x2D) coefficients via offsets
7191 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7192 // load next 16 (4x4S) inputs
7193 vs_ldpq_post(vs_front(vs2), zetas);
7194 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7195 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7196 vs_front(vs2), vtmp, vq);
7197 // store interleaved 16 (4x2D) coefficients via offsets
7198 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7199 }
7200
7201 // level 7
7202 // At level 7 the coefficients we need to combine with the zetas
7203 // occur singly with montmul inputs alternating with add/sub
7204 // inputs. Once again we can use 4-way parallelism to combine 16
7205 // zetas at a time. However, we have to load 8 adjacent values at
7206 // 4 different offsets using an ld2 load with arrangement 4S. That
7207 // interleaves the odd words of each pair into one
7208 // coefficients vector register and the even words of the pair
7209 // into the next register. We then need to montmul the 4 even
7210 // elements of the coefficients register sequence by the zetas in
7211 // order and then add/sub the 4 odd elements of the coefficients
7212 // register sequence. We use an equivalent st2 operation to store
7213 // the results back into memory de-interleaved.
7214
7215 for (int i = 0; i < 1024; i += 128) {
7216 // reload constants q, qinv each iteration as they get clobbered later
7217 vs_ldpq(vq, dilithiumConsts); // qInv, q
7218 // load interleaved 16 (4x4S) coefficients via offsets
7219 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7220 // load next 16 (4x4S) inputs
7221 vs_ldpq_post(vs_front(vs2), zetas);
7222 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7223 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7224 vs_front(vs2), vtmp, vq);
7225 // store interleaved 16 (4x4S) coefficients via offsets
7226 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7227 }
7228 __ leave(); // required for proper stackwalking of RuntimeStub frame
7229 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7230 __ ret(lr);
7231
7232 // record the stub entry and end
7233 store_archive_data(stub_id, start, __ pc());
7234
7235 return start;
7236 }
7237
7238 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
7239 // in the Java implementation come in sequences of at least 8, so we
7240 // can use ldpq to collect the corresponding data into pairs of vector
7241 // registers
7242 // We collect the coefficients that correspond to the 'j's into vs1
7243 // the coefficiets that correspond to the 'j+l's into vs2 then
7244 // do the additions into vs3 and the subtractions into vs1 then
7245 // save the result of the additions, load the zetas into vs2
7246 // do the (Montgomery) multiplications by zeta in parallel into vs2
7247 // finally save the results back to the coeffs array
7248 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
7249 const Register coeffs, const Register zetas) {
7250 int c1 = 0;
7251 int c2 = 32;
7252 int startIncr;
7253 int offsets[4];
7254 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7255 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7256 VSeq<2> vq(30); // n.b. constants overlap vs3
7257
7258 offsets[0] = 0;
7259
7260 for (int level = 3; level < 8; level++) {
7261 int c1Start = c1;
7262 int c2Start = c2;
7263 if (level == 3) {
7264 offsets[1] = 64;
7265 offsets[2] = 128;
7266 offsets[3] = 192;
7267 } else if (level == 4) {
7268 offsets[1] = 32;
7269 offsets[2] = 128;
7270 offsets[3] = 160;
7271 } else {
7272 offsets[1] = 32;
7273 offsets[2] = 64;
7274 offsets[3] = 96;
7275 }
7276
7277 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
7278 // time at 4 different offsets and multiply them in order by the
7279 // next set of input values. So we employ indexed load and store
7280 // pair instructions with arrangement 4S.
7281 for (int i = 0; i < 4; i++) {
7282 // load v1 32 (8x4S) coefficients relative to first start index
7283 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7284 // load v2 32 (8x4S) coefficients relative to second start index
7285 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
7286 // a0 = v1 + v2 -- n.b. clobbers vqs
7287 vs_addv(vs3, __ T4S, vs1, vs2);
7288 // a1 = v1 - v2
7289 vs_subv(vs1, __ T4S, vs1, vs2);
7290 // save a1 relative to first start index
7291 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7292 // load constants q, qinv each iteration as they get clobbered above
7293 vs_ldpq(vq, dilithiumConsts); // qInv, q
7294 // load b next 32 (8x4S) inputs
7295 vs_ldpq_post(vs2, zetas);
7296 // a = a1 montmul b
7297 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7298 // save a relative to second start index
7299 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
7300
7301 int k = 4 * level + i;
7302
7303 if (k < 24) {
7304 startIncr = 256;
7305 } else if (k == 25) {
7306 startIncr = 384;
7307 } else {
7308 startIncr = 128;
7309 }
7310
7311 c1Start += startIncr;
7312 c2Start += startIncr;
7313 }
7314
7315 c2 *= 2;
7316 }
7317 }
7318
7319 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
7320 // Implements the method
7321 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
7322 // the sun.security.provider.ML_DSA class.
7323 //
7324 // coeffs (int[256]) = c_rarg0
7325 // zetas (int[256]) = c_rarg1
7326 address generate_dilithiumAlmostInverseNtt() {
7327 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
7328 int entry_count = StubInfo::entry_count(stub_id);
7329 assert(entry_count == 1, "sanity check");
7330 address start = load_archive_data(stub_id);
7331 if (start != nullptr) {
7332 return start;
7333 }
7334 __ align(CodeEntryAlignment);
7335 StubCodeMark mark(this, stub_id);
7336 start = __ pc();
7337 __ enter();
7338
7339 const Register coeffs = c_rarg0;
7340 const Register zetas = c_rarg1;
7341
7342 const Register tmpAddr = r9;
7343 const Register dilithiumConsts = r10;
7344 const Register result = r11;
7345 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7346 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7347 VSeq<2> vq(30); // n.b. constants overlap vs3
7348 int offsets[4] = { 0, 32, 64, 96 };
7349 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7350 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7351
7352 __ add(result, coeffs, 0);
7353 __ lea(dilithiumConsts,
7354 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7355
7356 // Each level represents one iteration of the outer for loop of the Java version
7357
7358 // level 0
7359 // At level 0 we need to interleave adjacent quartets of
7360 // coefficients before we multiply and add/sub by the next 16
7361 // zetas just as we did for level 7 in the multiply code. So we
7362 // load and store the values using an ld2/st2 with arrangement 4S.
7363 for (int i = 0; i < 1024; i += 128) {
7364 // load constants q, qinv
7365 // n.b. this can be moved out of the loop as they do not get
7366 // clobbered by first two loops
7367 vs_ldpq(vq, dilithiumConsts); // qInv, q
7368 // a0/a1 load interleaved 32 (8x4S) coefficients
7369 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7370 // b load next 32 (8x4S) inputs
7371 vs_ldpq_post(vs_front(vs2), zetas);
7372 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7373 // n.b. second half of vs2 provides temporary register storage
7374 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7375 vs_front(vs2), vs_back(vs2), vtmp, vq);
7376 // a0/a1 store interleaved 32 (8x4S) coefficients
7377 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7378 }
7379
7380 // level 1
7381 // At level 1 we need to interleave pairs of adjacent pairs of
7382 // coefficients before we multiply by the next 16 zetas just as we
7383 // did for level 6 in the multiply code. So we load and store the
7384 // values an ld2/st2 with arrangement 2D.
7385 for (int i = 0; i < 1024; i += 128) {
7386 // a0/a1 load interleaved 32 (8x2D) coefficients
7387 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7388 // b load next 16 (4x4S) inputs
7389 vs_ldpq_post(vs_front(vs2), zetas);
7390 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7391 // n.b. second half of vs2 provides temporary register storage
7392 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7393 vs_front(vs2), vs_back(vs2), vtmp, vq);
7394 // a0/a1 store interleaved 32 (8x2D) coefficients
7395 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7396 }
7397
7398 // level 2
7399 // At level 2 coefficients come in blocks of 4. So, we load 4
7400 // adjacent coefficients at 8 distinct offsets for both the first
7401 // and second coefficient sequences, using an ldr with register
7402 // variant Q then combine them with next set of 32 zetas. Likewise
7403 // we store the results using an str with register variant Q.
7404 for (int i = 0; i < 1024; i += 256) {
7405 // c0 load 32 (8x4S) coefficients via first offsets
7406 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7407 // c1 load 32 (8x4S) coefficients via second offsets
7408 vs_ldr_indexed(vs2, __ Q, coeffs, i, offsets2);
7409 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
7410 vs_addv(vs3, __ T4S, vs1, vs2);
7411 // c = c0 - c1
7412 vs_subv(vs1, __ T4S, vs1, vs2);
7413 // store a0 32 (8x4S) coefficients via first offsets
7414 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
7415 // b load 32 (8x4S) next inputs
7416 vs_ldpq_post(vs2, zetas);
7417 // reload constants q, qinv -- they were clobbered earlier
7418 vs_ldpq(vq, dilithiumConsts); // qInv, q
7419 // compute a1 = b montmul c
7420 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7421 // store a1 32 (8x4S) coefficients via second offsets
7422 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
7423 }
7424
7425 // level 3-7
7426 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
7427
7428 __ leave(); // required for proper stackwalking of RuntimeStub frame
7429 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7430 __ ret(lr);
7431
7432 // record the stub entry and end
7433 store_archive_data(stub_id, start, __ pc());
7434
7435 return start;
7436 }
7437
7438 // Dilithium multiply polynomials in the NTT domain.
7439 // Straightforward implementation of the method
7440 // static int implDilithiumNttMult(
7441 // int[] product, int[] coeffs1, int[] coeffs2) {}
7442 // of the sun.security.provider.ML_DSA class.
7443 //
7444 // result (int[256]) = c_rarg0
7445 // poly1 (int[256]) = c_rarg1
7446 // poly2 (int[256]) = c_rarg2
7447 address generate_dilithiumNttMult() {
7448 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
7449 int entry_count = StubInfo::entry_count(stub_id);
7450 assert(entry_count == 1, "sanity check");
7451 address start = load_archive_data(stub_id);
7452 if (start != nullptr) {
7453 return start;
7454 }
7455 __ align(CodeEntryAlignment);
7456 StubCodeMark mark(this, stub_id);
7457 start = __ pc();
7458 __ enter();
7459
7460 Label L_loop;
7461
7462 const Register result = c_rarg0;
7463 const Register poly1 = c_rarg1;
7464 const Register poly2 = c_rarg2;
7465
7466 const Register dilithiumConsts = r10;
7467 const Register len = r11;
7468
7469 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7470 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7471 VSeq<2> vq(30); // n.b. constants overlap vs3
7472 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
7473
7474 __ lea(dilithiumConsts,
7475 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7476
7477 // load constants q, qinv
7478 vs_ldpq(vq, dilithiumConsts); // qInv, q
7479 // load constant rSquare into v29
7480 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
7481
7482 __ mov(len, zr);
7483 __ add(len, len, 1024);
7484
7485 __ BIND(L_loop);
7486
7487 // b load 32 (8x4S) next inputs from poly1
7488 vs_ldpq_post(vs1, poly1);
7489 // c load 32 (8x4S) next inputs from poly2
7490 vs_ldpq_post(vs2, poly2);
7491 // compute a = b montmul c
7492 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7493 // compute a = rsquare montmul a
7494 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
7495 // save a 32 (8x4S) results
7496 vs_stpq_post(vs2, result);
7497
7498 __ sub(len, len, 128);
7499 __ cmp(len, (u1)128);
7500 __ br(Assembler::GE, L_loop);
7501
7502 __ leave(); // required for proper stackwalking of RuntimeStub frame
7503 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7504 __ ret(lr);
7505
7506 // record the stub entry and end
7507 store_archive_data(stub_id, start, __ pc());
7508
7509 return start;
7510 }
7511
7512 // Dilithium Montgomery multiply an array by a constant.
7513 // A straightforward implementation of the method
7514 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
7515 // of the sun.security.provider.ML_DSA class
7516 //
7517 // coeffs (int[256]) = c_rarg0
7518 // constant (int) = c_rarg1
7519 address generate_dilithiumMontMulByConstant() {
7520 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
7521 int entry_count = StubInfo::entry_count(stub_id);
7522 assert(entry_count == 1, "sanity check");
7523 address start = load_archive_data(stub_id);
7524 if (start != nullptr) {
7525 return start;
7526 }
7527 __ align(CodeEntryAlignment);
7528 StubCodeMark mark(this, stub_id);
7529 start = __ pc();
7530 __ enter();
7531
7532 Label L_loop;
7533
7534 const Register coeffs = c_rarg0;
7535 const Register constant = c_rarg1;
7536
7537 const Register dilithiumConsts = r10;
7538 const Register result = r11;
7539 const Register len = r12;
7540
7541 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7542 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7543 VSeq<2> vq(30); // n.b. constants overlap vs3
7544 VSeq<8> vconst(29, 0); // for montmul by constant
7545
7546 // results track inputs
7547 __ add(result, coeffs, 0);
7548 __ lea(dilithiumConsts,
7549 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7550
7551 // load constants q, qinv -- they do not get clobbered by first two loops
7552 vs_ldpq(vq, dilithiumConsts); // qInv, q
7553 // copy caller supplied constant across vconst
7554 __ dup(vconst[0], __ T4S, constant);
7555 __ mov(len, zr);
7556 __ add(len, len, 1024);
7557
7558 __ BIND(L_loop);
7559
7560 // load next 32 inputs
7561 vs_ldpq_post(vs2, coeffs);
7562 // mont mul by constant
7563 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
7564 // write next 32 results
7565 vs_stpq_post(vs2, result);
7566
7567 __ sub(len, len, 128);
7568 __ cmp(len, (u1)128);
7569 __ br(Assembler::GE, L_loop);
7570
7571 __ leave(); // required for proper stackwalking of RuntimeStub frame
7572 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7573 __ ret(lr);
7574
7575 // record the stub entry and end
7576 store_archive_data(stub_id, start, __ pc());
7577
7578 return start;
7579 }
7580
7581 // Dilithium decompose poly.
7582 // Implements the method
7583 // static int implDilithiumDecomposePoly(int[] input, int[] lowPart, int[] highPart,
7584 // int twoGamma2, int multiplier) {
7585 // of the sun.security.provider.ML_DSA class
7586 //
7587 // input (int[256]) = c_rarg0
7588 // lowPart (int[256]) = c_rarg1
7589 // highPart (int[256]) = c_rarg2
7590 // twoGamma2 (int) = c_rarg3
7591 // multiplier (int) = c_rarg4
7592 address generate_dilithiumDecomposePoly() {
7593 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7594 int entry_count = StubInfo::entry_count(stub_id);
7595 assert(entry_count == 1, "sanity check");
7596 address start = load_archive_data(stub_id);
7597 if (start != nullptr) {
7598 return start;
7599 }
7600 __ align(CodeEntryAlignment);
7601 StubCodeMark mark(this, stub_id);
7602 start = __ pc();
7603 Label L_loop;
7604
7605 const Register input = c_rarg0;
7606 const Register lowPart = c_rarg1;
7607 const Register highPart = c_rarg2;
7608 const Register twoGamma2 = c_rarg3;
7609 const Register multiplier = c_rarg4;
7610
7611 const Register len = r9;
7612 const Register dilithiumConsts = r10;
7613 const Register tmp = r11;
7614
7615 // 6 independent sets of 4x4s values
7616 VSeq<4> vs1(0), vs2(4), vs3(8);
7617 VSeq<4> vs4(12), vs5(16), vtmp(20);
7618
7619 // 7 constants for cross-multiplying
7620 VSeq<4> one(25, 0);
7621 VSeq<4> qminus1(26, 0);
7622 VSeq<4> g2(27, 0);
7623 VSeq<4> twog2(28, 0);
7624 VSeq<4> mult(29, 0);
7625 VSeq<4> q(30, 0);
7626 VSeq<4> qadd(31, 0);
7627
7628 __ enter();
7629
7630 __ lea(dilithiumConsts,
7631 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7632
7633 // save callee-saved registers
7634 __ stpd(v8, v9, __ pre(sp, -64));
7635 __ stpd(v10, v11, Address(sp, 16));
7636 __ stpd(v12, v13, Address(sp, 32));
7637 __ stpd(v14, v15, Address(sp, 48));
7638
7639 // populate constant registers
7640 __ mov(tmp, zr);
7641 __ add(tmp, tmp, 1);
7642 __ dup(one[0], __ T4S, tmp); // 1
7643 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7644 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7645 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7646 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7647 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7648 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7649
7650 __ mov(len, zr);
7651 __ add(len, len, 1024);
7652
7653 __ BIND(L_loop);
7654
7655 // load next 4x4S inputs interleaved: rplus --> vs1
7656 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7657
7658 // rplus = rplus - ((rplus + qadd) >> 23) * q
7659 vs_addv(vtmp, __ T4S, vs1, qadd);
7660 vs_sshr(vtmp, __ T4S, vtmp, 23);
7661 vs_mulv(vtmp, __ T4S, vtmp, q);
7662 vs_subv(vs1, __ T4S, vs1, vtmp);
7663
7664 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7665 vs_sshr(vtmp, __ T4S, vs1, 31);
7666 vs_andr(vtmp, vtmp, q);
7667 vs_addv(vs1, __ T4S, vs1, vtmp);
7668
7669 // quotient --> vs2
7670 // int quotient = (rplus * multiplier) >> 22;
7671 vs_mulv(vtmp, __ T4S, vs1, mult);
7672 vs_sshr(vs2, __ T4S, vtmp, 22);
7673
7674 // r0 --> vs3
7675 // int r0 = rplus - quotient * twoGamma2;
7676 vs_mulv(vtmp, __ T4S, vs2, twog2);
7677 vs_subv(vs3, __ T4S, vs1, vtmp);
7678
7679 // mask --> vs4
7680 // int mask = (twoGamma2 - r0) >> 22;
7681 vs_subv(vtmp, __ T4S, twog2, vs3);
7682 vs_sshr(vs4, __ T4S, vtmp, 22);
7683
7684 // r0 -= (mask & twoGamma2);
7685 vs_andr(vtmp, vs4, twog2);
7686 vs_subv(vs3, __ T4S, vs3, vtmp);
7687
7688 // quotient += (mask & 1);
7689 vs_andr(vtmp, vs4, one);
7690 vs_addv(vs2, __ T4S, vs2, vtmp);
7691
7692 // mask = (twoGamma2 / 2 - r0) >> 31;
7693 vs_subv(vtmp, __ T4S, g2, vs3);
7694 vs_sshr(vs4, __ T4S, vtmp, 31);
7695
7696 // r0 -= (mask & twoGamma2);
7697 vs_andr(vtmp, vs4, twog2);
7698 vs_subv(vs3, __ T4S, vs3, vtmp);
7699
7700 // quotient += (mask & 1);
7701 vs_andr(vtmp, vs4, one);
7702 vs_addv(vs2, __ T4S, vs2, vtmp);
7703
7704 // r1 --> vs5
7705 // int r1 = rplus - r0 - (dilithium_q - 1);
7706 vs_subv(vtmp, __ T4S, vs1, vs3);
7707 vs_subv(vs5, __ T4S, vtmp, qminus1);
7708
7709 // r1 --> vs1 (overwriting rplus)
7710 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7711 vs_negr(vtmp, __ T4S, vs5);
7712 vs_orr(vtmp, vs5, vtmp);
7713 vs_sshr(vs1, __ T4S, vtmp, 31);
7714
7715 // r0 += ~r1;
7716 vs_notr(vtmp, vs1);
7717 vs_addv(vs3, __ T4S, vs3, vtmp);
7718
7719 // r1 = r1 & quotient;
7720 vs_andr(vs1, vs2, vs1);
7721
7722 // store results interleaved
7723 // lowPart[m] = r0;
7724 // highPart[m] = r1;
7725 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7726 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7727
7728 __ sub(len, len, 64);
7729 __ cmp(len, (u1)64);
7730 __ br(Assembler::GE, L_loop);
7731
7732 // restore callee-saved vector registers
7733 __ ldpd(v14, v15, Address(sp, 48));
7734 __ ldpd(v12, v13, Address(sp, 32));
7735 __ ldpd(v10, v11, Address(sp, 16));
7736 __ ldpd(v8, v9, __ post(sp, 64));
7737
7738 __ leave(); // required for proper stackwalking of RuntimeStub frame
7739 __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
7740 __ ret(lr);
7741
7742 // record the stub entry and end
7743 store_archive_data(stub_id, start, __ pc());
7744
7745 return start;
7746 }
7747
7748 static constexpr int montMulP256Shift1 = 12; // 64 - bits per limb
7749 static constexpr int montMulP256Shift2 = 52; // bits per limb
7750 // stack space needed for carry computation
7751 static constexpr int cDataSize = 6 * BytesPerLong;
7752 // stack space needed for data computed by the neon side
7753 static constexpr int mulDataSize = 16 * BytesPerLong;
7754
7755
7756 // Subroutine used by the 52 x 52 bit multiplication algorithm in
7757 // generate_intpoly_montgomeryMult_P256().
7758 // This function computes partial results of eight 52 x 52 bit multiplications,
7759 // where the multiplicands are stored as 64-bit values, specifically
7760 // (b_0, b_1, b_2, b_3) * (a_3, a_4). (The 4 calls to this function
7761 // together provide the results of these limb-multiplications.)
7762 // Calls to this function accept either the low 32 bits or high 20 bits
7763 // of each b_i packed into bs in ascending order. a_3 and a_4 are packed
7764 // into successive 64 bit elements of as. lane selects the low 32 or high
7765 // 20 bits of each a_j value. So four calls with the appropriate parameters
7766 // will produce the 64-bit low32 * low32, low32 * high20, high20 * low32,
7767 // high20 * high20 values in the output register sequences vs. The
7768 // 64-bit partial products are returned in vs in ascending order:
7769 // vs[0] = (b_0*a_3, b_1*a_3) . . . vs[3] = (b_2*a_4, b_3*a_4)
7770
7771 void neon_partial_mult_64(const VSeq<4>& vs, FloatRegister bs, FloatRegister as, int lane_lo) {
7772 __ umullv(vs[0], __ T2D, bs, __ T2S, as, __ S, lane_lo);
7773 __ umull2v(vs[1], __ T2D, bs, __ T4S, as, __ S, lane_lo);
7774 __ umullv(vs[2], __ T2D, bs, __ T2S, as, __ S, lane_lo + 2);
7775 __ umull2v(vs[3], __ T2D, bs, __ T4S, as, __ S, lane_lo + 2);
7776 }
7777
7778 // Subroutine used by the generate_intpoly_montgomeryMult_P256() function
7779 // to compute the result of a 52 x 52 bit multiplications where the
7780 // multiplicands, a and b are available as 64-bit values.
7781 // The result is going to two 64-bit registers lo (least significant 52 bits)
7782 // and hi (most significant 52 bits).
7783 void gpr_partial_mult_52(Register a, Register b, Register hi, Register lo,
7784 Register mask) {
7785 // compute 104-bit (40 + 64) full product
7786 __ umulh(hi, a, b);
7787 __ mul(lo, a, b);
7788 // combine 40 + 12 bits into hi result
7789 // on certain implementations of aarch64 (e.g. apple M1) replacing extr()
7790 // with the following equivalent instruction sequence the performance
7791 // improves slightly (despite it is two instructions longer and needs
7792 // an additional register)
7793 // __ lsl(hi, hi, montMulP256Shift1);
7794 // __ lsr(tmp, lo, montMulP256Shift2);
7795 // __ orr(hi, hi, tmp);
7796 __ extr(hi, hi, lo, montMulP256Shift2);
7797 // mask off 52 bits of lo result
7798 __ andr(lo, lo, mask);
7799 }
7800
7801 // This assembly follows the Java code in MontgomeryIntegerPolynomial256.mult()
7802 // quite closely. The main difference is that the computations done with the
7803 // last two limbs of `a` are done using Neon registers. This allows us to take
7804 // advantage of both the Neon registers and GPRs simultaneously.
7805 // It is also worth noting that since Neon does not support 64 bit
7806 // multiplication, we split each 64 bit value into lower and upper halves
7807 // and use the "schoolbook" multiplication algorithm.
7808 address generate_intpoly_montgomeryMult_P256() {
7809 assert(UseIntPolyIntrinsics, "what are we doing here?");
7810 StubId stub_id = StubId::stubgen_intpoly_montgomeryMult_P256_id;
7811 int entry_count = StubInfo::entry_count(stub_id);
7812 assert(entry_count == 1, "sanity check");
7813 address start = load_archive_data(stub_id);
7814 if (start != nullptr) {
7815 return start;
7816 }
7817 __ align(CodeEntryAlignment);
7818 StubCodeMark mark(this, stub_id);
7819 start = __ pc();
7820 __ enter();
7821
7822 // Registers that are used throughout entire routine
7823 const Register a = c_rarg0;
7824 const Register b = c_rarg1;
7825 const Register result = c_rarg2;
7826
7827 RegSet regs = RegSet::range(r0, r28) - rscratch1 - rscratch2
7828 - r16 - r17 - r18_tls - a - b - result;
7829
7830 auto common_regs = regs.begin();
7831 Register limb_mask = *common_regs++,
7832 c_ptr = *common_regs++,
7833 mod_0 = *common_regs++,
7834 mod_1 = *common_regs++,
7835 mod_3 = *common_regs++,
7836 mod_4 = *common_regs++,
7837 b_0 = *common_regs++,
7838 b_1 = *common_regs++,
7839 b_2 = *common_regs++,
7840 b_3 = *common_regs++,
7841 b_4 = *common_regs++;
7842
7843 FloatRegSet floatRegs = FloatRegSet::range(v0, v31)
7844 - FloatRegSet::range(v8, v15) // Caller saved vectors
7845 - FloatRegSet::range(v16, v31); // Manually-allocated vectors
7846
7847 auto common_vectors = floatRegs.begin();
7848 FloatRegister limb_mask_vec = *common_vectors++,
7849 b_lows = *common_vectors++,
7850 b_highs = *common_vectors++,
7851 a_vals = *common_vectors++;
7852
7853 // Push callee saved registers on to the stack
7854 RegSet callee_saved = RegSet::range(r19, r28);
7855 __ push(callee_saved, sp);
7856
7857 // Allocate space on the stack for carry values
7858 __ sub(sp, sp, cDataSize);
7859 __ mov(c_ptr, sp);
7860
7861 // Calculate (52-bit) limb masks for both gpr and vector registers
7862 __ mov(limb_mask, -UCONST64(1) >> montMulP256Shift1);
7863 __ dup(limb_mask_vec, __ T2D, limb_mask);
7864
7865 //Load input arrays and modulus
7866 Register a_ptr = *common_regs++, mod_ptr = *common_regs++;
7867 // skip 3 limbs so a_ptr addresses trailing pair {a3, a4}
7868 __ add(a_ptr, a, 3 * BytesPerLong);
7869 __ lea(mod_ptr, ExternalAddress((address)_modulus_P256));
7870 __ ldr(b_0, Address(b));
7871 __ ldr(b_1, Address(b, BytesPerLong));
7872 __ ldr(b_2, Address(b, 2 * BytesPerLong));
7873 __ ldr(b_3, Address(b, 3 * BytesPerLong));
7874 __ ldr(b_4, Address(b, 4 * BytesPerLong));
7875 __ ldr(mod_0, __ post(mod_ptr, BytesPerLong));
7876 __ ldr(mod_1, __ post(mod_ptr, BytesPerLong));
7877 __ ldr(mod_3, __ post(mod_ptr, BytesPerLong));
7878 __ ldr(mod_4, mod_ptr);
7879 __ ld1(a_vals, __ T2D, a_ptr);
7880 // use an interleaved load to group low 32 bits and high 20 bits
7881 // of 4 successive b values into two vector registers
7882 // n.b. these are the same inputs as the ones in b_0 ... b4
7883 __ ld2(b_lows, b_highs, __ T4S, b);
7884 common_regs = common_regs.remaining()
7885 + a_ptr + mod_ptr;
7886 a_ptr = mod_ptr = noreg;
7887
7888 //Regs used throughout the main "loop", which is partially unrolled here
7889 Register high = *common_regs++,
7890 low = *common_regs++,
7891 mul_ptr = *common_regs++,
7892 mod_high = *common_regs++,
7893 mod_low = *common_regs++,
7894 a_i = *common_regs++,
7895 c_i = *common_regs++,
7896 tmp = *common_regs++,
7897 n = *common_regs++;
7898
7899 // vector sequences used to compute and combine partial products of
7900 // b_i * a_j for i = {0,1,2,3} j = {3,4}
7901 VSeq<4> A(16);
7902 VSeq<4> B(20);
7903 VSeq<4> C(24);
7904 VSeq<4> D(28);
7905
7906
7907 // neon and gpr computations are interleaved to maximize parallelism
7908
7909 // allocate stack space for the neon results
7910 __ sub(sp, sp, mulDataSize);
7911 __ mov(mul_ptr, sp);
7912
7913 // cross-multiply low * low for limbs b0-b3 and a3-a4 in parallel
7914 neon_partial_mult_64(A, b_lows, a_vals, 0);
7915
7916 // Limb 0
7917 __ ldr(a_i, __ post(a, BytesPerLong));
7918 gpr_partial_mult_52(a_i, b_0, high, low, limb_mask);
7919 __ mov(n, low);
7920 // __ andr(n, low, limb_mask);
7921
7922 // cross-multiply high * low for limbs b0-b3 and a3-a4 in parallel
7923 neon_partial_mult_64(B, b_highs, a_vals, 0);
7924
7925 // Limb 0 modulus computation
7926 // n.b. modulus computation requires multiplying successive
7927 // limbs of the product by corresponding limbs of the p256
7928 // prime adding the result to the limb and folding this
7929 // partial result into a running 256-bit sum in c_i. Limbs
7930 // of c_i are stored via c_ptr once carries are included.
7931 // n.b. the mul + add is omitted for limb 2 since the
7932 // corresponding prime bits are zero.
7933 gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
7934 __ add(low, low, mod_low);
7935 __ add(high, high, mod_high);
7936 __ lsr(c_i, low, montMulP256Shift2);
7937 __ add(c_i, c_i, high);
7938
7939 // cross-multiply low * high for limbs b0-b3 and a3-a4 in parallel
7940 neon_partial_mult_64(C, b_lows, a_vals, 1);
7941
7942 // Limb 1
7943 gpr_partial_mult_52(a_i, b_1, high, low, limb_mask);
7944
7945 // cross-multiply high * high for limbs b0-b3 and a3-a4 in parallel
7946 neon_partial_mult_64(D, b_highs, a_vals, 1);
7947
7948 gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
7949 __ add(low, low, mod_low);
7950 __ add(high, high, mod_high);
7951 __ add(c_i, c_i, low);
7952 __ str(c_i, c_ptr);
7953 __ mov(c_i, high);
7954
7955 // combine neon 32-bit partial products, regrouping to produce
7956 // 8*52-bit low products in A and 8*52-bit high products in D
7957
7958 // add low*high/high*low intermediate products before regrouping
7959 vs_addv(B, __ T2D, B, C); // Store (B+C) in B
7960
7961 // Limb 2
7962 gpr_partial_mult_52(a_i, b_2, high, low, limb_mask);
7963 __ add(c_i, c_i, low);
7964 __ str(c_i, Address(c_ptr, 8));
7965 __ mov(c_i, high);
7966
7967 // shift high*high (40-bit) product up into 52-bits of output
7968 vs_shl(D, __ T2D, D, montMulP256Shift1);
7969
7970 // Limb 3
7971 gpr_partial_mult_52(a_i, b_3, high, low, limb_mask);
7972
7973 // shift high 32 (or 33) bits of intermediate products for addition to D
7974 vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >>> 20)
7975
7976 gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
7977 __ add(low, low, mod_low);
7978 __ add(high, high, mod_high);
7979 __ add(c_i, c_i, low);
7980 __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
7981 __ mov(c_i, high);
7982
7983 // shift low 32 bits of intermediate product up for masking and addition to A
7984 vs_shl(B, __ T2D, B, 32);
7985
7986 // Limb 4
7987 gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
7988
7989 // add high bits of intermediate product into D
7990 vs_addv(D, __ T2D, D, C);
7991
7992 gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
7993 __ add(low, low, mod_low);
7994 __ add(high, high, mod_high);
7995 __ add(c_i, c_i, low);
7996 __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
7997 __ str(high, Address(c_ptr, 4 * BytesPerLong));
7998
7999 // top 12 bits of 32*32 bit product in A need adding into high 52-bit output
8000 vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
8001 // Only 20 of the 32 bits now in the top of B should be added into A
8002 vs_andr(B, B, limb_mask_vec);
8003 // reduce original 64-bit product to 52-bits
8004 vs_andr(A, A, limb_mask_vec);
8005 // add intermediate products to high 52-bit result in D
8006 vs_addv(D, __ T2D, D, C);
8007 // add 20/21 bits of intermediate product in top of B into low 52-bit result
8008 vs_addv(A, __ T2D, A, B);
8009 // save and then mask off any overflow bit from computing low 52-bit result
8010 vs_ushr(B, __ T2D, A, montMulP256Shift2);
8011 vs_andr(A, A, limb_mask_vec);
8012 // add any remaining carry into the high 52-bit result
8013 vs_addv(D, __ T2D, D, B);
8014
8015 // the write interleaves the 4 successive pairs of low and
8016 // high results: (l0, l1), (h0, h1), ... (l6, l7), (h6, h7)
8017 vs_st1_interleaved(A, D, mul_ptr);
8018
8019 // Free mul_ptr
8020 common_regs = common_regs.remaining() + mul_ptr;
8021 mul_ptr = noreg;
8022
8023 /////////////////////////
8024 // Loop 2 & 3
8025 /////////////////////////
8026
8027 for (int i = 0; i < 2; i++) {
8028 // Load a_i and increment by 8 bytes
8029 __ ldr(a_i, __ post(a, BytesPerLong));
8030 __ ldr(c_i, c_ptr); //Load prior c_i
8031
8032 // Limb 0
8033 gpr_partial_mult_52(a_i, b_0, high, low, limb_mask);
8034 __ add(low, low, c_i);
8035 __ ldr(c_i, Address(c_ptr, BytesPerLong));
8036 __ andr(n, low, limb_mask);
8037 gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
8038 __ add(low, low, mod_low);
8039 __ add(high, high, mod_high);
8040 __ lsr(tmp, low, montMulP256Shift2);
8041 __ add(c_i, c_i, tmp);
8042 __ add(c_i, c_i, high);
8043
8044 // Limb 1
8045 gpr_partial_mult_52(a_i, b_1, high, low, limb_mask);
8046 gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
8047 __ ldr(tmp, Address(c_ptr, 2 * BytesPerLong));
8048 __ add(low, low, mod_low);
8049 __ add(high, high, mod_high);
8050 __ add(c_i, c_i, low);
8051 __ str(c_i, c_ptr);
8052 __ add(c_i, tmp, high);
8053
8054 // Limb 2
8055 gpr_partial_mult_52(a_i, b_2, high, low, limb_mask);
8056 __ ldr(tmp, Address(c_ptr, 3 * BytesPerLong));
8057 __ add(c_i, c_i, low);
8058 __ str(c_i, Address(c_ptr, BytesPerLong));
8059 __ add(c_i, tmp, high);
8060
8061 // Limb 3
8062 gpr_partial_mult_52(a_i, b_3, high, low, limb_mask);
8063 gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
8064 __ ldr(tmp, Address(c_ptr, 4 * BytesPerLong));
8065 __ add(low, low, mod_low);
8066 __ add(high, high, mod_high);
8067 __ add(c_i, c_i, low);
8068 __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
8069 __ add(c_i, tmp, high);
8070
8071 // Limb 4
8072 gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
8073 gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
8074 __ add(low, low, mod_low);
8075 __ add(high, high, mod_high);
8076 __ add(c_i, c_i, low);
8077 __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
8078 __ str(high, Address(c_ptr, 4 * BytesPerLong));
8079 }
8080 // Reallocate regs b_0, b_1, b_2 and b_3
8081 common_regs = common_regs.remaining()
8082 + b_0 + b_1 + b_2 + b_3;
8083 b_0 = b_1 = b_2 = b_3 = noreg;
8084
8085 Register low_1 = *common_regs++;
8086 Register high_1 = *common_regs++;
8087
8088 //////////////////////////////
8089 // a[3]
8090 //////////////////////////////
8091
8092 // For a_3 and a_4 we have already computed the cross-products
8093 // with b_0 ... b_3 and stored them on the stack relative to
8094 // `mul_ptr` i.e. the current `sp`in the order
8095 // l(a_3 * b_0), l(a_3 * b_1), h(a_3 * b_0), h(a_3 * b_1),
8096 // l(a_3 * b_2), l(a_3 * b_3), h(a_3 * b_2), h(a_3 * b_3),
8097 // l(a_4 * b_0), l(a_4 * b_1), h(a_4 * b_0), h(a_4 * b_1),
8098 // l(a_4 * b_2), l(a_4 * b_3), h(a_4 * b_2), h(a_4 * b_3),
8099 // where l(x) is the low 52 bits of x and h(x) is the high 52 bits
8100
8101 __ ldr(low_1, Address(sp));
8102 __ ldr(high_1, Address(sp, 2 * BytesPerLong));
8103
8104 __ ldr(low, Address(sp, BytesPerLong));
8105 __ ldr(high, Address(sp, 3 * BytesPerLong));
8106 __ ldr(a_i, __ post(a, BytesPerLong));
8107 __ ldr(c_i, c_ptr);
8108
8109 // Limb 0
8110 __ add(low_1, low_1, c_i);
8111 __ ldr(c_i, Address(c_ptr, BytesPerLong));
8112 __ andr(n, low_1, limb_mask);
8113 gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
8114 __ add(low_1, low_1, mod_low);
8115 __ add(high_1, high_1, mod_high);
8116 __ lsr(tmp, low_1, montMulP256Shift2);
8117 __ add(c_i, c_i, tmp);
8118 __ add(c_i, c_i, high_1);
8119
8120 // Limb 1
8121 __ ldr(low_1, Address(sp, 4 * BytesPerLong));
8122 __ ldr(high_1, Address(sp, 6 * BytesPerLong));
8123 gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
8124 __ ldr(tmp, Address(c_ptr, 2 * BytesPerLong));
8125 __ andr(mod_low, mod_low, limb_mask);
8126 __ add(low, low, mod_low);
8127 __ add(high, high, mod_high);
8128 __ add(c_i, c_i, low);
8129 __ str(c_i, c_ptr);
8130 __ add(c_i, tmp, high);
8131
8132 // Limb 2
8133 __ ldr(low, Address(sp, 5 * BytesPerLong));
8134 __ ldr(high, Address(sp, 7 * BytesPerLong));
8135 __ ldr(tmp, Address(c_ptr, 3 * BytesPerLong));
8136 __ add(c_i, c_i, low_1);
8137 __ str(c_i, Address(c_ptr, BytesPerLong));
8138 __ add(c_i, tmp, high_1);
8139
8140 // Limb 3
8141 gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
8142 __ ldr(tmp, Address(c_ptr, 4 * BytesPerLong));
8143 __ add(low, low, mod_low);
8144 __ add(high, high, mod_high);
8145 __ add(c_i, c_i, low);
8146 __ str(c_i, Address(c_ptr, 2 * BytesPerLong));
8147 __ add(c_i, tmp, high);
8148
8149 // Limb 4
8150 __ ldr(low, Address(sp, 8 * BytesPerLong));
8151 __ ldr(high, Address(sp, 10 * BytesPerLong));
8152 gpr_partial_mult_52(a_i, b_4, high_1, low_1, limb_mask);
8153 gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
8154 __ add(low_1, low_1, mod_low);
8155 __ add(high_1, high_1, mod_high);
8156 __ add(c_i, c_i, low_1);
8157 __ str(c_i, Address(c_ptr, 3 * BytesPerLong));
8158 __ str(high_1, Address(c_ptr, 4 * BytesPerLong));
8159
8160 //////////////////////////////
8161 // a[4]
8162 //////////////////////////////
8163
8164 Register c5 = *common_regs++,
8165 c6 = *common_regs++,
8166 c7 = *common_regs++;
8167
8168 __ ldr(a_i, a);
8169 __ ldr(c_i, c_ptr);
8170
8171 // Limb 0
8172 __ ldr(low_1, Address(sp, 9 * BytesPerLong));
8173 __ ldr(high_1, Address(sp, 11 * BytesPerLong));
8174
8175 __ add(low, low, c_i);
8176 __ ldr(c_i, Address(c_ptr, BytesPerLong));
8177 __ andr(n, low, limb_mask);
8178 gpr_partial_mult_52(n, mod_0, mod_high, mod_low, limb_mask);
8179 __ add(low, low, mod_low);
8180 __ add(high, high, mod_high);
8181 __ lsr(tmp, low, montMulP256Shift2);
8182 __ add(c_i, c_i, tmp);
8183 __ add(c_i, c_i, high);
8184
8185 __ ldr(low, Address(sp, 12 * BytesPerLong));
8186 __ ldr(high, Address(sp, 14 * BytesPerLong));
8187 gpr_partial_mult_52(n, mod_1, mod_high, mod_low, limb_mask);
8188 __ add(low_1, low_1, mod_low);
8189 __ add(high_1, high_1, mod_high);
8190 __ add(c5, c_i, low_1);
8191 __ ldr(c_i, Address(c_ptr, 2 * BytesPerLong));
8192 __ lsr(tmp, c5, montMulP256Shift2);
8193 __ add(c_i, c_i, tmp);
8194 __ add(c_i, c_i, high_1);
8195
8196 // Limb 2
8197 __ ldr(low_1, Address(sp, 13 * BytesPerLong));
8198 __ ldr(high_1, Address(sp, 15 * BytesPerLong));
8199 __ add(c6, c_i, low);
8200 __ ldr(c_i, Address(c_ptr, 3 * BytesPerLong));
8201 __ lsr(tmp, c6, montMulP256Shift2);
8202 __ add(c_i, c_i, tmp);
8203 __ add(c_i, c_i, high);
8204
8205 // Limb 3
8206 gpr_partial_mult_52(n, mod_3, mod_high, mod_low, limb_mask);
8207 __ add(low_1, low_1, mod_low);
8208 __ add(high_1, high_1, mod_high);
8209 __ add(c7, c_i, low_1);
8210 __ ldr(c_i, Address(c_ptr, 4 * BytesPerLong));
8211 __ lsr(tmp, c7, montMulP256Shift2);
8212 __ add(c_i, c_i, tmp);
8213 __ add(c_i, c_i, high_1);
8214
8215 // Limb 4
8216 gpr_partial_mult_52(a_i, b_4, high, low, limb_mask);
8217 gpr_partial_mult_52(n, mod_4, mod_high, mod_low, limb_mask);
8218 __ add(low, low, mod_low);
8219 __ add(high, high, mod_high);
8220
8221 // Reallocate b_4
8222 common_regs = common_regs.remaining() + b_4;
8223 b_4 = noreg;
8224
8225 Register c8 = *common_regs++,
8226 c9 = *common_regs++;
8227
8228 __ add(c8, c_i, low);
8229 __ lsr(c9, c8, montMulP256Shift2);
8230 __ add(c9, c9, high);
8231
8232 __ andr(c5, c5, limb_mask);
8233 __ andr(c6, c6, limb_mask);
8234 __ andr(c7, c7, limb_mask);
8235 __ andr(c8, c8, limb_mask);
8236
8237 /////////////////////////////
8238 // Final carry propagate
8239 /////////////////////////////
8240
8241 // c0 = c5 - modulus[0];
8242 // c1 = c6 - modulus[1] + (c0 >> BITS_PER_LIMB);
8243 // c0 &= LIMB_MASK;
8244 // c2 = c7 + (c1 >> BITS_PER_LIMB);
8245 // c1 &= LIMB_MASK;
8246 // c3 = c8 - modulus[3] + (c2 >> BITS_PER_LIMB);
8247 // c2 &= LIMB_MASK;
8248 // c4 = c9 - modulus4] + (c3 >> BITS_PER_LIMB);
8249 // c3 &= LIMB_MASK;
8250
8251 // Free up all unused regs
8252 common_regs = common_regs.remaining()
8253 + c_ptr + low + high + mod_high
8254 + mod_low + a_i + c_i + n + low_1 + high_1;
8255 c_ptr = low = high = mod_high
8256 = mod_low = a_i = c_i = n = low_1 = high_1 = noreg;
8257
8258 Register c0 = *common_regs++,
8259 c1 = *common_regs++,
8260 c2 = *common_regs++,
8261 c3 = *common_regs++,
8262 c4 = *common_regs++;
8263
8264 __ sub(c0, c5, mod_0);
8265 __ sub(c1, c6, mod_1);
8266 __ sub(c3, c8, mod_3);
8267 __ sub(c4, c9, mod_4);
8268 __ add(c1, c1, c0, Assembler::ASR, montMulP256Shift2);
8269 __ andr(c0, c0, limb_mask);
8270 __ add(c2, c7, c1, Assembler::ASR, montMulP256Shift2);
8271 __ andr(c1, c1, limb_mask);
8272 __ add(c3, c3, c2, Assembler::ASR, montMulP256Shift2);
8273 __ andr(c2, c2, limb_mask);
8274 __ add(c4, c4, c3, Assembler::ASR, montMulP256Shift2);
8275 __ andr(c3, c3, limb_mask);
8276
8277 // Final write back
8278 // mask = c4 >> 63
8279 // r[0] = ((c5 & mask) | (c0 & ~mask));
8280 // r[1] = ((c6 & mask) | (c1 & ~mask));
8281 // r[2] = ((c7 & mask) | (c2 & ~mask));
8282 // r[3] = ((c8 & mask) | (c3 & ~mask));
8283 // r[4] = ((c9 & mask) | (c4 & ~mask));
8284
8285 common_regs = common_regs.remaining()
8286 + mod_0 + mod_1 + mod_3 + mod_4;
8287 mod_0 = mod_1 = mod_3 = mod_4 = noreg;
8288
8289 Register mask = *common_regs++;
8290 Register nmask = *common_regs++;
8291
8292 __ asr(mask, c4, 63);
8293 __ mvn(nmask, mask);
8294 __ andr(c5, c5, mask);
8295 __ andr(tmp, c0, nmask);
8296 __ orr(c5, c5, tmp);
8297 __ andr(c6, c6, mask);
8298 __ andr(tmp, c1, nmask);
8299 __ orr(c6, c6, tmp);
8300 __ andr(c7, c7, mask);
8301 __ andr(tmp, c2, nmask);
8302 __ orr(c7, c7, tmp);
8303 __ andr(c8, c8, mask);
8304 __ andr(tmp, c3, nmask);
8305 __ orr(c8, c8, tmp);
8306 __ andr(c9, c9, mask);
8307 __ andr(tmp, c4, nmask);
8308 __ orr(c9, c9, tmp);
8309
8310 __ str(c5, result);
8311 __ str(c6, Address(result, BytesPerLong));
8312 __ str(c7, Address(result, 2 * BytesPerLong));
8313 __ str(c8, Address(result, 3 * BytesPerLong));
8314 __ str(c9, Address(result, 4 * BytesPerLong));
8315
8316 // End intrinsic call
8317 __ add(sp, sp, cDataSize + mulDataSize);
8318 __ pop(callee_saved, sp);
8319 __ leave();
8320 __ mov(r0, zr); // return 0
8321 __ ret(lr);
8322
8323 // record the stub entry and end
8324 store_archive_data(stub_id, start, __ pc());
8325
8326 return start;
8327 }
8328
8329 address generate_intpoly_assign() {
8330 // KNOWN Lengths:
8331 // MontgomeryIntPolynP256: 5 = 4 + 1
8332 // IntegerPolynomial1305: 5 = 4 + 1
8333 // IntegerPolynomial25519: 10 = 8 + 2
8334 // IntegerPolynomialP256: 10 = 8 + 2
8335 // Curve25519OrderField: 10 = 8 + 2
8336 // Curve25519OrderField: 10 = 8 + 2
8337 // P256OrderField: 10 = 8 + 2
8338 // IntegerPolynomialP384: 14 = 8 + 4 + 2
8339 // P384OrderField: 14 = 8 + 4 + 2
8340 // IntegerPolynomial448: 16 = 8 + 8
8341 // Curve448OrderField: 16 = 8 + 8
8342 // Curve448OrderField: 16 = 8 + 8
8343 // IntegerPolynomialP521: 19 = 8 + 8 + 2 + 1
8344 // P521OrderField: 19 = 8 + 8 + 2 + 1
8345 // Special Cases 5, 10, 14, 16, 19
8346 assert(UseIntPolyIntrinsics, "what are we doing here?");
8347 StubId stub_id = StubId::stubgen_intpoly_assign_id;
8348 int entry_count = StubInfo::entry_count(stub_id);
8349 assert(entry_count == 1, "sanity check");
8350 address start = load_archive_data(stub_id);
8351 if (start != nullptr) {
8352 return start;
8353 }
8354
8355 __ align(CodeEntryAlignment);
8356 StubCodeMark mark(this, stub_id);
8357 start = __ pc();
8358 __ enter();
8359
8360 // Inputs
8361 const Register set = c_rarg0;
8362 const Register aLimbs = c_rarg1;
8363 const Register bLimbs = c_rarg2;
8364 const Register length = c_rarg3;
8365
8366 Label L_Length5, L_Length10, L_Length14, L_Length16, L_Length19, L_Default, L_Done;
8367
8368 /*
8369 int maskValue = -set;
8370 for (int i = 0; i < a.length; i++) {
8371 long dummyLimbs = maskValue & (a[i] ^ b[i]);
8372 a[i] = dummyLimbs ^ a[i];
8373 }
8374 */
8375 Register mask_scalar = r4;
8376 FloatRegister mask_vec = v0;
8377
8378 __ neg(mask_scalar, set);
8379 __ dup(mask_vec, __ T2D, mask_scalar);
8380
8381 __ cmp(length, (u1)5);
8382 __ br(Assembler::EQ, L_Length5);
8383 __ cmp(length, (u1)10);
8384 __ br(Assembler::EQ, L_Length10);
8385 __ cmp(length, (u1)14);
8386 __ br(Assembler::EQ, L_Length14);
8387 __ cmp(length, (u1)16);
8388 __ br(Assembler::EQ, L_Length16);
8389 __ cmp(length, (u1)19);
8390 __ br(Assembler::EQ, L_Length19);
8391 __ b(L_Default);
8392
8393
8394 // Length = 5
8395 // Use 5 GPRs (neon not faster with this few limbs)
8396 __ BIND(L_Length5);
8397 {
8398 Register a0 = r5;
8399 Register a1 = r6;
8400 Register a2 = r7;
8401 Register a3 = r10;
8402 Register a4 = r11;
8403 Register b0 = r12;
8404 Register b1 = r13;
8405 Register b2 = r14;
8406 Register b3 = r15;
8407 Register b4 = r19;
8408
8409 __ push(r19, sp);
8410
8411 __ ldr(a0, aLimbs);
8412 __ ldr(a1, Address(aLimbs, 1 * BytesPerLong));
8413 __ ldr(a2, Address(aLimbs, 2 * BytesPerLong));
8414 __ ldr(a3, Address(aLimbs, 3 * BytesPerLong));
8415 __ ldr(a4, Address(aLimbs, 4 * BytesPerLong));
8416
8417 __ ldr(b0, bLimbs);
8418 __ ldr(b1, Address(bLimbs, 1 * BytesPerLong));
8419 __ ldr(b2, Address(bLimbs, 2 * BytesPerLong));
8420 __ ldr(b3, Address(bLimbs, 3 * BytesPerLong));
8421 __ ldr(b4, Address(bLimbs, 4 * BytesPerLong));
8422
8423 __ eor(b0, b0, a0);
8424 __ eor(b1, b1, a1);
8425 __ eor(b2, b2, a2);
8426 __ eor(b3, b3, a3);
8427 __ eor(b4, b4, a4);
8428
8429 __ andr(b0, b0, mask_scalar);
8430 __ andr(b1, b1, mask_scalar);
8431 __ andr(b2, b2, mask_scalar);
8432 __ andr(b3, b3, mask_scalar);
8433 __ andr(b4, b4, mask_scalar);
8434
8435 __ eor(a0, a0, b0);
8436 __ eor(a1, a1, b1);
8437 __ eor(a2, a2, b2);
8438 __ eor(a3, a3, b3);
8439 __ eor(a4, a4, b4);
8440
8441 __ str(a0, aLimbs);
8442 __ str(a1, Address(aLimbs, 1 * BytesPerLong));
8443 __ str(a2, Address(aLimbs, 2 * BytesPerLong));
8444 __ str(a3, Address(aLimbs, 3 * BytesPerLong));
8445 __ str(a4, Address(aLimbs, 4 * BytesPerLong));
8446
8447 __ pop(r19, sp);
8448 __ b(L_Done);
8449 }
8450
8451 // Length = 10
8452 // Split into 4 neon regs and 2 GPRs
8453 __ BIND(L_Length10);
8454 {
8455 Register a9 = r10;
8456 Register a10 = r11;
8457 Register b9 = r12;
8458 Register b10 = r13;
8459
8460 VSeq<4> a_vec(16);
8461 VSeq<4> b_vec(20);
8462
8463 __ ldr(a9, Address(aLimbs, 8 * BytesPerLong));
8464 __ ldr(a10, Address(aLimbs, 9 * BytesPerLong));
8465 __ ldr(b9, Address(bLimbs, 8 * BytesPerLong));
8466 __ ldr(b10, Address(bLimbs, 9 * BytesPerLong));
8467
8468 vs_ldpq(a_vec, aLimbs);
8469
8470 __ eor(b9, b9, a9);
8471 __ eor(b10, b10, a10);
8472
8473 vs_ldpq(b_vec, bLimbs);
8474
8475 __ andr(b9, b9, mask_scalar);
8476 __ andr(b10, b10, mask_scalar);
8477
8478 vs_eor(b_vec, b_vec, a_vec);
8479
8480 __ eor(a9, a9, b9);
8481 __ eor(a10, a10, b10);
8482
8483 vs_andr(b_vec, b_vec, mask_vec);
8484
8485 __ str(a9, Address(aLimbs, 8 * BytesPerLong));
8486 __ str(a10, Address(aLimbs, 9 * BytesPerLong));
8487
8488 vs_eor(a_vec, a_vec, b_vec);
8489 vs_stpq_post(a_vec, aLimbs);
8490
8491 __ b(L_Done);
8492 }
8493
8494 // Length = 14
8495 // Split into 5 neon regs and 4 GPRs
8496 __ BIND(L_Length14);
8497 {
8498 Register a10 = r5;
8499 Register a11 = r6;
8500 Register a12 = r7;
8501 Register a13 = r8;
8502 Register b10 = r9;
8503 Register b11 = r10;
8504 Register b12 = r11;
8505 Register b13 = r12;
8506
8507 VSeq<5> a_vec(16);
8508 VSeq<5> b_vec(22);
8509
8510 int offsets[2] = { 0, 32 };
8511
8512 __ ldr(a10, Address(aLimbs, 10 * BytesPerLong));
8513 __ ldr(a11, Address(aLimbs, 11 * BytesPerLong));
8514 __ ldr(a12, Address(aLimbs, 12 * BytesPerLong));
8515 __ ldr(a13, Address(aLimbs, 13 * BytesPerLong));
8516
8517 __ ldr(b10, Address(bLimbs, 10 * BytesPerLong));
8518 __ ldr(b11, Address(bLimbs, 11 * BytesPerLong));
8519 __ ldr(b12, Address(bLimbs, 12 * BytesPerLong));
8520 __ ldr(b13, Address(bLimbs, 13 * BytesPerLong));
8521
8522 __ ld1(a_vec[0], __ T2D, aLimbs);
8523 vs_ldpq_indexed(vs_tail(a_vec), aLimbs, 16, offsets);
8524
8525 __ eor(b10, b10, a10);
8526 __ eor(b11, b11, a11);
8527 __ eor(b12, b12, a12);
8528 __ eor(b13, b13, a13);
8529
8530 __ ld1(b_vec[0], __ T2D, bLimbs);
8531 vs_ldpq_indexed(vs_tail(b_vec), bLimbs, 16, offsets);
8532
8533 __ andr(b10, b10, mask_scalar);
8534 __ andr(b11, b11, mask_scalar);
8535 __ andr(b12, b12, mask_scalar);
8536 __ andr(b13, b13, mask_scalar);
8537
8538 vs_eor(b_vec, b_vec, a_vec);
8539
8540 __ eor(a10, a10, b10);
8541 __ eor(a11, a11, b11);
8542 __ eor(a12, a12, b12);
8543 __ eor(a13, a13, b13);
8544
8545 vs_andr(b_vec, b_vec, mask_vec);
8546
8547 __ str(a10, Address(aLimbs, 10 * BytesPerLong));
8548 __ str(a11, Address(aLimbs, 11 * BytesPerLong));
8549 __ str(a12, Address(aLimbs, 12 * BytesPerLong));
8550 __ str(a13, Address(aLimbs, 13 * BytesPerLong));
8551
8552 vs_eor(a_vec, a_vec, b_vec);
8553
8554 __ st1(a_vec[0], __ T2D, aLimbs);
8555 vs_stpq_indexed(vs_tail(a_vec), aLimbs, 16, offsets);
8556
8557 __ b(L_Done);
8558 }
8559
8560 // Length = 16
8561 // Use 8 neon regs
8562 __ BIND(L_Length16);
8563 {
8564 VSeq<8> a_vec(16);
8565 VSeq<8> b_vec(24);
8566
8567 vs_ldpq(a_vec, aLimbs);
8568 vs_ldpq(b_vec, bLimbs);
8569 vs_eor(b_vec, b_vec, a_vec);
8570 vs_andr(b_vec, b_vec, mask_vec);
8571 vs_eor(a_vec, a_vec, b_vec);
8572 vs_stpq_post(a_vec, aLimbs);
8573
8574 __ b(L_Done);
8575 }
8576
8577 // Length = 19
8578 // Split into 8 neon regs and 3 GPRs
8579 __ BIND(L_Length19);
8580 {
8581 Register a17 = r10;
8582 Register a18 = r11;
8583 Register a19 = r12;
8584 Register b17 = r13;
8585 Register b18 = r14;
8586 Register b19 = r15;
8587
8588 VSeq<8> a_vec(16);
8589 VSeq<8> b_vec(24);
8590
8591 __ ldr(a17, Address(aLimbs, 16 * BytesPerLong));
8592 __ ldr(a18, Address(aLimbs, 17 * BytesPerLong));
8593 __ ldr(a19, Address(aLimbs, 18 * BytesPerLong));
8594 __ ldr(b17, Address(bLimbs, 16 * BytesPerLong));
8595 __ ldr(b18, Address(bLimbs, 17 * BytesPerLong));
8596 __ ldr(b19, Address(bLimbs, 18 * BytesPerLong));
8597
8598 vs_ldpq(a_vec, aLimbs);
8599
8600 __ eor(b17, b17, a17);
8601 __ eor(b18, b18, a18);
8602 __ eor(b19, b19, a19);
8603
8604 vs_ldpq(b_vec, bLimbs);
8605
8606 __ andr(b17, b17, mask_scalar);
8607 __ andr(b18, b18, mask_scalar);
8608 __ andr(b19, b19, mask_scalar);
8609
8610 vs_eor(b_vec, b_vec, a_vec);
8611
8612 __ eor(a17, a17, b17);
8613 __ eor(a18, a18, b18);
8614 __ eor(a19, a19, b19);
8615
8616 vs_andr(b_vec, b_vec, mask_vec);
8617
8618 __ str(a17, Address(aLimbs, 16 * BytesPerLong));
8619 __ str(a18, Address(aLimbs, 17 * BytesPerLong));
8620 __ str(a19, Address(aLimbs, 18 * BytesPerLong));
8621
8622 vs_eor(a_vec, a_vec, b_vec);
8623 vs_stpq_post(a_vec, aLimbs);
8624
8625 __ b(L_Done);
8626 }
8627
8628 __ BIND(L_Default);
8629 {
8630 Register ctr = r5;
8631 Register a_val = r6;
8632 Register b_val = r7;
8633
8634 __ mov(ctr, length); // length (the number of limbs) is never 0
8635
8636 Label default_loop;
8637 __ BIND(default_loop);
8638
8639 __ ldr(a_val, aLimbs);
8640 __ ldr(b_val, __ post(bLimbs, 8));
8641 __ eor(b_val, b_val, a_val);
8642 __ andr(b_val, b_val, mask_scalar);
8643 __ eor(a_val, a_val, b_val);
8644 __ str(a_val, __ post(aLimbs, 8));
8645 __ sub(ctr, ctr, 1);
8646 __ cmp(ctr, (u1)0);
8647 __ br(Assembler::NE, default_loop);
8648 }
8649
8650 __ BIND(L_Done);
8651 __ leave(); // required for proper stackwalking of RuntimeStub frame
8652 __ mov(r0, zr); // return 0
8653 __ ret(lr);
8654
8655 // record the stub entry and end
8656 store_archive_data(stub_id, start, __ pc());
8657
8658 return start;
8659 }
8660
8661 /**
8662 * Arithmetic polynomial multiplication in Curve25519. The algorithm mimics
8663 * the version in the IntegerPolynomial25519 class, including the use of all
8664 * columns (no folding method).
8665 *
8666 * Arguments:
8667 *
8668 * Inputs:
8669 * c_rarg0 - long[] aLimbs
8670 * c_rarg1 - long[] bLimbs
8671 *
8672 * Output:
8673 * c_rarg2 - long[] rLimbs result
8674 */
8675 address generate_intpoly_mult_25519() {
8676 StubId stub_id = StubId::stubgen_intpoly_mult_25519_id;
8677 int entry_count = StubInfo::entry_count(stub_id);
8678 assert(entry_count == 1, "sanity check");
8679 address start = load_archive_data(stub_id);
8680 if (start != nullptr) {
8681 return start;
8682 }
8683 __ align(CodeEntryAlignment);
8684 StubCodeMark mark(this, stub_id);
8685 start = __ pc();
8686 __ enter();
8687
8688 // Register Map
8689 const Register aLimbs = c_rarg0; // r0
8690 const Register bLimbs = c_rarg1; // r1
8691 const Register rLimbs = c_rarg2; // r2
8692
8693 Register c[] = {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12};
8694 Register a = r13;
8695 Register b = r14;
8696 Register term = r15;
8697 Register low = r16;
8698 Register high = r17;
8699
8700 const int32_t limbs = 5;
8701 const int32_t bpl = 51;
8702 const int32_t rem = 64 - bpl;
8703 const int32_t TERM = 19;
8704 const int32_t columns = limbs * 2;
8705 const uint64_t mask = (uint64_t) -1 >> rem;
8706 const uint64_t CARRY_ADD = (uint64_t) 1 << (bpl - 1);
8707
8708 __ mov(term, TERM);
8709 for (int i = 0; i < columns; i++) {
8710 __ mov(c[i], zr);
8711 }
8712
8713 // Perform high/low multiplication with signed 5x51 bit limbs
8714 for (int i = 0; i < limbs; i++) {
8715 __ ldr(b, Address(bLimbs, i * 8));
8716 for (int j = 0; j < limbs; j++) {
8717 __ ldr(a, Address(aLimbs, j * 8));
8718 __ smulh(high, a, b);
8719 __ mul(low, a, b);
8720 __ extr(high, high, low, bpl);
8721 __ andr(low, low, mask);
8722 __ add(c[i + j], c[i + j], low);
8723 __ add(c[i + j + 1], c[i + j + 1], high);
8724 }
8725 }
8726
8727 for (int i = 0; i < limbs; i++) {
8728 __ mul(c[i + 5], c[i + 5], term);
8729 __ add(c[i], c[i], c[i + 5]);
8730 }
8731
8732 // Carry-add with reduction from high limb
8733 Register tmp = low;
8734 Register carry_add = high;
8735 __ mov(carry_add, CARRY_ADD);
8736
8737 // Limb 3
8738 __ add(tmp, c[3], carry_add);
8739 __ asr(tmp, tmp, bpl);
8740 __ add(c[4], c[4], tmp);
8741 __ lsl(tmp, tmp, bpl);
8742 __ sub(c[3], c[3], tmp);
8743
8744 // Limb 4
8745 __ add(tmp, c[4], carry_add);
8746 __ asr(tmp, tmp, bpl);
8747
8748 // Reduce high order limb and fold back into low order limb
8749 __ mul(term, tmp, term);
8750 __ add(c[0], c[0], term);
8751
8752 __ lsl(tmp, tmp, bpl);
8753 __ sub(c[4], c[4], tmp);
8754
8755 // Limbs 0 - 3
8756 for (int i = 0; i < (limbs - 1); i++) {
8757 __ add(tmp, c[i], carry_add);
8758 __ asr(tmp, tmp, bpl);
8759 __ add(c[i + 1], c[i + 1], tmp);
8760 __ lsl(tmp, tmp, bpl);
8761 __ sub(c[i], c[i], tmp);
8762 }
8763
8764 for (int i = 0; i < limbs; i++) {
8765 __ str(c[i], Address(rLimbs, i * 8));
8766 }
8767
8768 __ mov(r0, 0);
8769 __ leave(); // required for proper stackwalking of RuntimeStub frame
8770 __ ret(lr);
8771
8772 // record the stub entry and end
8773 store_archive_data(stub_id, start, __ pc());
8774
8775 return start;
8776 }
8777
8778 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
8779 Register tmp0, Register tmp1, Register tmp2) {
8780 __ bic(tmp0, a2, a1); // for a0
8781 __ bic(tmp1, a3, a2); // for a1
8782 __ bic(tmp2, a4, a3); // for a2
8783 __ eor(a2, a2, tmp2);
8784 __ bic(tmp2, a0, a4); // for a3
8785 __ eor(a3, a3, tmp2);
8786 __ bic(tmp2, a1, a0); // for a4
8787 __ eor(a0, a0, tmp0);
8788 __ eor(a1, a1, tmp1);
8789 __ eor(a4, a4, tmp2);
8790 }
8791
8792 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
8793 Register a0, Register a1, Register a2, Register a3, Register a4,
8794 Register a5, Register a6, Register a7, Register a8, Register a9,
8795 Register a10, Register a11, Register a12, Register a13, Register a14,
8796 Register a15, Register a16, Register a17, Register a18, Register a19,
8797 Register a20, Register a21, Register a22, Register a23, Register a24,
8798 Register tmp0, Register tmp1, Register tmp2) {
8799 __ eor3(tmp1, a4, a9, a14);
8800 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
8801 __ eor3(tmp2, a1, a6, a11);
8802 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
8803 __ rax1(tmp2, tmp0, tmp1); // d0
8804 {
8805
8806 Register tmp3, tmp4;
8807 if (can_use_fp && can_use_r18) {
8808 tmp3 = rfp;
8809 tmp4 = r18_tls;
8810 } else {
8811 tmp3 = a4;
8812 tmp4 = a9;
8813 __ stp(tmp3, tmp4, __ pre(sp, -16));
8814 }
8815
8816 __ eor3(tmp3, a0, a5, a10);
8817 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
8818 __ eor(a0, a0, tmp2);
8819 __ eor(a5, a5, tmp2);
8820 __ eor(a10, a10, tmp2);
8821 __ eor(a15, a15, tmp2);
8822 __ eor(a20, a20, tmp2); // d0(tmp2)
8823 __ eor3(tmp3, a2, a7, a12);
8824 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
8825 __ rax1(tmp3, tmp4, tmp2); // d1
8826 __ eor(a1, a1, tmp3);
8827 __ eor(a6, a6, tmp3);
8828 __ eor(a11, a11, tmp3);
8829 __ eor(a16, a16, tmp3);
8830 __ eor(a21, a21, tmp3); // d1(tmp3)
8831 __ rax1(tmp3, tmp2, tmp0); // d3
8832 __ eor3(tmp2, a3, a8, a13);
8833 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
8834 __ eor(a3, a3, tmp3);
8835 __ eor(a8, a8, tmp3);
8836 __ eor(a13, a13, tmp3);
8837 __ eor(a18, a18, tmp3);
8838 __ eor(a23, a23, tmp3);
8839 __ rax1(tmp2, tmp1, tmp0); // d2
8840 __ eor(a2, a2, tmp2);
8841 __ eor(a7, a7, tmp2);
8842 __ eor(a12, a12, tmp2);
8843 __ rax1(tmp0, tmp0, tmp4); // d4
8844 if (!can_use_fp || !can_use_r18) {
8845 __ ldp(tmp3, tmp4, __ post(sp, 16));
8846 }
8847 __ eor(a17, a17, tmp2);
8848 __ eor(a22, a22, tmp2);
8849 __ eor(a4, a4, tmp0);
8850 __ eor(a9, a9, tmp0);
8851 __ eor(a14, a14, tmp0);
8852 __ eor(a19, a19, tmp0);
8853 __ eor(a24, a24, tmp0);
8854 }
8855
8856 __ rol(tmp0, a10, 3);
8857 __ rol(a10, a1, 1);
8858 __ rol(a1, a6, 44);
8859 __ rol(a6, a9, 20);
8860 __ rol(a9, a22, 61);
8861 __ rol(a22, a14, 39);
8862 __ rol(a14, a20, 18);
8863 __ rol(a20, a2, 62);
8864 __ rol(a2, a12, 43);
8865 __ rol(a12, a13, 25);
8866 __ rol(a13, a19, 8) ;
8867 __ rol(a19, a23, 56);
8868 __ rol(a23, a15, 41);
8869 __ rol(a15, a4, 27);
8870 __ rol(a4, a24, 14);
8871 __ rol(a24, a21, 2);
8872 __ rol(a21, a8, 55);
8873 __ rol(a8, a16, 45);
8874 __ rol(a16, a5, 36);
8875 __ rol(a5, a3, 28);
8876 __ rol(a3, a18, 21);
8877 __ rol(a18, a17, 15);
8878 __ rol(a17, a11, 10);
8879 __ rol(a11, a7, 6);
8880 __ mov(a7, tmp0);
8881
8882 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
8883 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
8884 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
8885 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
8886 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
8887
8888 __ ldr(tmp1, __ post(rc, 8));
8889 __ eor(a0, a0, tmp1);
8890
8891 }
8892
8893 // Arguments:
8894 //
8895 // Inputs:
8896 // c_rarg0 - byte[] source+offset
8897 // c_rarg1 - byte[] SHA.state
8898 // c_rarg2 - int block_size
8899 // c_rarg3 - int offset
8900 // c_rarg4 - int limit
8901 //
8902 address generate_sha3_implCompress_gpr(StubId stub_id) {
8903 bool multi_block;
8904 switch (stub_id) {
8905 case StubId::stubgen_sha3_implCompress_id:
8906 multi_block = false;
8907 break;
8908 case StubId::stubgen_sha3_implCompressMB_id:
8909 multi_block = true;
8910 break;
8911 default:
8912 ShouldNotReachHere();
8913 }
8914 int entry_count = StubInfo::entry_count(stub_id);
8915 assert(entry_count == 1, "sanity check");
8916 address start = load_archive_data(stub_id);
8917 if (start != nullptr) {
8918 return start;
8919 }
8920 __ align(CodeEntryAlignment);
8921 StubCodeMark mark(this, stub_id);
8922 start = __ pc();
8923
8924 Register buf = c_rarg0;
8925 Register state = c_rarg1;
8926 Register block_size = c_rarg2;
8927 Register ofs = c_rarg3;
8928 Register limit = c_rarg4;
8929
8930 // use r3.r17,r19..r28 to keep a0..a24.
8931 // a0..a24 are respective locals from SHA3.java
8932 Register a0 = r25,
8933 a1 = r26,
8934 a2 = r27,
8935 a3 = r3,
8936 a4 = r4,
8937 a5 = r5,
8938 a6 = r6,
8939 a7 = r7,
8940 a8 = rscratch1, // r8
8941 a9 = rscratch2, // r9
8942 a10 = r10,
8943 a11 = r11,
8944 a12 = r12,
8945 a13 = r13,
8946 a14 = r14,
8947 a15 = r15,
8948 a16 = r16,
8949 a17 = r17,
8950 a18 = r28,
8951 a19 = r19,
8952 a20 = r20,
8953 a21 = r21,
8954 a22 = r22,
8955 a23 = r23,
8956 a24 = r24;
8957
8958 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
8959
8960 Label sha3_loop, rounds24_preloop, loop_body;
8961 Label sha3_512_or_sha3_384, shake128;
8962
8963 bool can_use_r18 = false;
8964 #ifndef R18_RESERVED
8965 can_use_r18 = true;
8966 #endif
8967 bool can_use_fp = !PreserveFramePointer;
8968
8969 __ enter();
8970
8971 // save almost all yet unsaved gpr registers on stack
8972 __ str(block_size, __ pre(sp, -128));
8973 if (multi_block) {
8974 __ stpw(ofs, limit, Address(sp, 8));
8975 }
8976 // 8 bytes at sp+16 will be used to keep buf
8977 __ stp(r19, r20, Address(sp, 32));
8978 __ stp(r21, r22, Address(sp, 48));
8979 __ stp(r23, r24, Address(sp, 64));
8980 __ stp(r25, r26, Address(sp, 80));
8981 __ stp(r27, r28, Address(sp, 96));
8982 if (can_use_r18 && can_use_fp) {
8983 __ stp(r18_tls, state, Address(sp, 112));
8984 } else {
8985 __ str(state, Address(sp, 112));
8986 }
8987
8988 // begin sha3 calculations: loading a0..a24 from state arrary
8989 __ ldp(a0, a1, state);
8990 __ ldp(a2, a3, Address(state, 16));
8991 __ ldp(a4, a5, Address(state, 32));
8992 __ ldp(a6, a7, Address(state, 48));
8993 __ ldp(a8, a9, Address(state, 64));
8994 __ ldp(a10, a11, Address(state, 80));
8995 __ ldp(a12, a13, Address(state, 96));
8996 __ ldp(a14, a15, Address(state, 112));
8997 __ ldp(a16, a17, Address(state, 128));
8998 __ ldp(a18, a19, Address(state, 144));
8999 __ ldp(a20, a21, Address(state, 160));
9000 __ ldp(a22, a23, Address(state, 176));
9001 __ ldr(a24, Address(state, 192));
9002
9003 __ BIND(sha3_loop);
9004
9005 // load input
9006 __ ldp(tmp3, tmp2, __ post(buf, 16));
9007 __ eor(a0, a0, tmp3);
9008 __ eor(a1, a1, tmp2);
9009 __ ldp(tmp3, tmp2, __ post(buf, 16));
9010 __ eor(a2, a2, tmp3);
9011 __ eor(a3, a3, tmp2);
9012 __ ldp(tmp3, tmp2, __ post(buf, 16));
9013 __ eor(a4, a4, tmp3);
9014 __ eor(a5, a5, tmp2);
9015 __ ldr(tmp3, __ post(buf, 8));
9016 __ eor(a6, a6, tmp3);
9017
9018 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
9019 __ tbz(block_size, 7, sha3_512_or_sha3_384);
9020
9021 __ ldp(tmp3, tmp2, __ post(buf, 16));
9022 __ eor(a7, a7, tmp3);
9023 __ eor(a8, a8, tmp2);
9024 __ ldp(tmp3, tmp2, __ post(buf, 16));
9025 __ eor(a9, a9, tmp3);
9026 __ eor(a10, a10, tmp2);
9027 __ ldp(tmp3, tmp2, __ post(buf, 16));
9028 __ eor(a11, a11, tmp3);
9029 __ eor(a12, a12, tmp2);
9030 __ ldp(tmp3, tmp2, __ post(buf, 16));
9031 __ eor(a13, a13, tmp3);
9032 __ eor(a14, a14, tmp2);
9033 __ ldp(tmp3, tmp2, __ post(buf, 16));
9034 __ eor(a15, a15, tmp3);
9035 __ eor(a16, a16, tmp2);
9036
9037 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
9038 __ andw(tmp2, block_size, 48);
9039 __ cbzw(tmp2, rounds24_preloop);
9040 __ tbnz(block_size, 5, shake128);
9041 // block_size == 144, bit5 == 0, SHA3-244
9042 __ ldr(tmp3, __ post(buf, 8));
9043 __ eor(a17, a17, tmp3);
9044 __ b(rounds24_preloop);
9045
9046 __ BIND(shake128);
9047 __ ldp(tmp3, tmp2, __ post(buf, 16));
9048 __ eor(a17, a17, tmp3);
9049 __ eor(a18, a18, tmp2);
9050 __ ldp(tmp3, tmp2, __ post(buf, 16));
9051 __ eor(a19, a19, tmp3);
9052 __ eor(a20, a20, tmp2);
9053 __ b(rounds24_preloop); // block_size == 168, SHAKE128
9054
9055 __ BIND(sha3_512_or_sha3_384);
9056 __ ldp(tmp3, tmp2, __ post(buf, 16));
9057 __ eor(a7, a7, tmp3);
9058 __ eor(a8, a8, tmp2);
9059 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
9060
9061 // SHA3-384
9062 __ ldp(tmp3, tmp2, __ post(buf, 16));
9063 __ eor(a9, a9, tmp3);
9064 __ eor(a10, a10, tmp2);
9065 __ ldp(tmp3, tmp2, __ post(buf, 16));
9066 __ eor(a11, a11, tmp3);
9067 __ eor(a12, a12, tmp2);
9068
9069 __ BIND(rounds24_preloop);
9070 __ fmovs(v0, 24.0); // float loop counter,
9071 __ fmovs(v1, 1.0); // exact representation
9072
9073 __ str(buf, Address(sp, 16));
9074 __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
9075
9076 __ BIND(loop_body);
9077 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
9078 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
9079 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
9080 tmp0, tmp1, tmp2);
9081 __ fsubs(v0, v0, v1);
9082 __ fcmps(v0, 0.0);
9083 __ br(__ NE, loop_body);
9084
9085 if (multi_block) {
9086 __ ldrw(block_size, sp); // block_size
9087 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
9088 __ addw(tmp2, tmp2, block_size);
9089 __ cmpw(tmp2, tmp1);
9090 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
9091 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
9092 __ br(Assembler::LE, sha3_loop);
9093 __ movw(c_rarg0, tmp2); // return offset
9094 }
9095 if (can_use_fp && can_use_r18) {
9096 __ ldp(r18_tls, state, Address(sp, 112));
9097 } else {
9098 __ ldr(state, Address(sp, 112));
9099 }
9100 // save calculated sha3 state
9101 __ stp(a0, a1, Address(state));
9102 __ stp(a2, a3, Address(state, 16));
9103 __ stp(a4, a5, Address(state, 32));
9104 __ stp(a6, a7, Address(state, 48));
9105 __ stp(a8, a9, Address(state, 64));
9106 __ stp(a10, a11, Address(state, 80));
9107 __ stp(a12, a13, Address(state, 96));
9108 __ stp(a14, a15, Address(state, 112));
9109 __ stp(a16, a17, Address(state, 128));
9110 __ stp(a18, a19, Address(state, 144));
9111 __ stp(a20, a21, Address(state, 160));
9112 __ stp(a22, a23, Address(state, 176));
9113 __ str(a24, Address(state, 192));
9114
9115 // restore required registers from stack
9116 __ ldp(r19, r20, Address(sp, 32));
9117 __ ldp(r21, r22, Address(sp, 48));
9118 __ ldp(r23, r24, Address(sp, 64));
9119 __ ldp(r25, r26, Address(sp, 80));
9120 __ ldp(r27, r28, Address(sp, 96));
9121 if (can_use_fp && can_use_r18) {
9122 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
9123 } // else no need to recalculate rfp, since it wasn't changed
9124
9125 __ leave();
9126
9127 __ ret(lr);
9128
9129 // record the stub entry and end
9130 store_archive_data(stub_id, start, __ pc());
9131
9132 return start;
9133 }
9134
9135 /**
9136 * Arguments:
9137 *
9138 * Inputs:
9139 * c_rarg0 - int crc
9140 * c_rarg1 - byte* buf
9141 * c_rarg2 - int length
9142 *
9143 * Output:
9144 * rax - int crc result
9145 */
9146 address generate_updateBytesCRC32() {
9147 assert(UseCRC32Intrinsics, "what are we doing here?");
9148 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
9149 int entry_count = StubInfo::entry_count(stub_id);
9150 assert(entry_count == 1, "sanity check");
9151 address start = load_archive_data(stub_id);
9152 if (start != nullptr) {
9153 return start;
9154 }
9155 __ align(CodeEntryAlignment);
9156 StubCodeMark mark(this, stub_id);
9157
9158 start = __ pc();
9159
9160 const Register crc = c_rarg0; // crc
9161 const Register buf = c_rarg1; // source java byte array address
9162 const Register len = c_rarg2; // length
9163 const Register table0 = c_rarg3; // crc_table address
9164 const Register table1 = c_rarg4;
9165 const Register table2 = c_rarg5;
9166 const Register table3 = c_rarg6;
9167 const Register tmp3 = c_rarg7;
9168
9169 BLOCK_COMMENT("Entry:");
9170 __ enter(); // required for proper stackwalking of RuntimeStub frame
9171
9172 __ kernel_crc32(crc, buf, len,
9173 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
9174
9175 __ leave(); // required for proper stackwalking of RuntimeStub frame
9176 __ ret(lr);
9177
9178 // record the stub entry and end
9179 store_archive_data(stub_id, start, __ pc());
9180
9181 return start;
9182 }
9183
9184 /**
9185 * Arguments:
9186 *
9187 * Inputs:
9188 * c_rarg0 - int crc
9189 * c_rarg1 - byte* buf
9190 * c_rarg2 - int length
9191 * c_rarg3 - int* table
9192 *
9193 * Output:
9194 * r0 - int crc result
9195 */
9196 address generate_updateBytesCRC32C() {
9197 assert(UseCRC32CIntrinsics, "what are we doing here?");
9198 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
9199 int entry_count = StubInfo::entry_count(stub_id);
9200 assert(entry_count == 1, "sanity check");
9201 address start = load_archive_data(stub_id);
9202 if (start != nullptr) {
9203 return start;
9204 }
9205 __ align(CodeEntryAlignment);
9206 StubCodeMark mark(this, stub_id);
9207
9208 start = __ pc();
9209
9210 const Register crc = c_rarg0; // crc
9211 const Register buf = c_rarg1; // source java byte array address
9212 const Register len = c_rarg2; // length
9213 const Register table0 = c_rarg3; // crc_table address
9214 const Register table1 = c_rarg4;
9215 const Register table2 = c_rarg5;
9216 const Register table3 = c_rarg6;
9217 const Register tmp3 = c_rarg7;
9218
9219 BLOCK_COMMENT("Entry:");
9220 __ enter(); // required for proper stackwalking of RuntimeStub frame
9221
9222 __ kernel_crc32c(crc, buf, len,
9223 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
9224
9225 __ leave(); // required for proper stackwalking of RuntimeStub frame
9226 __ ret(lr);
9227
9228 // record the stub entry and end
9229 store_archive_data(stub_id, start, __ pc());
9230
9231 return start;
9232 }
9233
9234 /***
9235 * Arguments:
9236 *
9237 * Inputs:
9238 * c_rarg0 - int adler
9239 * c_rarg1 - byte* buff
9240 * c_rarg2 - int len
9241 *
9242 * Output:
9243 * c_rarg0 - int adler result
9244 */
9245 address generate_updateBytesAdler32() {
9246 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
9247 int entry_count = StubInfo::entry_count(stub_id);
9248 assert(entry_count == 1, "sanity check");
9249 address start = load_archive_data(stub_id);
9250 if (start != nullptr) {
9251 return start;
9252 }
9253 __ align(CodeEntryAlignment);
9254 StubCodeMark mark(this, stub_id);
9255 start = __ pc();
9256
9257 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
9258
9259 // Aliases
9260 Register adler = c_rarg0;
9261 Register s1 = c_rarg0;
9262 Register s2 = c_rarg3;
9263 Register buff = c_rarg1;
9264 Register len = c_rarg2;
9265 Register nmax = r4;
9266 Register base = r5;
9267 Register count = r6;
9268 Register temp0 = rscratch1;
9269 Register temp1 = rscratch2;
9270 FloatRegister vbytes = v0;
9271 FloatRegister vs1acc = v1;
9272 FloatRegister vs2acc = v2;
9273 FloatRegister vtable = v3;
9274
9275 // Max number of bytes we can process before having to take the mod
9276 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
9277 uint64_t BASE = 0xfff1;
9278 uint64_t NMAX = 0x15B0;
9279
9280 __ mov(base, BASE);
9281 __ mov(nmax, NMAX);
9282
9283 // Load accumulation coefficients for the upper 16 bits
9284 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
9285 __ ld1(vtable, __ T16B, Address(temp0));
9286
9287 // s1 is initialized to the lower 16 bits of adler
9288 // s2 is initialized to the upper 16 bits of adler
9289 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
9290 __ uxth(s1, adler); // s1 = (adler & 0xffff)
9291
9292 // The pipelined loop needs at least 16 elements for 1 iteration
9293 // It does check this, but it is more effective to skip to the cleanup loop
9294 __ cmp(len, (u1)16);
9295 __ br(Assembler::HS, L_nmax);
9296 __ cbz(len, L_combine);
9297
9298 __ bind(L_simple_by1_loop);
9299 __ ldrb(temp0, Address(__ post(buff, 1)));
9300 __ add(s1, s1, temp0);
9301 __ add(s2, s2, s1);
9302 __ subs(len, len, 1);
9303 __ br(Assembler::HI, L_simple_by1_loop);
9304
9305 // s1 = s1 % BASE
9306 __ subs(temp0, s1, base);
9307 __ csel(s1, temp0, s1, Assembler::HS);
9308
9309 // s2 = s2 % BASE
9310 __ lsr(temp0, s2, 16);
9311 __ lsl(temp1, temp0, 4);
9312 __ sub(temp1, temp1, temp0);
9313 __ add(s2, temp1, s2, ext::uxth);
9314
9315 __ subs(temp0, s2, base);
9316 __ csel(s2, temp0, s2, Assembler::HS);
9317
9318 __ b(L_combine);
9319
9320 __ bind(L_nmax);
9321 __ subs(len, len, nmax);
9322 __ sub(count, nmax, 16);
9323 __ br(Assembler::LO, L_by16);
9324
9325 __ bind(L_nmax_loop);
9326
9327 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
9328 vbytes, vs1acc, vs2acc, vtable);
9329
9330 __ subs(count, count, 16);
9331 __ br(Assembler::HS, L_nmax_loop);
9332
9333 // s1 = s1 % BASE
9334 __ lsr(temp0, s1, 16);
9335 __ lsl(temp1, temp0, 4);
9336 __ sub(temp1, temp1, temp0);
9337 __ add(temp1, temp1, s1, ext::uxth);
9338
9339 __ lsr(temp0, temp1, 16);
9340 __ lsl(s1, temp0, 4);
9341 __ sub(s1, s1, temp0);
9342 __ add(s1, s1, temp1, ext:: uxth);
9343
9344 __ subs(temp0, s1, base);
9345 __ csel(s1, temp0, s1, Assembler::HS);
9346
9347 // s2 = s2 % BASE
9348 __ lsr(temp0, s2, 16);
9349 __ lsl(temp1, temp0, 4);
9350 __ sub(temp1, temp1, temp0);
9351 __ add(temp1, temp1, s2, ext::uxth);
9352
9353 __ lsr(temp0, temp1, 16);
9354 __ lsl(s2, temp0, 4);
9355 __ sub(s2, s2, temp0);
9356 __ add(s2, s2, temp1, ext:: uxth);
9357
9358 __ subs(temp0, s2, base);
9359 __ csel(s2, temp0, s2, Assembler::HS);
9360
9361 __ subs(len, len, nmax);
9362 __ sub(count, nmax, 16);
9363 __ br(Assembler::HS, L_nmax_loop);
9364
9365 __ bind(L_by16);
9366 __ adds(len, len, count);
9367 __ br(Assembler::LO, L_by1);
9368
9369 __ bind(L_by16_loop);
9370
9371 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
9372 vbytes, vs1acc, vs2acc, vtable);
9373
9374 __ subs(len, len, 16);
9375 __ br(Assembler::HS, L_by16_loop);
9376
9377 __ bind(L_by1);
9378 __ adds(len, len, 15);
9379 __ br(Assembler::LO, L_do_mod);
9380
9381 __ bind(L_by1_loop);
9382 __ ldrb(temp0, Address(__ post(buff, 1)));
9383 __ add(s1, temp0, s1);
9384 __ add(s2, s2, s1);
9385 __ subs(len, len, 1);
9386 __ br(Assembler::HS, L_by1_loop);
9387
9388 __ bind(L_do_mod);
9389 // s1 = s1 % BASE
9390 __ lsr(temp0, s1, 16);
9391 __ lsl(temp1, temp0, 4);
9392 __ sub(temp1, temp1, temp0);
9393 __ add(temp1, temp1, s1, ext::uxth);
9394
9395 __ lsr(temp0, temp1, 16);
9396 __ lsl(s1, temp0, 4);
9397 __ sub(s1, s1, temp0);
9398 __ add(s1, s1, temp1, ext:: uxth);
9399
9400 __ subs(temp0, s1, base);
9401 __ csel(s1, temp0, s1, Assembler::HS);
9402
9403 // s2 = s2 % BASE
9404 __ lsr(temp0, s2, 16);
9405 __ lsl(temp1, temp0, 4);
9406 __ sub(temp1, temp1, temp0);
9407 __ add(temp1, temp1, s2, ext::uxth);
9408
9409 __ lsr(temp0, temp1, 16);
9410 __ lsl(s2, temp0, 4);
9411 __ sub(s2, s2, temp0);
9412 __ add(s2, s2, temp1, ext:: uxth);
9413
9414 __ subs(temp0, s2, base);
9415 __ csel(s2, temp0, s2, Assembler::HS);
9416
9417 // Combine lower bits and higher bits
9418 __ bind(L_combine);
9419 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
9420
9421 __ ret(lr);
9422
9423 // record the stub entry and end
9424 store_archive_data(stub_id, start, __ pc());
9425
9426 return start;
9427 }
9428
9429 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
9430 Register temp0, Register temp1, FloatRegister vbytes,
9431 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
9432 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
9433 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
9434 // In non-vectorized code, we update s1 and s2 as:
9435 // s1 <- s1 + b1
9436 // s2 <- s2 + s1
9437 // s1 <- s1 + b2
9438 // s2 <- s2 + b1
9439 // ...
9440 // s1 <- s1 + b16
9441 // s2 <- s2 + s1
9442 // Putting above assignments together, we have:
9443 // s1_new = s1 + b1 + b2 + ... + b16
9444 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
9445 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
9446 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
9447 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
9448
9449 // s2 = s2 + s1 * 16
9450 __ add(s2, s2, s1, Assembler::LSL, 4);
9451
9452 // vs1acc = b1 + b2 + b3 + ... + b16
9453 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
9454 __ umullv(vs2acc, __ T8B, vtable, vbytes);
9455 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
9456 __ uaddlv(vs1acc, __ T16B, vbytes);
9457 __ uaddlv(vs2acc, __ T8H, vs2acc);
9458
9459 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
9460 __ fmovd(temp0, vs1acc);
9461 __ fmovd(temp1, vs2acc);
9462 __ add(s1, s1, temp0);
9463 __ add(s2, s2, temp1);
9464 }
9465
9466 /**
9467 * Arguments:
9468 *
9469 * Input:
9470 * c_rarg0 - x address
9471 * c_rarg1 - x length
9472 * c_rarg2 - y address
9473 * c_rarg3 - y length
9474 * c_rarg4 - z address
9475 */
9476 address generate_multiplyToLen() {
9477 StubId stub_id = StubId::stubgen_multiplyToLen_id;
9478 int entry_count = StubInfo::entry_count(stub_id);
9479 assert(entry_count == 1, "sanity check");
9480 address start = load_archive_data(stub_id);
9481 if (start != nullptr) {
9482 return start;
9483 }
9484 __ align(CodeEntryAlignment);
9485 StubCodeMark mark(this, stub_id);
9486
9487 start = __ pc();
9488 const Register x = r0;
9489 const Register xlen = r1;
9490 const Register y = r2;
9491 const Register ylen = r3;
9492 const Register z = r4;
9493
9494 const Register tmp0 = r5;
9495 const Register tmp1 = r10;
9496 const Register tmp2 = r11;
9497 const Register tmp3 = r12;
9498 const Register tmp4 = r13;
9499 const Register tmp5 = r14;
9500 const Register tmp6 = r15;
9501 const Register tmp7 = r16;
9502
9503 BLOCK_COMMENT("Entry:");
9504 __ enter(); // required for proper stackwalking of RuntimeStub frame
9505 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
9506 __ leave(); // required for proper stackwalking of RuntimeStub frame
9507 __ ret(lr);
9508
9509 // record the stub entry and end
9510 store_archive_data(stub_id, start, __ pc());
9511
9512 return start;
9513 }
9514
9515 address generate_squareToLen() {
9516 // squareToLen algorithm for sizes 1..127 described in java code works
9517 // faster than multiply_to_len on some CPUs and slower on others, but
9518 // multiply_to_len shows a bit better overall results
9519 StubId stub_id = StubId::stubgen_squareToLen_id;
9520 int entry_count = StubInfo::entry_count(stub_id);
9521 assert(entry_count == 1, "sanity check");
9522 address start = load_archive_data(stub_id);
9523 if (start != nullptr) {
9524 return start;
9525 }
9526 __ align(CodeEntryAlignment);
9527 StubCodeMark mark(this, stub_id);
9528 start = __ pc();
9529
9530 const Register x = r0;
9531 const Register xlen = r1;
9532 const Register z = r2;
9533 const Register y = r4; // == x
9534 const Register ylen = r5; // == xlen
9535
9536 const Register tmp0 = r3;
9537 const Register tmp1 = r10;
9538 const Register tmp2 = r11;
9539 const Register tmp3 = r12;
9540 const Register tmp4 = r13;
9541 const Register tmp5 = r14;
9542 const Register tmp6 = r15;
9543 const Register tmp7 = r16;
9544
9545 RegSet spilled_regs = RegSet::of(y, ylen);
9546 BLOCK_COMMENT("Entry:");
9547 __ enter();
9548 __ push(spilled_regs, sp);
9549 __ mov(y, x);
9550 __ mov(ylen, xlen);
9551 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
9552 __ pop(spilled_regs, sp);
9553 __ leave();
9554 __ ret(lr);
9555
9556 // record the stub entry and end
9557 store_archive_data(stub_id, start, __ pc());
9558
9559 return start;
9560 }
9561
9562 address generate_mulAdd() {
9563 StubId stub_id = StubId::stubgen_mulAdd_id;
9564 int entry_count = StubInfo::entry_count(stub_id);
9565 assert(entry_count == 1, "sanity check");
9566 address start = load_archive_data(stub_id);
9567 if (start != nullptr) {
9568 return start;
9569 }
9570 __ align(CodeEntryAlignment);
9571 StubCodeMark mark(this, stub_id);
9572
9573 start = __ pc();
9574
9575 const Register out = r0;
9576 const Register in = r1;
9577 const Register offset = r2;
9578 const Register len = r3;
9579 const Register k = r4;
9580
9581 BLOCK_COMMENT("Entry:");
9582 __ enter();
9583 __ mul_add(out, in, offset, len, k);
9584 __ leave();
9585 __ ret(lr);
9586
9587 // record the stub entry and end
9588 store_archive_data(stub_id, start, __ pc());
9589
9590 return start;
9591 }
9592
9593 // Arguments:
9594 //
9595 // Input:
9596 // c_rarg0 - newArr address
9597 // c_rarg1 - oldArr address
9598 // c_rarg2 - newIdx
9599 // c_rarg3 - shiftCount
9600 // c_rarg4 - numIter
9601 //
9602 address generate_bigIntegerRightShift() {
9603 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
9604 int entry_count = StubInfo::entry_count(stub_id);
9605 assert(entry_count == 1, "sanity check");
9606 address start = load_archive_data(stub_id);
9607 if (start != nullptr) {
9608 return start;
9609 }
9610 __ align(CodeEntryAlignment);
9611 StubCodeMark mark(this, stub_id);
9612 start = __ pc();
9613
9614 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
9615
9616 Register newArr = c_rarg0;
9617 Register oldArr = c_rarg1;
9618 Register newIdx = c_rarg2;
9619 Register shiftCount = c_rarg3;
9620 Register numIter = c_rarg4;
9621 Register idx = numIter;
9622
9623 Register newArrCur = rscratch1;
9624 Register shiftRevCount = rscratch2;
9625 Register oldArrCur = r13;
9626 Register oldArrNext = r14;
9627
9628 FloatRegister oldElem0 = v0;
9629 FloatRegister oldElem1 = v1;
9630 FloatRegister newElem = v2;
9631 FloatRegister shiftVCount = v3;
9632 FloatRegister shiftVRevCount = v4;
9633
9634 __ cbz(idx, Exit);
9635
9636 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
9637
9638 // left shift count
9639 __ movw(shiftRevCount, 32);
9640 __ subw(shiftRevCount, shiftRevCount, shiftCount);
9641
9642 // numIter too small to allow a 4-words SIMD loop, rolling back
9643 __ cmp(numIter, (u1)4);
9644 __ br(Assembler::LT, ShiftThree);
9645
9646 __ dup(shiftVCount, __ T4S, shiftCount);
9647 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
9648 __ negr(shiftVCount, __ T4S, shiftVCount);
9649
9650 __ BIND(ShiftSIMDLoop);
9651
9652 // Calculate the load addresses
9653 __ sub(idx, idx, 4);
9654 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
9655 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
9656 __ add(oldArrCur, oldArrNext, 4);
9657
9658 // Load 4 words and process
9659 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
9660 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
9661 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
9662 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
9663 __ orr(newElem, __ T16B, oldElem0, oldElem1);
9664 __ st1(newElem, __ T4S, Address(newArrCur));
9665
9666 __ cmp(idx, (u1)4);
9667 __ br(Assembler::LT, ShiftTwoLoop);
9668 __ b(ShiftSIMDLoop);
9669
9670 __ BIND(ShiftTwoLoop);
9671 __ cbz(idx, Exit);
9672 __ cmp(idx, (u1)1);
9673 __ br(Assembler::EQ, ShiftOne);
9674
9675 // Calculate the load addresses
9676 __ sub(idx, idx, 2);
9677 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
9678 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
9679 __ add(oldArrCur, oldArrNext, 4);
9680
9681 // Load 2 words and process
9682 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
9683 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
9684 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
9685 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
9686 __ orr(newElem, __ T8B, oldElem0, oldElem1);
9687 __ st1(newElem, __ T2S, Address(newArrCur));
9688 __ b(ShiftTwoLoop);
9689
9690 __ BIND(ShiftThree);
9691 __ tbz(idx, 1, ShiftOne);
9692 __ tbz(idx, 0, ShiftTwo);
9693 __ ldrw(r10, Address(oldArr, 12));
9694 __ ldrw(r11, Address(oldArr, 8));
9695 __ lsrvw(r10, r10, shiftCount);
9696 __ lslvw(r11, r11, shiftRevCount);
9697 __ orrw(r12, r10, r11);
9698 __ strw(r12, Address(newArr, 8));
9699
9700 __ BIND(ShiftTwo);
9701 __ ldrw(r10, Address(oldArr, 8));
9702 __ ldrw(r11, Address(oldArr, 4));
9703 __ lsrvw(r10, r10, shiftCount);
9704 __ lslvw(r11, r11, shiftRevCount);
9705 __ orrw(r12, r10, r11);
9706 __ strw(r12, Address(newArr, 4));
9707
9708 __ BIND(ShiftOne);
9709 __ ldrw(r10, Address(oldArr, 4));
9710 __ ldrw(r11, Address(oldArr));
9711 __ lsrvw(r10, r10, shiftCount);
9712 __ lslvw(r11, r11, shiftRevCount);
9713 __ orrw(r12, r10, r11);
9714 __ strw(r12, Address(newArr));
9715
9716 __ BIND(Exit);
9717 __ ret(lr);
9718
9719 // record the stub entry and end
9720 store_archive_data(stub_id, start, __ pc());
9721
9722 return start;
9723 }
9724
9725 // Arguments:
9726 //
9727 // Input:
9728 // c_rarg0 - newArr address
9729 // c_rarg1 - oldArr address
9730 // c_rarg2 - newIdx
9731 // c_rarg3 - shiftCount
9732 // c_rarg4 - numIter
9733 //
9734 address generate_bigIntegerLeftShift() {
9735 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
9736 int entry_count = StubInfo::entry_count(stub_id);
9737 assert(entry_count == 1, "sanity check");
9738 address start = load_archive_data(stub_id);
9739 if (start != nullptr) {
9740 return start;
9741 }
9742 __ align(CodeEntryAlignment);
9743 StubCodeMark mark(this, stub_id);
9744 start = __ pc();
9745
9746 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
9747
9748 Register newArr = c_rarg0;
9749 Register oldArr = c_rarg1;
9750 Register newIdx = c_rarg2;
9751 Register shiftCount = c_rarg3;
9752 Register numIter = c_rarg4;
9753
9754 Register shiftRevCount = rscratch1;
9755 Register oldArrNext = rscratch2;
9756
9757 FloatRegister oldElem0 = v0;
9758 FloatRegister oldElem1 = v1;
9759 FloatRegister newElem = v2;
9760 FloatRegister shiftVCount = v3;
9761 FloatRegister shiftVRevCount = v4;
9762
9763 __ cbz(numIter, Exit);
9764
9765 __ add(oldArrNext, oldArr, 4);
9766 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
9767
9768 // right shift count
9769 __ movw(shiftRevCount, 32);
9770 __ subw(shiftRevCount, shiftRevCount, shiftCount);
9771
9772 // numIter too small to allow a 4-words SIMD loop, rolling back
9773 __ cmp(numIter, (u1)4);
9774 __ br(Assembler::LT, ShiftThree);
9775
9776 __ dup(shiftVCount, __ T4S, shiftCount);
9777 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
9778 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
9779
9780 __ BIND(ShiftSIMDLoop);
9781
9782 // load 4 words and process
9783 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
9784 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
9785 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
9786 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
9787 __ orr(newElem, __ T16B, oldElem0, oldElem1);
9788 __ st1(newElem, __ T4S, __ post(newArr, 16));
9789 __ sub(numIter, numIter, 4);
9790
9791 __ cmp(numIter, (u1)4);
9792 __ br(Assembler::LT, ShiftTwoLoop);
9793 __ b(ShiftSIMDLoop);
9794
9795 __ BIND(ShiftTwoLoop);
9796 __ cbz(numIter, Exit);
9797 __ cmp(numIter, (u1)1);
9798 __ br(Assembler::EQ, ShiftOne);
9799
9800 // load 2 words and process
9801 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
9802 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
9803 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
9804 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
9805 __ orr(newElem, __ T8B, oldElem0, oldElem1);
9806 __ st1(newElem, __ T2S, __ post(newArr, 8));
9807 __ sub(numIter, numIter, 2);
9808 __ b(ShiftTwoLoop);
9809
9810 __ BIND(ShiftThree);
9811 __ ldrw(r10, __ post(oldArr, 4));
9812 __ ldrw(r11, __ post(oldArrNext, 4));
9813 __ lslvw(r10, r10, shiftCount);
9814 __ lsrvw(r11, r11, shiftRevCount);
9815 __ orrw(r12, r10, r11);
9816 __ strw(r12, __ post(newArr, 4));
9817 __ tbz(numIter, 1, Exit);
9818 __ tbz(numIter, 0, ShiftOne);
9819
9820 __ BIND(ShiftTwo);
9821 __ ldrw(r10, __ post(oldArr, 4));
9822 __ ldrw(r11, __ post(oldArrNext, 4));
9823 __ lslvw(r10, r10, shiftCount);
9824 __ lsrvw(r11, r11, shiftRevCount);
9825 __ orrw(r12, r10, r11);
9826 __ strw(r12, __ post(newArr, 4));
9827
9828 __ BIND(ShiftOne);
9829 __ ldrw(r10, Address(oldArr));
9830 __ ldrw(r11, Address(oldArrNext));
9831 __ lslvw(r10, r10, shiftCount);
9832 __ lsrvw(r11, r11, shiftRevCount);
9833 __ orrw(r12, r10, r11);
9834 __ strw(r12, Address(newArr));
9835
9836 __ BIND(Exit);
9837 __ ret(lr);
9838
9839 // record the stub entry and end
9840 store_archive_data(stub_id, start, __ pc());
9841
9842 return start;
9843 }
9844
9845 address generate_count_positives(address &count_positives_long) {
9846 StubId stub_id = StubId::stubgen_count_positives_id;
9847 GrowableArray<address> entries;
9848 int entry_count = StubInfo::entry_count(stub_id);
9849 // We have an extra entry for count_positives_long.
9850 assert(entry_count == 2, "sanity check");
9851 address start = load_archive_data(stub_id, &entries);
9852 if (start != nullptr) {
9853 assert(entries.length() == 1,
9854 "unexpected extra entry count %d", entries.length());
9855 count_positives_long = entries.at(0);
9856 return start;
9857 }
9858 const u1 large_loop_size = 64;
9859 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
9860 int dcache_line = VM_Version::dcache_line_size();
9861
9862 Register ary1 = r1, len = r2, result = r0;
9863
9864 __ align(CodeEntryAlignment);
9865 StubCodeMark mark(this, stub_id);
9866
9867 address entry = __ pc();
9868
9869 __ enter();
9870 // precondition: a copy of len is already in result
9871 // __ mov(result, len);
9872
9873 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
9874 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
9875
9876 __ cmp(len, (u1)15);
9877 __ br(Assembler::GT, LEN_OVER_15);
9878 // The only case when execution falls into this code is when pointer is near
9879 // the end of memory page and we have to avoid reading next page
9880 __ add(ary1, ary1, len);
9881 __ subs(len, len, 8);
9882 __ br(Assembler::GT, LEN_OVER_8);
9883 __ ldr(rscratch2, Address(ary1, -8));
9884 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
9885 __ lsrv(rscratch2, rscratch2, rscratch1);
9886 __ tst(rscratch2, UPPER_BIT_MASK);
9887 __ csel(result, zr, result, Assembler::NE);
9888 __ leave();
9889 __ ret(lr);
9890 __ bind(LEN_OVER_8);
9891 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
9892 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
9893 __ tst(rscratch2, UPPER_BIT_MASK);
9894 __ br(Assembler::NE, RET_NO_POP);
9895 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
9896 __ lsrv(rscratch1, rscratch1, rscratch2);
9897 __ tst(rscratch1, UPPER_BIT_MASK);
9898 __ bind(RET_NO_POP);
9899 __ csel(result, zr, result, Assembler::NE);
9900 __ leave();
9901 __ ret(lr);
9902
9903 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
9904 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
9905
9906 count_positives_long = __ pc(); // 2nd entry point
9907 entries.append(count_positives_long);
9908
9909 __ enter();
9910
9911 __ bind(LEN_OVER_15);
9912 __ push(spilled_regs, sp);
9913 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
9914 __ cbz(rscratch2, ALIGNED);
9915 __ ldp(tmp6, tmp1, Address(ary1));
9916 __ mov(tmp5, 16);
9917 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
9918 __ add(ary1, ary1, rscratch1);
9919 __ orr(tmp6, tmp6, tmp1);
9920 __ tst(tmp6, UPPER_BIT_MASK);
9921 __ br(Assembler::NE, RET_ADJUST);
9922 __ sub(len, len, rscratch1);
9923
9924 __ bind(ALIGNED);
9925 __ cmp(len, large_loop_size);
9926 __ br(Assembler::LT, CHECK_16);
9927 // Perform 16-byte load as early return in pre-loop to handle situation
9928 // when initially aligned large array has negative values at starting bytes,
9929 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
9930 // slower. Cases with negative bytes further ahead won't be affected that
9931 // much. In fact, it'll be faster due to early loads, less instructions and
9932 // less branches in LARGE_LOOP.
9933 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
9934 __ sub(len, len, 16);
9935 __ orr(tmp6, tmp6, tmp1);
9936 __ tst(tmp6, UPPER_BIT_MASK);
9937 __ br(Assembler::NE, RET_ADJUST_16);
9938 __ cmp(len, large_loop_size);
9939 __ br(Assembler::LT, CHECK_16);
9940
9941 if (SoftwarePrefetchHintDistance >= 0
9942 && SoftwarePrefetchHintDistance >= dcache_line) {
9943 // initial prefetch
9944 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
9945 }
9946 __ bind(LARGE_LOOP);
9947 if (SoftwarePrefetchHintDistance >= 0) {
9948 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
9949 }
9950 // Issue load instructions first, since it can save few CPU/MEM cycles, also
9951 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
9952 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
9953 // instructions per cycle and have less branches, but this approach disables
9954 // early return, thus, all 64 bytes are loaded and checked every time.
9955 __ ldp(tmp2, tmp3, Address(ary1));
9956 __ ldp(tmp4, tmp5, Address(ary1, 16));
9957 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
9958 __ ldp(tmp6, tmp1, Address(ary1, 48));
9959 __ add(ary1, ary1, large_loop_size);
9960 __ sub(len, len, large_loop_size);
9961 __ orr(tmp2, tmp2, tmp3);
9962 __ orr(tmp4, tmp4, tmp5);
9963 __ orr(rscratch1, rscratch1, rscratch2);
9964 __ orr(tmp6, tmp6, tmp1);
9965 __ orr(tmp2, tmp2, tmp4);
9966 __ orr(rscratch1, rscratch1, tmp6);
9967 __ orr(tmp2, tmp2, rscratch1);
9968 __ tst(tmp2, UPPER_BIT_MASK);
9969 __ br(Assembler::NE, RET_ADJUST_LONG);
9970 __ cmp(len, large_loop_size);
9971 __ br(Assembler::GE, LARGE_LOOP);
9972
9973 __ bind(CHECK_16); // small 16-byte load pre-loop
9974 __ cmp(len, (u1)16);
9975 __ br(Assembler::LT, POST_LOOP16);
9976
9977 __ bind(LOOP16); // small 16-byte load loop
9978 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
9979 __ sub(len, len, 16);
9980 __ orr(tmp2, tmp2, tmp3);
9981 __ tst(tmp2, UPPER_BIT_MASK);
9982 __ br(Assembler::NE, RET_ADJUST_16);
9983 __ cmp(len, (u1)16);
9984 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
9985
9986 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
9987 __ cmp(len, (u1)8);
9988 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
9989 __ ldr(tmp3, Address(__ post(ary1, 8)));
9990 __ tst(tmp3, UPPER_BIT_MASK);
9991 __ br(Assembler::NE, RET_ADJUST);
9992 __ sub(len, len, 8);
9993
9994 __ bind(POST_LOOP16_LOAD_TAIL);
9995 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
9996 __ ldr(tmp1, Address(ary1));
9997 __ mov(tmp2, 64);
9998 __ sub(tmp4, tmp2, len, __ LSL, 3);
9999 __ lslv(tmp1, tmp1, tmp4);
10000 __ tst(tmp1, UPPER_BIT_MASK);
10001 __ br(Assembler::NE, RET_ADJUST);
10002 // Fallthrough
10003
10004 __ bind(RET_LEN);
10005 __ pop(spilled_regs, sp);
10006 __ leave();
10007 __ ret(lr);
10008
10009 // difference result - len is the count of guaranteed to be
10010 // positive bytes
10011
10012 __ bind(RET_ADJUST_LONG);
10013 __ add(len, len, (u1)(large_loop_size - 16));
10014 __ bind(RET_ADJUST_16);
10015 __ add(len, len, 16);
10016 __ bind(RET_ADJUST);
10017 __ pop(spilled_regs, sp);
10018 __ leave();
10019 __ sub(result, result, len);
10020 __ ret(lr);
10021
10022 // record the stub entry and end plus the extra entry
10023 store_archive_data(stub_id, entry, __ pc(), &entries);
10024
10025 return entry;
10026 }
10027
10028 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
10029 bool usePrefetch, Label &NOT_EQUAL) {
10030 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10031 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
10032 tmp7 = r12, tmp8 = r13;
10033 Label LOOP;
10034
10035 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10036 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10037 __ bind(LOOP);
10038 if (usePrefetch) {
10039 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
10040 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
10041 }
10042 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
10043 __ eor(tmp1, tmp1, tmp2);
10044 __ eor(tmp3, tmp3, tmp4);
10045 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
10046 __ orr(tmp1, tmp1, tmp3);
10047 __ cbnz(tmp1, NOT_EQUAL);
10048 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10049 __ eor(tmp5, tmp5, tmp6);
10050 __ eor(tmp7, tmp7, tmp8);
10051 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10052 __ orr(tmp5, tmp5, tmp7);
10053 __ cbnz(tmp5, NOT_EQUAL);
10054 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
10055 __ eor(tmp1, tmp1, tmp2);
10056 __ eor(tmp3, tmp3, tmp4);
10057 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
10058 __ orr(tmp1, tmp1, tmp3);
10059 __ cbnz(tmp1, NOT_EQUAL);
10060 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
10061 __ eor(tmp5, tmp5, tmp6);
10062 __ sub(cnt1, cnt1, 8 * wordSize);
10063 __ eor(tmp7, tmp7, tmp8);
10064 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
10065 // tmp6 is not used. MacroAssembler::subs is used here (rather than
10066 // cmp) because subs allows an unlimited range of immediate operand.
10067 __ subs(tmp6, cnt1, loopThreshold);
10068 __ orr(tmp5, tmp5, tmp7);
10069 __ cbnz(tmp5, NOT_EQUAL);
10070 __ br(__ GE, LOOP);
10071 // post-loop
10072 __ eor(tmp1, tmp1, tmp2);
10073 __ eor(tmp3, tmp3, tmp4);
10074 __ orr(tmp1, tmp1, tmp3);
10075 __ sub(cnt1, cnt1, 2 * wordSize);
10076 __ cbnz(tmp1, NOT_EQUAL);
10077 }
10078
10079 void generate_large_array_equals_loop_simd(int loopThreshold,
10080 bool usePrefetch, Label &NOT_EQUAL) {
10081 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10082 tmp2 = rscratch2;
10083 Label LOOP;
10084
10085 __ bind(LOOP);
10086 if (usePrefetch) {
10087 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
10088 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
10089 }
10090 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
10091 __ sub(cnt1, cnt1, 8 * wordSize);
10092 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
10093 __ subs(tmp1, cnt1, loopThreshold);
10094 __ eor(v0, __ T16B, v0, v4);
10095 __ eor(v1, __ T16B, v1, v5);
10096 __ eor(v2, __ T16B, v2, v6);
10097 __ eor(v3, __ T16B, v3, v7);
10098 __ orr(v0, __ T16B, v0, v1);
10099 __ orr(v1, __ T16B, v2, v3);
10100 __ orr(v0, __ T16B, v0, v1);
10101 __ umov(tmp1, v0, __ D, 0);
10102 __ umov(tmp2, v0, __ D, 1);
10103 __ orr(tmp1, tmp1, tmp2);
10104 __ cbnz(tmp1, NOT_EQUAL);
10105 __ br(__ GE, LOOP);
10106 }
10107
10108 // a1 = r1 - array1 address
10109 // a2 = r2 - array2 address
10110 // result = r0 - return value. Already contains "false"
10111 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
10112 // r3-r5 are reserved temporary registers
10113 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
10114 address generate_large_array_equals() {
10115 StubId stub_id = StubId::stubgen_large_array_equals_id;
10116 int entry_count = StubInfo::entry_count(stub_id);
10117 assert(entry_count == 1, "sanity check");
10118 address start = load_archive_data(stub_id);
10119 if (start != nullptr) {
10120 return start;
10121 }
10122 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
10123 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
10124 tmp7 = r12, tmp8 = r13;
10125 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
10126 SMALL_LOOP, POST_LOOP;
10127 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
10128 // calculate if at least 32 prefetched bytes are used
10129 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
10130 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
10131 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
10132 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
10133 tmp5, tmp6, tmp7, tmp8);
10134
10135 __ align(CodeEntryAlignment);
10136
10137 StubCodeMark mark(this, stub_id);
10138
10139 address entry = __ pc();
10140 __ enter();
10141 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
10142 // also advance pointers to use post-increment instead of pre-increment
10143 __ add(a1, a1, wordSize);
10144 __ add(a2, a2, wordSize);
10145 if (AvoidUnalignedAccesses) {
10146 // both implementations (SIMD/nonSIMD) are using relatively large load
10147 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
10148 // on some CPUs in case of address is not at least 16-byte aligned.
10149 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
10150 // load if needed at least for 1st address and make if 16-byte aligned.
10151 Label ALIGNED16;
10152 __ tbz(a1, 3, ALIGNED16);
10153 __ ldr(tmp1, Address(__ post(a1, wordSize)));
10154 __ ldr(tmp2, Address(__ post(a2, wordSize)));
10155 __ sub(cnt1, cnt1, wordSize);
10156 __ eor(tmp1, tmp1, tmp2);
10157 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
10158 __ bind(ALIGNED16);
10159 }
10160 if (UseSIMDForArrayEquals) {
10161 if (SoftwarePrefetchHintDistance >= 0) {
10162 __ subs(tmp1, cnt1, prefetchLoopThreshold);
10163 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
10164 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
10165 /* prfm = */ true, NOT_EQUAL);
10166 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
10167 __ br(__ LT, TAIL);
10168 }
10169 __ bind(NO_PREFETCH_LARGE_LOOP);
10170 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
10171 /* prfm = */ false, NOT_EQUAL);
10172 } else {
10173 __ push(spilled_regs, sp);
10174 if (SoftwarePrefetchHintDistance >= 0) {
10175 __ subs(tmp1, cnt1, prefetchLoopThreshold);
10176 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
10177 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
10178 /* prfm = */ true, NOT_EQUAL);
10179 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
10180 __ br(__ LT, TAIL);
10181 }
10182 __ bind(NO_PREFETCH_LARGE_LOOP);
10183 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
10184 /* prfm = */ false, NOT_EQUAL);
10185 }
10186 __ bind(TAIL);
10187 __ cbz(cnt1, EQUAL);
10188 __ subs(cnt1, cnt1, wordSize);
10189 __ br(__ LE, POST_LOOP);
10190 __ bind(SMALL_LOOP);
10191 __ ldr(tmp1, Address(__ post(a1, wordSize)));
10192 __ ldr(tmp2, Address(__ post(a2, wordSize)));
10193 __ subs(cnt1, cnt1, wordSize);
10194 __ eor(tmp1, tmp1, tmp2);
10195 __ cbnz(tmp1, NOT_EQUAL);
10196 __ br(__ GT, SMALL_LOOP);
10197 __ bind(POST_LOOP);
10198 __ ldr(tmp1, Address(a1, cnt1));
10199 __ ldr(tmp2, Address(a2, cnt1));
10200 __ eor(tmp1, tmp1, tmp2);
10201 __ cbnz(tmp1, NOT_EQUAL);
10202 __ bind(EQUAL);
10203 __ mov(result, true);
10204 __ bind(NOT_EQUAL);
10205 if (!UseSIMDForArrayEquals) {
10206 __ pop(spilled_regs, sp);
10207 }
10208 __ bind(NOT_EQUAL_NO_POP);
10209 __ leave();
10210 __ ret(lr);
10211
10212 // record the stub entry and end
10213 store_archive_data(stub_id, entry, __ pc());
10214
10215 return entry;
10216 }
10217
10218 // result = r0 - return value. Contains initial hashcode value on entry.
10219 // ary = r1 - array address
10220 // cnt = r2 - elements count
10221 // Clobbers: v0-v13, rscratch1, rscratch2
10222 address generate_large_arrays_hashcode(BasicType eltype) {
10223 StubId stub_id;
10224 switch (eltype) {
10225 case T_BOOLEAN:
10226 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
10227 break;
10228 case T_BYTE:
10229 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
10230 break;
10231 case T_CHAR:
10232 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
10233 break;
10234 case T_SHORT:
10235 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
10236 break;
10237 case T_INT:
10238 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
10239 break;
10240 default:
10241 stub_id = StubId::NO_STUBID;
10242 ShouldNotReachHere();
10243 };
10244 int entry_count = StubInfo::entry_count(stub_id);
10245 assert(entry_count == 1, "sanity check");
10246 address start = load_archive_data(stub_id);
10247 if (start != nullptr) {
10248 return start;
10249 }
10250 const Register result = r0, ary = r1, cnt = r2;
10251 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
10252 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
10253 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
10254 const FloatRegister vpowm = v13;
10255
10256 ARRAYS_HASHCODE_REGISTERS;
10257
10258 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
10259
10260 unsigned int vf; // vectorization factor
10261 bool multiply_by_halves;
10262 Assembler::SIMD_Arrangement load_arrangement;
10263 switch (eltype) {
10264 case T_BOOLEAN:
10265 case T_BYTE:
10266 load_arrangement = Assembler::T8B;
10267 multiply_by_halves = true;
10268 vf = 8;
10269 break;
10270 case T_CHAR:
10271 case T_SHORT:
10272 load_arrangement = Assembler::T8H;
10273 multiply_by_halves = true;
10274 vf = 8;
10275 break;
10276 case T_INT:
10277 load_arrangement = Assembler::T4S;
10278 multiply_by_halves = false;
10279 vf = 4;
10280 break;
10281 default:
10282 ShouldNotReachHere();
10283 }
10284
10285 // Unroll factor
10286 const unsigned uf = 4;
10287
10288 // Effective vectorization factor
10289 const unsigned evf = vf * uf;
10290
10291 __ align(CodeEntryAlignment);
10292
10293 StubCodeMark mark(this, stub_id);
10294
10295 address entry = __ pc();
10296 __ enter();
10297
10298 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
10299 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
10300 // value shouldn't change throughout both loops.
10301 __ movw(rscratch1, intpow(31U, 3));
10302 __ mov(vpow, Assembler::S, 0, rscratch1);
10303 __ movw(rscratch1, intpow(31U, 2));
10304 __ mov(vpow, Assembler::S, 1, rscratch1);
10305 __ movw(rscratch1, intpow(31U, 1));
10306 __ mov(vpow, Assembler::S, 2, rscratch1);
10307 __ movw(rscratch1, intpow(31U, 0));
10308 __ mov(vpow, Assembler::S, 3, rscratch1);
10309
10310 __ mov(vmul0, Assembler::T16B, 0);
10311 __ mov(vmul0, Assembler::S, 3, result);
10312
10313 __ andr(rscratch2, cnt, (uf - 1) * vf);
10314 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
10315
10316 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
10317 __ mov(vpowm, Assembler::S, 0, rscratch1);
10318
10319 // SMALL LOOP
10320 __ bind(SMALL_LOOP);
10321
10322 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
10323 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10324 __ subsw(rscratch2, rscratch2, vf);
10325
10326 if (load_arrangement == Assembler::T8B) {
10327 // Extend 8B to 8H to be able to use vector multiply
10328 // instructions
10329 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
10330 if (is_signed_subword_type(eltype)) {
10331 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10332 } else {
10333 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10334 }
10335 }
10336
10337 switch (load_arrangement) {
10338 case Assembler::T4S:
10339 __ addv(vmul0, load_arrangement, vmul0, vdata0);
10340 break;
10341 case Assembler::T8B:
10342 case Assembler::T8H:
10343 assert(is_subword_type(eltype), "subword type expected");
10344 if (is_signed_subword_type(eltype)) {
10345 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10346 } else {
10347 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10348 }
10349 break;
10350 default:
10351 __ should_not_reach_here();
10352 }
10353
10354 // Process the upper half of a vector
10355 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
10356 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10357 if (is_signed_subword_type(eltype)) {
10358 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10359 } else {
10360 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10361 }
10362 }
10363
10364 __ br(Assembler::HI, SMALL_LOOP);
10365
10366 // SMALL LOOP'S EPILOQUE
10367 __ lsr(rscratch2, cnt, exact_log2(evf));
10368 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
10369
10370 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
10371 __ addv(vmul0, Assembler::T4S, vmul0);
10372 __ umov(result, vmul0, Assembler::S, 0);
10373
10374 // TAIL
10375 __ bind(TAIL);
10376
10377 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
10378 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
10379 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
10380 __ andr(rscratch2, cnt, vf - 1);
10381 __ bind(TAIL_SHORTCUT);
10382 __ adr(rscratch1, BR_BASE);
10383 // For Cortex-A53 offset is 4 because 2 nops are generated.
10384 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
10385 __ movw(rscratch2, 0x1f);
10386 __ br(rscratch1);
10387
10388 for (size_t i = 0; i < vf - 1; ++i) {
10389 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
10390 eltype);
10391 __ maddw(result, result, rscratch2, rscratch1);
10392 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
10393 // Generate 2nd nop to have 4 instructions per iteration.
10394 if (VM_Version::supports_a53mac()) {
10395 __ nop();
10396 }
10397 }
10398 __ bind(BR_BASE);
10399
10400 __ leave();
10401 __ ret(lr);
10402
10403 // LARGE LOOP
10404 __ bind(LARGE_LOOP_PREHEADER);
10405
10406 __ lsr(rscratch2, cnt, exact_log2(evf));
10407
10408 if (multiply_by_halves) {
10409 // 31^4 - multiplier between lower and upper parts of a register
10410 __ movw(rscratch1, intpow(31U, vf / 2));
10411 __ mov(vpowm, Assembler::S, 1, rscratch1);
10412 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
10413 __ movw(rscratch1, intpow(31U, evf - vf / 2));
10414 __ mov(vpowm, Assembler::S, 0, rscratch1);
10415 } else {
10416 // 31^16
10417 __ movw(rscratch1, intpow(31U, evf));
10418 __ mov(vpowm, Assembler::S, 0, rscratch1);
10419 }
10420
10421 __ mov(vmul3, Assembler::T16B, 0);
10422 __ mov(vmul2, Assembler::T16B, 0);
10423 __ mov(vmul1, Assembler::T16B, 0);
10424
10425 __ bind(LARGE_LOOP);
10426
10427 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
10428 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
10429 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
10430 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
10431
10432 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
10433 Address(__ post(ary, evf * type2aelembytes(eltype))));
10434
10435 if (load_arrangement == Assembler::T8B) {
10436 // Extend 8B to 8H to be able to use vector multiply
10437 // instructions
10438 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
10439 if (is_signed_subword_type(eltype)) {
10440 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
10441 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
10442 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
10443 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10444 } else {
10445 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
10446 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
10447 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
10448 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
10449 }
10450 }
10451
10452 switch (load_arrangement) {
10453 case Assembler::T4S:
10454 __ addv(vmul3, load_arrangement, vmul3, vdata3);
10455 __ addv(vmul2, load_arrangement, vmul2, vdata2);
10456 __ addv(vmul1, load_arrangement, vmul1, vdata1);
10457 __ addv(vmul0, load_arrangement, vmul0, vdata0);
10458 break;
10459 case Assembler::T8B:
10460 case Assembler::T8H:
10461 assert(is_subword_type(eltype), "subword type expected");
10462 if (is_signed_subword_type(eltype)) {
10463 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
10464 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
10465 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
10466 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10467 } else {
10468 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
10469 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
10470 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
10471 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
10472 }
10473 break;
10474 default:
10475 __ should_not_reach_here();
10476 }
10477
10478 // Process the upper half of a vector
10479 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
10480 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
10481 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
10482 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
10483 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
10484 if (is_signed_subword_type(eltype)) {
10485 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
10486 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
10487 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
10488 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10489 } else {
10490 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
10491 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
10492 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
10493 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
10494 }
10495 }
10496
10497 __ subsw(rscratch2, rscratch2, 1);
10498 __ br(Assembler::HI, LARGE_LOOP);
10499
10500 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
10501 __ addv(vmul3, Assembler::T4S, vmul3);
10502 __ umov(result, vmul3, Assembler::S, 0);
10503
10504 __ mov(rscratch2, intpow(31U, vf));
10505
10506 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
10507 __ addv(vmul2, Assembler::T4S, vmul2);
10508 __ umov(rscratch1, vmul2, Assembler::S, 0);
10509 __ maddw(result, result, rscratch2, rscratch1);
10510
10511 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
10512 __ addv(vmul1, Assembler::T4S, vmul1);
10513 __ umov(rscratch1, vmul1, Assembler::S, 0);
10514 __ maddw(result, result, rscratch2, rscratch1);
10515
10516 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
10517 __ addv(vmul0, Assembler::T4S, vmul0);
10518 __ umov(rscratch1, vmul0, Assembler::S, 0);
10519 __ maddw(result, result, rscratch2, rscratch1);
10520
10521 __ andr(rscratch2, cnt, vf - 1);
10522 __ cbnz(rscratch2, TAIL_SHORTCUT);
10523
10524 __ leave();
10525 __ ret(lr);
10526
10527 // record the stub entry and end
10528 store_archive_data(stub_id, entry, __ pc());
10529
10530 return entry;
10531 }
10532
10533 address generate_dsin_dcos(bool isCos) {
10534 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
10535 int entry_count = StubInfo::entry_count(stub_id);
10536 assert(entry_count == 1, "sanity check");
10537 address start = load_archive_data(stub_id);
10538 if (start != nullptr) {
10539 return start;
10540 }
10541 __ align(CodeEntryAlignment);
10542 StubCodeMark mark(this, stub_id);
10543 start = __ pc();
10544 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
10545 (address)StubRoutines::aarch64::_two_over_pi,
10546 (address)StubRoutines::aarch64::_pio2,
10547 (address)StubRoutines::aarch64::_dsin_coef,
10548 (address)StubRoutines::aarch64::_dcos_coef);
10549
10550 // record the stub entry and end
10551 store_archive_data(stub_id, start, __ pc());
10552
10553 return start;
10554 }
10555
10556 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
10557 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
10558 Label &DIFF2) {
10559 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
10560 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
10561
10562 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
10563 __ ldr(tmpU, Address(__ post(cnt1, 8)));
10564 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
10565 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
10566
10567 __ fmovd(tmpL, vtmp3);
10568 __ eor(rscratch2, tmp3, tmpL);
10569 __ cbnz(rscratch2, DIFF2);
10570
10571 __ ldr(tmp3, Address(__ post(cnt1, 8)));
10572 __ umov(tmpL, vtmp3, __ D, 1);
10573 __ eor(rscratch2, tmpU, tmpL);
10574 __ cbnz(rscratch2, DIFF1);
10575
10576 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
10577 __ ldr(tmpU, Address(__ post(cnt1, 8)));
10578 __ fmovd(tmpL, vtmp);
10579 __ eor(rscratch2, tmp3, tmpL);
10580 __ cbnz(rscratch2, DIFF2);
10581
10582 __ ldr(tmp3, Address(__ post(cnt1, 8)));
10583 __ umov(tmpL, vtmp, __ D, 1);
10584 __ eor(rscratch2, tmpU, tmpL);
10585 __ cbnz(rscratch2, DIFF1);
10586 }
10587
10588 // r0 = result
10589 // r1 = str1
10590 // r2 = cnt1
10591 // r3 = str2
10592 // r4 = cnt2
10593 // r10 = tmp1
10594 // r11 = tmp2
10595 address generate_compare_long_string_different_encoding(bool isLU) {
10596 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
10597 int entry_count = StubInfo::entry_count(stub_id);
10598 assert(entry_count == 1, "sanity check");
10599 address start = load_archive_data(stub_id);
10600 if (start != nullptr) {
10601 return start;
10602 }
10603 __ align(CodeEntryAlignment);
10604 StubCodeMark mark(this, stub_id);
10605 address entry = __ pc();
10606 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
10607 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
10608 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
10609 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10610 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
10611 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
10612 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
10613
10614 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
10615
10616 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
10617 // cnt2 == amount of characters left to compare
10618 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
10619 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
10620 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
10621 __ add(str2, str2, isLU ? wordSize : wordSize/2);
10622 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
10623 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
10624 __ eor(rscratch2, tmp1, tmp2);
10625 __ mov(rscratch1, tmp2);
10626 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
10627 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
10628 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
10629 __ push(spilled_regs, sp);
10630 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
10631 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
10632
10633 __ ldr(tmp3, Address(__ post(cnt1, 8)));
10634
10635 if (SoftwarePrefetchHintDistance >= 0) {
10636 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
10637 __ br(__ LT, NO_PREFETCH);
10638 __ bind(LARGE_LOOP_PREFETCH);
10639 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
10640 __ mov(tmp4, 2);
10641 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
10642 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
10643 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10644 __ subs(tmp4, tmp4, 1);
10645 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
10646 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
10647 __ mov(tmp4, 2);
10648 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
10649 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10650 __ subs(tmp4, tmp4, 1);
10651 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
10652 __ sub(cnt2, cnt2, 64);
10653 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
10654 __ br(__ GE, LARGE_LOOP_PREFETCH);
10655 }
10656 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
10657 __ bind(NO_PREFETCH);
10658 __ subs(cnt2, cnt2, 16);
10659 __ br(__ LT, TAIL);
10660 __ align(OptoLoopAlignment);
10661 __ bind(SMALL_LOOP); // smaller loop
10662 __ subs(cnt2, cnt2, 16);
10663 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
10664 __ br(__ GE, SMALL_LOOP);
10665 __ cmn(cnt2, (u1)16);
10666 __ br(__ EQ, LOAD_LAST);
10667 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
10668 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
10669 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
10670 __ ldr(tmp3, Address(cnt1, -8));
10671 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
10672 __ b(LOAD_LAST);
10673 __ bind(DIFF2);
10674 __ mov(tmpU, tmp3);
10675 __ bind(DIFF1);
10676 __ pop(spilled_regs, sp);
10677 __ b(CALCULATE_DIFFERENCE);
10678 __ bind(LOAD_LAST);
10679 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
10680 // No need to load it again
10681 __ mov(tmpU, tmp3);
10682 __ pop(spilled_regs, sp);
10683
10684 // tmp2 points to the address of the last 4 Latin1 characters right now
10685 __ ldrs(vtmp, Address(tmp2));
10686 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
10687 __ fmovd(tmpL, vtmp);
10688
10689 __ eor(rscratch2, tmpU, tmpL);
10690 __ cbz(rscratch2, DONE);
10691
10692 // Find the first different characters in the longwords and
10693 // compute their difference.
10694 __ bind(CALCULATE_DIFFERENCE);
10695 __ rev(rscratch2, rscratch2);
10696 __ clz(rscratch2, rscratch2);
10697 __ andr(rscratch2, rscratch2, -16);
10698 __ lsrv(tmp1, tmp1, rscratch2);
10699 __ uxthw(tmp1, tmp1);
10700 __ lsrv(rscratch1, rscratch1, rscratch2);
10701 __ uxthw(rscratch1, rscratch1);
10702 __ subw(result, tmp1, rscratch1);
10703 __ bind(DONE);
10704 __ ret(lr);
10705
10706 // record the stub entry and end
10707 store_archive_data(stub_id, entry, __ pc());
10708
10709 return entry;
10710 }
10711
10712 // r0 = input (float16)
10713 // v0 = result (float)
10714 // v1 = temporary float register
10715 address generate_float16ToFloat() {
10716 StubId stub_id = StubId::stubgen_hf2f_id;
10717 int entry_count = StubInfo::entry_count(stub_id);
10718 assert(entry_count == 1, "sanity check");
10719 address start = load_archive_data(stub_id);
10720 if (start != nullptr) {
10721 return start;
10722 }
10723 __ align(CodeEntryAlignment);
10724 StubCodeMark mark(this, stub_id);
10725 address entry = __ pc();
10726 BLOCK_COMMENT("Entry:");
10727 __ flt16_to_flt(v0, r0, v1);
10728 __ ret(lr);
10729
10730 // record the stub entry and end
10731 store_archive_data(stub_id, entry, __ pc());
10732
10733 return entry;
10734 }
10735
10736 // v0 = input (float)
10737 // r0 = result (float16)
10738 // v1 = temporary float register
10739 address generate_floatToFloat16() {
10740 StubId stub_id = StubId::stubgen_f2hf_id;
10741 int entry_count = StubInfo::entry_count(stub_id);
10742 assert(entry_count == 1, "sanity check");
10743 address start = load_archive_data(stub_id);
10744 if (start != nullptr) {
10745 return start;
10746 }
10747 __ align(CodeEntryAlignment);
10748 StubCodeMark mark(this, stub_id);
10749 address entry = __ pc();
10750 BLOCK_COMMENT("Entry:");
10751 __ flt_to_flt16(r0, v0, v1);
10752 __ ret(lr);
10753
10754 // record the stub entry and end
10755 store_archive_data(stub_id, entry, __ pc());
10756
10757 return entry;
10758 }
10759
10760 address generate_method_entry_barrier() {
10761 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
10762 int entry_count = StubInfo::entry_count(stub_id);
10763 assert(entry_count == 1, "sanity check");
10764 address start = load_archive_data(stub_id);
10765 if (start != nullptr) {
10766 return start;
10767 }
10768 __ align(CodeEntryAlignment);
10769 StubCodeMark mark(this, stub_id);
10770
10771 Label deoptimize_label;
10772
10773 start = __ pc();
10774
10775 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
10776
10777 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
10778 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
10779 // We can get here despite the nmethod being good, if we have not
10780 // yet applied our cross modification fence (or data fence).
10781 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
10782 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
10783 __ ldrw(rscratch2, rscratch2);
10784 __ strw(rscratch2, thread_epoch_addr);
10785 __ isb();
10786 __ membar(__ LoadLoad);
10787 }
10788
10789 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
10790
10791 __ enter();
10792 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
10793
10794 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
10795
10796 __ push_call_clobbered_registers();
10797
10798 __ mov(c_rarg0, rscratch2);
10799 __ call_VM_leaf
10800 (CAST_FROM_FN_PTR
10801 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
10802
10803 __ reset_last_Java_frame(true);
10804
10805 __ mov(rscratch1, r0);
10806
10807 __ pop_call_clobbered_registers();
10808
10809 __ cbnz(rscratch1, deoptimize_label);
10810
10811 __ leave();
10812 __ ret(lr);
10813
10814 __ BIND(deoptimize_label);
10815
10816 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
10817 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
10818
10819 __ mov(sp, rscratch1);
10820 __ br(rscratch2);
10821
10822 // record the stub entry and end
10823 store_archive_data(stub_id, start, __ pc());
10824
10825 return start;
10826 }
10827
10828 // r0 = result
10829 // r1 = str1
10830 // r2 = cnt1
10831 // r3 = str2
10832 // r4 = cnt2
10833 // r10 = tmp1
10834 // r11 = tmp2
10835 address generate_compare_long_string_same_encoding(bool isLL) {
10836 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
10837 int entry_count = StubInfo::entry_count(stub_id);
10838 assert(entry_count == 1, "sanity check");
10839 address start = load_archive_data(stub_id);
10840 if (start != nullptr) {
10841 return start;
10842 }
10843 __ align(CodeEntryAlignment);
10844 StubCodeMark mark(this, stub_id);
10845 address entry = __ pc();
10846 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10847 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
10848
10849 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
10850
10851 // exit from large loop when less than 64 bytes left to read or we're about
10852 // to prefetch memory behind array border
10853 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
10854
10855 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
10856 __ eor(rscratch2, tmp1, tmp2);
10857 __ cbnz(rscratch2, CAL_DIFFERENCE);
10858
10859 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
10860 // update pointers, because of previous read
10861 __ add(str1, str1, wordSize);
10862 __ add(str2, str2, wordSize);
10863 if (SoftwarePrefetchHintDistance >= 0) {
10864 __ align(OptoLoopAlignment);
10865 __ bind(LARGE_LOOP_PREFETCH);
10866 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
10867 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
10868
10869 for (int i = 0; i < 4; i++) {
10870 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
10871 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
10872 __ cmp(tmp1, tmp2);
10873 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10874 __ br(Assembler::NE, DIFF);
10875 }
10876 __ sub(cnt2, cnt2, isLL ? 64 : 32);
10877 __ add(str1, str1, 64);
10878 __ add(str2, str2, 64);
10879 __ subs(rscratch2, cnt2, largeLoopExitCondition);
10880 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
10881 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
10882 }
10883
10884 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
10885 __ br(Assembler::LE, LESS16);
10886 __ align(OptoLoopAlignment);
10887 __ bind(LOOP_COMPARE16);
10888 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
10889 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
10890 __ cmp(tmp1, tmp2);
10891 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10892 __ br(Assembler::NE, DIFF);
10893 __ sub(cnt2, cnt2, isLL ? 16 : 8);
10894 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
10895 __ br(Assembler::LT, LESS16);
10896
10897 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
10898 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
10899 __ cmp(tmp1, tmp2);
10900 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
10901 __ br(Assembler::NE, DIFF);
10902 __ sub(cnt2, cnt2, isLL ? 16 : 8);
10903 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
10904 __ br(Assembler::GE, LOOP_COMPARE16);
10905 __ cbz(cnt2, LENGTH_DIFF);
10906
10907 __ bind(LESS16);
10908 // each 8 compare
10909 __ subs(cnt2, cnt2, isLL ? 8 : 4);
10910 __ br(Assembler::LE, LESS8);
10911 __ ldr(tmp1, Address(__ post(str1, 8)));
10912 __ ldr(tmp2, Address(__ post(str2, 8)));
10913 __ eor(rscratch2, tmp1, tmp2);
10914 __ cbnz(rscratch2, CAL_DIFFERENCE);
10915 __ sub(cnt2, cnt2, isLL ? 8 : 4);
10916
10917 __ bind(LESS8); // directly load last 8 bytes
10918 if (!isLL) {
10919 __ add(cnt2, cnt2, cnt2);
10920 }
10921 __ ldr(tmp1, Address(str1, cnt2));
10922 __ ldr(tmp2, Address(str2, cnt2));
10923 __ eor(rscratch2, tmp1, tmp2);
10924 __ cbz(rscratch2, LENGTH_DIFF);
10925 __ b(CAL_DIFFERENCE);
10926
10927 __ bind(DIFF);
10928 __ cmp(tmp1, tmp2);
10929 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
10930 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
10931 // reuse rscratch2 register for the result of eor instruction
10932 __ eor(rscratch2, tmp1, tmp2);
10933
10934 __ bind(CAL_DIFFERENCE);
10935 __ rev(rscratch2, rscratch2);
10936 __ clz(rscratch2, rscratch2);
10937 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
10938 __ lsrv(tmp1, tmp1, rscratch2);
10939 __ lsrv(tmp2, tmp2, rscratch2);
10940 if (isLL) {
10941 __ uxtbw(tmp1, tmp1);
10942 __ uxtbw(tmp2, tmp2);
10943 } else {
10944 __ uxthw(tmp1, tmp1);
10945 __ uxthw(tmp2, tmp2);
10946 }
10947 __ subw(result, tmp1, tmp2);
10948
10949 __ bind(LENGTH_DIFF);
10950 __ ret(lr);
10951
10952 // record the stub entry and end
10953 store_archive_data(stub_id, entry, __ pc());
10954
10955 return entry;
10956 }
10957
10958 enum string_compare_mode {
10959 LL,
10960 LU,
10961 UL,
10962 UU,
10963 };
10964
10965 // The following registers are declared in aarch64.ad
10966 // r0 = result
10967 // r1 = str1
10968 // r2 = cnt1
10969 // r3 = str2
10970 // r4 = cnt2
10971 // r10 = tmp1
10972 // r11 = tmp2
10973 // z0 = ztmp1
10974 // z1 = ztmp2
10975 // p0 = pgtmp1
10976 // p1 = pgtmp2
10977 address generate_compare_long_string_sve(string_compare_mode mode) {
10978 StubId stub_id;
10979 switch (mode) {
10980 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
10981 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
10982 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
10983 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
10984 default: ShouldNotReachHere();
10985 }
10986 int entry_count = StubInfo::entry_count(stub_id);
10987 assert(entry_count == 1, "sanity check");
10988 address start = load_archive_data(stub_id);
10989 if (start != nullptr) {
10990 return start;
10991 }
10992 __ align(CodeEntryAlignment);
10993 StubCodeMark mark(this, stub_id);
10994 address entry = __ pc();
10995 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
10996 tmp1 = r10, tmp2 = r11;
10997
10998 Label LOOP, DONE, MISMATCH;
10999 Register vec_len = tmp1;
11000 Register idx = tmp2;
11001 // The minimum of the string lengths has been stored in cnt2.
11002 Register cnt = cnt2;
11003 FloatRegister ztmp1 = z0, ztmp2 = z1;
11004 PRegister pgtmp1 = p0, pgtmp2 = p1;
11005
11006 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
11007 switch (mode) { \
11008 case LL: \
11009 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
11010 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
11011 break; \
11012 case LU: \
11013 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
11014 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
11015 break; \
11016 case UL: \
11017 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
11018 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
11019 break; \
11020 case UU: \
11021 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
11022 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
11023 break; \
11024 default: \
11025 ShouldNotReachHere(); \
11026 }
11027
11028 __ mov(idx, 0);
11029 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
11030
11031 if (mode == LL) {
11032 __ sve_cntb(vec_len);
11033 } else {
11034 __ sve_cnth(vec_len);
11035 }
11036
11037 __ sub(rscratch1, cnt, vec_len);
11038
11039 __ bind(LOOP);
11040
11041 // main loop
11042 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
11043 __ add(idx, idx, vec_len);
11044 // Compare strings.
11045 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
11046 __ br(__ NE, MISMATCH);
11047 __ cmp(idx, rscratch1);
11048 __ br(__ LT, LOOP);
11049
11050 // post loop, last iteration
11051 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
11052
11053 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
11054 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
11055 __ br(__ EQ, DONE);
11056
11057 __ bind(MISMATCH);
11058
11059 // Crop the vector to find its location.
11060 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
11061 // Extract the first different characters of each string.
11062 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
11063 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
11064
11065 // Compute the difference of the first different characters.
11066 __ sub(result, rscratch1, rscratch2);
11067
11068 __ bind(DONE);
11069 __ ret(lr);
11070 #undef LOAD_PAIR
11071
11072 // record the stub entry and end
11073 store_archive_data(stub_id, entry, __ pc());
11074
11075 return entry;
11076 }
11077
11078 void generate_compare_long_strings() {
11079 if (UseSVE == 0) {
11080 StubRoutines::aarch64::_compare_long_string_LL
11081 = generate_compare_long_string_same_encoding(true);
11082 StubRoutines::aarch64::_compare_long_string_UU
11083 = generate_compare_long_string_same_encoding(false);
11084 StubRoutines::aarch64::_compare_long_string_LU
11085 = generate_compare_long_string_different_encoding(true);
11086 StubRoutines::aarch64::_compare_long_string_UL
11087 = generate_compare_long_string_different_encoding(false);
11088 } else {
11089 StubRoutines::aarch64::_compare_long_string_LL
11090 = generate_compare_long_string_sve(LL);
11091 StubRoutines::aarch64::_compare_long_string_UU
11092 = generate_compare_long_string_sve(UU);
11093 StubRoutines::aarch64::_compare_long_string_LU
11094 = generate_compare_long_string_sve(LU);
11095 StubRoutines::aarch64::_compare_long_string_UL
11096 = generate_compare_long_string_sve(UL);
11097 }
11098 }
11099
11100 // R0 = result
11101 // R1 = str2
11102 // R2 = cnt1
11103 // R3 = str1
11104 // R4 = cnt2
11105 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
11106 //
11107 // This generic linear code use few additional ideas, which makes it faster:
11108 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
11109 // in order to skip initial loading(help in systems with 1 ld pipeline)
11110 // 2) we can use "fast" algorithm of finding single character to search for
11111 // first symbol with less branches(1 branch per each loaded register instead
11112 // of branch for each symbol), so, this is where constants like
11113 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
11114 // 3) after loading and analyzing 1st register of source string, it can be
11115 // used to search for every 1st character entry, saving few loads in
11116 // comparison with "simplier-but-slower" implementation
11117 // 4) in order to avoid lots of push/pop operations, code below is heavily
11118 // re-using/re-initializing/compressing register values, which makes code
11119 // larger and a bit less readable, however, most of extra operations are
11120 // issued during loads or branches, so, penalty is minimal
11121 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
11122 StubId stub_id;
11123 if (str1_isL) {
11124 if (str2_isL) {
11125 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
11126 } else {
11127 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
11128 }
11129 } else {
11130 if (str2_isL) {
11131 ShouldNotReachHere();
11132 } else {
11133 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
11134 }
11135 }
11136 int entry_count = StubInfo::entry_count(stub_id);
11137 assert(entry_count == 1, "sanity check");
11138 address start = load_archive_data(stub_id);
11139 if (start != nullptr) {
11140 return start;
11141 }
11142 __ align(CodeEntryAlignment);
11143 StubCodeMark mark(this, stub_id);
11144 address entry = __ pc();
11145
11146 int str1_chr_size = str1_isL ? 1 : 2;
11147 int str2_chr_size = str2_isL ? 1 : 2;
11148 int str1_chr_shift = str1_isL ? 0 : 1;
11149 int str2_chr_shift = str2_isL ? 0 : 1;
11150 bool isL = str1_isL && str2_isL;
11151 // parameters
11152 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
11153 // temporary registers
11154 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
11155 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
11156 // redefinitions
11157 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
11158
11159 __ push(spilled_regs, sp);
11160 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
11161 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
11162 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
11163 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
11164 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
11165 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
11166 // Read whole register from str1. It is safe, because length >=8 here
11167 __ ldr(ch1, Address(str1));
11168 // Read whole register from str2. It is safe, because length >=8 here
11169 __ ldr(ch2, Address(str2));
11170 __ sub(cnt2, cnt2, cnt1);
11171 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
11172 if (str1_isL != str2_isL) {
11173 __ eor(v0, __ T16B, v0, v0);
11174 }
11175 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
11176 __ mul(first, first, tmp1);
11177 // check if we have less than 1 register to check
11178 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
11179 if (str1_isL != str2_isL) {
11180 __ fmovd(v1, ch1);
11181 }
11182 __ br(__ LE, L_SMALL);
11183 __ eor(ch2, first, ch2);
11184 if (str1_isL != str2_isL) {
11185 __ zip1(v1, __ T16B, v1, v0);
11186 }
11187 __ sub(tmp2, ch2, tmp1);
11188 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11189 __ bics(tmp2, tmp2, ch2);
11190 if (str1_isL != str2_isL) {
11191 __ fmovd(ch1, v1);
11192 }
11193 __ br(__ NE, L_HAS_ZERO);
11194 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
11195 __ add(result, result, wordSize/str2_chr_size);
11196 __ add(str2, str2, wordSize);
11197 __ br(__ LT, L_POST_LOOP);
11198 __ BIND(L_LOOP);
11199 __ ldr(ch2, Address(str2));
11200 __ eor(ch2, first, ch2);
11201 __ sub(tmp2, ch2, tmp1);
11202 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11203 __ bics(tmp2, tmp2, ch2);
11204 __ br(__ NE, L_HAS_ZERO);
11205 __ BIND(L_LOOP_PROCEED);
11206 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
11207 __ add(str2, str2, wordSize);
11208 __ add(result, result, wordSize/str2_chr_size);
11209 __ br(__ GE, L_LOOP);
11210 __ BIND(L_POST_LOOP);
11211 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
11212 __ br(__ LE, NOMATCH);
11213 __ ldr(ch2, Address(str2));
11214 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
11215 __ eor(ch2, first, ch2);
11216 __ sub(tmp2, ch2, tmp1);
11217 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11218 __ mov(tmp4, -1); // all bits set
11219 __ b(L_SMALL_PROCEED);
11220 __ align(OptoLoopAlignment);
11221 __ BIND(L_SMALL);
11222 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
11223 __ eor(ch2, first, ch2);
11224 if (str1_isL != str2_isL) {
11225 __ zip1(v1, __ T16B, v1, v0);
11226 }
11227 __ sub(tmp2, ch2, tmp1);
11228 __ mov(tmp4, -1); // all bits set
11229 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
11230 if (str1_isL != str2_isL) {
11231 __ fmovd(ch1, v1); // move converted 4 symbols
11232 }
11233 __ BIND(L_SMALL_PROCEED);
11234 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
11235 __ bic(tmp2, tmp2, ch2);
11236 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
11237 __ rbit(tmp2, tmp2);
11238 __ br(__ EQ, NOMATCH);
11239 __ BIND(L_SMALL_HAS_ZERO_LOOP);
11240 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
11241 __ cmp(cnt1, u1(wordSize/str2_chr_size));
11242 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
11243 if (str2_isL) { // LL
11244 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
11245 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
11246 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
11247 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
11248 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11249 } else {
11250 __ mov(ch2, 0xE); // all bits in byte set except last one
11251 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11252 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11253 __ lslv(tmp2, tmp2, tmp4);
11254 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11255 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11256 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11257 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11258 }
11259 __ cmp(ch1, ch2);
11260 __ mov(tmp4, wordSize/str2_chr_size);
11261 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11262 __ BIND(L_SMALL_CMP_LOOP);
11263 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
11264 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
11265 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
11266 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
11267 __ add(tmp4, tmp4, 1);
11268 __ cmp(tmp4, cnt1);
11269 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
11270 __ cmp(first, ch2);
11271 __ br(__ EQ, L_SMALL_CMP_LOOP);
11272 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
11273 __ cbz(tmp2, NOMATCH); // no more matches. exit
11274 __ clz(tmp4, tmp2);
11275 __ add(result, result, 1); // advance index
11276 __ add(str2, str2, str2_chr_size); // advance pointer
11277 __ b(L_SMALL_HAS_ZERO_LOOP);
11278 __ align(OptoLoopAlignment);
11279 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
11280 __ cmp(first, ch2);
11281 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11282 __ b(DONE);
11283 __ align(OptoLoopAlignment);
11284 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
11285 if (str2_isL) { // LL
11286 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
11287 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
11288 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
11289 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
11290 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11291 } else {
11292 __ mov(ch2, 0xE); // all bits in byte set except last one
11293 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11294 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11295 __ lslv(tmp2, tmp2, tmp4);
11296 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11297 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11298 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
11299 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11300 }
11301 __ cmp(ch1, ch2);
11302 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
11303 __ b(DONE);
11304 __ align(OptoLoopAlignment);
11305 __ BIND(L_HAS_ZERO);
11306 __ rbit(tmp2, tmp2);
11307 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
11308 // Now, perform compression of counters(cnt2 and cnt1) into one register.
11309 // It's fine because both counters are 32bit and are not changed in this
11310 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
11311 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
11312 __ sub(result, result, 1);
11313 __ BIND(L_HAS_ZERO_LOOP);
11314 __ mov(cnt1, wordSize/str2_chr_size);
11315 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
11316 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
11317 if (str2_isL) {
11318 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
11319 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11320 __ lslv(tmp2, tmp2, tmp4);
11321 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11322 __ add(tmp4, tmp4, 1);
11323 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11324 __ lsl(tmp2, tmp2, 1);
11325 __ mov(tmp4, wordSize/str2_chr_size);
11326 } else {
11327 __ mov(ch2, 0xE);
11328 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11329 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11330 __ lslv(tmp2, tmp2, tmp4);
11331 __ add(tmp4, tmp4, 1);
11332 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11333 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
11334 __ lsl(tmp2, tmp2, 1);
11335 __ mov(tmp4, wordSize/str2_chr_size);
11336 __ sub(str2, str2, str2_chr_size);
11337 }
11338 __ cmp(ch1, ch2);
11339 __ mov(tmp4, wordSize/str2_chr_size);
11340 __ br(__ NE, L_CMP_LOOP_NOMATCH);
11341 __ BIND(L_CMP_LOOP);
11342 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
11343 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
11344 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
11345 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
11346 __ add(tmp4, tmp4, 1);
11347 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
11348 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
11349 __ cmp(cnt1, ch2);
11350 __ br(__ EQ, L_CMP_LOOP);
11351 __ BIND(L_CMP_LOOP_NOMATCH);
11352 // here we're not matched
11353 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
11354 __ clz(tmp4, tmp2);
11355 __ add(str2, str2, str2_chr_size); // advance pointer
11356 __ b(L_HAS_ZERO_LOOP);
11357 __ align(OptoLoopAlignment);
11358 __ BIND(L_CMP_LOOP_LAST_CMP);
11359 __ cmp(cnt1, ch2);
11360 __ br(__ NE, L_CMP_LOOP_NOMATCH);
11361 __ b(DONE);
11362 __ align(OptoLoopAlignment);
11363 __ BIND(L_CMP_LOOP_LAST_CMP2);
11364 if (str2_isL) {
11365 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
11366 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11367 __ lslv(tmp2, tmp2, tmp4);
11368 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11369 __ add(tmp4, tmp4, 1);
11370 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11371 __ lsl(tmp2, tmp2, 1);
11372 } else {
11373 __ mov(ch2, 0xE);
11374 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
11375 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
11376 __ lslv(tmp2, tmp2, tmp4);
11377 __ add(tmp4, tmp4, 1);
11378 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
11379 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
11380 __ lsl(tmp2, tmp2, 1);
11381 __ sub(str2, str2, str2_chr_size);
11382 }
11383 __ cmp(ch1, ch2);
11384 __ br(__ NE, L_CMP_LOOP_NOMATCH);
11385 __ b(DONE);
11386 __ align(OptoLoopAlignment);
11387 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
11388 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
11389 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
11390 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
11391 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
11392 // result by analyzed characters value, so, we can just reset lower bits
11393 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
11394 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
11395 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
11396 // index of last analyzed substring inside current octet. So, str2 in at
11397 // respective start address. We need to advance it to next octet
11398 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
11399 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
11400 __ bfm(result, zr, 0, 2 - str2_chr_shift);
11401 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
11402 __ movw(cnt2, cnt2);
11403 __ b(L_LOOP_PROCEED);
11404 __ align(OptoLoopAlignment);
11405 __ BIND(NOMATCH);
11406 __ mov(result, -1);
11407 __ BIND(DONE);
11408 __ pop(spilled_regs, sp);
11409 __ ret(lr);
11410
11411 // record the stub entry and end
11412 store_archive_data(stub_id, entry, __ pc());
11413
11414 return entry;
11415 }
11416
11417 void generate_string_indexof_stubs() {
11418 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
11419 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
11420 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
11421 }
11422
11423 void inflate_and_store_2_fp_registers(bool generatePrfm,
11424 FloatRegister src1, FloatRegister src2) {
11425 Register dst = r1;
11426 __ zip1(v1, __ T16B, src1, v0);
11427 __ zip2(v2, __ T16B, src1, v0);
11428 if (generatePrfm) {
11429 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
11430 }
11431 __ zip1(v3, __ T16B, src2, v0);
11432 __ zip2(v4, __ T16B, src2, v0);
11433 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
11434 }
11435
11436 // R0 = src
11437 // R1 = dst
11438 // R2 = len
11439 // R3 = len >> 3
11440 // V0 = 0
11441 // v1 = loaded 8 bytes
11442 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
11443 address generate_large_byte_array_inflate() {
11444 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
11445 int entry_count = StubInfo::entry_count(stub_id);
11446 assert(entry_count == 1, "sanity check");
11447 address start = load_archive_data(stub_id);
11448 if (start != nullptr) {
11449 return start;
11450 }
11451 __ align(CodeEntryAlignment);
11452 StubCodeMark mark(this, stub_id);
11453 address entry = __ pc();
11454 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
11455 Register src = r0, dst = r1, len = r2, octetCounter = r3;
11456 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
11457
11458 // do one more 8-byte read to have address 16-byte aligned in most cases
11459 // also use single store instruction
11460 __ ldrd(v2, __ post(src, 8));
11461 __ sub(octetCounter, octetCounter, 2);
11462 __ zip1(v1, __ T16B, v1, v0);
11463 __ zip1(v2, __ T16B, v2, v0);
11464 __ st1(v1, v2, __ T16B, __ post(dst, 32));
11465 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11466 __ subs(rscratch1, octetCounter, large_loop_threshold);
11467 __ br(__ LE, LOOP_START);
11468 __ b(LOOP_PRFM_START);
11469 __ bind(LOOP_PRFM);
11470 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11471 __ bind(LOOP_PRFM_START);
11472 __ prfm(Address(src, SoftwarePrefetchHintDistance));
11473 __ sub(octetCounter, octetCounter, 8);
11474 __ subs(rscratch1, octetCounter, large_loop_threshold);
11475 inflate_and_store_2_fp_registers(true, v3, v4);
11476 inflate_and_store_2_fp_registers(true, v5, v6);
11477 __ br(__ GT, LOOP_PRFM);
11478 __ cmp(octetCounter, (u1)8);
11479 __ br(__ LT, DONE);
11480 __ bind(LOOP);
11481 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
11482 __ bind(LOOP_START);
11483 __ sub(octetCounter, octetCounter, 8);
11484 __ cmp(octetCounter, (u1)8);
11485 inflate_and_store_2_fp_registers(false, v3, v4);
11486 inflate_and_store_2_fp_registers(false, v5, v6);
11487 __ br(__ GE, LOOP);
11488 __ bind(DONE);
11489 __ ret(lr);
11490
11491 // record the stub entry and end
11492 store_archive_data(stub_id, entry, __ pc());
11493
11494 return entry;
11495 }
11496
11497 /**
11498 * Arguments:
11499 *
11500 * Input:
11501 * c_rarg0 - current state address
11502 * c_rarg1 - H key address
11503 * c_rarg2 - data address
11504 * c_rarg3 - number of blocks
11505 *
11506 * Output:
11507 * Updated state at c_rarg0
11508 */
11509 address generate_ghash_processBlocks_small() {
11510 // Bafflingly, GCM uses little-endian for the byte order, but
11511 // big-endian for the bit order. For example, the polynomial 1 is
11512 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
11513 //
11514 // So, we must either reverse the bytes in each word and do
11515 // everything big-endian or reverse the bits in each byte and do
11516 // it little-endian. On AArch64 it's more idiomatic to reverse
11517 // the bits in each byte (we have an instruction, RBIT, to do
11518 // that) and keep the data in little-endian bit order through the
11519 // calculation, bit-reversing the inputs and outputs.
11520
11521 StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
11522 int entry_count = StubInfo::entry_count(stub_id);
11523 assert(entry_count == 1, "sanity check");
11524 address start = load_archive_data(stub_id);
11525 if (start != nullptr) {
11526 return start;
11527 }
11528 __ align(CodeEntryAlignment);
11529 StubCodeMark mark(this, stub_id);
11530 Label polynomial; // local data generated at end of stub
11531 start = __ pc();
11532
11533 Register state = c_rarg0;
11534 Register subkeyH = c_rarg1;
11535 Register data = c_rarg2;
11536 Register blocks = c_rarg3;
11537
11538 FloatRegister vzr = v30;
11539 __ eor(vzr, __ T16B, vzr, vzr); // zero register
11540
11541 __ adr(rscratch1, polynomial);
11542 __ ldrq(v24, rscratch1); // The field polynomial
11543
11544 __ ldrq(v0, Address(state));
11545 __ ldrq(v1, Address(subkeyH));
11546
11547 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
11548 __ rbit(v0, __ T16B, v0);
11549 __ rev64(v1, __ T16B, v1);
11550 __ rbit(v1, __ T16B, v1);
11551
11552 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
11553 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
11554
11555 {
11556 Label L_ghash_loop;
11557 __ bind(L_ghash_loop);
11558
11559 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
11560 // reversing each byte
11561 __ rbit(v2, __ T16B, v2);
11562 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
11563
11564 // Multiply state in v2 by subkey in v1
11565 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
11566 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
11567 /*temps*/v6, v3, /*reuse/clobber b*/v2);
11568 // Reduce v7:v5 by the field polynomial
11569 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
11570
11571 __ sub(blocks, blocks, 1);
11572 __ cbnz(blocks, L_ghash_loop);
11573 }
11574
11575 // The bit-reversed result is at this point in v0
11576 __ rev64(v0, __ T16B, v0);
11577 __ rbit(v0, __ T16B, v0);
11578
11579 __ st1(v0, __ T16B, state);
11580 __ ret(lr);
11581
11582 // bind label and generate local polynomial data
11583 __ align(wordSize * 2);
11584 __ bind(polynomial);
11585 __ emit_int64(0x87); // The low-order bits of the field
11586 // polynomial (i.e. p = z^7+z^2+z+1)
11587 // repeated in the low and high parts of a
11588 // 128-bit vector
11589 __ emit_int64(0x87);
11590
11591 // record the stub entry and end
11592 store_archive_data(stub_id, start, __ pc());
11593
11594 return start;
11595 }
11596
11597 address generate_ghash_processBlocks(address small) {
11598 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
11599 int entry_count = StubInfo::entry_count(stub_id);
11600 assert(entry_count == 1, "sanity check");
11601 address start = load_archive_data(stub_id);
11602 if (start != nullptr) {
11603 return start;
11604 }
11605 Label polynomial; // local data generated after stub
11606 __ align(CodeEntryAlignment);
11607 StubCodeMark mark(this, stub_id);
11608 start = __ pc();
11609
11610 Register state = c_rarg0;
11611 Register subkeyH = c_rarg1;
11612 Register data = c_rarg2;
11613 Register blocks = c_rarg3;
11614
11615 const int unroll = 4;
11616
11617 __ cmp(blocks, (unsigned char)(unroll * 2));
11618 __ br(__ LT, small);
11619
11620 if (unroll > 1) {
11621 // Save state before entering routine
11622 __ sub(sp, sp, 4 * 16);
11623 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
11624 __ sub(sp, sp, 4 * 16);
11625 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
11626 }
11627
11628 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
11629
11630 if (unroll > 1) {
11631 // And restore state
11632 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
11633 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
11634 }
11635
11636 __ cmp(blocks, (unsigned char)0);
11637 __ br(__ GT, small);
11638
11639 __ ret(lr);
11640
11641 // bind label and generate polynomial data
11642 __ align(wordSize * 2);
11643 __ bind(polynomial);
11644 __ emit_int64(0x87); // The low-order bits of the field
11645 // polynomial (i.e. p = z^7+z^2+z+1)
11646 // repeated in the low and high parts of a
11647 // 128-bit vector
11648 __ emit_int64(0x87);
11649
11650 // record the stub entry and end
11651 store_archive_data(stub_id, start, __ pc());
11652
11653 return start;
11654 }
11655
11656 void generate_base64_encode_simdround(Register src, Register dst,
11657 FloatRegister codec, u8 size) {
11658
11659 FloatRegister in0 = v4, in1 = v5, in2 = v6;
11660 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
11661 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
11662
11663 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
11664
11665 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
11666
11667 __ ushr(ind0, arrangement, in0, 2);
11668
11669 __ ushr(ind1, arrangement, in1, 2);
11670 __ shl(in0, arrangement, in0, 6);
11671 __ orr(ind1, arrangement, ind1, in0);
11672 __ ushr(ind1, arrangement, ind1, 2);
11673
11674 __ ushr(ind2, arrangement, in2, 4);
11675 __ shl(in1, arrangement, in1, 4);
11676 __ orr(ind2, arrangement, in1, ind2);
11677 __ ushr(ind2, arrangement, ind2, 2);
11678
11679 __ shl(ind3, arrangement, in2, 2);
11680 __ ushr(ind3, arrangement, ind3, 2);
11681
11682 __ tbl(out0, arrangement, codec, 4, ind0);
11683 __ tbl(out1, arrangement, codec, 4, ind1);
11684 __ tbl(out2, arrangement, codec, 4, ind2);
11685 __ tbl(out3, arrangement, codec, 4, ind3);
11686
11687 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
11688 }
11689
11690 /**
11691 * Arguments:
11692 *
11693 * Input:
11694 * c_rarg0 - src_start
11695 * c_rarg1 - src_offset
11696 * c_rarg2 - src_length
11697 * c_rarg3 - dest_start
11698 * c_rarg4 - dest_offset
11699 * c_rarg5 - isURL
11700 *
11701 */
11702 address generate_base64_encodeBlock() {
11703
11704 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
11705 int entry_count = StubInfo::entry_count(stub_id);
11706 assert(entry_count == 1, "sanity check");
11707 address start = load_archive_data(stub_id);
11708 if (start != nullptr) {
11709 return start;
11710 }
11711 __ align(CodeEntryAlignment);
11712 StubCodeMark mark(this, stub_id);
11713 start = __ pc();
11714
11715 Register src = c_rarg0; // source array
11716 Register soff = c_rarg1; // source start offset
11717 Register send = c_rarg2; // source end offset
11718 Register dst = c_rarg3; // dest array
11719 Register doff = c_rarg4; // position for writing to dest array
11720 Register isURL = c_rarg5; // Base64 or URL character set
11721
11722 // c_rarg6 and c_rarg7 are free to use as temps
11723 Register codec = c_rarg6;
11724 Register length = c_rarg7;
11725
11726 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
11727
11728 __ add(src, src, soff);
11729 __ add(dst, dst, doff);
11730 __ sub(length, send, soff);
11731
11732 // load the codec base address
11733 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
11734 __ cbz(isURL, ProcessData);
11735 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
11736
11737 __ BIND(ProcessData);
11738
11739 // too short to formup a SIMD loop, roll back
11740 __ cmp(length, (u1)24);
11741 __ br(Assembler::LT, Process3B);
11742
11743 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
11744
11745 __ BIND(Process48B);
11746 __ cmp(length, (u1)48);
11747 __ br(Assembler::LT, Process24B);
11748 generate_base64_encode_simdround(src, dst, v0, 16);
11749 __ sub(length, length, 48);
11750 __ b(Process48B);
11751
11752 __ BIND(Process24B);
11753 __ cmp(length, (u1)24);
11754 __ br(Assembler::LT, SIMDExit);
11755 generate_base64_encode_simdround(src, dst, v0, 8);
11756 __ sub(length, length, 24);
11757
11758 __ BIND(SIMDExit);
11759 __ cbz(length, Exit);
11760
11761 __ BIND(Process3B);
11762 // 3 src bytes, 24 bits
11763 __ ldrb(r10, __ post(src, 1));
11764 __ ldrb(r11, __ post(src, 1));
11765 __ ldrb(r12, __ post(src, 1));
11766 __ orrw(r11, r11, r10, Assembler::LSL, 8);
11767 __ orrw(r12, r12, r11, Assembler::LSL, 8);
11768 // codec index
11769 __ ubfmw(r15, r12, 18, 23);
11770 __ ubfmw(r14, r12, 12, 17);
11771 __ ubfmw(r13, r12, 6, 11);
11772 __ andw(r12, r12, 63);
11773 // get the code based on the codec
11774 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
11775 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
11776 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
11777 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
11778 __ strb(r15, __ post(dst, 1));
11779 __ strb(r14, __ post(dst, 1));
11780 __ strb(r13, __ post(dst, 1));
11781 __ strb(r12, __ post(dst, 1));
11782 __ sub(length, length, 3);
11783 __ cbnz(length, Process3B);
11784
11785 __ BIND(Exit);
11786 __ ret(lr);
11787
11788 // record the stub entry and end
11789 store_archive_data(stub_id, start, __ pc());
11790
11791 return start;
11792 }
11793
11794 void generate_base64_decode_simdround(Register src, Register dst,
11795 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
11796
11797 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
11798 FloatRegister out0 = v20, out1 = v21, out2 = v22;
11799
11800 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
11801 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
11802
11803 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
11804
11805 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
11806
11807 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
11808
11809 // we need unsigned saturating subtract, to make sure all input values
11810 // in range [0, 63] will have 0U value in the higher half lookup
11811 __ uqsubv(decH0, __ T16B, in0, v27);
11812 __ uqsubv(decH1, __ T16B, in1, v27);
11813 __ uqsubv(decH2, __ T16B, in2, v27);
11814 __ uqsubv(decH3, __ T16B, in3, v27);
11815
11816 // lower half lookup
11817 __ tbl(decL0, arrangement, codecL, 4, in0);
11818 __ tbl(decL1, arrangement, codecL, 4, in1);
11819 __ tbl(decL2, arrangement, codecL, 4, in2);
11820 __ tbl(decL3, arrangement, codecL, 4, in3);
11821
11822 // higher half lookup
11823 __ tbx(decH0, arrangement, codecH, 4, decH0);
11824 __ tbx(decH1, arrangement, codecH, 4, decH1);
11825 __ tbx(decH2, arrangement, codecH, 4, decH2);
11826 __ tbx(decH3, arrangement, codecH, 4, decH3);
11827
11828 // combine lower and higher
11829 __ orr(decL0, arrangement, decL0, decH0);
11830 __ orr(decL1, arrangement, decL1, decH1);
11831 __ orr(decL2, arrangement, decL2, decH2);
11832 __ orr(decL3, arrangement, decL3, decH3);
11833
11834 // check illegal inputs, value larger than 63 (maximum of 6 bits)
11835 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
11836 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
11837 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
11838 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
11839 __ orr(in0, arrangement, decH0, decH1);
11840 __ orr(in1, arrangement, decH2, decH3);
11841 __ orr(in2, arrangement, in0, in1);
11842 __ umaxv(in3, arrangement, in2);
11843 __ umov(rscratch2, in3, __ B, 0);
11844
11845 // get the data to output
11846 __ shl(out0, arrangement, decL0, 2);
11847 __ ushr(out1, arrangement, decL1, 4);
11848 __ orr(out0, arrangement, out0, out1);
11849 __ shl(out1, arrangement, decL1, 4);
11850 __ ushr(out2, arrangement, decL2, 2);
11851 __ orr(out1, arrangement, out1, out2);
11852 __ shl(out2, arrangement, decL2, 6);
11853 __ orr(out2, arrangement, out2, decL3);
11854
11855 __ cbz(rscratch2, NoIllegalData);
11856
11857 // handle illegal input
11858 __ umov(r10, in2, __ D, 0);
11859 if (size == 16) {
11860 __ cbnz(r10, ErrorInLowerHalf);
11861
11862 // illegal input is in higher half, store the lower half now.
11863 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
11864
11865 __ umov(r10, in2, __ D, 1);
11866 __ umov(r11, out0, __ D, 1);
11867 __ umov(r12, out1, __ D, 1);
11868 __ umov(r13, out2, __ D, 1);
11869 __ b(StoreLegalData);
11870
11871 __ BIND(ErrorInLowerHalf);
11872 }
11873 __ umov(r11, out0, __ D, 0);
11874 __ umov(r12, out1, __ D, 0);
11875 __ umov(r13, out2, __ D, 0);
11876
11877 __ BIND(StoreLegalData);
11878 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
11879 __ strb(r11, __ post(dst, 1));
11880 __ strb(r12, __ post(dst, 1));
11881 __ strb(r13, __ post(dst, 1));
11882 __ lsr(r10, r10, 8);
11883 __ lsr(r11, r11, 8);
11884 __ lsr(r12, r12, 8);
11885 __ lsr(r13, r13, 8);
11886 __ b(StoreLegalData);
11887
11888 __ BIND(NoIllegalData);
11889 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
11890 }
11891
11892
11893 /**
11894 * Arguments:
11895 *
11896 * Input:
11897 * c_rarg0 - src_start
11898 * c_rarg1 - src_offset
11899 * c_rarg2 - src_length
11900 * c_rarg3 - dest_start
11901 * c_rarg4 - dest_offset
11902 * c_rarg5 - isURL
11903 * c_rarg6 - isMIME
11904 *
11905 */
11906 address generate_base64_decodeBlock() {
11907
11908 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
11909 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
11910 // titled "Base64 decoding".
11911
11912 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
11913 int entry_count = StubInfo::entry_count(stub_id);
11914 assert(entry_count == 1, "sanity check");
11915 address start = load_archive_data(stub_id);
11916 if (start != nullptr) {
11917 return start;
11918 }
11919 __ align(CodeEntryAlignment);
11920 StubCodeMark mark(this, stub_id);
11921 start = __ pc();
11922
11923 Register src = c_rarg0; // source array
11924 Register soff = c_rarg1; // source start offset
11925 Register send = c_rarg2; // source end offset
11926 Register dst = c_rarg3; // dest array
11927 Register doff = c_rarg4; // position for writing to dest array
11928 Register isURL = c_rarg5; // Base64 or URL character set
11929 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
11930
11931 Register length = send; // reuse send as length of source data to process
11932
11933 Register simd_codec = c_rarg6;
11934 Register nosimd_codec = c_rarg7;
11935
11936 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
11937
11938 __ enter();
11939
11940 __ add(src, src, soff);
11941 __ add(dst, dst, doff);
11942
11943 __ mov(doff, dst);
11944
11945 __ sub(length, send, soff);
11946 __ bfm(length, zr, 0, 1);
11947
11948 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
11949 __ cbz(isURL, ProcessData);
11950 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
11951
11952 __ BIND(ProcessData);
11953 __ mov(rscratch1, length);
11954 __ cmp(length, (u1)144); // 144 = 80 + 64
11955 __ br(Assembler::LT, Process4B);
11956
11957 // In the MIME case, the line length cannot be more than 76
11958 // bytes (see RFC 2045). This is too short a block for SIMD
11959 // to be worthwhile, so we use non-SIMD here.
11960 __ movw(rscratch1, 79);
11961
11962 __ BIND(Process4B);
11963 __ ldrw(r14, __ post(src, 4));
11964 __ ubfxw(r10, r14, 0, 8);
11965 __ ubfxw(r11, r14, 8, 8);
11966 __ ubfxw(r12, r14, 16, 8);
11967 __ ubfxw(r13, r14, 24, 8);
11968 // get the de-code
11969 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
11970 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
11971 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
11972 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
11973 // error detection, 255u indicates an illegal input
11974 __ orrw(r14, r10, r11);
11975 __ orrw(r15, r12, r13);
11976 __ orrw(r14, r14, r15);
11977 __ tbnz(r14, 7, Exit);
11978 // recover the data
11979 __ lslw(r14, r10, 10);
11980 __ bfiw(r14, r11, 4, 6);
11981 __ bfmw(r14, r12, 2, 5);
11982 __ rev16w(r14, r14);
11983 __ bfiw(r13, r12, 6, 2);
11984 __ strh(r14, __ post(dst, 2));
11985 __ strb(r13, __ post(dst, 1));
11986 // non-simd loop
11987 __ subsw(rscratch1, rscratch1, 4);
11988 __ br(Assembler::GT, Process4B);
11989
11990 // if exiting from PreProcess80B, rscratch1 == -1;
11991 // otherwise, rscratch1 == 0.
11992 __ cbzw(rscratch1, Exit);
11993 __ sub(length, length, 80);
11994
11995 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
11996 __ cbz(isURL, SIMDEnter);
11997 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
11998
11999 __ BIND(SIMDEnter);
12000 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
12001 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
12002 __ mov(rscratch1, 63);
12003 __ dup(v27, __ T16B, rscratch1);
12004
12005 __ BIND(Process64B);
12006 __ cmp(length, (u1)64);
12007 __ br(Assembler::LT, Process32B);
12008 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
12009 __ sub(length, length, 64);
12010 __ b(Process64B);
12011
12012 __ BIND(Process32B);
12013 __ cmp(length, (u1)32);
12014 __ br(Assembler::LT, SIMDExit);
12015 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
12016 __ sub(length, length, 32);
12017 __ b(Process32B);
12018
12019 __ BIND(SIMDExit);
12020 __ cbz(length, Exit);
12021 __ movw(rscratch1, length);
12022 __ b(Process4B);
12023
12024 __ BIND(Exit);
12025 __ sub(c_rarg0, dst, doff);
12026
12027 __ leave();
12028 __ ret(lr);
12029
12030 // record the stub entry and end
12031 store_archive_data(stub_id, start, __ pc());
12032
12033 return start;
12034 }
12035
12036 // Support for spin waits.
12037 address generate_spin_wait() {
12038 StubId stub_id = StubId::stubgen_spin_wait_id;
12039 int entry_count = StubInfo::entry_count(stub_id);
12040 assert(entry_count == 1, "sanity check");
12041 address start = load_archive_data(stub_id);
12042 if (start != nullptr) {
12043 return start;
12044 }
12045 __ align(CodeEntryAlignment);
12046 StubCodeMark mark(this, stub_id);
12047 start = __ pc();
12048
12049 __ spin_wait();
12050 __ ret(lr);
12051
12052 // record the stub entry and end
12053 store_archive_data(stub_id, start, __ pc());
12054
12055 return start;
12056 }
12057
12058 void generate_lookup_secondary_supers_table_stub() {
12059 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
12060 GrowableArray<address> entries;
12061 int entry_count = StubInfo::entry_count(stub_id);
12062 assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
12063 address start = load_archive_data(stub_id, &entries);
12064 if (start != nullptr) {
12065 assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
12066 "unexpected extra entry count %d", entries.length());
12067 StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
12068 for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
12069 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
12070 }
12071 return;
12072 }
12073
12074 StubCodeMark mark(this, stub_id);
12075
12076 const Register
12077 r_super_klass = r0,
12078 r_array_base = r1,
12079 r_array_length = r2,
12080 r_array_index = r3,
12081 r_sub_klass = r4,
12082 r_bitmap = rscratch2,
12083 result = r5;
12084 const FloatRegister
12085 vtemp = v0;
12086
12087 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
12088 address next_entry = __ pc();
12089 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
12090 if (slot == 0) {
12091 start = next_entry;
12092 } else {
12093 entries.append(next_entry);
12094 }
12095 Label L_success;
12096 __ enter();
12097 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
12098 r_array_base, r_array_length, r_array_index,
12099 vtemp, result, slot,
12100 /*stub_is_near*/true);
12101 __ leave();
12102 __ ret(lr);
12103 }
12104 // record the stub entry and end plus all the auxiliary entries
12105 store_archive_data(stub_id, start, __ pc(), &entries);
12106 }
12107
12108 // Slow path implementation for UseSecondarySupersTable.
12109 address generate_lookup_secondary_supers_table_slow_path_stub() {
12110 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
12111 int entry_count = StubInfo::entry_count(stub_id);
12112 assert(entry_count == 1, "sanity check");
12113 address start = load_archive_data(stub_id);
12114 if (start != nullptr) {
12115 return start;
12116 }
12117 StubCodeMark mark(this, stub_id);
12118 start = __ pc();
12119 const Register
12120 r_super_klass = r0, // argument
12121 r_array_base = r1, // argument
12122 temp1 = r2, // temp
12123 r_array_index = r3, // argument
12124 r_bitmap = rscratch2, // argument
12125 result = r5; // argument
12126
12127 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
12128 __ ret(lr);
12129
12130 // record the stub entry and end
12131 store_archive_data(stub_id, start, __ pc());
12132
12133 return start;
12134 }
12135
12136 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12137
12138 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
12139 //
12140 // If LSE is in use, generate LSE versions of all the stubs. The
12141 // non-LSE versions are in atomic_aarch64.S.
12142
12143 // class AtomicStubMark records the entry point of a stub and the
12144 // stub pointer which will point to it. The stub pointer is set to
12145 // the entry point when ~AtomicStubMark() is called, which must be
12146 // after ICache::invalidate_range. This ensures safe publication of
12147 // the generated code.
12148 class AtomicStubMark {
12149 address _entry_point;
12150 aarch64_atomic_stub_t *_stub;
12151 MacroAssembler *_masm;
12152 public:
12153 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
12154 _masm = masm;
12155 __ align(32);
12156 _entry_point = __ pc();
12157 _stub = stub;
12158 }
12159 ~AtomicStubMark() {
12160 *_stub = (aarch64_atomic_stub_t)_entry_point;
12161 }
12162 };
12163
12164 // NB: For memory_order_conservative we need a trailing membar after
12165 // LSE atomic operations but not a leading membar.
12166 //
12167 // We don't need a leading membar because a clause in the Arm ARM
12168 // says:
12169 //
12170 // Barrier-ordered-before
12171 //
12172 // Barrier instructions order prior Memory effects before subsequent
12173 // Memory effects generated by the same Observer. A read or a write
12174 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
12175 // Observer if and only if RW1 appears in program order before RW 2
12176 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
12177 // instruction with both Acquire and Release semantics.
12178 //
12179 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
12180 // and Release semantics, therefore we don't need a leading
12181 // barrier. However, there is no corresponding Barrier-ordered-after
12182 // relationship, therefore we need a trailing membar to prevent a
12183 // later store or load from being reordered with the store in an
12184 // atomic instruction.
12185 //
12186 // This was checked by using the herd7 consistency model simulator
12187 // (http://diy.inria.fr/) with this test case:
12188 //
12189 // AArch64 LseCas
12190 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
12191 // P0 | P1;
12192 // LDR W4, [X2] | MOV W3, #0;
12193 // DMB LD | MOV W4, #1;
12194 // LDR W3, [X1] | CASAL W3, W4, [X1];
12195 // | DMB ISH;
12196 // | STR W4, [X2];
12197 // exists
12198 // (0:X3=0 /\ 0:X4=1)
12199 //
12200 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
12201 // with the store to x in P1. Without the DMB in P1 this may happen.
12202 //
12203 // At the time of writing we don't know of any AArch64 hardware that
12204 // reorders stores in this way, but the Reference Manual permits it.
12205
12206 void gen_cas_entry(Assembler::operand_size size,
12207 atomic_memory_order order) {
12208 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
12209 exchange_val = c_rarg2;
12210 bool acquire, release;
12211 switch (order) {
12212 case memory_order_relaxed:
12213 acquire = false;
12214 release = false;
12215 break;
12216 case memory_order_release:
12217 acquire = false;
12218 release = true;
12219 break;
12220 default:
12221 acquire = true;
12222 release = true;
12223 break;
12224 }
12225 __ mov(prev, compare_val);
12226 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
12227 if (order == memory_order_conservative) {
12228 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12229 }
12230 if (size == Assembler::xword) {
12231 __ mov(r0, prev);
12232 } else {
12233 __ movw(r0, prev);
12234 }
12235 __ ret(lr);
12236 }
12237
12238 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
12239 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
12240 // If not relaxed, then default to conservative. Relaxed is the only
12241 // case we use enough to be worth specializing.
12242 if (order == memory_order_relaxed) {
12243 __ ldadd(size, incr, prev, addr);
12244 } else {
12245 __ ldaddal(size, incr, prev, addr);
12246 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12247 }
12248 if (size == Assembler::xword) {
12249 __ mov(r0, prev);
12250 } else {
12251 __ movw(r0, prev);
12252 }
12253 __ ret(lr);
12254 }
12255
12256 void gen_swpal_entry(Assembler::operand_size size) {
12257 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
12258 __ swpal(size, incr, prev, addr);
12259 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
12260 if (size == Assembler::xword) {
12261 __ mov(r0, prev);
12262 } else {
12263 __ movw(r0, prev);
12264 }
12265 __ ret(lr);
12266 }
12267
12268 void generate_atomic_entry_points() {
12269 if (! UseLSE) {
12270 return;
12271 }
12272 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
12273 GrowableArray<address> entries;
12274 int entry_count = StubInfo::entry_count(stub_id);
12275 address start = load_archive_data(stub_id, &entries);
12276 if (start != nullptr) {
12277 assert(entries.length() == entry_count - 1,
12278 "unexpected extra entry count %d", entries.length());
12279 aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
12280 int idx = 0;
12281 aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12282 aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12283 aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12284 aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12285 aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12286 aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12287 aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12288 aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12289 aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12290 aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12291 aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12292 aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12293 aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12294 aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12295 aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
12296 assert(idx == entries.length(), "sanity!");
12297 return;
12298 }
12299
12300 __ align(CodeEntryAlignment);
12301 StubCodeMark mark(this, stub_id);
12302 start = __ pc();
12303 address end;
12304 {
12305 // ADD, memory_order_conservative
12306 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
12307 gen_ldadd_entry(Assembler::word, memory_order_conservative);
12308
12309 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
12310 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
12311
12312 // ADD, memory_order_relaxed
12313 AtomicStubMark mark_fetch_add_4_relaxed
12314 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
12315 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
12316
12317 AtomicStubMark mark_fetch_add_8_relaxed
12318 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
12319 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
12320
12321 // XCHG, memory_order_conservative
12322 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
12323 gen_swpal_entry(Assembler::word);
12324
12325 AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
12326 gen_swpal_entry(Assembler::xword);
12327
12328 // CAS, memory_order_conservative
12329 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
12330 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
12331
12332 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
12333 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
12334
12335 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
12336 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
12337
12338 // CAS, memory_order_relaxed
12339 AtomicStubMark mark_cmpxchg_1_relaxed
12340 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
12341 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
12342
12343 AtomicStubMark mark_cmpxchg_4_relaxed
12344 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
12345 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
12346
12347 AtomicStubMark mark_cmpxchg_8_relaxed
12348 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
12349 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
12350
12351 AtomicStubMark mark_cmpxchg_4_release
12352 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
12353 gen_cas_entry(MacroAssembler::word, memory_order_release);
12354
12355 AtomicStubMark mark_cmpxchg_8_release
12356 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
12357 gen_cas_entry(MacroAssembler::xword, memory_order_release);
12358
12359 AtomicStubMark mark_cmpxchg_4_seq_cst
12360 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
12361 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
12362
12363 AtomicStubMark mark_cmpxchg_8_seq_cst
12364 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
12365 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
12366
12367 end = __ pc();
12368
12369 ICache::invalidate_range(start, end - start);
12370 // exit block to force update of AtomicStubMark targets
12371 }
12372
12373 assert(start == (address)aarch64_atomic_fetch_add_4_impl,
12374 "atomic stub should be at start of buffer");
12375 // record the stub start and end plus all the entries saved by the
12376 // AtomicStubMark destructor
12377 entries.append((address)aarch64_atomic_fetch_add_8_impl);
12378 entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
12379 entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
12380 entries.append((address)aarch64_atomic_xchg_4_impl);
12381 entries.append((address)aarch64_atomic_xchg_8_impl);
12382 entries.append((address)aarch64_atomic_cmpxchg_1_impl);
12383 entries.append((address)aarch64_atomic_cmpxchg_4_impl);
12384 entries.append((address)aarch64_atomic_cmpxchg_8_impl);
12385 entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
12386 entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
12387 entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
12388 entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
12389 entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
12390 entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
12391 entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
12392
12393 assert(entries.length() == entry_count - 1,
12394 "unexpected extra entry count %d", entries.length());
12395
12396 store_archive_data(stub_id, start, end, &entries);
12397 }
12398 #endif // LINUX
12399
12400 address generate_cont_thaw(Continuation::thaw_kind kind) {
12401 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
12402 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
12403
12404 address start = __ pc();
12405
12406 if (return_barrier) {
12407 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
12408 __ mov(sp, rscratch1);
12409 }
12410 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
12411
12412 if (return_barrier) {
12413 // preserve possible return value from a method returning to the return barrier
12414 __ fmovd(rscratch1, v0);
12415 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
12416 }
12417
12418 __ movw(c_rarg1, (return_barrier ? 1 : 0));
12419 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
12420 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
12421
12422 if (return_barrier) {
12423 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
12424 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
12425 __ fmovd(v0, rscratch1);
12426 }
12427 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
12428
12429
12430 Label thaw_success;
12431 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
12432 __ cbnz(rscratch2, thaw_success);
12433 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
12434 __ br(rscratch1);
12435 __ bind(thaw_success);
12436
12437 // make room for the thawed frames
12438 __ sub(rscratch1, sp, rscratch2);
12439 __ andr(rscratch1, rscratch1, -16); // align
12440 __ mov(sp, rscratch1);
12441
12442 if (return_barrier) {
12443 // save original return value -- again
12444 __ fmovd(rscratch1, v0);
12445 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
12446 }
12447
12448 // If we want, we can templatize thaw by kind, and have three different entries
12449 __ movw(c_rarg1, (uint32_t)kind);
12450
12451 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
12452 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
12453
12454 if (return_barrier) {
12455 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
12456 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
12457 __ fmovd(v0, rscratch1);
12458 } else {
12459 __ mov(r0, zr); // return 0 (success) from doYield
12460 }
12461
12462 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
12463 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
12464 __ mov(rfp, sp);
12465
12466 if (return_barrier_exception) {
12467 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
12468 __ authenticate_return_address(c_rarg1);
12469 __ verify_oop(r0);
12470 // save return value containing the exception oop in callee-saved R19
12471 __ mov(r19, r0);
12472
12473 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
12474
12475 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
12476 // __ reinitialize_ptrue();
12477
12478 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
12479
12480 __ mov(r1, r0); // the exception handler
12481 __ mov(r0, r19); // restore return value containing the exception oop
12482 __ verify_oop(r0);
12483
12484 __ leave();
12485 __ mov(r3, lr);
12486 __ br(r1); // the exception handler
12487 } else {
12488 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
12489 __ leave();
12490 __ ret(lr);
12491 }
12492
12493 return start;
12494 }
12495
12496 address generate_cont_thaw() {
12497 if (!Continuations::enabled()) return nullptr;
12498
12499 StubId stub_id = StubId::stubgen_cont_thaw_id;
12500 int entry_count = StubInfo::entry_count(stub_id);
12501 assert(entry_count == 1, "sanity check");
12502 address start = load_archive_data(stub_id);
12503 if (start != nullptr) {
12504 return start;
12505 }
12506 StubCodeMark mark(this, stub_id);
12507 start = __ pc();
12508 generate_cont_thaw(Continuation::thaw_top);
12509
12510 // record the stub start and end
12511 store_archive_data(stub_id, start, __ pc());
12512
12513 return start;
12514 }
12515
12516 address generate_cont_returnBarrier() {
12517 if (!Continuations::enabled()) return nullptr;
12518
12519 // TODO: will probably need multiple return barriers depending on return type
12520 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
12521 int entry_count = StubInfo::entry_count(stub_id);
12522 assert(entry_count == 1, "sanity check");
12523 address start = load_archive_data(stub_id);
12524 if (start != nullptr) {
12525 return start;
12526 }
12527 StubCodeMark mark(this, stub_id);
12528 start = __ pc();
12529
12530 generate_cont_thaw(Continuation::thaw_return_barrier);
12531
12532 // record the stub start and end
12533 store_archive_data(stub_id, start, __ pc());
12534
12535 return start;
12536 }
12537
12538 address generate_cont_returnBarrier_exception() {
12539 if (!Continuations::enabled()) return nullptr;
12540
12541 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
12542 int entry_count = StubInfo::entry_count(stub_id);
12543 assert(entry_count == 1, "sanity check");
12544 address start = load_archive_data(stub_id);
12545 if (start != nullptr) {
12546 return start;
12547 }
12548 StubCodeMark mark(this, stub_id);
12549 start = __ pc();
12550
12551 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
12552
12553 // record the stub start and end
12554 store_archive_data(stub_id, start, __ pc());
12555
12556 return start;
12557 }
12558
12559 address generate_cont_preempt_stub() {
12560 if (!Continuations::enabled()) return nullptr;
12561 StubId stub_id = StubId::stubgen_cont_preempt_id;
12562 int entry_count = StubInfo::entry_count(stub_id);
12563 assert(entry_count == 1, "sanity check");
12564 address start = load_archive_data(stub_id);
12565 if (start != nullptr) {
12566 return start;
12567 }
12568 StubCodeMark mark(this, stub_id);
12569 start = __ pc();
12570
12571 __ reset_last_Java_frame(true);
12572
12573 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
12574 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
12575 __ mov(sp, rscratch2);
12576
12577 Label preemption_cancelled;
12578 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
12579 __ cbnz(rscratch1, preemption_cancelled);
12580
12581 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
12582 SharedRuntime::continuation_enter_cleanup(_masm);
12583 __ leave();
12584 __ ret(lr);
12585
12586 // We acquired the monitor after freezing the frames so call thaw to continue execution.
12587 __ bind(preemption_cancelled);
12588 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
12589 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
12590 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
12591 __ ldr(rscratch1, Address(rscratch1));
12592 __ br(rscratch1);
12593
12594 // record the stub start and end
12595 store_archive_data(stub_id, start, __ pc());
12596
12597 return start;
12598 }
12599
12600 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
12601 // are represented as long[5], with BITS_PER_LIMB = 26.
12602 // Pack five 26-bit limbs into three 64-bit registers.
12603 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
12604 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
12605 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
12606 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
12607 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
12608
12609 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
12610 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
12611 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
12612 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
12613
12614 if (dest2->is_valid()) {
12615 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
12616 } else {
12617 #ifdef ASSERT
12618 Label OK;
12619 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
12620 __ br(__ EQ, OK);
12621 __ stop("high bits of Poly1305 integer should be zero");
12622 __ should_not_reach_here();
12623 __ bind(OK);
12624 #endif
12625 }
12626 }
12627
12628 // As above, but return only a 128-bit integer, packed into two
12629 // 64-bit registers.
12630 void pack_26(Register dest0, Register dest1, Register src) {
12631 pack_26(dest0, dest1, noreg, src);
12632 }
12633
12634 // Multiply and multiply-accumulate unsigned 64-bit registers.
12635 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
12636 __ mul(prod_lo, n, m);
12637 __ umulh(prod_hi, n, m);
12638 }
12639 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
12640 wide_mul(rscratch1, rscratch2, n, m);
12641 __ adds(sum_lo, sum_lo, rscratch1);
12642 __ adc(sum_hi, sum_hi, rscratch2);
12643 }
12644
12645 // Poly1305, RFC 7539
12646
12647 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
12648 // description of the tricks used to simplify and accelerate this
12649 // computation.
12650
12651 address generate_poly1305_processBlocks() {
12652 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
12653 int entry_count = StubInfo::entry_count(stub_id);
12654 assert(entry_count == 1, "sanity check");
12655 address start = load_archive_data(stub_id);
12656 if (start != nullptr) {
12657 return start;
12658 }
12659 __ align(CodeEntryAlignment);
12660 StubCodeMark mark(this, stub_id);
12661 start = __ pc();
12662 Label here;
12663 __ enter();
12664 RegSet callee_saved = RegSet::range(r19, r28);
12665 __ push(callee_saved, sp);
12666
12667 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
12668
12669 // Arguments
12670 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
12671
12672 // R_n is the 128-bit randomly-generated key, packed into two
12673 // registers. The caller passes this key to us as long[5], with
12674 // BITS_PER_LIMB = 26.
12675 const Register R_0 = *++regs, R_1 = *++regs;
12676 pack_26(R_0, R_1, r_start);
12677
12678 // RR_n is (R_n >> 2) * 5
12679 const Register RR_0 = *++regs, RR_1 = *++regs;
12680 __ lsr(RR_0, R_0, 2);
12681 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
12682 __ lsr(RR_1, R_1, 2);
12683 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
12684
12685 // U_n is the current checksum
12686 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
12687 pack_26(U_0, U_1, U_2, acc_start);
12688
12689 static constexpr int BLOCK_LENGTH = 16;
12690 Label DONE, LOOP;
12691
12692 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
12693 __ br(Assembler::LT, DONE); {
12694 __ bind(LOOP);
12695
12696 // S_n is to be the sum of U_n and the next block of data
12697 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
12698 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
12699 __ adds(S_0, U_0, S_0);
12700 __ adcs(S_1, U_1, S_1);
12701 __ adc(S_2, U_2, zr);
12702 __ add(S_2, S_2, 1);
12703
12704 const Register U_0HI = *++regs, U_1HI = *++regs;
12705
12706 // NB: this logic depends on some of the special properties of
12707 // Poly1305 keys. In particular, because we know that the top
12708 // four bits of R_0 and R_1 are zero, we can add together
12709 // partial products without any risk of needing to propagate a
12710 // carry out.
12711 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
12712 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
12713 __ andr(U_2, R_0, 3);
12714 __ mul(U_2, S_2, U_2);
12715
12716 // Recycle registers S_0, S_1, S_2
12717 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
12718
12719 // Partial reduction mod 2**130 - 5
12720 __ adds(U_1, U_0HI, U_1);
12721 __ adc(U_2, U_1HI, U_2);
12722 // Sum now in U_2:U_1:U_0.
12723 // Dead: U_0HI, U_1HI.
12724 regs = (regs.remaining() + U_0HI + U_1HI).begin();
12725
12726 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
12727
12728 // First, U_2:U_1:U_0 += (U_2 >> 2)
12729 __ lsr(rscratch1, U_2, 2);
12730 __ andr(U_2, U_2, (u8)3);
12731 __ adds(U_0, U_0, rscratch1);
12732 __ adcs(U_1, U_1, zr);
12733 __ adc(U_2, U_2, zr);
12734 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
12735 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
12736 __ adcs(U_1, U_1, zr);
12737 __ adc(U_2, U_2, zr);
12738
12739 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
12740 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
12741 __ br(~ Assembler::LT, LOOP);
12742 }
12743
12744 // Further reduce modulo 2^130 - 5
12745 __ lsr(rscratch1, U_2, 2);
12746 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
12747 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
12748 __ adcs(U_1, U_1, zr);
12749 __ andr(U_2, U_2, (u1)3);
12750 __ adc(U_2, U_2, zr);
12751
12752 // Unpack the sum into five 26-bit limbs and write to memory.
12753 __ ubfiz(rscratch1, U_0, 0, 26);
12754 __ ubfx(rscratch2, U_0, 26, 26);
12755 __ stp(rscratch1, rscratch2, Address(acc_start));
12756 __ ubfx(rscratch1, U_0, 52, 12);
12757 __ bfi(rscratch1, U_1, 12, 14);
12758 __ ubfx(rscratch2, U_1, 14, 26);
12759 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
12760 __ ubfx(rscratch1, U_1, 40, 24);
12761 __ bfi(rscratch1, U_2, 24, 3);
12762 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
12763
12764 __ bind(DONE);
12765 __ pop(callee_saved, sp);
12766 __ leave();
12767 __ ret(lr);
12768
12769 // record the stub start and end
12770 store_archive_data(stub_id, start, __ pc());
12771
12772 return start;
12773 }
12774
12775 // exception handler for upcall stubs
12776 address generate_upcall_stub_exception_handler() {
12777 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
12778 int entry_count = StubInfo::entry_count(stub_id);
12779 assert(entry_count == 1, "sanity check");
12780 address start = load_archive_data(stub_id);
12781 if (start != nullptr) {
12782 return start;
12783 }
12784 StubCodeMark mark(this, stub_id);
12785 start = __ pc();
12786
12787 // Native caller has no idea how to handle exceptions,
12788 // so we just crash here. Up to callee to catch exceptions.
12789 __ verify_oop(r0);
12790 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
12791 __ blr(rscratch1);
12792 __ should_not_reach_here();
12793
12794 // record the stub start and end
12795 store_archive_data(stub_id, start, __ pc());
12796
12797 return start;
12798 }
12799
12800 // load Method* target of MethodHandle
12801 // j_rarg0 = jobject receiver
12802 // rmethod = result
12803 address generate_upcall_stub_load_target() {
12804 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
12805 int entry_count = StubInfo::entry_count(stub_id);
12806 assert(entry_count == 1, "sanity check");
12807 address start = load_archive_data(stub_id);
12808 if (start != nullptr) {
12809 return start;
12810 }
12811 StubCodeMark mark(this, stub_id);
12812 start = __ pc();
12813
12814 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
12815 // Load target method from receiver
12816 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
12817 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
12818 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
12819 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
12820 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
12821 noreg, noreg);
12822 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
12823
12824 __ ret(lr);
12825
12826 // record the stub start and end
12827 store_archive_data(stub_id, start, __ pc());
12828
12829 return start;
12830 }
12831
12832 #undef __
12833 #define __ masm->
12834
12835 class MontgomeryMultiplyGenerator : public MacroAssembler {
12836
12837 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
12838 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
12839
12840 RegSet _toSave;
12841 bool _squaring;
12842
12843 public:
12844 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
12845 : MacroAssembler(as->code()), _squaring(squaring) {
12846
12847 // Register allocation
12848
12849 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
12850 Pa_base = *regs; // Argument registers
12851 if (squaring)
12852 Pb_base = Pa_base;
12853 else
12854 Pb_base = *++regs;
12855 Pn_base = *++regs;
12856 Rlen= *++regs;
12857 inv = *++regs;
12858 Pm_base = *++regs;
12859
12860 // Working registers:
12861 Ra = *++regs; // The current digit of a, b, n, and m.
12862 Rb = *++regs;
12863 Rm = *++regs;
12864 Rn = *++regs;
12865
12866 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
12867 Pb = *++regs;
12868 Pm = *++regs;
12869 Pn = *++regs;
12870
12871 t0 = *++regs; // Three registers which form a
12872 t1 = *++regs; // triple-precision accumuator.
12873 t2 = *++regs;
12874
12875 Ri = *++regs; // Inner and outer loop indexes.
12876 Rj = *++regs;
12877
12878 Rhi_ab = *++regs; // Product registers: low and high parts
12879 Rlo_ab = *++regs; // of a*b and m*n.
12880 Rhi_mn = *++regs;
12881 Rlo_mn = *++regs;
12882
12883 // r19 and up are callee-saved.
12884 _toSave = RegSet::range(r19, *regs) + Pm_base;
12885 }
12886
12887 private:
12888 void save_regs() {
12889 push(_toSave, sp);
12890 }
12891
12892 void restore_regs() {
12893 pop(_toSave, sp);
12894 }
12895
12896 template <typename T>
12897 void unroll_2(Register count, T block) {
12898 Label loop, end, odd;
12899 tbnz(count, 0, odd);
12900 cbz(count, end);
12901 align(16);
12902 bind(loop);
12903 (this->*block)();
12904 bind(odd);
12905 (this->*block)();
12906 subs(count, count, 2);
12907 br(Assembler::GT, loop);
12908 bind(end);
12909 }
12910
12911 template <typename T>
12912 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
12913 Label loop, end, odd;
12914 tbnz(count, 0, odd);
12915 cbz(count, end);
12916 align(16);
12917 bind(loop);
12918 (this->*block)(d, s, tmp);
12919 bind(odd);
12920 (this->*block)(d, s, tmp);
12921 subs(count, count, 2);
12922 br(Assembler::GT, loop);
12923 bind(end);
12924 }
12925
12926 void pre1(RegisterOrConstant i) {
12927 block_comment("pre1");
12928 // Pa = Pa_base;
12929 // Pb = Pb_base + i;
12930 // Pm = Pm_base;
12931 // Pn = Pn_base + i;
12932 // Ra = *Pa;
12933 // Rb = *Pb;
12934 // Rm = *Pm;
12935 // Rn = *Pn;
12936 ldr(Ra, Address(Pa_base));
12937 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
12938 ldr(Rm, Address(Pm_base));
12939 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12940 lea(Pa, Address(Pa_base));
12941 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
12942 lea(Pm, Address(Pm_base));
12943 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12944
12945 // Zero the m*n result.
12946 mov(Rhi_mn, zr);
12947 mov(Rlo_mn, zr);
12948 }
12949
12950 // The core multiply-accumulate step of a Montgomery
12951 // multiplication. The idea is to schedule operations as a
12952 // pipeline so that instructions with long latencies (loads and
12953 // multiplies) have time to complete before their results are
12954 // used. This most benefits in-order implementations of the
12955 // architecture but out-of-order ones also benefit.
12956 void step() {
12957 block_comment("step");
12958 // MACC(Ra, Rb, t0, t1, t2);
12959 // Ra = *++Pa;
12960 // Rb = *--Pb;
12961 umulh(Rhi_ab, Ra, Rb);
12962 mul(Rlo_ab, Ra, Rb);
12963 ldr(Ra, pre(Pa, wordSize));
12964 ldr(Rb, pre(Pb, -wordSize));
12965 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
12966 // previous iteration.
12967 // MACC(Rm, Rn, t0, t1, t2);
12968 // Rm = *++Pm;
12969 // Rn = *--Pn;
12970 umulh(Rhi_mn, Rm, Rn);
12971 mul(Rlo_mn, Rm, Rn);
12972 ldr(Rm, pre(Pm, wordSize));
12973 ldr(Rn, pre(Pn, -wordSize));
12974 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12975 }
12976
12977 void post1() {
12978 block_comment("post1");
12979
12980 // MACC(Ra, Rb, t0, t1, t2);
12981 // Ra = *++Pa;
12982 // Rb = *--Pb;
12983 umulh(Rhi_ab, Ra, Rb);
12984 mul(Rlo_ab, Ra, Rb);
12985 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12986 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12987
12988 // *Pm = Rm = t0 * inv;
12989 mul(Rm, t0, inv);
12990 str(Rm, Address(Pm));
12991
12992 // MACC(Rm, Rn, t0, t1, t2);
12993 // t0 = t1; t1 = t2; t2 = 0;
12994 umulh(Rhi_mn, Rm, Rn);
12995
12996 #ifndef PRODUCT
12997 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12998 {
12999 mul(Rlo_mn, Rm, Rn);
13000 add(Rlo_mn, t0, Rlo_mn);
13001 Label ok;
13002 cbz(Rlo_mn, ok); {
13003 stop("broken Montgomery multiply");
13004 } bind(ok);
13005 }
13006 #endif
13007 // We have very carefully set things up so that
13008 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
13009 // the lower half of Rm * Rn because we know the result already:
13010 // it must be -t0. t0 + (-t0) must generate a carry iff
13011 // t0 != 0. So, rather than do a mul and an adds we just set
13012 // the carry flag iff t0 is nonzero.
13013 //
13014 // mul(Rlo_mn, Rm, Rn);
13015 // adds(zr, t0, Rlo_mn);
13016 subs(zr, t0, 1); // Set carry iff t0 is nonzero
13017 adcs(t0, t1, Rhi_mn);
13018 adc(t1, t2, zr);
13019 mov(t2, zr);
13020 }
13021
13022 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
13023 block_comment("pre2");
13024 // Pa = Pa_base + i-len;
13025 // Pb = Pb_base + len;
13026 // Pm = Pm_base + i-len;
13027 // Pn = Pn_base + len;
13028
13029 if (i.is_register()) {
13030 sub(Rj, i.as_register(), len);
13031 } else {
13032 mov(Rj, i.as_constant());
13033 sub(Rj, Rj, len);
13034 }
13035 // Rj == i-len
13036
13037 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
13038 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
13039 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
13040 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
13041
13042 // Ra = *++Pa;
13043 // Rb = *--Pb;
13044 // Rm = *++Pm;
13045 // Rn = *--Pn;
13046 ldr(Ra, pre(Pa, wordSize));
13047 ldr(Rb, pre(Pb, -wordSize));
13048 ldr(Rm, pre(Pm, wordSize));
13049 ldr(Rn, pre(Pn, -wordSize));
13050
13051 mov(Rhi_mn, zr);
13052 mov(Rlo_mn, zr);
13053 }
13054
13055 void post2(RegisterOrConstant i, RegisterOrConstant len) {
13056 block_comment("post2");
13057 if (i.is_constant()) {
13058 mov(Rj, i.as_constant()-len.as_constant());
13059 } else {
13060 sub(Rj, i.as_register(), len);
13061 }
13062
13063 adds(t0, t0, Rlo_mn); // The pending m*n, low part
13064
13065 // As soon as we know the least significant digit of our result,
13066 // store it.
13067 // Pm_base[i-len] = t0;
13068 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
13069
13070 // t0 = t1; t1 = t2; t2 = 0;
13071 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
13072 adc(t1, t2, zr);
13073 mov(t2, zr);
13074 }
13075
13076 // A carry in t0 after Montgomery multiplication means that we
13077 // should subtract multiples of n from our result in m. We'll
13078 // keep doing that until there is no carry.
13079 void normalize(RegisterOrConstant len) {
13080 block_comment("normalize");
13081 // while (t0)
13082 // t0 = sub(Pm_base, Pn_base, t0, len);
13083 Label loop, post, again;
13084 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
13085 cbz(t0, post); {
13086 bind(again); {
13087 mov(i, zr);
13088 mov(cnt, len);
13089 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13090 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
13091 subs(zr, zr, zr); // set carry flag, i.e. no borrow
13092 align(16);
13093 bind(loop); {
13094 sbcs(Rm, Rm, Rn);
13095 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13096 add(i, i, 1);
13097 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
13098 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
13099 sub(cnt, cnt, 1);
13100 } cbnz(cnt, loop);
13101 sbc(t0, t0, zr);
13102 } cbnz(t0, again);
13103 } bind(post);
13104 }
13105
13106 // Move memory at s to d, reversing words.
13107 // Increments d to end of copied memory
13108 // Destroys tmp1, tmp2
13109 // Preserves len
13110 // Leaves s pointing to the address which was in d at start
13111 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
13112 assert(tmp1->encoding() < r19->encoding(), "register corruption");
13113 assert(tmp2->encoding() < r19->encoding(), "register corruption");
13114
13115 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
13116 mov(tmp1, len);
13117 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
13118 sub(s, d, len, ext::uxtw, LogBytesPerWord);
13119 }
13120 // where
13121 void reverse1(Register d, Register s, Register tmp) {
13122 ldr(tmp, pre(s, -wordSize));
13123 ror(tmp, tmp, 32);
13124 str(tmp, post(d, wordSize));
13125 }
13126
13127 void step_squaring() {
13128 // An extra ACC
13129 step();
13130 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13131 }
13132
13133 void last_squaring(RegisterOrConstant i) {
13134 Label dont;
13135 // if ((i & 1) == 0) {
13136 tbnz(i.as_register(), 0, dont); {
13137 // MACC(Ra, Rb, t0, t1, t2);
13138 // Ra = *++Pa;
13139 // Rb = *--Pb;
13140 umulh(Rhi_ab, Ra, Rb);
13141 mul(Rlo_ab, Ra, Rb);
13142 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
13143 } bind(dont);
13144 }
13145
13146 void extra_step_squaring() {
13147 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
13148
13149 // MACC(Rm, Rn, t0, t1, t2);
13150 // Rm = *++Pm;
13151 // Rn = *--Pn;
13152 umulh(Rhi_mn, Rm, Rn);
13153 mul(Rlo_mn, Rm, Rn);
13154 ldr(Rm, pre(Pm, wordSize));
13155 ldr(Rn, pre(Pn, -wordSize));
13156 }
13157
13158 void post1_squaring() {
13159 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
13160
13161 // *Pm = Rm = t0 * inv;
13162 mul(Rm, t0, inv);
13163 str(Rm, Address(Pm));
13164
13165 // MACC(Rm, Rn, t0, t1, t2);
13166 // t0 = t1; t1 = t2; t2 = 0;
13167 umulh(Rhi_mn, Rm, Rn);
13168
13169 #ifndef PRODUCT
13170 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
13171 {
13172 mul(Rlo_mn, Rm, Rn);
13173 add(Rlo_mn, t0, Rlo_mn);
13174 Label ok;
13175 cbz(Rlo_mn, ok); {
13176 stop("broken Montgomery multiply");
13177 } bind(ok);
13178 }
13179 #endif
13180 // We have very carefully set things up so that
13181 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
13182 // the lower half of Rm * Rn because we know the result already:
13183 // it must be -t0. t0 + (-t0) must generate a carry iff
13184 // t0 != 0. So, rather than do a mul and an adds we just set
13185 // the carry flag iff t0 is nonzero.
13186 //
13187 // mul(Rlo_mn, Rm, Rn);
13188 // adds(zr, t0, Rlo_mn);
13189 subs(zr, t0, 1); // Set carry iff t0 is nonzero
13190 adcs(t0, t1, Rhi_mn);
13191 adc(t1, t2, zr);
13192 mov(t2, zr);
13193 }
13194
13195 void acc(Register Rhi, Register Rlo,
13196 Register t0, Register t1, Register t2) {
13197 adds(t0, t0, Rlo);
13198 adcs(t1, t1, Rhi);
13199 adc(t2, t2, zr);
13200 }
13201
13202 public:
13203 /**
13204 * Fast Montgomery multiplication. The derivation of the
13205 * algorithm is in A Cryptographic Library for the Motorola
13206 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
13207 *
13208 * Arguments:
13209 *
13210 * Inputs for multiplication:
13211 * c_rarg0 - int array elements a
13212 * c_rarg1 - int array elements b
13213 * c_rarg2 - int array elements n (the modulus)
13214 * c_rarg3 - int length
13215 * c_rarg4 - int inv
13216 * c_rarg5 - int array elements m (the result)
13217 *
13218 * Inputs for squaring:
13219 * c_rarg0 - int array elements a
13220 * c_rarg1 - int array elements n (the modulus)
13221 * c_rarg2 - int length
13222 * c_rarg3 - int inv
13223 * c_rarg4 - int array elements m (the result)
13224 *
13225 */
13226 address generate_multiply() {
13227 Label argh, nothing;
13228
13229 align(CodeEntryAlignment);
13230 address entry = pc();
13231
13232 cbzw(Rlen, nothing);
13233
13234 enter();
13235
13236 // Make room.
13237 cmpw(Rlen, 512);
13238 br(Assembler::HI, argh);
13239 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
13240 andr(sp, Ra, -2 * wordSize);
13241
13242 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
13243
13244 {
13245 // Copy input args, reversing as we go. We use Ra as a
13246 // temporary variable.
13247 reverse(Ra, Pa_base, Rlen, t0, t1);
13248 if (!_squaring)
13249 reverse(Ra, Pb_base, Rlen, t0, t1);
13250 reverse(Ra, Pn_base, Rlen, t0, t1);
13251 }
13252
13253 // Push all call-saved registers and also Pm_base which we'll need
13254 // at the end.
13255 save_regs();
13256
13257 #ifndef PRODUCT
13258 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
13259 {
13260 ldr(Rn, Address(Pn_base, 0));
13261 mul(Rlo_mn, Rn, inv);
13262 subs(zr, Rlo_mn, -1);
13263 Label ok;
13264 br(EQ, ok); {
13265 stop("broken inverse in Montgomery multiply");
13266 } bind(ok);
13267 }
13268 #endif
13269
13270 mov(Pm_base, Ra);
13271
13272 mov(t0, zr);
13273 mov(t1, zr);
13274 mov(t2, zr);
13275
13276 block_comment("for (int i = 0; i < len; i++) {");
13277 mov(Ri, zr); {
13278 Label loop, end;
13279 cmpw(Ri, Rlen);
13280 br(Assembler::GE, end);
13281
13282 bind(loop);
13283 pre1(Ri);
13284
13285 block_comment(" for (j = i; j; j--) {"); {
13286 movw(Rj, Ri);
13287 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
13288 } block_comment(" } // j");
13289
13290 post1();
13291 addw(Ri, Ri, 1);
13292 cmpw(Ri, Rlen);
13293 br(Assembler::LT, loop);
13294 bind(end);
13295 block_comment("} // i");
13296 }
13297
13298 block_comment("for (int i = len; i < 2*len; i++) {");
13299 mov(Ri, Rlen); {
13300 Label loop, end;
13301 cmpw(Ri, Rlen, Assembler::LSL, 1);
13302 br(Assembler::GE, end);
13303
13304 bind(loop);
13305 pre2(Ri, Rlen);
13306
13307 block_comment(" for (j = len*2-i-1; j; j--) {"); {
13308 lslw(Rj, Rlen, 1);
13309 subw(Rj, Rj, Ri);
13310 subw(Rj, Rj, 1);
13311 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
13312 } block_comment(" } // j");
13313
13314 post2(Ri, Rlen);
13315 addw(Ri, Ri, 1);
13316 cmpw(Ri, Rlen, Assembler::LSL, 1);
13317 br(Assembler::LT, loop);
13318 bind(end);
13319 }
13320 block_comment("} // i");
13321
13322 normalize(Rlen);
13323
13324 mov(Ra, Pm_base); // Save Pm_base in Ra
13325 restore_regs(); // Restore caller's Pm_base
13326
13327 // Copy our result into caller's Pm_base
13328 reverse(Pm_base, Ra, Rlen, t0, t1);
13329
13330 leave();
13331 bind(nothing);
13332 ret(lr);
13333
13334 // handler for error case
13335 bind(argh);
13336 stop("MontgomeryMultiply total_allocation must be <= 8192");
13337
13338 return entry;
13339 }
13340 // In C, approximately:
13341
13342 // void
13343 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
13344 // julong Pn_base[], julong Pm_base[],
13345 // julong inv, int len) {
13346 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
13347 // julong *Pa, *Pb, *Pn, *Pm;
13348 // julong Ra, Rb, Rn, Rm;
13349
13350 // int i;
13351
13352 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
13353
13354 // for (i = 0; i < len; i++) {
13355 // int j;
13356
13357 // Pa = Pa_base;
13358 // Pb = Pb_base + i;
13359 // Pm = Pm_base;
13360 // Pn = Pn_base + i;
13361
13362 // Ra = *Pa;
13363 // Rb = *Pb;
13364 // Rm = *Pm;
13365 // Rn = *Pn;
13366
13367 // int iters = i;
13368 // for (j = 0; iters--; j++) {
13369 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
13370 // MACC(Ra, Rb, t0, t1, t2);
13371 // Ra = *++Pa;
13372 // Rb = *--Pb;
13373 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13374 // MACC(Rm, Rn, t0, t1, t2);
13375 // Rm = *++Pm;
13376 // Rn = *--Pn;
13377 // }
13378
13379 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
13380 // MACC(Ra, Rb, t0, t1, t2);
13381 // *Pm = Rm = t0 * inv;
13382 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
13383 // MACC(Rm, Rn, t0, t1, t2);
13384
13385 // assert(t0 == 0, "broken Montgomery multiply");
13386
13387 // t0 = t1; t1 = t2; t2 = 0;
13388 // }
13389
13390 // for (i = len; i < 2*len; i++) {
13391 // int j;
13392
13393 // Pa = Pa_base + i-len;
13394 // Pb = Pb_base + len;
13395 // Pm = Pm_base + i-len;
13396 // Pn = Pn_base + len;
13397
13398 // Ra = *++Pa;
13399 // Rb = *--Pb;
13400 // Rm = *++Pm;
13401 // Rn = *--Pn;
13402
13403 // int iters = len*2-i-1;
13404 // for (j = i-len+1; iters--; j++) {
13405 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
13406 // MACC(Ra, Rb, t0, t1, t2);
13407 // Ra = *++Pa;
13408 // Rb = *--Pb;
13409 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13410 // MACC(Rm, Rn, t0, t1, t2);
13411 // Rm = *++Pm;
13412 // Rn = *--Pn;
13413 // }
13414
13415 // Pm_base[i-len] = t0;
13416 // t0 = t1; t1 = t2; t2 = 0;
13417 // }
13418
13419 // while (t0)
13420 // t0 = sub(Pm_base, Pn_base, t0, len);
13421 // }
13422
13423 /**
13424 * Fast Montgomery squaring. This uses asymptotically 25% fewer
13425 * multiplies than Montgomery multiplication so it should be up to
13426 * 25% faster. However, its loop control is more complex and it
13427 * may actually run slower on some machines.
13428 *
13429 * Arguments:
13430 *
13431 * Inputs:
13432 * c_rarg0 - int array elements a
13433 * c_rarg1 - int array elements n (the modulus)
13434 * c_rarg2 - int length
13435 * c_rarg3 - int inv
13436 * c_rarg4 - int array elements m (the result)
13437 *
13438 */
13439 address generate_square() {
13440 Label argh;
13441
13442 align(CodeEntryAlignment);
13443 address entry = pc();
13444
13445 enter();
13446
13447 // Make room.
13448 cmpw(Rlen, 512);
13449 br(Assembler::HI, argh);
13450 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
13451 andr(sp, Ra, -2 * wordSize);
13452
13453 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
13454
13455 {
13456 // Copy input args, reversing as we go. We use Ra as a
13457 // temporary variable.
13458 reverse(Ra, Pa_base, Rlen, t0, t1);
13459 reverse(Ra, Pn_base, Rlen, t0, t1);
13460 }
13461
13462 // Push all call-saved registers and also Pm_base which we'll need
13463 // at the end.
13464 save_regs();
13465
13466 mov(Pm_base, Ra);
13467
13468 mov(t0, zr);
13469 mov(t1, zr);
13470 mov(t2, zr);
13471
13472 block_comment("for (int i = 0; i < len; i++) {");
13473 mov(Ri, zr); {
13474 Label loop, end;
13475 bind(loop);
13476 cmp(Ri, Rlen);
13477 br(Assembler::GE, end);
13478
13479 pre1(Ri);
13480
13481 block_comment("for (j = (i+1)/2; j; j--) {"); {
13482 add(Rj, Ri, 1);
13483 lsr(Rj, Rj, 1);
13484 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
13485 } block_comment(" } // j");
13486
13487 last_squaring(Ri);
13488
13489 block_comment(" for (j = i/2; j; j--) {"); {
13490 lsr(Rj, Ri, 1);
13491 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
13492 } block_comment(" } // j");
13493
13494 post1_squaring();
13495 add(Ri, Ri, 1);
13496 cmp(Ri, Rlen);
13497 br(Assembler::LT, loop);
13498
13499 bind(end);
13500 block_comment("} // i");
13501 }
13502
13503 block_comment("for (int i = len; i < 2*len; i++) {");
13504 mov(Ri, Rlen); {
13505 Label loop, end;
13506 bind(loop);
13507 cmp(Ri, Rlen, Assembler::LSL, 1);
13508 br(Assembler::GE, end);
13509
13510 pre2(Ri, Rlen);
13511
13512 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
13513 lsl(Rj, Rlen, 1);
13514 sub(Rj, Rj, Ri);
13515 sub(Rj, Rj, 1);
13516 lsr(Rj, Rj, 1);
13517 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
13518 } block_comment(" } // j");
13519
13520 last_squaring(Ri);
13521
13522 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
13523 lsl(Rj, Rlen, 1);
13524 sub(Rj, Rj, Ri);
13525 lsr(Rj, Rj, 1);
13526 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
13527 } block_comment(" } // j");
13528
13529 post2(Ri, Rlen);
13530 add(Ri, Ri, 1);
13531 cmp(Ri, Rlen, Assembler::LSL, 1);
13532
13533 br(Assembler::LT, loop);
13534 bind(end);
13535 block_comment("} // i");
13536 }
13537
13538 normalize(Rlen);
13539
13540 mov(Ra, Pm_base); // Save Pm_base in Ra
13541 restore_regs(); // Restore caller's Pm_base
13542
13543 // Copy our result into caller's Pm_base
13544 reverse(Pm_base, Ra, Rlen, t0, t1);
13545
13546 leave();
13547 ret(lr);
13548
13549 // handler for error case
13550 bind(argh);
13551 stop("MontgomeryMultiply total_allocation must be <= 8192");
13552
13553 return entry;
13554 }
13555 // In C, approximately:
13556
13557 // void
13558 // montgomery_square(julong Pa_base[], julong Pn_base[],
13559 // julong Pm_base[], julong inv, int len) {
13560 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
13561 // julong *Pa, *Pb, *Pn, *Pm;
13562 // julong Ra, Rb, Rn, Rm;
13563
13564 // int i;
13565
13566 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
13567
13568 // for (i = 0; i < len; i++) {
13569 // int j;
13570
13571 // Pa = Pa_base;
13572 // Pb = Pa_base + i;
13573 // Pm = Pm_base;
13574 // Pn = Pn_base + i;
13575
13576 // Ra = *Pa;
13577 // Rb = *Pb;
13578 // Rm = *Pm;
13579 // Rn = *Pn;
13580
13581 // int iters = (i+1)/2;
13582 // for (j = 0; iters--; j++) {
13583 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
13584 // MACC2(Ra, Rb, t0, t1, t2);
13585 // Ra = *++Pa;
13586 // Rb = *--Pb;
13587 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13588 // MACC(Rm, Rn, t0, t1, t2);
13589 // Rm = *++Pm;
13590 // Rn = *--Pn;
13591 // }
13592 // if ((i & 1) == 0) {
13593 // assert(Ra == Pa_base[j], "must be");
13594 // MACC(Ra, Ra, t0, t1, t2);
13595 // }
13596 // iters = i/2;
13597 // assert(iters == i-j, "must be");
13598 // for (; iters--; j++) {
13599 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13600 // MACC(Rm, Rn, t0, t1, t2);
13601 // Rm = *++Pm;
13602 // Rn = *--Pn;
13603 // }
13604
13605 // *Pm = Rm = t0 * inv;
13606 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
13607 // MACC(Rm, Rn, t0, t1, t2);
13608
13609 // assert(t0 == 0, "broken Montgomery multiply");
13610
13611 // t0 = t1; t1 = t2; t2 = 0;
13612 // }
13613
13614 // for (i = len; i < 2*len; i++) {
13615 // int start = i-len+1;
13616 // int end = start + (len - start)/2;
13617 // int j;
13618
13619 // Pa = Pa_base + i-len;
13620 // Pb = Pa_base + len;
13621 // Pm = Pm_base + i-len;
13622 // Pn = Pn_base + len;
13623
13624 // Ra = *++Pa;
13625 // Rb = *--Pb;
13626 // Rm = *++Pm;
13627 // Rn = *--Pn;
13628
13629 // int iters = (2*len-i-1)/2;
13630 // assert(iters == end-start, "must be");
13631 // for (j = start; iters--; j++) {
13632 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
13633 // MACC2(Ra, Rb, t0, t1, t2);
13634 // Ra = *++Pa;
13635 // Rb = *--Pb;
13636 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13637 // MACC(Rm, Rn, t0, t1, t2);
13638 // Rm = *++Pm;
13639 // Rn = *--Pn;
13640 // }
13641 // if ((i & 1) == 0) {
13642 // assert(Ra == Pa_base[j], "must be");
13643 // MACC(Ra, Ra, t0, t1, t2);
13644 // }
13645 // iters = (2*len-i)/2;
13646 // assert(iters == len-j, "must be");
13647 // for (; iters--; j++) {
13648 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
13649 // MACC(Rm, Rn, t0, t1, t2);
13650 // Rm = *++Pm;
13651 // Rn = *--Pn;
13652 // }
13653 // Pm_base[i-len] = t0;
13654 // t0 = t1; t1 = t2; t2 = 0;
13655 // }
13656
13657 // while (t0)
13658 // t0 = sub(Pm_base, Pn_base, t0, len);
13659 // }
13660 };
13661
13662 // Initialization
13663 void generate_preuniverse_stubs() {
13664 // preuniverse stubs are not needed for aarch64
13665 }
13666
13667 void generate_initial_stubs() {
13668 // Generate initial stubs and initializes the entry points
13669
13670 // entry points that exist in all platforms Note: This is code
13671 // that could be shared among different platforms - however the
13672 // benefit seems to be smaller than the disadvantage of having a
13673 // much more complicated generator structure. See also comment in
13674 // stubRoutines.hpp.
13675
13676 StubRoutines::_forward_exception_entry = generate_forward_exception();
13677
13678 StubRoutines::_call_stub_entry =
13679 generate_call_stub(StubRoutines::_call_stub_return_address);
13680
13681 // is referenced by megamorphic call
13682 StubRoutines::_catch_exception_entry = generate_catch_exception();
13683
13684 // Initialize table for copy memory (arraycopy) check.
13685 if (UnsafeMemoryAccess::_table == nullptr) {
13686 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
13687 }
13688
13689 if (UseCRC32Intrinsics) {
13690 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
13691 }
13692
13693 if (UseCRC32CIntrinsics) {
13694 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
13695 }
13696
13697 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
13698 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
13699 }
13700
13701 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
13702 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
13703 }
13704
13705 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
13706 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
13707 StubRoutines::_hf2f = generate_float16ToFloat();
13708 StubRoutines::_f2hf = generate_floatToFloat16();
13709 }
13710 }
13711
13712 void generate_continuation_stubs() {
13713 // Continuation stubs:
13714 StubRoutines::_cont_thaw = generate_cont_thaw();
13715 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
13716 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
13717 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
13718 }
13719
13720 void generate_final_stubs() {
13721 // support for verify_oop (must happen after universe_init)
13722 if (VerifyOops) {
13723 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
13724 }
13725
13726 // arraycopy stubs used by compilers
13727 generate_arraycopy_stubs();
13728
13729 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
13730
13731 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
13732
13733 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
13734 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
13735
13736 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
13737
13738 generate_atomic_entry_points();
13739
13740 #endif // LINUX
13741
13742 #ifdef COMPILER2
13743 if (UseSecondarySupersTable) {
13744 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
13745 if (! InlineSecondarySupersTest) {
13746 generate_lookup_secondary_supers_table_stub();
13747 }
13748 }
13749 #endif
13750
13751 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
13752 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
13753 }
13754
13755 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
13756 }
13757
13758 void generate_compiler_stubs() {
13759 #ifdef COMPILER2
13760
13761 if (UseSVE == 0) {
13762 generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
13763 }
13764
13765 // array equals stub for large arrays.
13766 if (!UseSimpleArrayEquals) {
13767 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
13768 }
13769
13770 // arrays_hascode stub for large arrays.
13771 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
13772 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
13773 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
13774 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
13775 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
13776
13777 // byte_array_inflate stub for large arrays.
13778 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
13779
13780 // countPositives stub for large arrays.
13781 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
13782
13783 generate_compare_long_strings();
13784
13785 generate_string_indexof_stubs();
13786
13787 if (UseMultiplyToLenIntrinsic) {
13788 StubRoutines::_multiplyToLen = generate_multiplyToLen();
13789 }
13790
13791 if (UseSquareToLenIntrinsic) {
13792 StubRoutines::_squareToLen = generate_squareToLen();
13793 }
13794
13795 if (UseMulAddIntrinsic) {
13796 StubRoutines::_mulAdd = generate_mulAdd();
13797 }
13798
13799 if (UseSIMDForBigIntegerShiftIntrinsics) {
13800 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
13801 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
13802 }
13803
13804 if (UseMontgomeryMultiplyIntrinsic) {
13805 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
13806 address start = load_archive_data(stub_id);
13807 if (start == nullptr) {
13808 // we have to generate it
13809 StubCodeMark mark(this, stub_id);
13810 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
13811 start = g.generate_multiply();
13812 // record the stub start and end
13813 store_archive_data(stub_id, start, _masm->pc());
13814 }
13815 StubRoutines::_montgomeryMultiply = start;
13816 }
13817
13818 if (UseMontgomerySquareIntrinsic) {
13819 StubId stub_id = StubId::stubgen_montgomerySquare_id;
13820 address start = load_archive_data(stub_id);
13821 if (start == nullptr) {
13822 // we have to generate it
13823 StubCodeMark mark(this, stub_id);
13824 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
13825 // We use generate_multiply() rather than generate_square()
13826 // because it's faster for the sizes of modulus we care about.
13827 start = g.generate_multiply();
13828 // record the stub start and end
13829 store_archive_data(stub_id, start, _masm->pc());
13830 }
13831 StubRoutines::_montgomerySquare = start;
13832 }
13833
13834 if (UseChaCha20Intrinsics) {
13835 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
13836 }
13837
13838 if (UseIntPolyIntrinsics) {
13839 StubRoutines::_intpoly_montgomeryMult_P256 = generate_intpoly_montgomeryMult_P256();
13840 StubRoutines::_intpoly_assign = generate_intpoly_assign();
13841 }
13842
13843 if (UseKyberIntrinsics) {
13844 StubRoutines::_kyberNtt = generate_kyberNtt();
13845 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
13846 StubRoutines::_kyberNttMult = generate_kyberNttMult();
13847 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
13848 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
13849 StubRoutines::_kyber12To16 = generate_kyber12To16();
13850 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
13851 }
13852
13853 if (UseDilithiumIntrinsics) {
13854 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
13855 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
13856 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
13857 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
13858 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
13859 }
13860
13861 if (UseBASE64Intrinsics) {
13862 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
13863 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
13864 }
13865
13866 // data cache line writeback
13867 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
13868 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
13869
13870 if (UseAESIntrinsics) {
13871 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
13872 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
13873 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
13874 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
13875 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
13876 }
13877 if (UseGHASHIntrinsics) {
13878 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
13879 StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
13880 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
13881 }
13882 if (UseAESIntrinsics && UseGHASHIntrinsics) {
13883 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
13884 }
13885
13886 if (UseMD5Intrinsics) {
13887 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
13888 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
13889 }
13890 if (UseSHA1Intrinsics) {
13891 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
13892 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
13893 }
13894 if (UseSHA256Intrinsics) {
13895 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
13896 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
13897 }
13898 if (UseSHA512Intrinsics) {
13899 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
13900 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
13901 }
13902 if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
13903 StubRoutines::_double_keccak = generate_double_keccak();
13904 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
13905 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
13906 } else if (UseSHA3Intrinsics) {
13907 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
13908 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
13909 }
13910
13911 if (UsePoly1305Intrinsics) {
13912 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
13913 }
13914
13915 // The difference between AArch64 vs. x86_64 intrinsics implementation
13916 // include the lack of square() intrinsics; usage caused a 3.3% performance
13917 // degradation due to the efficiencies of the symmetric squaring shape in
13918 // Java vs. the inefficiencies of the leaf calls and the additional cycles
13919 // required for 64 bit multiplication in AArch64.
13920 if (UseIntPoly25519Intrinsics) {
13921 StubRoutines::_intpoly_mult_25519 = generate_intpoly_mult_25519();
13922 }
13923
13924 // generate Adler32 intrinsics code
13925 if (UseAdler32Intrinsics) {
13926 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
13927 }
13928
13929 #endif // COMPILER2
13930 }
13931
13932 public:
13933 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
13934 switch(blob_id) {
13935 case BlobId::stubgen_preuniverse_id:
13936 generate_preuniverse_stubs();
13937 break;
13938 case BlobId::stubgen_initial_id:
13939 generate_initial_stubs();
13940 break;
13941 case BlobId::stubgen_continuation_id:
13942 generate_continuation_stubs();
13943 break;
13944 case BlobId::stubgen_compiler_id:
13945 generate_compiler_stubs();
13946 break;
13947 case BlobId::stubgen_final_id:
13948 generate_final_stubs();
13949 break;
13950 default:
13951 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
13952 break;
13953 };
13954 }
13955
13956 #if INCLUDE_CDS
13957 static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
13958 // external data defined in this file
13959 #define ADD(addr) external_addresses.append((address)(addr));
13960 ADD(_sha256_round_consts);
13961 ADD(_sha512_round_consts);
13962 ADD(_sha3_round_consts);
13963 ADD(_double_keccak_round_consts);
13964 ADD(_modulus_P256);
13965 ADD(_encodeBlock_toBase64);
13966 ADD(_encodeBlock_toBase64URL);
13967 ADD(_decodeBlock_fromBase64ForNoSIMD);
13968 ADD(_decodeBlock_fromBase64URLForNoSIMD);
13969 ADD(_decodeBlock_fromBase64ForSIMD);
13970 ADD(_decodeBlock_fromBase64URLForSIMD);
13971 #undef ADD
13972 }
13973 #endif // INCLUDE_CDS
13974 }; // end class declaration
13975
13976 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
13977 StubGenerator g(code, blob_id, stub_data);
13978 }
13979
13980 #if INCLUDE_CDS
13981 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
13982 StubGenerator::init_AOTAddressTable(addresses);
13983 }
13984 #endif // INCLUDE_CDS
13985
13986 #if defined (LINUX)
13987
13988 // Define pointers to atomic stubs and initialize them to point to the
13989 // code in atomic_aarch64.S.
13990
13991 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
13992 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
13993 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
13994 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
13995 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
13996
13997 DEFAULT_ATOMIC_OP(fetch_add, 4, )
13998 DEFAULT_ATOMIC_OP(fetch_add, 8, )
13999 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
14000 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
14001 DEFAULT_ATOMIC_OP(xchg, 4, )
14002 DEFAULT_ATOMIC_OP(xchg, 8, )
14003 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
14004 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
14005 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
14006 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
14007 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
14008 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
14009 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
14010 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
14011 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
14012 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
14013
14014 #undef DEFAULT_ATOMIC_OP
14015
14016 #endif // LINUX