1 /*
  2  * Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  */
 23 
 24 package org.openjdk.bench.jdk.incubator.vector.crypto;
 25 
 26 import jdk.incubator.foreign.MemorySegment;
 27 import org.openjdk.jmh.annotations.*;
 28 import jdk.incubator.vector.*;
 29 
 30 import java.nio.ByteOrder;
 31 import java.util.Arrays;
 32 
 33 @State(Scope.Thread)
 34 @BenchmarkMode(Mode.Throughput)
 35 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
 36 @Warmup(iterations = 3, time = 3)
 37 @Measurement(iterations = 8, time = 2)
 38 public class Poly1305Bench {
 39 
 40     @Param({"16384", "65536"})
 41     private int dataSize;
 42 
 43     private Poly1305Vector poly1305_S128 = makePoly1305(VectorShape.S_128_BIT);
 44     private Poly1305Vector poly1305_S256 = makePoly1305(VectorShape.S_256_BIT);
 45     private Poly1305Vector poly1305_S512 = makePoly1305(VectorShape.S_512_BIT);
 46 
 47     private byte[] in;
 48     private byte[] out = new byte[16];
 49     private byte[] key = new byte[32];
 50 
 51     private static Poly1305Vector makePoly1305(VectorShape shape) {
 52         Poly1305Vector poly = new Poly1305Vector(shape);
 53         runKAT(poly);
 54         return poly;
 55     }
 56 
 57     @Setup
 58     public void setup() {
 59         in = new byte[dataSize];
 60     }
 61 
 62     @Benchmark
 63     public void auth128() {
 64         poly1305_S128.computeTag(key, in, out);
 65     }
 66 
 67     @Benchmark
 68     public void auth256() {
 69         poly1305_S256.computeTag(key, in, out);
 70     }
 71 
 72     @Benchmark
 73     public void auth512() {
 74         poly1305_S512.computeTag(key, in, out);
 75     }
 76 
 77     private static class Poly1305Vector {
 78 
 79         private static final int BITS_PER_LIMB = 26;
 80         private static final int LIMB_MASK = (1 << BITS_PER_LIMB) - 1;
 81         private static final int KEY_LENGTH = 32;
 82         private static final int RS_LENGTH = KEY_LENGTH / 2;
 83 
 84         private final VectorSpecies<Long> longSpecies;
 85         private final VectorSpecies<Integer> intSpecies;
 86         private final int vectorWidth;
 87         private final int parBlockCount;
 88 
 89         private final VectorShuffle<Long> inShuffle0;
 90         private final VectorShuffle<Long> inShuffle1;
 91         private final VectorMask<Long> inMask;
 92 
 93         public Poly1305Vector(VectorShape shape) {
 94 
 95             this.longSpecies = VectorSpecies.of(long.class, shape);
 96             int intSize = shape.vectorBitSize() / 2;
 97             VectorShape intShape = VectorShape.forBitSize(intSize);
 98             this.intSpecies = VectorSpecies.of(int.class, intShape);
 99             this.vectorWidth = longSpecies.length();
100             this.parBlockCount = vectorWidth * 16;
101 
102             this.inShuffle0 = makeInShuffle0();
103             this.inShuffle1 = makeInShuffle1();
104             this.inMask = makeInMask();
105         }
106 
107         private VectorShuffle<Long> makeInShuffle0() {
108             int[] indexArr = new int[vectorWidth];
109             for (int i = 0; i < indexArr.length; i++) {
110                 indexArr[i] = (2 * i) % vectorWidth;
111             }
112             return VectorShuffle.fromArray(longSpecies, indexArr, 0);
113         }
114         private VectorShuffle<Long> makeInShuffle1() {
115             int[] indexArr = new int[vectorWidth];
116             for (int i = 0; i < indexArr.length; i++) {
117                 indexArr[i] = ((2 * i) % vectorWidth) + 1;
118             }
119             return VectorShuffle.fromArray(longSpecies, indexArr, 0);
120         }
121         private VectorMask<Long> makeInMask() {
122             boolean[] maskArr = new boolean[vectorWidth];
123             for (int i = vectorWidth / 2; i < vectorWidth; i++) {
124                 maskArr[i] = true;
125             }
126             return VectorMask.fromArray(longSpecies, maskArr, 0);
127         }
128 
129         private static int[] fromByteArray(byte[] buf) {
130             int[] result = new int[5];
131 
132             result[0]
133                     = (buf[0] & 0xFF)
134                     + ((buf[1] & 0xFF) << 8)
135                     + ((buf[2] & 0xFF) << 16)
136                     + ((buf[3] & 0x03) << 24);
137             result[1]
138                     = ((buf[3] & 0xFF) >> 2)
139                     + ((buf[4] & 0xFF) << 6)
140                     + ((buf[5] & 0xFF) << 14)
141                     + ((buf[6] & 0x0F) << 22);
142             result[2]
143                     = ((buf[6] & 0xFF) >> 4)
144                     + ((buf[7] & 0xFF) << 4)
145                     + ((buf[8] & 0xFF) << 12)
146                     + ((buf[9] & 0x3F) << 20);
147             result[3]
148                     = ((buf[9] & 0xFF) >> 6)
149                     + ((buf[10] & 0xFF) << 2)
150                     + ((buf[11] & 0xFF) << 10)
151                     + ((buf[12] & 0xFF) << 18);
152             result[4]
153                     = (buf[13] & 0xFF)
154                     + ((buf[14] & 0xFF) << 8)
155                     + ((buf[15] & 0xFF) << 16);
156 
157             return result;
158         }
159 
160         private static void toByteArray(long v0, long v1, long v2, long v3,
161             long v4, byte[] dst) {
162 
163             dst[0] = (byte) v0;
164             v0 >>= 8;
165             dst[1] = (byte) v0;
166             v0 >>= 8;
167             dst[2] = (byte) v0;
168             v0 >>= 8;
169             dst[3] = (byte) v0;
170 
171             dst[3] += (v1 & 0x3F) << 2;
172             v1 >>= 6;
173             dst[4] = (byte) v1;
174             v1 >>= 8;
175             dst[5] = (byte) v1;
176             v1 >>= 8;
177             dst[6] = (byte) v1;
178 
179             dst[6] += (v2 & 0xF) << 4;
180             v2 >>= 4;
181             dst[7] = (byte) v2;
182             v2 >>= 8;
183             dst[8] = (byte) v2;
184             v2 >>= 8;
185             dst[9] = (byte) v2;
186 
187             dst[9] += (v3 & 0x3) << 6;
188             v3 >>= 2;
189             dst[10] = (byte) v3;
190             v3 >>= 8;
191             dst[11] = (byte) v3;
192             v3 >>= 8;
193             dst[12] = (byte) v3;
194 
195             dst[13] = (byte) v4;
196             v4 >>= 8;
197             dst[14] = (byte) v4;
198             v4 >>= 8;
199             dst[15] = (byte) v4;
200         }
201 
202         protected static long carryValue(long x) {
203             return x >> BITS_PER_LIMB;
204         }
205 
206         public static void carryReduce(int[] r, long c0, long c1, long c2,
207             long c3, long c4) {
208 
209             long c;
210 
211             c = carryValue(c3); c3 &= LIMB_MASK; c4 += c;
212             c = carryValue(c4); c4 &= LIMB_MASK; c0 += c * 5;
213             c = carryValue(c0); c0 &= LIMB_MASK; c1 += c;
214             c = carryValue(c1); c1 &= LIMB_MASK; c2 += c;
215             c = carryValue(c2); c2 &= LIMB_MASK; c3 += c;
216             c = carryValue(c3); c3 &= LIMB_MASK; c4 += c;
217 
218             r[0] = (int) c0;
219             r[1] = (int) c1;
220             r[2] = (int) c2;
221             r[3] = (int) c3;
222             r[4] = (int) c4;
223         }
224 
225         private int[] multiply(int[] a, int[] b) {
226             int[] result = new int[5];
227 
228             long a0 = a[0];
229             long a1 = a[1];
230             long a2 = a[2];
231             long a3 = a[3];
232             long a4 = a[4];
233 
234             long c0 = (a0 * b[0]) + 5 * (a1 * b[4]) + 5 * (a2 * b[3]) +
235                 5 * (a3 * b[2]) + 5 * (a4 * b[1]);
236             long c1 = (a0 * b[1]) + (a1 * b[0]) + 5 * (a2 * b[4]) +
237                 5 * (a3 * b[3]) + 5 * (a4 * b[2]);
238             long c2 = (a0 * b[2]) + (a1 * b[1]) + (a2 * b[0]) +
239                 5 * (a3 * b[4]) + 5 * (a4 * b[3]);
240             long c3 = (a0 * b[3]) + (a1 * b[2]) + (a2 * b[1]) + (a3 * b[0]) +
241                 5 * (a4 * b[4]);
242             long c4 = (a0 * b[4]) + (a1 * b[3]) + (a2 * b[2]) + (a3 * b[1]) +
243                 (a4 * b[0]);
244 
245             carryReduce(result, c0, c1, c2, c3, c4);
246 
247             return result;
248         }
249 
250         private LongVector rPowerVec(int[][] r, long[] temp, int maxIndex,
251             int secondIndex) {
252 
253             for (int i = 0; i < temp.length; i++) {
254                 temp[i] = r[maxIndex - i][secondIndex];
255             }
256             return LongVector.fromArray(longSpecies, temp, 0);
257         }
258 
259         public void computeTag(byte[] key, byte[] msg, byte[] out) {
260 
261             byte[] keyBytes = key.clone();
262 
263             // setup key values
264             // Clamp the bytes in the "r" half of the key.
265             keyBytes[3] &= 15;
266             keyBytes[7] &= 15;
267             keyBytes[11] &= 15;
268             keyBytes[15] &= 15;
269             keyBytes[4] &= 252;
270             keyBytes[8] &= 252;
271             keyBytes[12] &= 252;
272 
273             // Create IntegerModuloP elements from the r and s values
274             int[][] r = new int[vectorWidth][];
275             r[0] = fromByteArray(keyBytes);
276             for (int i = 1; i < vectorWidth; i++) {
277                 r[i] = multiply(r[i - 1], r[0]);
278             }
279 
280             int rUpIndex = vectorWidth - 1;
281             IntVector rUp0_int = IntVector.broadcast(intSpecies, r[rUpIndex][0]);
282             IntVector rUp1_int = IntVector.broadcast(intSpecies, r[rUpIndex][1]);
283             IntVector rUp2_int = IntVector.broadcast(intSpecies, r[rUpIndex][2]);
284             IntVector rUp3_int = IntVector.broadcast(intSpecies, r[rUpIndex][3]);
285             IntVector rUp4_int = IntVector.broadcast(intSpecies, r[rUpIndex][4]);
286 
287             IntVector r5Up1_int = rUp1_int.mul(5);
288             IntVector r5Up2_int = rUp2_int.mul(5);
289             IntVector r5Up3_int = rUp3_int.mul(5);
290             IntVector r5Up4_int = rUp4_int.mul(5);
291 
292             MemorySegment msMsg = MemorySegment.ofArray(msg);
293             LongVector longMsg0 = LongVector.fromMemorySegment(longSpecies, msMsg, 0, ByteOrder.LITTLE_ENDIAN);
294             LongVector longMsg1 =
295                 LongVector.fromMemorySegment(longSpecies, msMsg, vectorWidth * 8L, ByteOrder.LITTLE_ENDIAN);
296 
297             LongVector inAlign0 =
298             longMsg0.rearrange(inShuffle0).blend(longMsg1.rearrange(inShuffle0), inMask);
299             LongVector inAlign1 =
300             longMsg0.rearrange(inShuffle1).blend(longMsg1.rearrange(inShuffle1), inMask);
301 
302             IntVector a0 = (IntVector)
303                 inAlign0.and(LIMB_MASK).castShape(intSpecies, 0);
304             IntVector a1 = (IntVector)
305                 inAlign0.lanewise(VectorOperators.LSHR,26).and(LIMB_MASK).castShape(intSpecies, 0);
306             IntVector a2 = (IntVector)
307                 inAlign0.lanewise(VectorOperators.LSHR,52).and(0xFFF).castShape(intSpecies, 0);
308             a2 = a2.or(inAlign1.and(0x3FFF).lanewise(VectorOperators.LSHL,12).castShape(intSpecies, 0));
309             IntVector a3 = (IntVector)
310                 inAlign1.lanewise(VectorOperators.LSHR,14).and(LIMB_MASK).castShape(intSpecies, 0);
311             IntVector a4 = (IntVector)
312                 inAlign1.lanewise(VectorOperators.LSHR,40).and(0xFFFFFF).castShape(intSpecies, 0);
313             a4 = a4.or(1 << 24);
314 
315             int numParBlocks = msg.length / parBlockCount - 1;
316             for (int i = 0; i < numParBlocks; i++) {
317 
318                 // multiply and reduce
319                 LongVector c0 = (LongVector)
320                     a0.castShape(longSpecies, 0).mul(rUp0_int.castShape(longSpecies, 0))
321                     .add(a1.castShape(longSpecies, 0).mul(r5Up4_int.castShape(longSpecies, 0)))
322                     .add(a2.castShape(longSpecies, 0).mul(r5Up3_int.castShape(longSpecies, 0)))
323                     .add(a3.castShape(longSpecies, 0).mul(r5Up2_int.castShape(longSpecies, 0)))
324                     .add(a4.castShape(longSpecies, 0).mul(r5Up1_int.castShape(longSpecies, 0)));
325 
326                 LongVector c1 = (LongVector)
327                     a0.castShape(longSpecies, 0).mul(rUp1_int.castShape(longSpecies, 0))
328                     .add(a1.castShape(longSpecies, 0).mul(rUp0_int.castShape(longSpecies, 0)))
329                     .add(a2.castShape(longSpecies, 0).mul(r5Up4_int.castShape(longSpecies, 0)))
330                     .add(a3.castShape(longSpecies, 0).mul(r5Up3_int.castShape(longSpecies, 0)))
331                     .add(a4.castShape(longSpecies, 0).mul(r5Up2_int.castShape(longSpecies, 0)));
332 
333                 LongVector c2 = (LongVector)
334                     a0.castShape(longSpecies, 0).mul(rUp2_int.castShape(longSpecies, 0))
335                     .add(a1.castShape(longSpecies, 0).mul(rUp1_int.castShape(longSpecies, 0)))
336                     .add(a2.castShape(longSpecies, 0).mul(rUp0_int.castShape(longSpecies, 0)))
337                     .add(a3.castShape(longSpecies, 0).mul(r5Up4_int.castShape(longSpecies, 0)))
338                     .add(a4.castShape(longSpecies, 0).mul(r5Up3_int.castShape(longSpecies, 0)));
339 
340                 LongVector c3 = (LongVector)
341                     a0.castShape(longSpecies, 0).mul(rUp3_int.castShape(longSpecies, 0))
342                     .add(a1.castShape(longSpecies, 0).mul(rUp2_int.castShape(longSpecies, 0)))
343                     .add(a2.castShape(longSpecies, 0).mul(rUp1_int.castShape(longSpecies, 0)))
344                     .add(a3.castShape(longSpecies, 0).mul(rUp0_int.castShape(longSpecies, 0)))
345                     .add(a4.castShape(longSpecies, 0).mul(r5Up4_int.castShape(longSpecies, 0)));
346 
347                 LongVector c4 = (LongVector)
348                     a0.castShape(longSpecies, 0).mul(rUp4_int.castShape(longSpecies, 0))
349                     .add(a1.castShape(longSpecies, 0).mul(rUp3_int.castShape(longSpecies, 0)))
350                     .add(a2.castShape(longSpecies, 0).mul(rUp2_int.castShape(longSpecies, 0)))
351                     .add(a3.castShape(longSpecies, 0).mul(rUp1_int.castShape(longSpecies, 0)))
352                     .add(a4.castShape(longSpecies, 0).mul(rUp0_int.castShape(longSpecies, 0)));
353 
354                 // carry/reduce
355                 // Note: this carry/reduce sequence might not be correct
356                 c4 = c4.add(c3.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
357                 c3 = c3.and(LIMB_MASK);
358                 c0 = c0.add(c4.lanewise(VectorOperators.LSHR, BITS_PER_LIMB).mul(5));
359                 c4 = c4.and(LIMB_MASK);
360                 c1 = c1.add(c0.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
361                 c0 = c0.and(LIMB_MASK);
362                 c2 = c2.add(c1.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
363                 c1 = c1.and(LIMB_MASK);
364                 c3 = c3.add(c2.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
365                 c2 = c2.and(LIMB_MASK);
366                 c4 = c4.add(c3.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
367                 c3 = c3.and(LIMB_MASK);
368 
369                 a0 = (IntVector) c0.castShape(intSpecies, 0);
370                 a1 = (IntVector) c1.castShape(intSpecies, 0);
371                 a2 = (IntVector) c2.castShape(intSpecies, 0);
372                 a3 = (IntVector) c3.castShape(intSpecies, 0);
373                 a4 = (IntVector) c4.castShape(intSpecies, 0);
374 
375                 // fromByteArray and add next part of message
376                 int start = parBlockCount * (i + 1);
377 
378                 longMsg0 = LongVector.fromMemorySegment(longSpecies, msMsg, start, ByteOrder.LITTLE_ENDIAN);
379                 longMsg1 = LongVector.fromMemorySegment(longSpecies, msMsg,
380                     start + vectorWidth * 8L, ByteOrder.LITTLE_ENDIAN);
381 
382                 inAlign0 =
383                         longMsg0.rearrange(inShuffle0).blend(longMsg1.rearrange(inShuffle0), inMask);
384                 inAlign1 =
385                         longMsg0.rearrange(inShuffle1).blend(longMsg1.rearrange(inShuffle1), inMask);
386 
387                 IntVector in0 = (IntVector)
388                     inAlign0.and(LIMB_MASK).castShape(intSpecies, 0);
389                 IntVector in1 = (IntVector)
390                     inAlign0.lanewise(VectorOperators.LSHR, 26).and(LIMB_MASK).castShape(intSpecies, 0);
391                 IntVector in2 = (IntVector)
392                     inAlign0.lanewise(VectorOperators.LSHR, 52).and(0xFFF).castShape(intSpecies, 0);
393                 in2 = in2.or(inAlign1.and(0x3FFF).lanewise(VectorOperators.LSHL, 12).castShape(intSpecies, 0));
394                 IntVector in3 = (IntVector)
395                     inAlign1.lanewise(VectorOperators.LSHR, 14).and(LIMB_MASK).castShape(intSpecies, 0);
396                 IntVector in4 = (IntVector)
397                     inAlign1.lanewise(VectorOperators.LSHR, 40).and(0xFFFFFF).castShape(intSpecies, 0);
398                 in4 = in4.or(1 << 24);
399 
400                 a0 = a0.add(in0);
401                 a1 = a1.add(in1);
402                 a2 = a2.add(in2);
403                 a3 = a3.add(in3);
404                 a4 = a4.add(in4);
405             }
406 
407             // multiply by powers of r
408             long[] rTemp = new long[vectorWidth];
409             LongVector rFin0 = rPowerVec(r, rTemp, rUpIndex, 0);
410             LongVector rFin1 = rPowerVec(r, rTemp, rUpIndex, 1);
411             LongVector rFin2 = rPowerVec(r, rTemp, rUpIndex, 2);
412             LongVector rFin3 = rPowerVec(r, rTemp, rUpIndex, 3);
413             LongVector rFin4 = rPowerVec(r, rTemp, rUpIndex, 4);
414 
415             LongVector r5Fin_1 = rFin1.mul(5);
416             LongVector r5Fin_2 = rFin2.mul(5);
417             LongVector r5Fin_3 = rFin3.mul(5);
418             LongVector r5Fin_4 = rFin4.mul(5);
419 
420             LongVector c0 = (LongVector) a0.castShape(longSpecies, 0).mul(rFin0)
421                 .add(a1.castShape(longSpecies, 0).mul(r5Fin_4))
422                 .add(a2.castShape(longSpecies, 0).mul(r5Fin_3))
423                 .add(a3.castShape(longSpecies, 0).mul(r5Fin_2))
424                 .add(a4.castShape(longSpecies, 0).mul(r5Fin_1));
425             LongVector c1 = (LongVector) a0.castShape(longSpecies, 0).mul(rFin1)
426                 .add(a1.castShape(longSpecies, 0).mul(rFin0))
427                 .add(a2.castShape(longSpecies, 0).mul(r5Fin_4))
428                 .add(a3.castShape(longSpecies, 0).mul(r5Fin_3))
429                 .add(a4.castShape(longSpecies, 0).mul(r5Fin_2));
430             LongVector c2 = (LongVector) a0.castShape(longSpecies, 0).mul(rFin2)
431                 .add(a1.castShape(longSpecies, 0).mul(rFin1))
432                 .add(a2.castShape(longSpecies, 0).mul(rFin0))
433                 .add(a3.castShape(longSpecies, 0).mul(r5Fin_4))
434                 .add(a4.castShape(longSpecies, 0).mul(r5Fin_3));
435             LongVector c3 = (LongVector) a0.castShape(longSpecies, 0).mul(rFin3)
436                 .add(a1.castShape(longSpecies, 0).mul(rFin2))
437                 .add(a2.castShape(longSpecies, 0).mul(rFin1))
438                 .add(a3.castShape(longSpecies, 0).mul(rFin0))
439                 .add(a4.castShape(longSpecies, 0).mul(r5Fin_4));
440             LongVector c4 = (LongVector) a0.castShape(longSpecies, 0).mul(rFin4)
441                 .add(a1.castShape(longSpecies, 0).mul(rFin3))
442                 .add(a2.castShape(longSpecies, 0).mul(rFin2))
443                 .add(a3.castShape(longSpecies, 0).mul(rFin1))
444                 .add(a4.castShape(longSpecies, 0).mul(rFin0));
445 
446             c4 = c4.add(c3.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
447             c3 = c3.and(LIMB_MASK);
448             c0 = c0.add(c4.lanewise(VectorOperators.LSHR, BITS_PER_LIMB).mul(5));
449             c4 = c4.and(LIMB_MASK);
450             c1 = c1.add(c0.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
451             c0 = c0.and(LIMB_MASK);
452             c2 = c2.add(c1.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
453             c1 = c1.and(LIMB_MASK);
454             c3 = c3.add(c2.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
455             c2 = c2.and(LIMB_MASK);
456             c4 = c4.add(c3.lanewise(VectorOperators.LSHR, BITS_PER_LIMB));
457             c3 = c3.and(LIMB_MASK);
458 
459             a0 = (IntVector) c0.castShape(intSpecies, 0);
460             a1 = (IntVector) c1.castShape(intSpecies, 0);
461             a2 = (IntVector) c2.castShape(intSpecies, 0);
462             a3 = (IntVector) c3.castShape(intSpecies, 0);
463             a4 = (IntVector) c4.castShape(intSpecies, 0);
464 
465             // collect lanes and calculate tag
466             long a0Fin = a0.reduceLanes(VectorOperators.ADD);
467             long a1Fin = a1.reduceLanes(VectorOperators.ADD);
468             long a2Fin = a2.reduceLanes(VectorOperators.ADD);
469             long a3Fin = a3.reduceLanes(VectorOperators.ADD);
470             long a4Fin = a4.reduceLanes(VectorOperators.ADD);
471 
472             // carry/reduce the result
473             a4Fin = a4Fin + (a3Fin >>> BITS_PER_LIMB);
474             a3Fin = a3Fin & LIMB_MASK;
475             a0Fin = a0Fin + ((a4Fin >>> BITS_PER_LIMB) * 5);
476             a4Fin = a4Fin & LIMB_MASK;
477             a1Fin = a1Fin + (a0Fin >>> BITS_PER_LIMB);
478             a0Fin = a0Fin & LIMB_MASK;
479             a2Fin = a2Fin + (a1Fin >>> BITS_PER_LIMB);
480             a1Fin = a1Fin & LIMB_MASK;
481             a3Fin = a3Fin + (a2Fin >>> BITS_PER_LIMB);
482             a2Fin = a2Fin & LIMB_MASK;
483             a4Fin = a4Fin + (a3Fin >>> BITS_PER_LIMB);
484             a3Fin = a3Fin & LIMB_MASK;
485 
486             byte[] s_arr =
487                 Arrays.copyOfRange(keyBytes, RS_LENGTH, 2 * RS_LENGTH);
488             int[] s = fromByteArray(s_arr);
489 
490             // Add in the s-half of the key to the accumulator
491             a0Fin += s[0];
492             a1Fin += s[1];
493             a2Fin += s[2];
494             a3Fin += s[3];
495             a4Fin += s[4];
496 
497             // final carry mod 2^130
498             a1Fin = a1Fin + (a0Fin >> BITS_PER_LIMB);
499             a0Fin = a0Fin & LIMB_MASK;
500             a2Fin = a2Fin + (a1Fin >> BITS_PER_LIMB);
501             a1Fin = a1Fin & LIMB_MASK;
502             a3Fin = a3Fin + (a2Fin >> BITS_PER_LIMB);
503             a2Fin = a2Fin & LIMB_MASK;
504             a4Fin = a4Fin + (a3Fin >> BITS_PER_LIMB);
505             a3Fin = a3Fin & LIMB_MASK;
506             a4Fin = a4Fin & LIMB_MASK;
507 
508             // put result in buffer
509             toByteArray(a0Fin, a1Fin, a2Fin, a3Fin, a4Fin, out);
510         }
511     }
512 
513 
514     private static byte[] hexStringToByteArray(String str) {
515         byte[] result = new byte[str.length() / 2];
516         for (int i = 0; i < result.length; i++) {
517             result[i] = (byte) Character.digit(str.charAt(2 * i), 16);
518             result[i] <<= 4;
519             result[i] += Character.digit(str.charAt(2 * i + 1), 16);
520         }
521         return result;
522     }
523 
524     public static String byteArrayToHexString(byte[] arr) {
525         StringBuilder result = new StringBuilder();
526         for (int i = 0; i < arr.length; ++i) {
527             byte curVal = arr[i];
528             result.append(Character.forDigit(curVal >> 4 & 0xF, 16));
529             result.append(Character.forDigit(curVal & 0xF, 16));
530         }
531         return result.toString();
532     }
533 
534     private static void kat(Poly1305Vector poly1305, String key, String msg,
535         String expectedTag) {
536 
537         kat(poly1305, hexStringToByteArray(key), hexStringToByteArray(msg),
538             hexStringToByteArray(expectedTag));
539     }
540 
541     private static void kat(Poly1305Vector poly1305, byte[] key, byte[] msg,
542         byte[] expectedTag) {
543 
544         byte[] tag = new byte[expectedTag.length];
545         poly1305.computeTag(key, msg, tag);
546         if (!Arrays.equals(tag, expectedTag)) {
547             throw new RuntimeException(
548                     "bad tag: " + byteArrayToHexString(tag) +
549                     " expected: " + byteArrayToHexString(expectedTag));
550         }
551     }
552 
553     /*
554      * Poly1305 Known Answer Tests to ensure that the implementation is correct.
555      */
556     private static void runKAT(Poly1305Vector poly1305) {
557         kat(poly1305,
558             "d212b886dd4682a41f1759e6c5aef84760e5a63d4423ca7d1fb5c7ecfc5dac27",
559             "5d2ad39e2a7b0bc5f375488643acf391188d01ad936971457427bc053c4262a1" +
560             "598532850def8573213c5f79fa736703c57c03ec49b55617210998c8af408698" +
561             "866632a7ecf7e9a688605cbca919e17e2badd090a7a6d83ad90be0617fa44642" +
562             "cc9a1ca38514a026cbea51c287ec0b56719fc61183c88e9450ba85aa8ab7d390",
563             "7ccdfa8e82df540276e8172f705adce2");
564 
565         kat(poly1305,
566             "2b0b684c86910104aee1d261ac4d5a0f5443b4b7746cf7f8ba03921d273f6a9b",
567             "027b359f44a5d60f81073ceb74749207742529dcefa4a26a1817db2c8d50ba2b" +
568             "d9e170cd1930946872d95e4eae41389f362087871a749897e0fbe42494e6f0b3" +
569             "8db01e2059510b6fda4f422ce7d226433ba00940e1761baaff80d9b8f3a61d11" +
570             "a109e6082d231cf85aa718199e6eaaaf07bad562469ef1b8e639c727967bf6da" +
571             "bcd16fcb0fc102095325e2fac92e599e81c26900df1deb7b0a0b5c321a658024" +
572             "26506740509ece646fecf33a517b66e57577372156aae85765c6b473521d1019" +
573             "4f5fbe0e932cfee716e1d41c9154fb8e15b82ab7e807fb54f3d7d3e4c589cc9a" +
574             "492d17ea4fd27894fa9d22a9db6d5df674cd1e97e7e8758a360291f22dfe1cc3",
575             "84ca3a778faf0ab9f840fe5fb38ace27");
576 
577         kat(poly1305,
578             "870c6fa7da2eadb845ac8b0eeaed4cf856eca67bf96b64a29a2e6a881821fa8a",
579             "ff1a3b67a4f575be5f05c4054e4c7365838c2cbe786ba78900c8b43f197c3c4d" +
580             "120432a287e434669af579bcd56f3320e54d2f97a306f917f2f41b1c97cc69db" +
581             "4ac2051adccd687fa89f92504d1ab5c3006681d846c8051aabccca0024ef5ec4" +
582             "c43b8701ffc9d14fef8d55e229ed210a2b9bde996f5d7b545d15e1fc32764604" +
583             "b2a0384dd173aa800b7526c8ff397c05130bb6a1f2194968adaef6979b023cd8" +
584             "d9195d2739351c7e4ac6c43508634f813641f669e78cbcf732ccb1321a2cd2c4" +
585             "14c7df5b9ea3408f2e12fbf3a3cbdb98699dd5402725ec25f9fff9bcd0f93cb3" +
586             "cf0dac016fec41a5ef3ae8b8d258a09f530ad14ad2e52186041592eac0ea22ff" +
587             "8c3751009b516c60f7764cccbb816394ef035bd8cb2a38d5c6b9229e528e56cd" +
588             "62600b5219b64212642384e628f01d790eeef4963a7d1a63a9faff79d4acfa09" +
589             "78b58b0b623ae89389661aa408b16814d3baaca20978dce6888c3365f4ffd2fa" +
590             "8f031a44f2e870a06da21d7becf450d335e1386268bc189435e7955a477bc368",
591             "ff4e0ee6feb1c6a57e638a79fafc7c60");
592 
593         kat(poly1305,
594             "c27987ae88a833ae2ea90371b2e257c15773da3bc34516b6b075446e1f844a81",
595             "64e5a2e2940b173c7103ae931ced302a8f8c778f4e5c0b3677c51552655005d8" +
596             "504b724107e7262448c94db83fc9c6a2a26fc973360dce15c0553b73bb733d3e" +
597             "f61fcba8977e76c32523b80c3b45b1226b23ee17522f9b677880c69b356917ae" +
598             "3c792a0c5b0c77b90dfa51483626323b7a73fffb1b128c595d553bf62a8f5bb9" +
599             "fa48b4a850a932481bf607e8da84730c9052bba9316ec7eb84007a4eb5cbed5c" +
600             "7c67ef32d4c5cb6cfbccd738d239857c240de6d3d4e5af14d480feb63541e5d8" +
601             "036e088b2e32431e6fe0c4d3505aebe2e14bd02b6b15325f89aef048cd1236db" +
602             "4461a59304b7c61ece2c52ef8ac4cf2326e6aaff013494b1b191be4ae4381f57" +
603             "e72b947ee23d0a528087db9338bc28c68484929fc3436995b2083b06a765ceb7" +
604             "09e9dd41ba896d99832d6851189766e844137d9a83d2890bc2be7afc82f9ebb8" +
605             "bafe08ef5f7ff0cce9a1d08e6b797a17df04731f384a34b16e72e9f2ab070114" +
606             "8008945509fe378658dc51eb752248f48364be327cd1b6bd148c518a976ae95d" +
607             "d391f3b0d447251988c7e77400c9d44395b8f9f10cbc442a6804d0ad83e8c3e3" +
608             "9fa09c2140fac143c90f09a7d907c57e29b528d54c8bd927f39aee2cec671213" +
609             "c50fe657b29682d57a419e3e52dafb348cbe44b6c17e4be18f5c5e411734fcfc" +
610             "99b9ca26f29a21cc93374ef1bfa86ca2bb3be76b94b4ef69ec790c968a51e4d0",
611             "d4c09727f68fa3beb57ce9e74205b652");
612 
613         kat(poly1305,
614             "2593adf2efc0e49c7fde0d45de4f7a21ceb76df45c0e5a917ef1f6b7fbf4fb7e",
615             "23665b9a6d4f04c9d58347d32d64d4cbf8d4ca993a8bb73758e6eb2db9f97096" +
616             "d0f00aca8cb16460d2bc15c0136fa92482602f47b3ec78244c4dc619d9b28afa" +
617             "19b063c196bcff848eff179102cce29dfcc58bf90a2f6311e6d021e2573ccbb4" +
618             "4e06947167c9865127c0b7362196523f97c8157058f7aebff475f77e23393dc1" +
619             "a3031bbaf31270db3eadc00cca6ae073aa53160d095afdce0a202de8a23d9a38" +
620             "b0bed20cbe64e1ec77fc13ba0cfc9be52edb70475bf1aeaaff25e11f2e0ae47f" +
621             "f23cbd4a6219d276fcc6c8f12a739f11434c86d4e24397f96ef6e36d9195fa8a" +
622             "48eb55990d69feacfb754b5019a3ebafa98d5544077b46c136cc11de8ee7f8a6" +
623             "76a6696600088696233f4e8f060ba8a64890fb638469639bfb727ed758c36250" +
624             "a553b7ce1115509f2bb19c13cea87003a8eff45ce9e1cff0a21ba5ae19226d50" +
625             "e108db212a588e5f4c502468859b9b607922c3311b5d912bd9400e696d7debbb" +
626             "9ac5454cc7d0f95fc242c491f095a02f0d3bd7ead0f0b7358c9b1d85e4e9ab75" +
627             "24bb43867c94a21a4e0db6470a210c9dd937e4801396bd687127fa7c83014c85" +
628             "372553c56dfd6cd9b75fa10483aea825f8e3fa53c6bf17467e37c2e7439ed0ea" +
629             "6fb24d13d428965c44f1ac943c7bc77fa84711c91b41f5ee6d9a7d9091648a96" +
630             "cc7c261d7fc5d964446d1e3dcc41d32ecaa8d7791b8462563fcf7f96cd1d11d4" +
631             "34923e0150321356866f5bdafebc96f2661bfd3c1f104e96b6492cafcbe25fc6" +
632             "ec0c92a3bbec7328e1905d5951fae04625a2452f596027a5d9c64eed55165c8a" +
633             "23bc3f944b4fa9c7ad83ebc1777c7153d5de13d04c0a12e774b17906a62f5134" +
634             "685c2de31da08bd04840299fd62d56ffe95248365034e7ba95961cebf0542b24",
635             "b9f68b0996caf5135136b10b37fe5f81");
636 
637         kat(poly1305,
638             "e9c8c78bc0ad5751f094fd4657fe5ef2a3c232f6930eef3431cde76659f04210",
639             "914e57a2745fd475d7b8f982483fe11a05d7b55853239112d5ae99616c718b3c" +
640             "4a0c2d05e3ca1df509614c0fe051b414d404149ec422e0998e192e51518518c4" +
641             "b8acd9e3e3ff9f3b4ef931d3052755785d38e75821ceefa7da0bfe3f1fb2dc6d" +
642             "738e2a2332e53ce77d44547621bb7aa724dd8805c7c795088db865d6b13d9b3e" +
643             "8acec846efb072d105ab6e599f8292a7601087e0ba13af9f503dcfd426e26e4d" +
644             "fb22bf5a1ff1a82d67d9bd8871e6adc17aa39d221f2865f81da9ed566192c269" +
645             "3c85f0442924e603b9ae54b88dd0f21e92eedc40c08dd484c552e297894eeee8" +
646             "b5acc91d5ae16f56257bb0836b48e1a8fa72e83a8b10b7026a7f466c8b08eac5" +
647             "4359b70e639117cf688e263b891f004db94d77941380f3ab0559538c9398c859" +
648             "b76d2bbcd6b635e753160583e7adc263097a80520d003514e134a21597c1ec57" +
649             "55da3a70acc6951b4d4d81e98b9eb962d9e3bc37d5e8ebd61e2a3f61cc452a65" +
650             "56571e12c190d4e3d0f8cc61ffcb60324b4a6987e7375a832ff807682e0b4595" +
651             "66ef1f765638f3d2e837ed43ce2c1c7837f271c866908d865c3d9174fd4f8056" +
652             "265abfb88fbc207db7a12c0a0ad035e5a728725e98cb682d41fd0bcf3aef2fd7" +
653             "ab261727f310fc7cf3b34286c9e9ee235995315167191f3b4d77e5642fb57dbd" +
654             "fdb5ccadefc5d03866918ab1a3eff54b405d8946e2b0c2fa444d1b2be4c3d41d" +
655             "990515e7534190d66d10e38c36c5d3df0315db85ba10c924bef97d1faa07a5f8" +
656             "f04998a7d38689237a1912bea3f821357d8383d7c5cfa66ba5965b5a94bb702c" +
657             "e6583e59879021139355c5b90e0f9cd13b34f3357ffde404bbf34c97f9fd55b5" +
658             "53e42d8a6b370eded02c8a5221e15db701da56918412520e12fd1ef9f4748647" +
659             "858488d5e0abd5b9e01457768907e1d24581f9591771304192711292e4025fce" +
660             "bd92adb2297e9496852f80bd78578bbdb292ca209f7584ff76e9eb66ec8a111e" +
661             "add30dc7ef364c4f1339312f226fe0cfa7a5b1602417e469cf2c8e3874c51232" +
662             "00f2d90dbe7f3c3ff5c6c6484052a80eb6229a6ed6176ad600da185da624bea6",
663             "c1de44dd8ea245ca43e5587460feb514");
664     }
665 }