1 /*
2 * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25 package org.openjdk.bench.jdk.incubator.vector.operation;
26
27 import jdk.incubator.vector.*;
28
29 import org.openjdk.jmh.annotations.*;
30
31 import java.util.concurrent.TimeUnit;
32
33 /**
34 * Population count algorithms from "Faster Population Counts Using AVX2 Instructions", 2018 by Mula, Kurz, Lemire
35 */
36 @BenchmarkMode(Mode.Throughput)
37 @Warmup(iterations = 3, time = 1)
38 @Measurement(iterations = 5, time = 1)
39 @OutputTimeUnit(TimeUnit.MILLISECONDS)
40 @State(Scope.Benchmark)
41 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
42 public class PopulationCount extends AbstractVectorBenchmark {
43 @Param({"64", "1024", "65536"})
44 int size;
45
46 private long[] data;
47
48 @Setup
49 public void init() {
50 data = fillLong(size, i -> RAND.nextLong());
51 // data = fillLong(size, i -> 0L);
52 // data = fillLong(size, i -> -1L);
53
54 checkConsistency();
55 }
56
57 @TearDown
58 public void tearDown() {
59 checkConsistency();
60 }
61
62 void checkConsistency() {
63 long popCount = longBitCount();
64 assert popCount == treeOfAdders();
65 assert popCount == WilkesWheelerGill();
66 assert popCount == Wegner();
67 assert popCount == Lauradoux();
68 assert popCount == HarleySeal();
69 assert popCount == Mula128();
70 assert popCount == Mula256();
71 assert popCount == HarleySeal256();
72 }
73
74 long tail(int upper) {
75 long acc = 0;
76 for (int i = upper; i < data.length; i++) {
77 acc += Long.bitCount(data[i]);
78 }
79 return acc;
80 }
81
82 @Benchmark
83 public long longBitCount() {
84 long acc = 0;
85 for (int i = 0; i < data.length; i++) {
86 acc += Long.bitCount(data[i]);
87 }
88 return acc;
89 }
90
91 /* ============================================================================================================== */
92
93 // FIGURE 4. The Wegner function in C
94
95 long popcntWegner(long x) {
96 int v = 0;
97 while (x != 0) {
98 x &= x - 1;
99 v++;
100 }
101 return v;
102 }
103
104 @Benchmark
105 public long Wegner() {
106 long acc = 0;
107 for (int i = 0; i < data.length; i++) {
108 acc += popcntWegner(data[i]);
109 }
110 return acc;
111 }
112
113 /* ============================================================================================================== */
114
115 // FIGURE 2. A naive tree-of-adders function in C
116
117 static long popcntTree(long x) {
118 long c1 = 0x5555555555555555L;
119 long c2 = 0x3333333333333333L;
120 long c4 = 0x0F0F0F0F0F0F0F0FL;
121 long c8 = 0x00FF00FF00FF00FFL;
122 long c16 = 0x0000FFFF0000FFFFL;
123 long c32 = 0x00000000FFFFFFFFL;
124
125 x = (x & c1) + ((x >>> 1) & c1);
126 x = (x & c2) + ((x >>> 2) & c2);
127 x = (x & c4) + ((x >>> 4) & c4);
128 x = (x & c8) + ((x >>> 8) & c8);
129 x = (x & c16) + ((x >>> 16) & c16);
130 x = (x & c32) + ((x >>> 32) & c32);
131 return x;
132 }
133
134 @Benchmark
135 public long treeOfAdders() {
136 long acc = 0;
137 for (int i = 0; i < data.length; i++) {
138 acc += popcntTree(data[i]);
139 }
140 return acc;
141 }
142
143 /* ============================================================================================================== */
144
145 // FIGURE 3. The Wilkes-Wheeler-Gill function in C
146
147 static long popcntWWG(long x) {
148 long c1 = 0x5555555555555555L;
149 long c2 = 0x3333333333333333L;
150 long c4 = 0x0F0F0F0F0F0F0F0FL;
151
152 x -= (x >>> 1) & c1;
153 x = (( x >>> 2) & c2) + (x & c2) ;
154 x = ( x + (x >>> 4) ) & c4;
155 x *= 0x0101010101010101L;
156 x = x >>> 56;
157 return x;
158 }
159
160 @Benchmark
161 public long WilkesWheelerGill() {
162 long acc = 0;
163 for (int i = 0; i < data.length; i++) {
164 acc += popcntWWG(data[i]);
165 }
166 return acc;
167 }
168
169 /* ============================================================================================================== */
170
171 // FIGURE 5. The Lauradoux population count in C for sets of 12 words.
172
173 static long parallelPopcnt(long count1, long count2, long count3) {
174 long m1 = 0x5555555555555555L;
175 long m2 = 0x3333333333333333L;
176 long m4 = 0x0F0F0F0F0F0F0F0FL;
177
178 long half1 = (count3 ) & m1;
179 long half2 = (count3 >>> 1) & m1;
180
181 count1 -= (count1 >>> 1) & m1;
182 count2 -= (count2 >>> 1) & m1;
183 count1 += half1;
184 count2 += half2;
185 count1 = (count1 & m2) + (( count1 >>> 2) & m2);
186 count1 += (count2 & m2) + (( count2 >>> 2) & m2);
187 return (count1 & m4) + (( count1 >>> 4) & m4);
188 }
189
190 static long reduce(long acc) {
191 long m8 = 0x00FF00FF00FF00FFL;
192 long m16 = 0x0000FFFF0000FFFFL;
193 long m32 = 0x00000000FFFFFFFFL;
194
195 acc = (acc & m8) + (( acc >>> 8) & m8);
196 acc = (acc + (acc >>> 16) ) & m16;
197 acc = (acc & m32) + (acc >>> 32);
198 return acc;
199 }
200
201 static long popcntLauradoux(long[] xs, int off) {
202 long acc = 0;
203 for (int j = off; j < off+12; j += 3) {
204 acc += parallelPopcnt(xs[j+0], xs[j+1], xs[j+2]);
205 }
206 return reduce(acc);
207 }
208
209 @Benchmark
210 public long Lauradoux() {
211 long acc = 0;
212 int upper = data.length - (data.length % 12);
213 for (int i = 0; i < upper; i += 12) {
214 acc += popcntLauradoux(data, i);
215 }
216 return acc + tail(upper);
217 }
218
219 /* ============================================================================================================== */
220
221 // FIGURE 6. A C function implementing a bitwise parallel carry-save adder (CSA). Given three input words a, b, c, it
222 // generates two new words h, l in which each bit represents the high and low bits in the bitwise sum of the bits from a,
223 // b, and c.
224
225 static long csaLow(long a, long b, long c) {
226 long u = a ^ b;
227 long lo = u ^ c;
228 return lo;
229 }
230
231 static long csaHigh(long a, long b, long c) {
232 long u = a ^ b;
233 long hi = (a & b) | (u & c) ;
234 return hi;
235 }
236
237 // FIGURE 8. A C function implementing the Harley-Seal
238 // population count over an array of 64-bit words. The count
239 // function could be the Wilkes-Wheeler-Gill function.
240 @Benchmark
241 public long HarleySeal() {
242 long total = 0, ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0;
243 long twosA = 0, twosB = 0;
244 long foursA = 0, foursB = 0;
245 long eightsA = 0, eightsB = 0;
246
247 int step = 16;
248 int upper = data.length - (data.length % step);
249 for (int i = 0; i < upper; i += step) {
250 // CSA(&twosA, &ones, ones, d[i+0], d[i +1]);
251 twosA = csaHigh(ones, data[i+0], data[i+1]);
252 ones = csaLow(ones, data[i+0], data[i+1]);
253
254 // CSA(&twosB, &ones, ones, d[i+2], d[i+3]);
255 twosB = csaHigh(ones, data[i+2], data[i+3]);
256 ones = csaLow(ones, data[i+2], data[i+3]);
257
258 // CSA(&foursA, &twos, twos, twosA, twosB);
259 foursA = csaHigh(twos, twosA, twosB);
260 twos = csaLow(twos, twosA, twosB);
261
262 // ====================================
263
264 // CSA(&twosA, &ones, ones, d[i+4], d[i+5]);
265 twosA = csaHigh(ones, data[i+4], data[i+5]);
266 ones = csaLow(ones, data[i+4], data[i+5]);
267
268 // CSA(&twosB, &ones, ones, d[i+6], d[i+7]);
269 twosB = csaHigh(ones, data[i+6], data[i+7]);
270 ones = csaLow(ones, data[i+6], data[i+7]);
271
272 // CSA(&foursB, &twos, twos, twosA, twosB);
273 foursB = csaHigh(twos, twosA, twosB);
274 twos = csaLow(twos, twosA, twosB);
275
276 // ====================================
277
278 // CSA(&eightsA, &fours, fours, foursA, foursB);
279 eightsA = csaHigh(fours, foursA, foursB);
280 fours = csaLow(fours, foursA, foursB);
281
282 // ====================================
283
284 // CSA(&twosA, &ones, ones, d[i+8], d[i+9]);
285 twosA = csaHigh(ones, data[i+8], data[i+9]);
286 ones = csaLow(ones, data[i+8], data[i+9]);
287
288 // CSA(&twosB, &ones, ones, d[i+10],d[i+11]);
289 twosB = csaHigh(ones, data[i+10], data[i+11]);
290 ones = csaLow(ones, data[i+10], data[i+11]);
291
292 // CSA(&foursA, &twos, twos, twosA, twosB);
293 foursA = csaHigh(twos, twosA, twosB);
294 twos = csaLow(twos, twosA, twosB);
295
296 // ====================================
297
298 // CSA(&twosA, &ones, ones, d[i+12], d[i +13]);
299 twosA = csaHigh(ones, data[i+12], data[i+13]);
300 ones = csaLow(ones, data[i+12], data[i+13]);
301
302 // CSA(&twosB, &ones, ones, d[i+14], d[i +15]);
303 twosB = csaHigh(ones, data[i+14], data[i+15]);
304 ones = csaLow(ones, data[i+14], data[i+15]);
305
306 // ====================================
307
308 // CSA(&foursB, &twos, twos, twosA, twosB);
309 foursB = csaHigh(twos, twosA, twosB);
310 twos = csaLow(twos, twosA, twosB);
311
312 // CSA(&eightsB, &fours, fours, foursA, foursB);
313 eightsB = csaHigh(fours, foursA, foursB);
314 fours = csaLow(fours, foursA, foursB);
315
316 // ====================================
317
318 // CSA(&sixteens, &eights, eights, eightsA, eightsB);
319 sixteens = csaHigh(eights, eightsA, eightsB);
320 eights = csaLow(eights, eightsA, eightsB);
321
322 total += Long.bitCount(sixteens);
323 }
324 total = 16 * total
325 + 8 * Long.bitCount(eights)
326 + 4 * Long.bitCount(fours)
327 + 2 * Long.bitCount(twos)
328 + 1 * Long.bitCount(ones);
329
330 return total + tail(upper);
331 }
332
333 /* ============================================================================================================== */
334
335 // FIGURE 9. A C function using SSE intrinsics implementing Mula's algorithm to compute sixteen population counts,
336 // corresponding to sixteen input bytes.
337
338 static final ByteVector MULA128_LOOKUP = IntVector.fromArray(I128,
339 new int[]{
340 0x02_01_01_00, // 0, 1, 1, 2,
341 0x03_02_02_01, // 1, 2, 2, 3,
342 0x03_02_02_01, // 1, 2, 2, 3,
343 0x04_03_03_02 // 2, 3, 3, 4
344 },
345 0
346 ).reinterpretAsBytes();
347
348 ByteVector popcntB128(ByteVector v) {
349 var low_mask = ByteVector.broadcast(B128, (byte)0x0f);
350
351 var lo = v .and(low_mask);
352 var hi = v.lanewise(VectorOperators.LSHR, 4).and(low_mask);
353
354 var cnt1 = MULA128_LOOKUP.rearrange(lo.toShuffle());
355 var cnt2 = MULA128_LOOKUP.rearrange(hi.toShuffle());
356
357 return cnt1.add(cnt2);
358 }
359
360 @Benchmark
361 public long Mula128() {
362 var acc = LongVector.zero(L128); // IntVector
363 int step = 32; // % B128.length() == 0!
364 int upper = data.length - (data.length % step);
365 for (int i = 0; i < upper; i += step) {
366 var bacc = ByteVector.zero(B128);
367 for (int j = 0; j < step; j += L128.length()) {
368 var v1 = LongVector.fromArray(L128, data, i + j);
369 var v2 = v1.reinterpretAsBytes();
370 var v3 = popcntB128(v2);
371 bacc = bacc.add(v3);
372 }
373 acc = acc.add(sumUnsignedBytes(bacc));
374 }
375 var r = acc.reduceLanes(VectorOperators.ADD) + tail(upper);
376 return r;
377 }
378
379 /* ============================================================================================================== */
380
381 // FIGURE 10. A C function using AVX2 intrinsics implementing Mula's algorithm to compute the four population counts
382 // of the four 64-bit words in a 256-bit vector. The 32 B output vector should be interpreted as four separate
383 // 64-bit counts that need to be summed to obtain the final population count.
384
385 static final ByteVector MULA256_LOOKUP =
386 join(I128, I256, MULA128_LOOKUP.reinterpretAsInts(), MULA128_LOOKUP.reinterpretAsInts()).reinterpretAsBytes();
387
388 ByteVector popcntB256(ByteVector v) {
389 var low_mask = ByteVector.broadcast(B256, (byte)0x0F);
390
391 var lo = v .and(low_mask);
392 var hi = v.lanewise(VectorOperators.LSHR, 4).and(low_mask);
393
394 var cnt1 = MULA256_LOOKUP.rearrange(lo.toShuffle());
395 var cnt2 = MULA256_LOOKUP.rearrange(hi.toShuffle());
396 var cnt = cnt1.add(cnt2);
397
398 return cnt;
399 }
400
401 // Horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers,
402 // and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst:
403 // _mm256_sad_epu8(total, _mm256_setzero_si256())
404 LongVector sumUnsignedBytes(ByteVector vb) {
405 return sumUnsignedBytesShapes(vb);
406 // return sumUnsignedBytesShifts(vb);
407 }
408
409 LongVector sumUnsignedBytesShapes(ByteVector vb) {
410 VectorSpecies<Short> shortSpecies = VectorSpecies.of(short.class, vb.shape());
411 VectorSpecies<Integer> intSpecies = VectorSpecies.of(int.class, vb.shape());
412 VectorSpecies<Long> longSpecies = VectorSpecies.of(long.class, vb.shape());
413
414 var low_short_mask = ShortVector.broadcast(shortSpecies, (short) 0xFF);
415 var low_int_mask = IntVector.broadcast(intSpecies, 0xFFFF);
416 var low_long_mask = LongVector.broadcast(longSpecies, 0xFFFFFFFFL);
417
418 var vs = vb.reinterpretAsShorts(); // 16-bit
419 var vs0 = vs.and(low_short_mask);
420 var vs1 = vs.lanewise(VectorOperators.LSHR, 8).and(low_short_mask);
421 var vs01 = vs0.add(vs1);
422
423 var vi = vs01.reinterpretAsInts(); // 32-bit
424 var vi0 = vi.and(low_int_mask);
425 var vi1 = vi.lanewise(VectorOperators.LSHR, 16).and(low_int_mask);
426 var vi01 = vi0.add(vi1);
427
428 var vl = vi01.reinterpretAsLongs(); // 64-bit
429 var vl0 = vl.and(low_long_mask);
430 var vl1 = vl.lanewise(VectorOperators.LSHR, 32).and(low_long_mask);
431 var vl01 = vl0.add(vl1);
432
433 return vl01;
434 }
435
436 LongVector sumUnsignedBytesShifts(ByteVector vb) {
437 VectorSpecies<Long> to = VectorSpecies.of(long.class, vb.shape());
438
439 var low_mask = LongVector.broadcast(to, 0xFF);
440
441 var vl = vb.reinterpretAsLongs();
442
443 var v0 = vl .and(low_mask); // 8-bit
444 var v1 = vl.lanewise(VectorOperators.LSHR, 8).and(low_mask); // 8-bit
445 var v2 = vl.lanewise(VectorOperators.LSHR, 16).and(low_mask); // 8-bit
446 var v3 = vl.lanewise(VectorOperators.LSHR, 24).and(low_mask); // 8-bit
447 var v4 = vl.lanewise(VectorOperators.LSHR, 32).and(low_mask); // 8-bit
448 var v5 = vl.lanewise(VectorOperators.LSHR, 40).and(low_mask); // 8-bit
449 var v6 = vl.lanewise(VectorOperators.LSHR, 48).and(low_mask); // 8-bit
450 var v7 = vl.lanewise(VectorOperators.LSHR, 56).and(low_mask); // 8-bit
451
452 var v01 = v0.add(v1);
453 var v23 = v2.add(v3);
454 var v45 = v4.add(v5);
455 var v67 = v6.add(v7);
456
457 var v03 = v01.add(v23);
458 var v47 = v45.add(v67);
459
460 var sum = v03.add(v47); // 64-bit
461 return sum;
462 }
463
464 @Benchmark
465 public long Mula256() {
466 var acc = LongVector.zero(L256);
467 int step = 32; // % B256.length() == 0!
468 int upper = data.length - (data.length % step);
469 for (int i = 0; i < upper; i += step) {
470 var bacc = ByteVector.zero(B256);
471 for (int j = 0; j < step; j += L256.length()) {
472 var v1 = LongVector.fromArray(L256, data, i + j);
473 var v2 = popcntB256(v1.reinterpretAsBytes());
474 bacc = bacc.add(v2);
475 }
476 acc = acc.add(sumUnsignedBytes(bacc));
477 }
478 return acc.reduceLanes(VectorOperators.ADD) + tail(upper);
479 }
480
481
482 /* ============================================================================================================== */
483
484 // FIGURE 11. A C function using AVX2 intrinsics implementing a bitwise parallel carry-save adder (CSA).
485
486 LongVector csaLow(LongVector a, LongVector b, LongVector c) {
487 var u = a.lanewise(VectorOperators.XOR, b);
488 var r = u.lanewise(VectorOperators.XOR, c);
489 return r;
490 }
491
492 LongVector csaHigh(LongVector a, LongVector b, LongVector c) {
493 var u = a.lanewise(VectorOperators.XOR, b);
494 var ab = a.and(b);
495 var uc = u.and(c);
496 var r = ab.or(uc); // (a & b) | ((a ^ b) & c)
497 return r;
498 }
499
500 LongVector popcntL256(LongVector v) {
501 var vb1 = v.reinterpretAsBytes();
502 var vb2 = popcntB256(vb1);
503 return sumUnsignedBytes(vb2);
504 }
505
506 // FIGURE 12. A C function using AVX2 intrinsics implementing Harley-Seal's algorithm. It assumes, for
507 // simplicity, that the input size in 256-bit vectors is divisible by 16. See Fig. 10 for the count function.
508
509 @Benchmark
510 public long HarleySeal256() {
511 LongVector ones, twos, fours, eights, sixteens, vtotal, twosA, twosB, foursA, foursB, eightsA, eightsB;
512 ones = twos = fours = eights = sixteens = twosA = twosB = foursA = foursB = eightsA = eights = vtotal = LongVector.broadcast(L256, 0);
513
514 var vlen = L256.length();
515 int step = 16 * vlen;
516 int upper = data.length - (data.length % step);
517 for (int i = 0; i < upper; i += step) {
518 // CSA(&twosA, &ones, ones, d[i+0], d[i +1]);
519 var d0 = LongVector.fromArray(L256, data, i + 0 * vlen);
520 var d1 = LongVector.fromArray(L256, data, i + 1 * vlen);
521
522 twosA = csaHigh(ones, d0, d1);
523 ones = csaLow(ones, d0, d1);
524
525 // CSA(&twosB, &ones, ones, d[i+2], d[i+3]);
526 var d2 = LongVector.fromArray(L256, data, i + 2 * vlen);
527 var d3 = LongVector.fromArray(L256, data, i + 3 * vlen);
528 twosB = csaHigh(ones, d2, d3);
529 ones = csaLow(ones, d2, d3);
530
531 // CSA(&foursA, &twos, twos, twosA, twosB);
532 foursA = csaHigh(twos, twosA, twosB);
533 twos = csaLow(twos, twosA, twosB);
534
535 // ====================================
536
537 // CSA(&twosA, &ones, ones, d[i+4], d[i+5]);
538 var d4 = LongVector.fromArray(L256, data, i + 4 * vlen);
539 var d5 = LongVector.fromArray(L256, data, i + 5 * vlen);
540 twosA = csaHigh(ones, d4, d5);
541 ones = csaLow(ones, d4, d5);
542
543 // CSA(&twosB, &ones, ones, d[i+6], d[i+7]);
544 var d6 = LongVector.fromArray(L256, data, i + 6 * vlen);
545 var d7 = LongVector.fromArray(L256, data, i + 7 * vlen);
546 twosB = csaHigh(ones, d6, d7);
547 ones = csaLow(ones, d6, d7);
548
549 // CSA(&foursB, &twos, twos, twosA, twosB);
550 foursB = csaHigh(twos, twosA, twosB);
551 twos = csaLow(twos, twosA, twosB);
552
553 // ====================================
554
555 // CSA(&eightsA, &fours, fours, foursA, foursB);
556 eightsA = csaHigh(fours, foursA, foursB);
557 fours = csaLow(fours, foursA, foursB);
558
559 // ====================================
560
561 // CSA(&twosA, &ones, ones, d[i+8], d[i+9]);
562 var d8 = LongVector.fromArray(L256, data, i + 8 * vlen);
563 var d9 = LongVector.fromArray(L256, data, i + 9 * vlen);
564 twosA = csaHigh(ones, d8, d9);
565 ones = csaLow(ones, d8, d9);
566
567 // CSA(&twosB, &ones, ones, d[i+10],d[i+11]);
568 var d10 = LongVector.fromArray(L256, data, i + 10 * vlen);
569 var d11 = LongVector.fromArray(L256, data, i + 11 * vlen);
570 twosB = csaHigh(ones, d10, d11);
571 ones = csaLow(ones, d10, d11);
572
573 // CSA(&foursA, &twos, twos, twosA, twosB);
574 foursA = csaHigh(twos, twosA, twosB);
575 twos = csaLow(twos, twosA, twosB);
576
577 // ====================================
578
579 // CSA(&twosA, &ones, ones, d[i+12], d[i +13]);
580 var d12 = LongVector.fromArray(L256, data, i + 12 * vlen);
581 var d13 = LongVector.fromArray(L256, data, i + 13 * vlen);
582 twosA = csaHigh(ones, d12, d13);
583 ones = csaLow(ones, d12, d13);
584
585 // CSA(&twosB, &ones, ones, d[i+14], d[i +15]);
586 var d14 = LongVector.fromArray(L256, data, i + 14 * vlen);
587 var d15 = LongVector.fromArray(L256, data, i + 15 * vlen);
588 twosB = csaHigh(ones, d14, d15);
589 ones = csaLow(ones, d14, d15);
590
591 // ====================================
592
593 // CSA(&foursB, &twos, twos, twosA, twosB);
594 foursB = csaHigh(twos, twosA, twosB);
595 twos = csaLow(twos, twosA, twosB);
596
597 // CSA(&eightsB, &fours, fours, foursA, foursB);
598 eightsB = csaHigh(fours, foursA, foursB);
599 fours = csaLow(fours, foursA, foursB);
600
601 // ====================================
602
603 // CSA(&sixteens, &eights, eights, eightsA, eightsB);
604 sixteens = csaHigh(eights, eightsA, eightsB);
605 eights = csaLow(eights, eightsA, eightsB);
606
607 vtotal = vtotal.add(popcntL256(sixteens));
608 }
609
610 vtotal = vtotal.mul(16); // << 4
611 vtotal = vtotal.add(popcntL256(eights).mul(8)); // << 3
612 vtotal = vtotal.add(popcntL256(fours).mul(4)); // << 2
613 vtotal = vtotal.add(popcntL256(twos).mul(2)); // << 1
614 vtotal = vtotal.add(popcntL256(ones)); // << 0
615
616 var total = vtotal.reduceLanes(VectorOperators.ADD);
617
618 return total + tail(upper);
619 }
620
621 /* ============================================================================================================== */
622
623 // ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) {
624 // return _mm512_ternarylogic_epi32(c, b, a, 0x96); // vpternlogd
625 // }
626 //
627 // ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) {
628 // return _mm512_ternarylogic_epi32(c, b, a, 0xe8); // vpternlogd
629 // }
630 }