1 /*
  2  * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 package org.openjdk.bench.jdk.incubator.vector.operation;
 26 
 27 import jdk.incubator.vector.*;
 28 
 29 import org.openjdk.jmh.annotations.*;
 30 
 31 import java.util.concurrent.TimeUnit;
 32 
 33 /**
 34  * Population count algorithms from "Faster Population Counts Using AVX2 Instructions", 2018 by Mula, Kurz, Lemire
 35  */
 36 @BenchmarkMode(Mode.Throughput)
 37 @Warmup(iterations = 3, time = 1)
 38 @Measurement(iterations = 5, time = 1)
 39 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 40 @State(Scope.Benchmark)
 41 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
 42 public class PopulationCount extends AbstractVectorBenchmark {
 43     @Param({"64", "1024", "65536"})
 44     int size;
 45 
 46     private long[] data;
 47 
 48     @Setup
 49     public void init() {
 50         data = fillLong(size, i -> RANDOM.nextLong());
 51 //        data = fillLong(size, i -> 0L);
 52 //        data = fillLong(size, i -> -1L);
 53 
 54         checkConsistency();
 55     }
 56 
 57     @TearDown
 58     public void tearDown() {
 59         checkConsistency();
 60     }
 61 
 62     void checkConsistency() {
 63         long popCount = longBitCount();
 64         assert popCount == treeOfAdders();
 65         assert popCount == WilkesWheelerGill();
 66         assert popCount == Wegner();
 67         assert popCount == Lauradoux();
 68         assert popCount == HarleySeal();
 69         assert popCount == Mula128();
 70         assert popCount == Mula256();
 71         assert popCount == HarleySeal256();
 72     }
 73 
 74     long tail(int upper) {
 75         long acc = 0;
 76         for (int i = upper; i < data.length; i++) {
 77             acc += Long.bitCount(data[i]);
 78         }
 79         return acc;
 80     }
 81 
 82     @Benchmark
 83     public long longBitCount() {
 84         long acc = 0;
 85         for (int i = 0; i < data.length; i++) {
 86             acc += Long.bitCount(data[i]);
 87         }
 88         return acc;
 89     }
 90 
 91     /* ============================================================================================================== */
 92 
 93     // FIGURE 4. The Wegner function in C
 94 
 95     long popcntWegner(long x) {
 96         int v = 0;
 97         while (x != 0) {
 98             x &= x - 1;
 99             v++;
100         }
101         return v;
102     }
103 
104     @Benchmark
105     public long Wegner() {
106         long acc = 0;
107         for (int i = 0; i < data.length; i++) {
108             acc += popcntWegner(data[i]);
109         }
110         return acc;
111     }
112 
113     /* ============================================================================================================== */
114 
115     // FIGURE 2. A naive tree-of-adders function in C
116 
117     static long popcntTree(long x) {
118         long c1  = 0x5555555555555555L;
119         long c2  = 0x3333333333333333L;
120         long c4  = 0x0F0F0F0F0F0F0F0FL;
121         long c8  = 0x00FF00FF00FF00FFL;
122         long c16 = 0x0000FFFF0000FFFFL;
123         long c32 = 0x00000000FFFFFFFFL;
124 
125         x = (x & c1)  + ((x >>> 1)  & c1);
126         x = (x & c2)  + ((x >>> 2)  & c2);
127         x = (x & c4)  + ((x >>> 4)  & c4);
128         x = (x & c8)  + ((x >>> 8)  & c8);
129         x = (x & c16) + ((x >>> 16) & c16);
130         x = (x & c32) + ((x >>> 32) & c32);
131         return x;
132     }
133 
134     @Benchmark
135     public long treeOfAdders() {
136         long acc = 0;
137         for (int i = 0; i < data.length; i++) {
138             acc += popcntTree(data[i]);
139         }
140         return acc;
141     }
142 
143     /* ============================================================================================================== */
144 
145     // FIGURE 3. The Wilkes-Wheeler-Gill function in C
146 
147     static long popcntWWG(long x) {
148         long c1 = 0x5555555555555555L;
149         long c2 = 0x3333333333333333L;
150         long c4 = 0x0F0F0F0F0F0F0F0FL;
151 
152         x -= (x >>> 1) & c1;
153         x = (( x >>> 2) & c2) + (x & c2) ;
154         x = ( x + (x >>> 4) ) & c4;
155         x *= 0x0101010101010101L;
156         x = x >>> 56;
157         return x;
158     }
159 
160     @Benchmark
161     public long WilkesWheelerGill() {
162         long acc = 0;
163         for (int i = 0; i < data.length; i++) {
164             acc += popcntWWG(data[i]);
165         }
166         return acc;
167     }
168 
169     /* ============================================================================================================== */
170 
171     // FIGURE 5. The Lauradoux population count in C for sets of 12 words.
172 
173     static long parallelPopcnt(long count1, long count2, long count3) {
174         long m1  = 0x5555555555555555L;
175         long m2  = 0x3333333333333333L;
176         long m4  = 0x0F0F0F0F0F0F0F0FL;
177 
178         long half1 = (count3      ) & m1;
179         long half2 = (count3 >>> 1) & m1;
180 
181         count1 -= (count1 >>> 1) & m1;
182         count2 -= (count2 >>> 1) & m1;
183         count1 += half1;
184         count2 += half2;
185         count1  = (count1 & m2) + (( count1 >>> 2) & m2);
186         count1 += (count2 & m2) + (( count2 >>> 2) & m2);
187         return (count1 & m4) + (( count1 >>> 4) & m4);
188     }
189 
190     static long reduce(long acc) {
191         long m8  = 0x00FF00FF00FF00FFL;
192         long m16 = 0x0000FFFF0000FFFFL;
193         long m32 = 0x00000000FFFFFFFFL;
194 
195         acc = (acc & m8) + (( acc >>> 8) & m8);
196         acc = (acc + (acc >>> 16) ) & m16;
197         acc = (acc & m32) + (acc >>> 32);
198         return acc;
199     }
200 
201     static long popcntLauradoux(long[] xs, int off) {
202         long acc = 0;
203         for (int j = off; j < off+12; j += 3) {
204             acc += parallelPopcnt(xs[j+0], xs[j+1], xs[j+2]);
205         }
206         return reduce(acc);
207     }
208 
209     @Benchmark
210     public long Lauradoux() {
211         long acc = 0;
212         int upper = data.length - (data.length % 12);
213         for (int i = 0; i < upper; i += 12) {
214             acc += popcntLauradoux(data, i);
215         }
216         return acc + tail(upper);
217     }
218 
219     /* ============================================================================================================== */
220 
221     // FIGURE 6. A C function implementing a bitwise parallel carry-save adder (CSA). Given three input words a, b, c, it
222     // generates two new words h, l in which each bit represents the high and low bits in the bitwise sum of the bits from a,
223     // b, and c.
224 
225     static long csaLow(long a, long b, long c) {
226         long u = a ^ b;
227         long lo = u ^ c;
228         return lo;
229     }
230 
231     static long csaHigh(long a, long b, long c) {
232         long u = a ^ b;
233         long hi = (a & b) | (u & c) ;
234         return hi;
235     }
236 
237     // FIGURE 8. A C function implementing the Harley-Seal
238     // population count over an array of 64-bit words. The count
239     // function could be the Wilkes-Wheeler-Gill function.
240     @Benchmark
241     public long HarleySeal() {
242         long total = 0, ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0;
243         long twosA   = 0, twosB   = 0;
244         long foursA  = 0, foursB  = 0;
245         long eightsA = 0, eightsB = 0;
246 
247         int step = 16;
248         int upper = data.length - (data.length % step);
249         for (int i = 0; i < upper; i += step) {
250             // CSA(&twosA, &ones, ones, d[i+0], d[i +1]);
251             twosA = csaHigh(ones, data[i+0], data[i+1]);
252             ones  = csaLow(ones, data[i+0], data[i+1]);
253 
254             // CSA(&twosB, &ones, ones, d[i+2], d[i+3]);
255             twosB = csaHigh(ones, data[i+2], data[i+3]);
256             ones  = csaLow(ones, data[i+2], data[i+3]);
257 
258             // CSA(&foursA, &twos, twos, twosA, twosB);
259             foursA = csaHigh(twos, twosA, twosB);
260             twos   = csaLow(twos, twosA, twosB);
261 
262             // ====================================
263 
264             // CSA(&twosA, &ones, ones, d[i+4], d[i+5]);
265             twosA = csaHigh(ones, data[i+4], data[i+5]);
266             ones  = csaLow(ones, data[i+4], data[i+5]);
267 
268             // CSA(&twosB, &ones, ones, d[i+6], d[i+7]);
269             twosB = csaHigh(ones, data[i+6], data[i+7]);
270             ones  = csaLow(ones, data[i+6], data[i+7]);
271 
272             // CSA(&foursB, &twos, twos, twosA, twosB);
273             foursB = csaHigh(twos, twosA, twosB);
274             twos   = csaLow(twos, twosA, twosB);
275 
276             // ====================================
277 
278             // CSA(&eightsA, &fours, fours, foursA, foursB);
279             eightsA = csaHigh(fours, foursA, foursB);
280             fours   = csaLow(fours, foursA, foursB);
281 
282             // ====================================
283 
284             // CSA(&twosA, &ones, ones, d[i+8], d[i+9]);
285             twosA = csaHigh(ones, data[i+8], data[i+9]);
286             ones  = csaLow(ones, data[i+8], data[i+9]);
287 
288             // CSA(&twosB, &ones, ones, d[i+10],d[i+11]);
289             twosB = csaHigh(ones, data[i+10], data[i+11]);
290             ones  = csaLow(ones, data[i+10], data[i+11]);
291 
292             // CSA(&foursA, &twos, twos, twosA, twosB);
293             foursA = csaHigh(twos, twosA, twosB);
294             twos   = csaLow(twos, twosA, twosB);
295 
296             // ====================================
297 
298             // CSA(&twosA, &ones, ones, d[i+12], d[i +13]);
299             twosA = csaHigh(ones, data[i+12], data[i+13]);
300             ones  = csaLow(ones, data[i+12], data[i+13]);
301 
302             // CSA(&twosB, &ones, ones, d[i+14], d[i +15]);
303             twosB = csaHigh(ones, data[i+14], data[i+15]);
304             ones  = csaLow(ones, data[i+14], data[i+15]);
305 
306             // ====================================
307 
308             // CSA(&foursB, &twos, twos, twosA, twosB);
309             foursB = csaHigh(twos, twosA, twosB);
310             twos   = csaLow(twos, twosA, twosB);
311 
312             // CSA(&eightsB, &fours, fours, foursA, foursB);
313             eightsB = csaHigh(fours, foursA, foursB);
314             fours   = csaLow(fours, foursA, foursB);
315 
316             // ====================================
317 
318             // CSA(&sixteens, &eights, eights, eightsA, eightsB);
319             sixteens = csaHigh(eights, eightsA, eightsB);
320             eights   = csaLow(eights, eightsA, eightsB);
321 
322             total += Long.bitCount(sixteens);
323         }
324         total = 16 * total
325                 + 8 * Long.bitCount(eights)
326                 + 4 * Long.bitCount(fours)
327                 + 2 * Long.bitCount(twos)
328                 + 1 * Long.bitCount(ones);
329 
330         return total + tail(upper);
331     }
332 
333     /* ============================================================================================================== */
334 
335     // FIGURE 9. A C function using SSE intrinsics implementing Mula's algorithm to compute sixteen population counts,
336     // corresponding to sixteen input bytes.
337 
338     static final ByteVector MULA128_LOOKUP = IntVector.fromArray(I128,
339             new int[]{
340                     0x02_01_01_00, // 0, 1, 1, 2,
341                     0x03_02_02_01, // 1, 2, 2, 3,
342                     0x03_02_02_01, // 1, 2, 2, 3,
343                     0x04_03_03_02  // 2, 3, 3, 4
344             },
345             0
346             ).reinterpretAsBytes();
347 
348     ByteVector popcntB128(ByteVector v) {
349         var low_mask = ByteVector.broadcast(B128, (byte)0x0f);
350 
351         var lo = v          .and(low_mask);
352         var hi = v.lanewise(VectorOperators.LSHR, 4).and(low_mask);
353 
354         var cnt1 = MULA128_LOOKUP.rearrange(lo.toShuffle());
355         var cnt2 = MULA128_LOOKUP.rearrange(hi.toShuffle());
356 
357         return cnt1.add(cnt2);
358     }
359 
360     @Benchmark
361     public long Mula128() {
362         var acc = LongVector.zero(L128); // IntVector
363         int step = 32; // % B128.length() == 0!
364         int upper = data.length - (data.length % step);
365         for (int i = 0; i < upper; i += step) {
366             var bacc = ByteVector.zero(B128);
367             for (int j = 0; j < step; j += L128.length()) {
368                 var v1 = LongVector.fromArray(L128, data, i + j);
369                 var v2 = v1.reinterpretAsBytes();
370                 var v3 = popcntB128(v2);
371                 bacc = bacc.add(v3);
372             }
373             acc = acc.add(sumUnsignedBytes(bacc));
374         }
375         var r = acc.reduceLanes(VectorOperators.ADD) + tail(upper);
376         return r;
377     }
378 
379     /* ============================================================================================================== */
380 
381     // FIGURE 10. A C function using AVX2 intrinsics implementing Mula's algorithm to compute the four population counts
382     // of the four 64-bit words in a 256-bit vector. The 32 B output vector should be interpreted as four separate
383     // 64-bit counts that need to be summed to obtain the final population count.
384 
385     static final ByteVector MULA256_LOOKUP = 
386             join(I128, I256, MULA128_LOOKUP.reinterpretAsInts(), MULA128_LOOKUP.reinterpretAsInts()).reinterpretAsBytes();
387 
388     ByteVector popcntB256(ByteVector v) {
389         var low_mask = ByteVector.broadcast(B256, (byte)0x0F);
390 
391         var lo = v          .and(low_mask);
392         var hi = v.lanewise(VectorOperators.LSHR, 4).and(low_mask);
393 
394         var cnt1 = MULA256_LOOKUP.rearrange(lo.toShuffle());
395         var cnt2 = MULA256_LOOKUP.rearrange(hi.toShuffle());
396         var cnt = cnt1.add(cnt2);
397 
398         return cnt;
399     }
400 
401     // Horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers,
402     // and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst:
403     //   _mm256_sad_epu8(total, _mm256_setzero_si256())
404     LongVector sumUnsignedBytes(ByteVector vb) {
405         return sumUnsignedBytesShapes(vb);
406 //        return sumUnsignedBytesShifts(vb);
407     }
408 
409     LongVector sumUnsignedBytesShapes(ByteVector vb) {
410         VectorSpecies<Short> shortSpecies = VectorSpecies.of(short.class, vb.shape());
411         VectorSpecies<Integer> intSpecies = VectorSpecies.of(int.class, vb.shape());
412         VectorSpecies<Long> longSpecies = VectorSpecies.of(long.class, vb.shape());
413 
414         var low_short_mask = ShortVector.broadcast(shortSpecies, (short) 0xFF);
415         var low_int_mask = IntVector.broadcast(intSpecies, 0xFFFF);
416         var low_long_mask = LongVector.broadcast(longSpecies, 0xFFFFFFFFL);
417 
418         var vs = vb.reinterpretAsShorts(); // 16-bit
419         var vs0 = vs.and(low_short_mask);
420         var vs1 = vs.lanewise(VectorOperators.LSHR, 8).and(low_short_mask);
421         var vs01 = vs0.add(vs1);
422 
423         var vi = vs01.reinterpretAsInts(); // 32-bit
424         var vi0 = vi.and(low_int_mask);
425         var vi1 = vi.lanewise(VectorOperators.LSHR, 16).and(low_int_mask);
426         var vi01 = vi0.add(vi1);
427 
428         var vl = vi01.reinterpretAsLongs(); // 64-bit
429         var vl0 = vl.and(low_long_mask);
430         var vl1 = vl.lanewise(VectorOperators.LSHR, 32).and(low_long_mask);
431         var vl01 = vl0.add(vl1);
432 
433         return vl01;
434     }
435 
436     LongVector sumUnsignedBytesShifts(ByteVector vb) {
437         VectorSpecies<Long> to = VectorSpecies.of(long.class, vb.shape());
438 
439         var low_mask = LongVector.broadcast(to, 0xFF);
440 
441         var vl = vb.reinterpretAsLongs();
442 
443         var v0 = vl           .and(low_mask); // 8-bit
444         var v1 = vl.lanewise(VectorOperators.LSHR, 8).and(low_mask);  // 8-bit
445         var v2 = vl.lanewise(VectorOperators.LSHR, 16).and(low_mask); // 8-bit
446         var v3 = vl.lanewise(VectorOperators.LSHR, 24).and(low_mask); // 8-bit
447         var v4 = vl.lanewise(VectorOperators.LSHR, 32).and(low_mask); // 8-bit
448         var v5 = vl.lanewise(VectorOperators.LSHR, 40).and(low_mask); // 8-bit
449         var v6 = vl.lanewise(VectorOperators.LSHR, 48).and(low_mask); // 8-bit
450         var v7 = vl.lanewise(VectorOperators.LSHR, 56).and(low_mask); // 8-bit
451 
452         var v01 = v0.add(v1);
453         var v23 = v2.add(v3);
454         var v45 = v4.add(v5);
455         var v67 = v6.add(v7);
456 
457         var v03 = v01.add(v23);
458         var v47 = v45.add(v67);
459 
460         var sum = v03.add(v47); // 64-bit
461         return sum;
462     }
463 
464     @Benchmark
465     public long Mula256() {
466         var acc = LongVector.zero(L256);
467         int step = 32; // % B256.length() == 0!
468         int upper = data.length - (data.length % step);
469         for (int i = 0; i < upper; i += step) {
470             var bacc = ByteVector.zero(B256);
471             for (int j = 0; j < step; j += L256.length()) {
472                 var v1 = LongVector.fromArray(L256, data, i + j);
473                 var v2 = popcntB256(v1.reinterpretAsBytes());
474                 bacc = bacc.add(v2);
475             }
476             acc = acc.add(sumUnsignedBytes(bacc));
477         }
478         return acc.reduceLanes(VectorOperators.ADD) + tail(upper);
479     }
480 
481 
482     /* ============================================================================================================== */
483 
484     // FIGURE 11. A C function using AVX2 intrinsics implementing a bitwise parallel carry-save adder (CSA).
485 
486     LongVector csaLow(LongVector a, LongVector b, LongVector c) {
487         var u = a.lanewise(VectorOperators.XOR, b);
488         var r = u.lanewise(VectorOperators.XOR, c);
489         return r;
490     }
491 
492     LongVector csaHigh(LongVector a, LongVector b, LongVector c) {
493         var u  = a.lanewise(VectorOperators.XOR, b);
494         var ab = a.and(b);
495         var uc = u.and(c);
496         var r  = ab.or(uc); // (a & b) | ((a ^ b) & c)
497         return r;
498     }
499 
500     LongVector popcntL256(LongVector v) {
501         var vb1 = v.reinterpretAsBytes();
502         var vb2 = popcntB256(vb1);
503         return sumUnsignedBytes(vb2);
504     }
505 
506     // FIGURE 12. A C function using AVX2 intrinsics implementing Harley-Seal's algorithm. It assumes, for
507     // simplicity, that the input size in 256-bit vectors is divisible by 16. See Fig. 10 for the count function.
508 
509     @Benchmark
510     public long HarleySeal256() {
511         LongVector ones, twos, fours, eights, sixteens, vtotal, twosA, twosB, foursA, foursB, eightsA, eightsB;
512         ones = twos = fours = eights = sixteens = twosA = twosB = foursA = foursB = eightsA = eights = vtotal = LongVector.broadcast(L256, 0);
513 
514         var vlen = L256.length();
515         int step = 16 * vlen;
516         int upper = data.length - (data.length % step);
517         for (int i = 0; i < upper; i += step) {
518             // CSA(&twosA, &ones, ones, d[i+0], d[i +1]);
519             var d0 = LongVector.fromArray(L256, data, i + 0 * vlen);
520             var d1 = LongVector.fromArray(L256, data, i + 1 * vlen);
521 
522             twosA = csaHigh(ones, d0, d1);
523             ones  = csaLow(ones, d0, d1);
524 
525             // CSA(&twosB, &ones, ones, d[i+2], d[i+3]);
526             var d2 = LongVector.fromArray(L256, data, i + 2 * vlen);
527             var d3 = LongVector.fromArray(L256, data, i + 3 * vlen);
528             twosB = csaHigh(ones, d2, d3);
529             ones  = csaLow(ones, d2, d3);
530 
531             // CSA(&foursA, &twos, twos, twosA, twosB);
532             foursA = csaHigh(twos, twosA, twosB);
533             twos   = csaLow(twos, twosA, twosB);
534 
535             // ====================================
536 
537             // CSA(&twosA, &ones, ones, d[i+4], d[i+5]);
538             var d4 = LongVector.fromArray(L256, data, i + 4 * vlen);
539             var d5 = LongVector.fromArray(L256, data, i + 5 * vlen);
540             twosA = csaHigh(ones, d4, d5);
541             ones  = csaLow(ones, d4, d5);
542 
543             // CSA(&twosB, &ones, ones, d[i+6], d[i+7]);
544             var d6 = LongVector.fromArray(L256, data, i + 6 * vlen);
545             var d7 = LongVector.fromArray(L256, data, i + 7 * vlen);
546             twosB = csaHigh(ones, d6, d7);
547             ones  = csaLow(ones, d6, d7);
548 
549             // CSA(&foursB, &twos, twos, twosA, twosB);
550             foursB = csaHigh(twos, twosA, twosB);
551             twos   = csaLow(twos, twosA, twosB);
552 
553             // ====================================
554 
555             // CSA(&eightsA, &fours, fours, foursA, foursB);
556             eightsA = csaHigh(fours, foursA, foursB);
557             fours   = csaLow(fours, foursA, foursB);
558 
559             // ====================================
560 
561             // CSA(&twosA, &ones, ones, d[i+8], d[i+9]);
562             var d8 = LongVector.fromArray(L256, data, i + 8 * vlen);
563             var d9 = LongVector.fromArray(L256, data, i + 9 * vlen);
564             twosA = csaHigh(ones, d8, d9);
565             ones  = csaLow(ones, d8, d9);
566 
567             // CSA(&twosB, &ones, ones, d[i+10],d[i+11]);
568             var d10 = LongVector.fromArray(L256, data, i + 10 * vlen);
569             var d11 = LongVector.fromArray(L256, data, i + 11 * vlen);
570             twosB = csaHigh(ones, d10, d11);
571             ones  = csaLow(ones, d10, d11);
572 
573             // CSA(&foursA, &twos, twos, twosA, twosB);
574             foursA = csaHigh(twos, twosA, twosB);
575             twos   = csaLow(twos, twosA, twosB);
576 
577             // ====================================
578 
579             // CSA(&twosA, &ones, ones, d[i+12], d[i +13]);
580             var d12 = LongVector.fromArray(L256, data, i + 12 * vlen);
581             var d13 = LongVector.fromArray(L256, data, i + 13 * vlen);
582             twosA = csaHigh(ones, d12, d13);
583             ones  = csaLow(ones, d12, d13);
584 
585             // CSA(&twosB, &ones, ones, d[i+14], d[i +15]);
586             var d14 = LongVector.fromArray(L256, data, i + 14 * vlen);
587             var d15 = LongVector.fromArray(L256, data, i + 15 * vlen);
588             twosB = csaHigh(ones, d14, d15);
589             ones  = csaLow(ones, d14, d15);
590 
591             // ====================================
592 
593             // CSA(&foursB, &twos, twos, twosA, twosB);
594             foursB = csaHigh(twos, twosA, twosB);
595             twos   = csaLow(twos, twosA, twosB);
596 
597             // CSA(&eightsB, &fours, fours, foursA, foursB);
598             eightsB = csaHigh(fours, foursA, foursB);
599             fours   = csaLow(fours, foursA, foursB);
600 
601             // ====================================
602 
603             // CSA(&sixteens, &eights, eights, eightsA, eightsB);
604             sixteens = csaHigh(eights, eightsA, eightsB);
605             eights   = csaLow(eights, eightsA, eightsB);
606 
607             vtotal = vtotal.add(popcntL256(sixteens));
608         }
609 
610         vtotal = vtotal.mul(16);                       // << 4
611         vtotal = vtotal.add(popcntL256(eights).mul(8)); // << 3
612         vtotal = vtotal.add(popcntL256(fours).mul(4));  // << 2
613         vtotal = vtotal.add(popcntL256(twos).mul(2));   // << 1
614         vtotal = vtotal.add(popcntL256(ones));          // << 0
615 
616         var total = vtotal.reduceLanes(VectorOperators.ADD);
617 
618         return total + tail(upper);
619     }
620 
621     /* ============================================================================================================== */
622 
623 //    ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) {
624 //        return _mm512_ternarylogic_epi32(c, b, a, 0x96); // vpternlogd
625 //    }
626 //
627 //    ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) {
628 //        return _mm512_ternarylogic_epi32(c, b, a, 0xe8); // vpternlogd
629 //    }
630 }