1 /* 2 * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 package org.openjdk.bench.jdk.incubator.vector.operation; 26 27 import jdk.incubator.vector.*; 28 29 import org.openjdk.jmh.annotations.*; 30 31 import java.util.concurrent.TimeUnit; 32 33 /** 34 * Population count algorithms from "Faster Population Counts Using AVX2 Instructions", 2018 by Mula, Kurz, Lemire 35 */ 36 @BenchmarkMode(Mode.Throughput) 37 @Warmup(iterations = 3, time = 1) 38 @Measurement(iterations = 5, time = 1) 39 @OutputTimeUnit(TimeUnit.MILLISECONDS) 40 @State(Scope.Benchmark) 41 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) 42 public class PopulationCount extends AbstractVectorBenchmark { 43 @Param({"64", "1024", "65536"}) 44 int size; 45 46 private long[] data; 47 48 @Setup 49 public void init() { 50 data = fillLong(size, i -> RANDOM.nextLong()); 51 // data = fillLong(size, i -> 0L); 52 // data = fillLong(size, i -> -1L); 53 54 checkConsistency(); 55 } 56 57 @TearDown 58 public void tearDown() { 59 checkConsistency(); 60 } 61 62 void checkConsistency() { 63 long popCount = longBitCount(); 64 assert popCount == treeOfAdders(); 65 assert popCount == WilkesWheelerGill(); 66 assert popCount == Wegner(); 67 assert popCount == Lauradoux(); 68 assert popCount == HarleySeal(); 69 assert popCount == Mula128(); 70 assert popCount == Mula256(); 71 assert popCount == HarleySeal256(); 72 } 73 74 long tail(int upper) { 75 long acc = 0; 76 for (int i = upper; i < data.length; i++) { 77 acc += Long.bitCount(data[i]); 78 } 79 return acc; 80 } 81 82 @Benchmark 83 public long longBitCount() { 84 long acc = 0; 85 for (int i = 0; i < data.length; i++) { 86 acc += Long.bitCount(data[i]); 87 } 88 return acc; 89 } 90 91 /* ============================================================================================================== */ 92 93 // FIGURE 4. The Wegner function in C 94 95 long popcntWegner(long x) { 96 int v = 0; 97 while (x != 0) { 98 x &= x - 1; 99 v++; 100 } 101 return v; 102 } 103 104 @Benchmark 105 public long Wegner() { 106 long acc = 0; 107 for (int i = 0; i < data.length; i++) { 108 acc += popcntWegner(data[i]); 109 } 110 return acc; 111 } 112 113 /* ============================================================================================================== */ 114 115 // FIGURE 2. A naive tree-of-adders function in C 116 117 static long popcntTree(long x) { 118 long c1 = 0x5555555555555555L; 119 long c2 = 0x3333333333333333L; 120 long c4 = 0x0F0F0F0F0F0F0F0FL; 121 long c8 = 0x00FF00FF00FF00FFL; 122 long c16 = 0x0000FFFF0000FFFFL; 123 long c32 = 0x00000000FFFFFFFFL; 124 125 x = (x & c1) + ((x >>> 1) & c1); 126 x = (x & c2) + ((x >>> 2) & c2); 127 x = (x & c4) + ((x >>> 4) & c4); 128 x = (x & c8) + ((x >>> 8) & c8); 129 x = (x & c16) + ((x >>> 16) & c16); 130 x = (x & c32) + ((x >>> 32) & c32); 131 return x; 132 } 133 134 @Benchmark 135 public long treeOfAdders() { 136 long acc = 0; 137 for (int i = 0; i < data.length; i++) { 138 acc += popcntTree(data[i]); 139 } 140 return acc; 141 } 142 143 /* ============================================================================================================== */ 144 145 // FIGURE 3. The Wilkes-Wheeler-Gill function in C 146 147 static long popcntWWG(long x) { 148 long c1 = 0x5555555555555555L; 149 long c2 = 0x3333333333333333L; 150 long c4 = 0x0F0F0F0F0F0F0F0FL; 151 152 x -= (x >>> 1) & c1; 153 x = (( x >>> 2) & c2) + (x & c2) ; 154 x = ( x + (x >>> 4) ) & c4; 155 x *= 0x0101010101010101L; 156 x = x >>> 56; 157 return x; 158 } 159 160 @Benchmark 161 public long WilkesWheelerGill() { 162 long acc = 0; 163 for (int i = 0; i < data.length; i++) { 164 acc += popcntWWG(data[i]); 165 } 166 return acc; 167 } 168 169 /* ============================================================================================================== */ 170 171 // FIGURE 5. The Lauradoux population count in C for sets of 12 words. 172 173 static long parallelPopcnt(long count1, long count2, long count3) { 174 long m1 = 0x5555555555555555L; 175 long m2 = 0x3333333333333333L; 176 long m4 = 0x0F0F0F0F0F0F0F0FL; 177 178 long half1 = (count3 ) & m1; 179 long half2 = (count3 >>> 1) & m1; 180 181 count1 -= (count1 >>> 1) & m1; 182 count2 -= (count2 >>> 1) & m1; 183 count1 += half1; 184 count2 += half2; 185 count1 = (count1 & m2) + (( count1 >>> 2) & m2); 186 count1 += (count2 & m2) + (( count2 >>> 2) & m2); 187 return (count1 & m4) + (( count1 >>> 4) & m4); 188 } 189 190 static long reduce(long acc) { 191 long m8 = 0x00FF00FF00FF00FFL; 192 long m16 = 0x0000FFFF0000FFFFL; 193 long m32 = 0x00000000FFFFFFFFL; 194 195 acc = (acc & m8) + (( acc >>> 8) & m8); 196 acc = (acc + (acc >>> 16) ) & m16; 197 acc = (acc & m32) + (acc >>> 32); 198 return acc; 199 } 200 201 static long popcntLauradoux(long[] xs, int off) { 202 long acc = 0; 203 for (int j = off; j < off+12; j += 3) { 204 acc += parallelPopcnt(xs[j+0], xs[j+1], xs[j+2]); 205 } 206 return reduce(acc); 207 } 208 209 @Benchmark 210 public long Lauradoux() { 211 long acc = 0; 212 int upper = data.length - (data.length % 12); 213 for (int i = 0; i < upper; i += 12) { 214 acc += popcntLauradoux(data, i); 215 } 216 return acc + tail(upper); 217 } 218 219 /* ============================================================================================================== */ 220 221 // FIGURE 6. A C function implementing a bitwise parallel carry-save adder (CSA). Given three input words a, b, c, it 222 // generates two new words h, l in which each bit represents the high and low bits in the bitwise sum of the bits from a, 223 // b, and c. 224 225 static long csaLow(long a, long b, long c) { 226 long u = a ^ b; 227 long lo = u ^ c; 228 return lo; 229 } 230 231 static long csaHigh(long a, long b, long c) { 232 long u = a ^ b; 233 long hi = (a & b) | (u & c) ; 234 return hi; 235 } 236 237 // FIGURE 8. A C function implementing the Harley-Seal 238 // population count over an array of 64-bit words. The count 239 // function could be the Wilkes-Wheeler-Gill function. 240 @Benchmark 241 public long HarleySeal() { 242 long total = 0, ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0; 243 long twosA = 0, twosB = 0; 244 long foursA = 0, foursB = 0; 245 long eightsA = 0, eightsB = 0; 246 247 int step = 16; 248 int upper = data.length - (data.length % step); 249 for (int i = 0; i < upper; i += step) { 250 // CSA(&twosA, &ones, ones, d[i+0], d[i +1]); 251 twosA = csaHigh(ones, data[i+0], data[i+1]); 252 ones = csaLow(ones, data[i+0], data[i+1]); 253 254 // CSA(&twosB, &ones, ones, d[i+2], d[i+3]); 255 twosB = csaHigh(ones, data[i+2], data[i+3]); 256 ones = csaLow(ones, data[i+2], data[i+3]); 257 258 // CSA(&foursA, &twos, twos, twosA, twosB); 259 foursA = csaHigh(twos, twosA, twosB); 260 twos = csaLow(twos, twosA, twosB); 261 262 // ==================================== 263 264 // CSA(&twosA, &ones, ones, d[i+4], d[i+5]); 265 twosA = csaHigh(ones, data[i+4], data[i+5]); 266 ones = csaLow(ones, data[i+4], data[i+5]); 267 268 // CSA(&twosB, &ones, ones, d[i+6], d[i+7]); 269 twosB = csaHigh(ones, data[i+6], data[i+7]); 270 ones = csaLow(ones, data[i+6], data[i+7]); 271 272 // CSA(&foursB, &twos, twos, twosA, twosB); 273 foursB = csaHigh(twos, twosA, twosB); 274 twos = csaLow(twos, twosA, twosB); 275 276 // ==================================== 277 278 // CSA(&eightsA, &fours, fours, foursA, foursB); 279 eightsA = csaHigh(fours, foursA, foursB); 280 fours = csaLow(fours, foursA, foursB); 281 282 // ==================================== 283 284 // CSA(&twosA, &ones, ones, d[i+8], d[i+9]); 285 twosA = csaHigh(ones, data[i+8], data[i+9]); 286 ones = csaLow(ones, data[i+8], data[i+9]); 287 288 // CSA(&twosB, &ones, ones, d[i+10],d[i+11]); 289 twosB = csaHigh(ones, data[i+10], data[i+11]); 290 ones = csaLow(ones, data[i+10], data[i+11]); 291 292 // CSA(&foursA, &twos, twos, twosA, twosB); 293 foursA = csaHigh(twos, twosA, twosB); 294 twos = csaLow(twos, twosA, twosB); 295 296 // ==================================== 297 298 // CSA(&twosA, &ones, ones, d[i+12], d[i +13]); 299 twosA = csaHigh(ones, data[i+12], data[i+13]); 300 ones = csaLow(ones, data[i+12], data[i+13]); 301 302 // CSA(&twosB, &ones, ones, d[i+14], d[i +15]); 303 twosB = csaHigh(ones, data[i+14], data[i+15]); 304 ones = csaLow(ones, data[i+14], data[i+15]); 305 306 // ==================================== 307 308 // CSA(&foursB, &twos, twos, twosA, twosB); 309 foursB = csaHigh(twos, twosA, twosB); 310 twos = csaLow(twos, twosA, twosB); 311 312 // CSA(&eightsB, &fours, fours, foursA, foursB); 313 eightsB = csaHigh(fours, foursA, foursB); 314 fours = csaLow(fours, foursA, foursB); 315 316 // ==================================== 317 318 // CSA(&sixteens, &eights, eights, eightsA, eightsB); 319 sixteens = csaHigh(eights, eightsA, eightsB); 320 eights = csaLow(eights, eightsA, eightsB); 321 322 total += Long.bitCount(sixteens); 323 } 324 total = 16 * total 325 + 8 * Long.bitCount(eights) 326 + 4 * Long.bitCount(fours) 327 + 2 * Long.bitCount(twos) 328 + 1 * Long.bitCount(ones); 329 330 return total + tail(upper); 331 } 332 333 /* ============================================================================================================== */ 334 335 // FIGURE 9. A C function using SSE intrinsics implementing Mula's algorithm to compute sixteen population counts, 336 // corresponding to sixteen input bytes. 337 338 static final ByteVector MULA128_LOOKUP = IntVector.fromArray(I128, 339 new int[]{ 340 0x02_01_01_00, // 0, 1, 1, 2, 341 0x03_02_02_01, // 1, 2, 2, 3, 342 0x03_02_02_01, // 1, 2, 2, 3, 343 0x04_03_03_02 // 2, 3, 3, 4 344 }, 345 0 346 ).reinterpretAsBytes(); 347 348 ByteVector popcntB128(ByteVector v) { 349 var low_mask = ByteVector.broadcast(B128, (byte)0x0f); 350 351 var lo = v .and(low_mask); 352 var hi = v.lanewise(VectorOperators.LSHR, 4).and(low_mask); 353 354 var cnt1 = MULA128_LOOKUP.rearrange(lo.toShuffle()); 355 var cnt2 = MULA128_LOOKUP.rearrange(hi.toShuffle()); 356 357 return cnt1.add(cnt2); 358 } 359 360 @Benchmark 361 public long Mula128() { 362 var acc = LongVector.zero(L128); // IntVector 363 int step = 32; // % B128.length() == 0! 364 int upper = data.length - (data.length % step); 365 for (int i = 0; i < upper; i += step) { 366 var bacc = ByteVector.zero(B128); 367 for (int j = 0; j < step; j += L128.length()) { 368 var v1 = LongVector.fromArray(L128, data, i + j); 369 var v2 = v1.reinterpretAsBytes(); 370 var v3 = popcntB128(v2); 371 bacc = bacc.add(v3); 372 } 373 acc = acc.add(sumUnsignedBytes(bacc)); 374 } 375 var r = acc.reduceLanes(VectorOperators.ADD) + tail(upper); 376 return r; 377 } 378 379 /* ============================================================================================================== */ 380 381 // FIGURE 10. A C function using AVX2 intrinsics implementing Mula's algorithm to compute the four population counts 382 // of the four 64-bit words in a 256-bit vector. The 32 B output vector should be interpreted as four separate 383 // 64-bit counts that need to be summed to obtain the final population count. 384 385 static final ByteVector MULA256_LOOKUP = 386 join(I128, I256, MULA128_LOOKUP.reinterpretAsInts(), MULA128_LOOKUP.reinterpretAsInts()).reinterpretAsBytes(); 387 388 ByteVector popcntB256(ByteVector v) { 389 var low_mask = ByteVector.broadcast(B256, (byte)0x0F); 390 391 var lo = v .and(low_mask); 392 var hi = v.lanewise(VectorOperators.LSHR, 4).and(low_mask); 393 394 var cnt1 = MULA256_LOOKUP.rearrange(lo.toShuffle()); 395 var cnt2 = MULA256_LOOKUP.rearrange(hi.toShuffle()); 396 var cnt = cnt1.add(cnt2); 397 398 return cnt; 399 } 400 401 // Horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, 402 // and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst: 403 // _mm256_sad_epu8(total, _mm256_setzero_si256()) 404 LongVector sumUnsignedBytes(ByteVector vb) { 405 return sumUnsignedBytesShapes(vb); 406 // return sumUnsignedBytesShifts(vb); 407 } 408 409 LongVector sumUnsignedBytesShapes(ByteVector vb) { 410 VectorSpecies<Short> shortSpecies = VectorSpecies.of(short.class, vb.shape()); 411 VectorSpecies<Integer> intSpecies = VectorSpecies.of(int.class, vb.shape()); 412 VectorSpecies<Long> longSpecies = VectorSpecies.of(long.class, vb.shape()); 413 414 var low_short_mask = ShortVector.broadcast(shortSpecies, (short) 0xFF); 415 var low_int_mask = IntVector.broadcast(intSpecies, 0xFFFF); 416 var low_long_mask = LongVector.broadcast(longSpecies, 0xFFFFFFFFL); 417 418 var vs = vb.reinterpretAsShorts(); // 16-bit 419 var vs0 = vs.and(low_short_mask); 420 var vs1 = vs.lanewise(VectorOperators.LSHR, 8).and(low_short_mask); 421 var vs01 = vs0.add(vs1); 422 423 var vi = vs01.reinterpretAsInts(); // 32-bit 424 var vi0 = vi.and(low_int_mask); 425 var vi1 = vi.lanewise(VectorOperators.LSHR, 16).and(low_int_mask); 426 var vi01 = vi0.add(vi1); 427 428 var vl = vi01.reinterpretAsLongs(); // 64-bit 429 var vl0 = vl.and(low_long_mask); 430 var vl1 = vl.lanewise(VectorOperators.LSHR, 32).and(low_long_mask); 431 var vl01 = vl0.add(vl1); 432 433 return vl01; 434 } 435 436 LongVector sumUnsignedBytesShifts(ByteVector vb) { 437 VectorSpecies<Long> to = VectorSpecies.of(long.class, vb.shape()); 438 439 var low_mask = LongVector.broadcast(to, 0xFF); 440 441 var vl = vb.reinterpretAsLongs(); 442 443 var v0 = vl .and(low_mask); // 8-bit 444 var v1 = vl.lanewise(VectorOperators.LSHR, 8).and(low_mask); // 8-bit 445 var v2 = vl.lanewise(VectorOperators.LSHR, 16).and(low_mask); // 8-bit 446 var v3 = vl.lanewise(VectorOperators.LSHR, 24).and(low_mask); // 8-bit 447 var v4 = vl.lanewise(VectorOperators.LSHR, 32).and(low_mask); // 8-bit 448 var v5 = vl.lanewise(VectorOperators.LSHR, 40).and(low_mask); // 8-bit 449 var v6 = vl.lanewise(VectorOperators.LSHR, 48).and(low_mask); // 8-bit 450 var v7 = vl.lanewise(VectorOperators.LSHR, 56).and(low_mask); // 8-bit 451 452 var v01 = v0.add(v1); 453 var v23 = v2.add(v3); 454 var v45 = v4.add(v5); 455 var v67 = v6.add(v7); 456 457 var v03 = v01.add(v23); 458 var v47 = v45.add(v67); 459 460 var sum = v03.add(v47); // 64-bit 461 return sum; 462 } 463 464 @Benchmark 465 public long Mula256() { 466 var acc = LongVector.zero(L256); 467 int step = 32; // % B256.length() == 0! 468 int upper = data.length - (data.length % step); 469 for (int i = 0; i < upper; i += step) { 470 var bacc = ByteVector.zero(B256); 471 for (int j = 0; j < step; j += L256.length()) { 472 var v1 = LongVector.fromArray(L256, data, i + j); 473 var v2 = popcntB256(v1.reinterpretAsBytes()); 474 bacc = bacc.add(v2); 475 } 476 acc = acc.add(sumUnsignedBytes(bacc)); 477 } 478 return acc.reduceLanes(VectorOperators.ADD) + tail(upper); 479 } 480 481 482 /* ============================================================================================================== */ 483 484 // FIGURE 11. A C function using AVX2 intrinsics implementing a bitwise parallel carry-save adder (CSA). 485 486 LongVector csaLow(LongVector a, LongVector b, LongVector c) { 487 var u = a.lanewise(VectorOperators.XOR, b); 488 var r = u.lanewise(VectorOperators.XOR, c); 489 return r; 490 } 491 492 LongVector csaHigh(LongVector a, LongVector b, LongVector c) { 493 var u = a.lanewise(VectorOperators.XOR, b); 494 var ab = a.and(b); 495 var uc = u.and(c); 496 var r = ab.or(uc); // (a & b) | ((a ^ b) & c) 497 return r; 498 } 499 500 LongVector popcntL256(LongVector v) { 501 var vb1 = v.reinterpretAsBytes(); 502 var vb2 = popcntB256(vb1); 503 return sumUnsignedBytes(vb2); 504 } 505 506 // FIGURE 12. A C function using AVX2 intrinsics implementing Harley-Seal's algorithm. It assumes, for 507 // simplicity, that the input size in 256-bit vectors is divisible by 16. See Fig. 10 for the count function. 508 509 @Benchmark 510 public long HarleySeal256() { 511 LongVector ones, twos, fours, eights, sixteens, vtotal, twosA, twosB, foursA, foursB, eightsA, eightsB; 512 ones = twos = fours = eights = sixteens = twosA = twosB = foursA = foursB = eightsA = eights = vtotal = LongVector.broadcast(L256, 0); 513 514 var vlen = L256.length(); 515 int step = 16 * vlen; 516 int upper = data.length - (data.length % step); 517 for (int i = 0; i < upper; i += step) { 518 // CSA(&twosA, &ones, ones, d[i+0], d[i +1]); 519 var d0 = LongVector.fromArray(L256, data, i + 0 * vlen); 520 var d1 = LongVector.fromArray(L256, data, i + 1 * vlen); 521 522 twosA = csaHigh(ones, d0, d1); 523 ones = csaLow(ones, d0, d1); 524 525 // CSA(&twosB, &ones, ones, d[i+2], d[i+3]); 526 var d2 = LongVector.fromArray(L256, data, i + 2 * vlen); 527 var d3 = LongVector.fromArray(L256, data, i + 3 * vlen); 528 twosB = csaHigh(ones, d2, d3); 529 ones = csaLow(ones, d2, d3); 530 531 // CSA(&foursA, &twos, twos, twosA, twosB); 532 foursA = csaHigh(twos, twosA, twosB); 533 twos = csaLow(twos, twosA, twosB); 534 535 // ==================================== 536 537 // CSA(&twosA, &ones, ones, d[i+4], d[i+5]); 538 var d4 = LongVector.fromArray(L256, data, i + 4 * vlen); 539 var d5 = LongVector.fromArray(L256, data, i + 5 * vlen); 540 twosA = csaHigh(ones, d4, d5); 541 ones = csaLow(ones, d4, d5); 542 543 // CSA(&twosB, &ones, ones, d[i+6], d[i+7]); 544 var d6 = LongVector.fromArray(L256, data, i + 6 * vlen); 545 var d7 = LongVector.fromArray(L256, data, i + 7 * vlen); 546 twosB = csaHigh(ones, d6, d7); 547 ones = csaLow(ones, d6, d7); 548 549 // CSA(&foursB, &twos, twos, twosA, twosB); 550 foursB = csaHigh(twos, twosA, twosB); 551 twos = csaLow(twos, twosA, twosB); 552 553 // ==================================== 554 555 // CSA(&eightsA, &fours, fours, foursA, foursB); 556 eightsA = csaHigh(fours, foursA, foursB); 557 fours = csaLow(fours, foursA, foursB); 558 559 // ==================================== 560 561 // CSA(&twosA, &ones, ones, d[i+8], d[i+9]); 562 var d8 = LongVector.fromArray(L256, data, i + 8 * vlen); 563 var d9 = LongVector.fromArray(L256, data, i + 9 * vlen); 564 twosA = csaHigh(ones, d8, d9); 565 ones = csaLow(ones, d8, d9); 566 567 // CSA(&twosB, &ones, ones, d[i+10],d[i+11]); 568 var d10 = LongVector.fromArray(L256, data, i + 10 * vlen); 569 var d11 = LongVector.fromArray(L256, data, i + 11 * vlen); 570 twosB = csaHigh(ones, d10, d11); 571 ones = csaLow(ones, d10, d11); 572 573 // CSA(&foursA, &twos, twos, twosA, twosB); 574 foursA = csaHigh(twos, twosA, twosB); 575 twos = csaLow(twos, twosA, twosB); 576 577 // ==================================== 578 579 // CSA(&twosA, &ones, ones, d[i+12], d[i +13]); 580 var d12 = LongVector.fromArray(L256, data, i + 12 * vlen); 581 var d13 = LongVector.fromArray(L256, data, i + 13 * vlen); 582 twosA = csaHigh(ones, d12, d13); 583 ones = csaLow(ones, d12, d13); 584 585 // CSA(&twosB, &ones, ones, d[i+14], d[i +15]); 586 var d14 = LongVector.fromArray(L256, data, i + 14 * vlen); 587 var d15 = LongVector.fromArray(L256, data, i + 15 * vlen); 588 twosB = csaHigh(ones, d14, d15); 589 ones = csaLow(ones, d14, d15); 590 591 // ==================================== 592 593 // CSA(&foursB, &twos, twos, twosA, twosB); 594 foursB = csaHigh(twos, twosA, twosB); 595 twos = csaLow(twos, twosA, twosB); 596 597 // CSA(&eightsB, &fours, fours, foursA, foursB); 598 eightsB = csaHigh(fours, foursA, foursB); 599 fours = csaLow(fours, foursA, foursB); 600 601 // ==================================== 602 603 // CSA(&sixteens, &eights, eights, eightsA, eightsB); 604 sixteens = csaHigh(eights, eightsA, eightsB); 605 eights = csaLow(eights, eightsA, eightsB); 606 607 vtotal = vtotal.add(popcntL256(sixteens)); 608 } 609 610 vtotal = vtotal.mul(16); // << 4 611 vtotal = vtotal.add(popcntL256(eights).mul(8)); // << 3 612 vtotal = vtotal.add(popcntL256(fours).mul(4)); // << 2 613 vtotal = vtotal.add(popcntL256(twos).mul(2)); // << 1 614 vtotal = vtotal.add(popcntL256(ones)); // << 0 615 616 var total = vtotal.reduceLanes(VectorOperators.ADD); 617 618 return total + tail(upper); 619 } 620 621 /* ============================================================================================================== */ 622 623 // ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) { 624 // return _mm512_ternarylogic_epi32(c, b, a, 0x96); // vpternlogd 625 // } 626 // 627 // ByteVector csaLow512(ByteVector a, ByteVector b, ByteVector c) { 628 // return _mm512_ternarylogic_epi32(c, b, a, 0xe8); // vpternlogd 629 // } 630 }