1 /*
   2  * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package org.openjdk.bench.jdk.incubator.vector.operation;
  25 
  26 // -- This file was mechanically generated: Do not edit! -- //
  27 
  28 import java.util.concurrent.TimeUnit;
  29 import java.util.function.IntFunction;
  30 
  31 import org.openjdk.jmh.annotations.*;
  32 import org.openjdk.jmh.infra.Blackhole;
  33 
  34 @BenchmarkMode(Mode.Throughput)
  35 @OutputTimeUnit(TimeUnit.MILLISECONDS)
  36 @State(Scope.Benchmark)
  37 @Warmup(iterations = 3, time = 1)
  38 @Measurement(iterations = 5, time = 1)
  39 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
  40 public class ByteScalar extends AbstractVectorBenchmark {
  41     static final int INVOC_COUNT = 1; // To align with vector benchmarks.
  42 
  43     private static final byte CONST_SHIFT = Byte.SIZE / 2;
  44 
  45     @Param("1024")
  46     int size;
  47 
  48     byte[] fill(IntFunction<Byte> f) {
  49         byte[] array = new byte[size];
  50         for (int i = 0; i < array.length; i++) {
  51             array[i] = f.apply(i);
  52         }
  53         return array;
  54     }
  55 
  56     static byte bits(byte e) {
  57         return e;
  58     }
  59 
  60     byte[] as, bs, cs, rs;
  61     boolean[] ms, mt, rms;
  62     int[] ss;
  63 
  64     @Setup
  65     public void init() {
  66         as = fill(i -> (byte)(2*i));
  67         bs = fill(i -> (byte)(i+1));
  68         cs = fill(i -> (byte)(i+5));
  69         rs = fill(i -> (byte)0);
  70         ms = fillMask(size, i -> (i % 2) == 0);
  71         mt = fillMask(size, i -> true);
  72         rms = fillMask(size, i -> false);
  73 
  74         ss = fillInt(size, i -> RANDOM.nextInt(Math.max(i,1)));
  75     }
  76 
  77     final IntFunction<byte[]> fa = vl -> as;
  78     final IntFunction<byte[]> fb = vl -> bs;
  79     final IntFunction<byte[]> fc = vl -> cs;
  80     final IntFunction<byte[]> fr = vl -> rs;
  81     final IntFunction<boolean[]> fm = vl -> ms;
  82     final IntFunction<boolean[]> fmt = vl -> mt;
  83     final IntFunction<boolean[]> fmr = vl -> rms;
  84     final IntFunction<int[]> fs = vl -> ss;
  85 
  86     static boolean eq(byte a, byte b) {
  87         return a == b;
  88     }
  89 
  90     static boolean neq(byte a, byte b) {
  91         return a != b;
  92     }
  93 
  94     static boolean lt(byte a, byte b) {
  95         return a < b;
  96     }
  97 
  98     static boolean le(byte a, byte b) {
  99         return a <= b;
 100     }
 101 
 102     static boolean gt(byte a, byte b) {
 103         return a > b;
 104     }
 105 
 106     static boolean ge(byte a, byte b) {
 107         return a >= b;
 108     }
 109 
 110     static boolean ult(byte a, byte b) {
 111         return Byte.compareUnsigned(a, b) < 0;
 112     }
 113 
 114     static boolean ule(byte a, byte b) {
 115         return Byte.compareUnsigned(a, b) <= 0;
 116     }
 117 
 118     static boolean ugt(byte a, byte b) {
 119         return Byte.compareUnsigned(a, b) > 0;
 120     }
 121 
 122     static boolean uge(byte a, byte b) {
 123         return Byte.compareUnsigned(a, b) >= 0;
 124     }
 125 
 126     static byte ROL_scalar(byte a, byte b) {
 127         return (byte)(((((byte)a) & 0xFF) << (b & 7)) | ((((byte)a) & 0xFF) >>> (8 - (b & 7))));
 128     }
 129 
 130     static byte ROR_scalar(byte a, byte b) {
 131         return (byte)(((((byte)a) & 0xFF) >>> (b & 7)) | ((((byte)a) & 0xFF) << (8 - (b & 7))));
 132     }
 133 
 134     static byte TRAILING_ZEROS_COUNT_scalar(byte a) {
 135         return (byte) (a != 0 ? Integer.numberOfTrailingZeros(a) : 8);
 136     }
 137 
 138     static byte LEADING_ZEROS_COUNT_scalar(byte a) {
 139         return (byte) (a >= 0 ? Integer.numberOfLeadingZeros(a) - 24 : 0);
 140     }
 141 
 142     static byte REVERSE_scalar(byte a) {
 143         byte b = ROL_scalar(a, (byte) 4);
 144         b = (byte)(((b & 0x55) << 1) | ((b & 0xAA) >>> 1));
 145         b = (byte)(((b & 0x33) << 2) | ((b & 0xCC) >>> 2));
 146         return b;
 147     }
 148 
 149     @Benchmark
 150     public void ADD(Blackhole bh) {
 151         byte[] as = fa.apply(size);
 152         byte[] bs = fb.apply(size);
 153         byte[] rs = fr.apply(size);
 154 
 155         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 156             for (int i = 0; i < as.length; i++) {
 157                 byte a = as[i];
 158                 byte b = bs[i];
 159                 rs[i] = (byte)(a + b);
 160             }
 161         }
 162 
 163         bh.consume(rs);
 164     }
 165 
 166     @Benchmark
 167     public void ADDMasked(Blackhole bh) {
 168         byte[] as = fa.apply(size);
 169         byte[] bs = fb.apply(size);
 170         byte[] rs = fr.apply(size);
 171         boolean[] ms = fm.apply(size);
 172 
 173         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 174             for (int i = 0; i < as.length; i++) {
 175                 byte a = as[i];
 176                 byte b = bs[i];
 177                 if (ms[i % ms.length]) {
 178                     rs[i] = (byte)(a + b);
 179                 } else {
 180                     rs[i] = a;
 181                 }
 182             }
 183         }
 184         bh.consume(rs);
 185     }
 186 
 187     @Benchmark
 188     public void SUB(Blackhole bh) {
 189         byte[] as = fa.apply(size);
 190         byte[] bs = fb.apply(size);
 191         byte[] rs = fr.apply(size);
 192 
 193         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 194             for (int i = 0; i < as.length; i++) {
 195                 byte a = as[i];
 196                 byte b = bs[i];
 197                 rs[i] = (byte)(a - b);
 198             }
 199         }
 200 
 201         bh.consume(rs);
 202     }
 203 
 204     @Benchmark
 205     public void SUBMasked(Blackhole bh) {
 206         byte[] as = fa.apply(size);
 207         byte[] bs = fb.apply(size);
 208         byte[] rs = fr.apply(size);
 209         boolean[] ms = fm.apply(size);
 210 
 211         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 212             for (int i = 0; i < as.length; i++) {
 213                 byte a = as[i];
 214                 byte b = bs[i];
 215                 if (ms[i % ms.length]) {
 216                     rs[i] = (byte)(a - b);
 217                 } else {
 218                     rs[i] = a;
 219                 }
 220             }
 221         }
 222         bh.consume(rs);
 223     }
 224 
 225     @Benchmark
 226     public void MUL(Blackhole bh) {
 227         byte[] as = fa.apply(size);
 228         byte[] bs = fb.apply(size);
 229         byte[] rs = fr.apply(size);
 230 
 231         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 232             for (int i = 0; i < as.length; i++) {
 233                 byte a = as[i];
 234                 byte b = bs[i];
 235                 rs[i] = (byte)(a * b);
 236             }
 237         }
 238 
 239         bh.consume(rs);
 240     }
 241 
 242     @Benchmark
 243     public void MULMasked(Blackhole bh) {
 244         byte[] as = fa.apply(size);
 245         byte[] bs = fb.apply(size);
 246         byte[] rs = fr.apply(size);
 247         boolean[] ms = fm.apply(size);
 248 
 249         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 250             for (int i = 0; i < as.length; i++) {
 251                 byte a = as[i];
 252                 byte b = bs[i];
 253                 if (ms[i % ms.length]) {
 254                     rs[i] = (byte)(a * b);
 255                 } else {
 256                     rs[i] = a;
 257                 }
 258             }
 259         }
 260         bh.consume(rs);
 261     }
 262 
 263     @Benchmark
 264     public void FIRST_NONZERO(Blackhole bh) {
 265         byte[] as = fa.apply(size);
 266         byte[] bs = fb.apply(size);
 267         byte[] rs = fr.apply(size);
 268 
 269         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 270             for (int i = 0; i < as.length; i++) {
 271                 byte a = as[i];
 272                 byte b = bs[i];
 273                 rs[i] = (byte)((a)!=0?a:b);
 274             }
 275         }
 276 
 277         bh.consume(rs);
 278     }
 279 
 280     @Benchmark
 281     public void FIRST_NONZEROMasked(Blackhole bh) {
 282         byte[] as = fa.apply(size);
 283         byte[] bs = fb.apply(size);
 284         byte[] rs = fr.apply(size);
 285         boolean[] ms = fm.apply(size);
 286 
 287         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 288             for (int i = 0; i < as.length; i++) {
 289                 byte a = as[i];
 290                 byte b = bs[i];
 291                 if (ms[i % ms.length]) {
 292                     rs[i] = (byte)((a)!=0?a:b);
 293                 } else {
 294                     rs[i] = a;
 295                 }
 296             }
 297         }
 298         bh.consume(rs);
 299     }
 300 
 301     @Benchmark
 302     public void AND(Blackhole bh) {
 303         byte[] as = fa.apply(size);
 304         byte[] bs = fb.apply(size);
 305         byte[] rs = fr.apply(size);
 306 
 307         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 308             for (int i = 0; i < as.length; i++) {
 309                 byte a = as[i];
 310                 byte b = bs[i];
 311                 rs[i] = (byte)(a & b);
 312             }
 313         }
 314 
 315         bh.consume(rs);
 316     }
 317 
 318     @Benchmark
 319     public void ANDMasked(Blackhole bh) {
 320         byte[] as = fa.apply(size);
 321         byte[] bs = fb.apply(size);
 322         byte[] rs = fr.apply(size);
 323         boolean[] ms = fm.apply(size);
 324 
 325         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 326             for (int i = 0; i < as.length; i++) {
 327                 byte a = as[i];
 328                 byte b = bs[i];
 329                 if (ms[i % ms.length]) {
 330                     rs[i] = (byte)(a & b);
 331                 } else {
 332                     rs[i] = a;
 333                 }
 334             }
 335         }
 336         bh.consume(rs);
 337     }
 338 
 339     @Benchmark
 340     public void AND_NOT(Blackhole bh) {
 341         byte[] as = fa.apply(size);
 342         byte[] bs = fb.apply(size);
 343         byte[] rs = fr.apply(size);
 344 
 345         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 346             for (int i = 0; i < as.length; i++) {
 347                 byte a = as[i];
 348                 byte b = bs[i];
 349                 rs[i] = (byte)(a & ~b);
 350             }
 351         }
 352 
 353         bh.consume(rs);
 354     }
 355 
 356     @Benchmark
 357     public void AND_NOTMasked(Blackhole bh) {
 358         byte[] as = fa.apply(size);
 359         byte[] bs = fb.apply(size);
 360         byte[] rs = fr.apply(size);
 361         boolean[] ms = fm.apply(size);
 362 
 363         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 364             for (int i = 0; i < as.length; i++) {
 365                 byte a = as[i];
 366                 byte b = bs[i];
 367                 if (ms[i % ms.length]) {
 368                     rs[i] = (byte)(a & ~b);
 369                 } else {
 370                     rs[i] = a;
 371                 }
 372             }
 373         }
 374         bh.consume(rs);
 375     }
 376 
 377     @Benchmark
 378     public void OR(Blackhole bh) {
 379         byte[] as = fa.apply(size);
 380         byte[] bs = fb.apply(size);
 381         byte[] rs = fr.apply(size);
 382 
 383         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 384             for (int i = 0; i < as.length; i++) {
 385                 byte a = as[i];
 386                 byte b = bs[i];
 387                 rs[i] = (byte)(a | b);
 388             }
 389         }
 390 
 391         bh.consume(rs);
 392     }
 393 
 394     @Benchmark
 395     public void ORMasked(Blackhole bh) {
 396         byte[] as = fa.apply(size);
 397         byte[] bs = fb.apply(size);
 398         byte[] rs = fr.apply(size);
 399         boolean[] ms = fm.apply(size);
 400 
 401         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 402             for (int i = 0; i < as.length; i++) {
 403                 byte a = as[i];
 404                 byte b = bs[i];
 405                 if (ms[i % ms.length]) {
 406                     rs[i] = (byte)(a | b);
 407                 } else {
 408                     rs[i] = a;
 409                 }
 410             }
 411         }
 412         bh.consume(rs);
 413     }
 414 
 415     @Benchmark
 416     public void XOR(Blackhole bh) {
 417         byte[] as = fa.apply(size);
 418         byte[] bs = fb.apply(size);
 419         byte[] rs = fr.apply(size);
 420 
 421         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 422             for (int i = 0; i < as.length; i++) {
 423                 byte a = as[i];
 424                 byte b = bs[i];
 425                 rs[i] = (byte)(a ^ b);
 426             }
 427         }
 428 
 429         bh.consume(rs);
 430     }
 431 
 432     @Benchmark
 433     public void XORMasked(Blackhole bh) {
 434         byte[] as = fa.apply(size);
 435         byte[] bs = fb.apply(size);
 436         byte[] rs = fr.apply(size);
 437         boolean[] ms = fm.apply(size);
 438 
 439         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 440             for (int i = 0; i < as.length; i++) {
 441                 byte a = as[i];
 442                 byte b = bs[i];
 443                 if (ms[i % ms.length]) {
 444                     rs[i] = (byte)(a ^ b);
 445                 } else {
 446                     rs[i] = a;
 447                 }
 448             }
 449         }
 450         bh.consume(rs);
 451     }
 452 
 453     @Benchmark
 454     public void LSHL(Blackhole bh) {
 455         byte[] as = fa.apply(size);
 456         byte[] bs = fb.apply(size);
 457         byte[] rs = fr.apply(size);
 458 
 459         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 460             for (int i = 0; i < as.length; i++) {
 461                 byte a = as[i];
 462                 byte b = bs[i];
 463                 rs[i] = (byte)((a << (b & 0x7)));
 464             }
 465         }
 466 
 467         bh.consume(rs);
 468     }
 469 
 470     @Benchmark
 471     public void LSHLMasked(Blackhole bh) {
 472         byte[] as = fa.apply(size);
 473         byte[] bs = fb.apply(size);
 474         byte[] rs = fr.apply(size);
 475         boolean[] ms = fm.apply(size);
 476 
 477         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 478             for (int i = 0; i < as.length; i++) {
 479                 byte a = as[i];
 480                 byte b = bs[i];
 481                 if (ms[i % ms.length]) {
 482                     rs[i] = (byte)((a << (b & 0x7)));
 483                 } else {
 484                     rs[i] = a;
 485                 }
 486             }
 487         }
 488         bh.consume(rs);
 489     }
 490 
 491     @Benchmark
 492     public void ASHR(Blackhole bh) {
 493         byte[] as = fa.apply(size);
 494         byte[] bs = fb.apply(size);
 495         byte[] rs = fr.apply(size);
 496 
 497         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 498             for (int i = 0; i < as.length; i++) {
 499                 byte a = as[i];
 500                 byte b = bs[i];
 501                 rs[i] = (byte)((a >> (b & 0x7)));
 502             }
 503         }
 504 
 505         bh.consume(rs);
 506     }
 507 
 508     @Benchmark
 509     public void ASHRMasked(Blackhole bh) {
 510         byte[] as = fa.apply(size);
 511         byte[] bs = fb.apply(size);
 512         byte[] rs = fr.apply(size);
 513         boolean[] ms = fm.apply(size);
 514 
 515         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 516             for (int i = 0; i < as.length; i++) {
 517                 byte a = as[i];
 518                 byte b = bs[i];
 519                 if (ms[i % ms.length]) {
 520                     rs[i] = (byte)((a >> (b & 0x7)));
 521                 } else {
 522                     rs[i] = a;
 523                 }
 524             }
 525         }
 526         bh.consume(rs);
 527     }
 528 
 529     @Benchmark
 530     public void LSHR(Blackhole bh) {
 531         byte[] as = fa.apply(size);
 532         byte[] bs = fb.apply(size);
 533         byte[] rs = fr.apply(size);
 534 
 535         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 536             for (int i = 0; i < as.length; i++) {
 537                 byte a = as[i];
 538                 byte b = bs[i];
 539                 rs[i] = (byte)(((a & 0xFF) >>> (b & 0x7)));
 540             }
 541         }
 542 
 543         bh.consume(rs);
 544     }
 545 
 546     @Benchmark
 547     public void LSHRMasked(Blackhole bh) {
 548         byte[] as = fa.apply(size);
 549         byte[] bs = fb.apply(size);
 550         byte[] rs = fr.apply(size);
 551         boolean[] ms = fm.apply(size);
 552 
 553         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 554             for (int i = 0; i < as.length; i++) {
 555                 byte a = as[i];
 556                 byte b = bs[i];
 557                 if (ms[i % ms.length]) {
 558                     rs[i] = (byte)(((a & 0xFF) >>> (b & 0x7)));
 559                 } else {
 560                     rs[i] = a;
 561                 }
 562             }
 563         }
 564         bh.consume(rs);
 565     }
 566 
 567     @Benchmark
 568     public void LSHLShift(Blackhole bh) {
 569         byte[] as = fa.apply(size);
 570         byte[] bs = fb.apply(size);
 571         byte[] rs = fr.apply(size);
 572 
 573         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 574             for (int i = 0; i < as.length; i++) {
 575                 byte a = as[i];
 576                 byte b = bs[i];
 577                 rs[i] = (byte)((a << (b & 7)));
 578             }
 579         }
 580 
 581         bh.consume(rs);
 582     }
 583 
 584     @Benchmark
 585     public void LSHLMaskedShift(Blackhole bh) {
 586         byte[] as = fa.apply(size);
 587         byte[] bs = fb.apply(size);
 588         byte[] rs = fr.apply(size);
 589         boolean[] ms = fm.apply(size);
 590 
 591         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 592             for (int i = 0; i < as.length; i++) {
 593                 byte a = as[i];
 594                 byte b = bs[i];
 595                 boolean m = ms[i % ms.length];
 596                 rs[i] = (m ? (byte)((a << (b & 7))) : a);
 597             }
 598         }
 599 
 600         bh.consume(rs);
 601     }
 602 
 603     @Benchmark
 604     public void LSHRShift(Blackhole bh) {
 605         byte[] as = fa.apply(size);
 606         byte[] bs = fb.apply(size);
 607         byte[] rs = fr.apply(size);
 608 
 609         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 610             for (int i = 0; i < as.length; i++) {
 611                 byte a = as[i];
 612                 byte b = bs[i];
 613                 rs[i] = (byte)(((a & 0xFF) >>> (b & 7)));
 614             }
 615         }
 616 
 617         bh.consume(rs);
 618     }
 619 
 620     @Benchmark
 621     public void LSHRMaskedShift(Blackhole bh) {
 622         byte[] as = fa.apply(size);
 623         byte[] bs = fb.apply(size);
 624         byte[] rs = fr.apply(size);
 625         boolean[] ms = fm.apply(size);
 626 
 627         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 628             for (int i = 0; i < as.length; i++) {
 629                 byte a = as[i];
 630                 byte b = bs[i];
 631                 boolean m = ms[i % ms.length];
 632                 rs[i] = (m ? (byte)(((a & 0xFF) >>> (b & 7))) : a);
 633             }
 634         }
 635 
 636         bh.consume(rs);
 637     }
 638 
 639     @Benchmark
 640     public void ASHRShift(Blackhole bh) {
 641         byte[] as = fa.apply(size);
 642         byte[] bs = fb.apply(size);
 643         byte[] rs = fr.apply(size);
 644 
 645         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 646             for (int i = 0; i < as.length; i++) {
 647                 byte a = as[i];
 648                 byte b = bs[i];
 649                 rs[i] = (byte)((a >> (b & 7)));
 650             }
 651         }
 652 
 653         bh.consume(rs);
 654     }
 655 
 656     @Benchmark
 657     public void ASHRMaskedShift(Blackhole bh) {
 658         byte[] as = fa.apply(size);
 659         byte[] bs = fb.apply(size);
 660         byte[] rs = fr.apply(size);
 661         boolean[] ms = fm.apply(size);
 662 
 663         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 664             for (int i = 0; i < as.length; i++) {
 665                 byte a = as[i];
 666                 byte b = bs[i];
 667                 boolean m = ms[i % ms.length];
 668                 rs[i] = (m ? (byte)((a >> (b & 7))) : a);
 669             }
 670         }
 671 
 672         bh.consume(rs);
 673     }
 674 
 675     @Benchmark
 676     public void ROR(Blackhole bh) {
 677         byte[] as = fa.apply(size);
 678         byte[] bs = fb.apply(size);
 679         byte[] rs = fr.apply(size);
 680 
 681         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 682             for (int i = 0; i < as.length; i++) {
 683                 byte a = as[i];
 684                 byte b = bs[i];
 685                 rs[i] = (byte)(ROR_scalar(a,b));
 686             }
 687         }
 688 
 689         bh.consume(rs);
 690     }
 691 
 692     @Benchmark
 693     public void RORMasked(Blackhole bh) {
 694         byte[] as = fa.apply(size);
 695         byte[] bs = fb.apply(size);
 696         byte[] rs = fr.apply(size);
 697         boolean[] ms = fm.apply(size);
 698 
 699         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 700             for (int i = 0; i < as.length; i++) {
 701                 byte a = as[i];
 702                 byte b = bs[i];
 703                 if (ms[i % ms.length]) {
 704                     rs[i] = (byte)(ROR_scalar(a,b));
 705                 } else {
 706                     rs[i] = a;
 707                 }
 708             }
 709         }
 710         bh.consume(rs);
 711     }
 712 
 713     @Benchmark
 714     public void ROL(Blackhole bh) {
 715         byte[] as = fa.apply(size);
 716         byte[] bs = fb.apply(size);
 717         byte[] rs = fr.apply(size);
 718 
 719         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 720             for (int i = 0; i < as.length; i++) {
 721                 byte a = as[i];
 722                 byte b = bs[i];
 723                 rs[i] = (byte)(ROL_scalar(a,b));
 724             }
 725         }
 726 
 727         bh.consume(rs);
 728     }
 729 
 730     @Benchmark
 731     public void ROLMasked(Blackhole bh) {
 732         byte[] as = fa.apply(size);
 733         byte[] bs = fb.apply(size);
 734         byte[] rs = fr.apply(size);
 735         boolean[] ms = fm.apply(size);
 736 
 737         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 738             for (int i = 0; i < as.length; i++) {
 739                 byte a = as[i];
 740                 byte b = bs[i];
 741                 if (ms[i % ms.length]) {
 742                     rs[i] = (byte)(ROL_scalar(a,b));
 743                 } else {
 744                     rs[i] = a;
 745                 }
 746             }
 747         }
 748         bh.consume(rs);
 749     }
 750 
 751     @Benchmark
 752     public void RORShift(Blackhole bh) {
 753         byte[] as = fa.apply(size);
 754         byte[] bs = fb.apply(size);
 755         byte[] rs = fr.apply(size);
 756 
 757         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 758             for (int i = 0; i < as.length; i++) {
 759                 byte a = as[i];
 760                 byte b = bs[i];
 761                 rs[i] = (byte)(ROR_scalar(a, b));
 762             }
 763         }
 764 
 765         bh.consume(rs);
 766     }
 767 
 768     @Benchmark
 769     public void RORMaskedShift(Blackhole bh) {
 770         byte[] as = fa.apply(size);
 771         byte[] bs = fb.apply(size);
 772         byte[] rs = fr.apply(size);
 773         boolean[] ms = fm.apply(size);
 774 
 775         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 776             for (int i = 0; i < as.length; i++) {
 777                 byte a = as[i];
 778                 byte b = bs[i];
 779                 boolean m = ms[i % ms.length];
 780                 rs[i] = (m ? (byte)(ROR_scalar(a, b)) : a);
 781             }
 782         }
 783 
 784         bh.consume(rs);
 785     }
 786 
 787     @Benchmark
 788     public void ROLShift(Blackhole bh) {
 789         byte[] as = fa.apply(size);
 790         byte[] bs = fb.apply(size);
 791         byte[] rs = fr.apply(size);
 792 
 793         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 794             for (int i = 0; i < as.length; i++) {
 795                 byte a = as[i];
 796                 byte b = bs[i];
 797                 rs[i] = (byte)(ROL_scalar(a, b));
 798             }
 799         }
 800 
 801         bh.consume(rs);
 802     }
 803 
 804     @Benchmark
 805     public void ROLMaskedShift(Blackhole bh) {
 806         byte[] as = fa.apply(size);
 807         byte[] bs = fb.apply(size);
 808         byte[] rs = fr.apply(size);
 809         boolean[] ms = fm.apply(size);
 810 
 811         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 812             for (int i = 0; i < as.length; i++) {
 813                 byte a = as[i];
 814                 byte b = bs[i];
 815                 boolean m = ms[i % ms.length];
 816                 rs[i] = (m ? (byte)(ROL_scalar(a, b)) : a);
 817             }
 818         }
 819 
 820         bh.consume(rs);
 821     }
 822 
 823     @Benchmark
 824     public void LSHRShiftConst(Blackhole bh) {
 825         byte[] as = fa.apply(size);
 826         byte[] bs = fb.apply(size);
 827         byte[] rs = fr.apply(size);
 828 
 829         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 830             for (int i = 0; i < as.length; i++) {
 831                 byte a = as[i];
 832                 byte b = bs[i];
 833                 rs[i] = (byte)(((a & 0xFF) >>> CONST_SHIFT));
 834             }
 835         }
 836 
 837         bh.consume(rs);
 838     }
 839 
 840     @Benchmark
 841     public void LSHRMaskedShiftConst(Blackhole bh) {
 842         byte[] as = fa.apply(size);
 843         byte[] bs = fb.apply(size);
 844         byte[] rs = fr.apply(size);
 845         boolean[] ms = fm.apply(size);
 846 
 847         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 848             for (int i = 0; i < as.length; i++) {
 849                 byte a = as[i];
 850                 byte b = bs[i];
 851                 boolean m = ms[i % ms.length];
 852                 rs[i] = (m ? (byte)(((a & 0xFF) >>> CONST_SHIFT)) : a);
 853             }
 854         }
 855 
 856         bh.consume(rs);
 857     }
 858 
 859     @Benchmark
 860     public void LSHLShiftConst(Blackhole bh) {
 861         byte[] as = fa.apply(size);
 862         byte[] bs = fb.apply(size);
 863         byte[] rs = fr.apply(size);
 864 
 865         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 866             for (int i = 0; i < as.length; i++) {
 867                 byte a = as[i];
 868                 byte b = bs[i];
 869                 rs[i] = (byte)((a << CONST_SHIFT));
 870             }
 871         }
 872 
 873         bh.consume(rs);
 874     }
 875 
 876     @Benchmark
 877     public void LSHLMaskedShiftConst(Blackhole bh) {
 878         byte[] as = fa.apply(size);
 879         byte[] bs = fb.apply(size);
 880         byte[] rs = fr.apply(size);
 881         boolean[] ms = fm.apply(size);
 882 
 883         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 884             for (int i = 0; i < as.length; i++) {
 885                 byte a = as[i];
 886                 byte b = bs[i];
 887                 boolean m = ms[i % ms.length];
 888                 rs[i] = (m ? (byte)((a << CONST_SHIFT)) : a);
 889             }
 890         }
 891 
 892         bh.consume(rs);
 893     }
 894 
 895     @Benchmark
 896     public void ASHRShiftConst(Blackhole bh) {
 897         byte[] as = fa.apply(size);
 898         byte[] bs = fb.apply(size);
 899         byte[] rs = fr.apply(size);
 900 
 901         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 902             for (int i = 0; i < as.length; i++) {
 903                 byte a = as[i];
 904                 byte b = bs[i];
 905                 rs[i] = (byte)((a >> CONST_SHIFT));
 906             }
 907         }
 908 
 909         bh.consume(rs);
 910     }
 911 
 912     @Benchmark
 913     public void ASHRMaskedShiftConst(Blackhole bh) {
 914         byte[] as = fa.apply(size);
 915         byte[] bs = fb.apply(size);
 916         byte[] rs = fr.apply(size);
 917         boolean[] ms = fm.apply(size);
 918 
 919         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 920             for (int i = 0; i < as.length; i++) {
 921                 byte a = as[i];
 922                 byte b = bs[i];
 923                 boolean m = ms[i % ms.length];
 924                 rs[i] = (m ? (byte)((a >> CONST_SHIFT)) : a);
 925             }
 926         }
 927 
 928         bh.consume(rs);
 929     }
 930 
 931     @Benchmark
 932     public void RORShiftConst(Blackhole bh) {
 933         byte[] as = fa.apply(size);
 934         byte[] bs = fb.apply(size);
 935         byte[] rs = fr.apply(size);
 936 
 937         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 938             for (int i = 0; i < as.length; i++) {
 939                 byte a = as[i];
 940                 byte b = bs[i];
 941                 rs[i] = (byte)(ROR_scalar(a, CONST_SHIFT));
 942             }
 943         }
 944 
 945         bh.consume(rs);
 946     }
 947 
 948     @Benchmark
 949     public void RORMaskedShiftConst(Blackhole bh) {
 950         byte[] as = fa.apply(size);
 951         byte[] bs = fb.apply(size);
 952         byte[] rs = fr.apply(size);
 953         boolean[] ms = fm.apply(size);
 954 
 955         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 956             for (int i = 0; i < as.length; i++) {
 957                 byte a = as[i];
 958                 byte b = bs[i];
 959                 boolean m = ms[i % ms.length];
 960                 rs[i] = (m ? (byte)(ROR_scalar(a, CONST_SHIFT)) : a);
 961             }
 962         }
 963 
 964         bh.consume(rs);
 965     }
 966 
 967     @Benchmark
 968     public void ROLShiftConst(Blackhole bh) {
 969         byte[] as = fa.apply(size);
 970         byte[] bs = fb.apply(size);
 971         byte[] rs = fr.apply(size);
 972 
 973         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 974             for (int i = 0; i < as.length; i++) {
 975                 byte a = as[i];
 976                 byte b = bs[i];
 977                 rs[i] = (byte)(ROL_scalar(a, CONST_SHIFT));
 978             }
 979         }
 980 
 981         bh.consume(rs);
 982     }
 983 
 984     @Benchmark
 985     public void ROLMaskedShiftConst(Blackhole bh) {
 986         byte[] as = fa.apply(size);
 987         byte[] bs = fb.apply(size);
 988         byte[] rs = fr.apply(size);
 989         boolean[] ms = fm.apply(size);
 990 
 991         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 992             for (int i = 0; i < as.length; i++) {
 993                 byte a = as[i];
 994                 byte b = bs[i];
 995                 boolean m = ms[i % ms.length];
 996                 rs[i] = (m ? (byte)(ROL_scalar(a, CONST_SHIFT)) : a);
 997             }
 998         }
 999 
1000         bh.consume(rs);
1001     }
1002 
1003     @Benchmark
1004     public void MIN(Blackhole bh) {
1005         byte[] as = fa.apply(size);
1006         byte[] bs = fb.apply(size);
1007         byte[] rs = fr.apply(size);
1008 
1009         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1010             for (int i = 0; i < as.length; i++) {
1011                 byte a = as[i];
1012                 byte b = bs[i];
1013                 rs[i] = (byte)(Math.min(a, b));
1014             }
1015         }
1016 
1017         bh.consume(rs);
1018     }
1019 
1020     @Benchmark
1021     public void MAX(Blackhole bh) {
1022         byte[] as = fa.apply(size);
1023         byte[] bs = fb.apply(size);
1024         byte[] rs = fr.apply(size);
1025 
1026         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1027             for (int i = 0; i < as.length; i++) {
1028                 byte a = as[i];
1029                 byte b = bs[i];
1030                 rs[i] = (byte)(Math.max(a, b));
1031             }
1032         }
1033 
1034         bh.consume(rs);
1035     }
1036 
1037     @Benchmark
1038     public void ANDLanes(Blackhole bh) {
1039         byte[] as = fa.apply(size);
1040         byte r = -1;
1041         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1042             r = -1;
1043             for (int i = 0; i < as.length; i++) {
1044                 r &= as[i];
1045             }
1046         }
1047         bh.consume(r);
1048     }
1049 
1050     @Benchmark
1051     public void ANDMaskedLanes(Blackhole bh) {
1052         byte[] as = fa.apply(size);
1053         boolean[] ms = fm.apply(size);
1054         byte r = -1;
1055         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1056             r = -1;
1057             for (int i = 0; i < as.length; i++) {
1058                 if (ms[i % ms.length])
1059                     r &= as[i];
1060             }
1061         }
1062         bh.consume(r);
1063     }
1064 
1065     @Benchmark
1066     public void ORLanes(Blackhole bh) {
1067         byte[] as = fa.apply(size);
1068         byte r = 0;
1069         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1070             r = 0;
1071             for (int i = 0; i < as.length; i++) {
1072                 r |= as[i];
1073             }
1074         }
1075         bh.consume(r);
1076     }
1077 
1078     @Benchmark
1079     public void ORMaskedLanes(Blackhole bh) {
1080         byte[] as = fa.apply(size);
1081         boolean[] ms = fm.apply(size);
1082         byte r = 0;
1083         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1084             r = 0;
1085             for (int i = 0; i < as.length; i++) {
1086                 if (ms[i % ms.length])
1087                     r |= as[i];
1088             }
1089         }
1090         bh.consume(r);
1091     }
1092 
1093     @Benchmark
1094     public void XORLanes(Blackhole bh) {
1095         byte[] as = fa.apply(size);
1096         byte r = 0;
1097         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1098             r = 0;
1099             for (int i = 0; i < as.length; i++) {
1100                 r ^= as[i];
1101             }
1102         }
1103         bh.consume(r);
1104     }
1105 
1106     @Benchmark
1107     public void XORMaskedLanes(Blackhole bh) {
1108         byte[] as = fa.apply(size);
1109         boolean[] ms = fm.apply(size);
1110         byte r = 0;
1111         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1112             r = 0;
1113             for (int i = 0; i < as.length; i++) {
1114                 if (ms[i % ms.length])
1115                     r ^= as[i];
1116             }
1117         }
1118         bh.consume(r);
1119     }
1120 
1121     @Benchmark
1122     public void ADDLanes(Blackhole bh) {
1123         byte[] as = fa.apply(size);
1124         byte r = 0;
1125         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1126             r = 0;
1127             for (int i = 0; i < as.length; i++) {
1128                 r += as[i];
1129             }
1130         }
1131         bh.consume(r);
1132     }
1133 
1134     @Benchmark
1135     public void ADDMaskedLanes(Blackhole bh) {
1136         byte[] as = fa.apply(size);
1137         boolean[] ms = fm.apply(size);
1138         byte r = 0;
1139         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1140             r = 0;
1141             for (int i = 0; i < as.length; i++) {
1142                 if (ms[i % ms.length])
1143                     r += as[i];
1144             }
1145         }
1146         bh.consume(r);
1147     }
1148 
1149     @Benchmark
1150     public void MULLanes(Blackhole bh) {
1151         byte[] as = fa.apply(size);
1152         byte r = 1;
1153         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1154             r = 1;
1155             for (int i = 0; i < as.length; i++) {
1156                 r *= as[i];
1157             }
1158         }
1159         bh.consume(r);
1160     }
1161 
1162     @Benchmark
1163     public void MULMaskedLanes(Blackhole bh) {
1164         byte[] as = fa.apply(size);
1165         boolean[] ms = fm.apply(size);
1166         byte r = 1;
1167         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1168             r = 1;
1169             for (int i = 0; i < as.length; i++) {
1170                 if (ms[i % ms.length])
1171                     r *= as[i];
1172             }
1173         }
1174         bh.consume(r);
1175     }
1176 
1177     @Benchmark
1178     public void anyTrue(Blackhole bh) {
1179         boolean[] ms = fm.apply(size);
1180         boolean r = false;
1181         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1182             r = false;
1183             for (int i = 0; i < ms.length; i++) {
1184                 r |= ms[i];
1185             }
1186         }
1187         bh.consume(r);
1188     }
1189 
1190     @Benchmark
1191     public void allTrue(Blackhole bh) {
1192         boolean[] ms = fm.apply(size);
1193         boolean r = true;
1194         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1195             r = true;
1196             for (int i = 0; i < ms.length; i++) {
1197                 r &= ms[i];
1198             }
1199         }
1200         bh.consume(r);
1201     }
1202 
1203     @Benchmark
1204     public void IS_DEFAULT(Blackhole bh) {
1205         byte[] as = fa.apply(size);
1206         boolean r = true;
1207 
1208         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1209             for (int i = 0; i < as.length; i++) {
1210                 byte a = as[i];
1211                 r &= (bits(a)==0); // accumulate so JIT can't eliminate the computation
1212             }
1213         }
1214 
1215         bh.consume(r);
1216     }
1217 
1218     @Benchmark
1219     public void IS_NEGATIVE(Blackhole bh) {
1220         byte[] as = fa.apply(size);
1221         boolean r = true;
1222 
1223         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1224             for (int i = 0; i < as.length; i++) {
1225                 byte a = as[i];
1226                 r &= (bits(a)<0); // accumulate so JIT can't eliminate the computation
1227             }
1228         }
1229 
1230         bh.consume(r);
1231     }
1232 
1233     @Benchmark
1234     public void LT(Blackhole bh) {
1235         byte[] as = fa.apply(size);
1236         byte[] bs = fb.apply(size);
1237         boolean r = true;
1238 
1239         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1240             for (int i = 0; i < as.length; i++) {
1241                 r &= lt(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1242             }
1243         }
1244 
1245         bh.consume(r);
1246     }
1247 
1248     @Benchmark
1249     public void GT(Blackhole bh) {
1250         byte[] as = fa.apply(size);
1251         byte[] bs = fb.apply(size);
1252         boolean r = true;
1253 
1254         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1255             for (int i = 0; i < as.length; i++) {
1256                 r &= gt(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1257             }
1258         }
1259 
1260         bh.consume(r);
1261     }
1262 
1263     @Benchmark
1264     public void EQ(Blackhole bh) {
1265         byte[] as = fa.apply(size);
1266         byte[] bs = fb.apply(size);
1267         boolean r = true;
1268 
1269         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1270             for (int i = 0; i < as.length; i++) {
1271                 r &= eq(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1272             }
1273         }
1274 
1275         bh.consume(r);
1276     }
1277 
1278     @Benchmark
1279     public void NE(Blackhole bh) {
1280         byte[] as = fa.apply(size);
1281         byte[] bs = fb.apply(size);
1282         boolean r = true;
1283 
1284         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1285             for (int i = 0; i < as.length; i++) {
1286                 r &= neq(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1287             }
1288         }
1289 
1290         bh.consume(r);
1291     }
1292 
1293     @Benchmark
1294     public void LE(Blackhole bh) {
1295         byte[] as = fa.apply(size);
1296         byte[] bs = fb.apply(size);
1297         boolean r = true;
1298 
1299         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1300             for (int i = 0; i < as.length; i++) {
1301                 r &= le(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1302             }
1303         }
1304 
1305         bh.consume(r);
1306     }
1307 
1308     @Benchmark
1309     public void GE(Blackhole bh) {
1310         byte[] as = fa.apply(size);
1311         byte[] bs = fb.apply(size);
1312         boolean r = true;
1313 
1314         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1315             for (int i = 0; i < as.length; i++) {
1316                 r &= ge(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1317             }
1318         }
1319 
1320         bh.consume(r);
1321     }
1322 
1323     @Benchmark
1324     public void UNSIGNED_LT(Blackhole bh) {
1325         byte[] as = fa.apply(size);
1326         byte[] bs = fb.apply(size);
1327         boolean r = true;
1328 
1329         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1330             for (int i = 0; i < as.length; i++) {
1331                 r &= ult(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1332             }
1333         }
1334 
1335         bh.consume(r);
1336     }
1337 
1338     @Benchmark
1339     public void UNSIGNED_GT(Blackhole bh) {
1340         byte[] as = fa.apply(size);
1341         byte[] bs = fb.apply(size);
1342         boolean r = true;
1343 
1344         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1345             for (int i = 0; i < as.length; i++) {
1346                 r &= ugt(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1347             }
1348         }
1349 
1350         bh.consume(r);
1351     }
1352 
1353     @Benchmark
1354     public void UNSIGNED_LE(Blackhole bh) {
1355         byte[] as = fa.apply(size);
1356         byte[] bs = fb.apply(size);
1357         boolean r = true;
1358 
1359         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1360             for (int i = 0; i < as.length; i++) {
1361                 r &= ule(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1362             }
1363         }
1364 
1365         bh.consume(r);
1366     }
1367 
1368     @Benchmark
1369     public void UNSIGNED_GE(Blackhole bh) {
1370         byte[] as = fa.apply(size);
1371         byte[] bs = fb.apply(size);
1372         boolean r = true;
1373 
1374         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1375             for (int i = 0; i < as.length; i++) {
1376                 r &= uge(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1377             }
1378         }
1379 
1380         bh.consume(r);
1381     }
1382 
1383     @Benchmark
1384     public void blend(Blackhole bh) {
1385         byte[] as = fa.apply(size);
1386         byte[] bs = fb.apply(size);
1387         byte[] rs = fr.apply(size);
1388         boolean[] ms = fm.apply(size);
1389 
1390         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1391             for (int i = 0; i < as.length; i++) {
1392                 byte a = as[i];
1393                 byte b = bs[i];
1394                 boolean m = ms[i % ms.length];
1395                 rs[i] = (m ? b : a);
1396             }
1397         }
1398 
1399         bh.consume(rs);
1400     }
1401 
1402     void rearrangeShared(int window, Blackhole bh) {
1403         byte[] as = fa.apply(size);
1404         int[] order = fs.apply(size);
1405         byte[] rs = fr.apply(size);
1406 
1407         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1408             for (int i = 0; i < as.length; i += window) {
1409                 for (int j = 0; j < window; j++) {
1410                     byte a = as[i+j];
1411                     int pos = order[j];
1412                     rs[i + pos] = a;
1413                 }
1414             }
1415         }
1416 
1417         bh.consume(rs);
1418     }
1419 
1420     @Benchmark
1421     public void rearrange064(Blackhole bh) {
1422         int window = 64 / Byte.SIZE;
1423         rearrangeShared(window, bh);
1424     }
1425 
1426     @Benchmark
1427     public void rearrange128(Blackhole bh) {
1428         int window = 128 / Byte.SIZE;
1429         rearrangeShared(window, bh);
1430     }
1431 
1432     @Benchmark
1433     public void rearrange256(Blackhole bh) {
1434         int window = 256 / Byte.SIZE;
1435         rearrangeShared(window, bh);
1436     }
1437 
1438     @Benchmark
1439     public void rearrange512(Blackhole bh) {
1440         int window = 512 / Byte.SIZE;
1441         rearrangeShared(window, bh);
1442     }
1443 
1444     @Benchmark
1445     public void compressScalar(Blackhole bh) {
1446         byte[] as = fa.apply(size);
1447         byte[] rs = new byte[size];
1448         boolean[] im = fmt.apply(size);
1449 
1450         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1451             for (int i = 0, j = 0; i < as.length; i++) {
1452                 if (im[i]) {
1453                     rs[j++] = as[i];
1454                 }
1455             }
1456         }
1457 
1458         bh.consume(rs);
1459     }
1460 
1461     @Benchmark
1462     public void expandScalar(Blackhole bh) {
1463         byte[] as = fa.apply(size);
1464         byte[] rs = new byte[size];
1465         boolean[] im = fmt.apply(size);
1466 
1467         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1468             for (int i = 0, j = 0; i < as.length; i++) {
1469                 if (im[i]) {
1470                     rs[i++] = as[j++];
1471                 }
1472             }
1473         }
1474 
1475         bh.consume(rs);
1476     }
1477 
1478     @Benchmark
1479     public void maskCompressScalar(Blackhole bh) {
1480         boolean[] im = fmt.apply(size);
1481         boolean[] rm = new boolean[size];
1482 
1483         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1484             for (int i = 0, j = 0; i < im.length; i++) {
1485                 if (im[i]) {
1486                     rm[j++] = im[i];
1487                 }
1488             }
1489         }
1490 
1491         bh.consume(rm);
1492     }
1493 
1494     void broadcastShared(int window, Blackhole bh) {
1495         byte[] as = fa.apply(size);
1496         byte[] rs = fr.apply(size);
1497 
1498         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1499             for (int i = 0; i < as.length; i += window) {
1500                 int idx = i;
1501                 for (int j = 0; j < window; j++) {
1502                     rs[j] = as[idx];
1503                 }
1504             }
1505         }
1506 
1507         bh.consume(rs);
1508     }
1509 
1510     @Benchmark
1511     public void broadcast064(Blackhole bh) {
1512         int window = 64 / Byte.SIZE;
1513         broadcastShared(window, bh);
1514     }
1515 
1516     @Benchmark
1517     public void broadcast128(Blackhole bh) {
1518         int window = 128 / Byte.SIZE;
1519         broadcastShared(window, bh);
1520     }
1521 
1522     @Benchmark
1523     public void broadcast256(Blackhole bh) {
1524         int window = 256 / Byte.SIZE;
1525         broadcastShared(window, bh);
1526     }
1527 
1528     @Benchmark
1529     public void broadcast512(Blackhole bh) {
1530         int window = 512 / Byte.SIZE;
1531         broadcastShared(window, bh);
1532     }
1533 
1534     @Benchmark
1535     public void zero(Blackhole bh) {
1536         byte[] as = fa.apply(size);
1537 
1538         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1539             for (int i = 0; i < as.length; i++) {
1540                 as[i] = (byte)0;
1541             }
1542         }
1543 
1544         bh.consume(as);
1545     }
1546 
1547     @Benchmark
1548     public void BITWISE_BLEND(Blackhole bh) {
1549         byte[] as = fa.apply(size);
1550         byte[] bs = fb.apply(size);
1551         byte[] cs = fc.apply(size);
1552         byte[] rs = fr.apply(size);
1553 
1554         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1555             for (int i = 0; i < as.length; i++) {
1556                 byte a = as[i];
1557                 byte b = bs[i];
1558                 byte c = cs[i];
1559                 rs[i] = (byte)((a&~(c))|(b&c));
1560             }
1561         }
1562 
1563         bh.consume(rs);
1564     }
1565 
1566     @Benchmark
1567     public void BITWISE_BLENDMasked(Blackhole bh) {
1568         byte[] as = fa.apply(size);
1569         byte[] bs = fb.apply(size);
1570         byte[] cs = fc.apply(size);
1571         byte[] rs = fr.apply(size);
1572         boolean[] ms = fm.apply(size);
1573 
1574         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1575             for (int i = 0; i < as.length; i++) {
1576                 byte a = as[i];
1577                 byte b = bs[i];
1578                 byte c = cs[i];
1579                 if (ms[i % ms.length]) {
1580                     rs[i] = (byte)((a&~(c))|(b&c));
1581                 } else {
1582                     rs[i] = a;
1583                 }
1584             }
1585         }
1586         bh.consume(rs);
1587     }
1588     @Benchmark
1589     public void NEG(Blackhole bh) {
1590         byte[] as = fa.apply(size);
1591         byte[] rs = fr.apply(size);
1592 
1593         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1594             for (int i = 0; i < as.length; i++) {
1595                 byte a = as[i];
1596                 rs[i] = (byte)(-((byte)a));
1597             }
1598         }
1599 
1600         bh.consume(rs);
1601     }
1602 
1603     @Benchmark
1604     public void NEGMasked(Blackhole bh) {
1605         byte[] as = fa.apply(size);
1606         byte[] rs = fr.apply(size);
1607         boolean[] ms = fm.apply(size);
1608 
1609         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1610             for (int i = 0; i < as.length; i++) {
1611                 byte a = as[i];
1612                 boolean m = ms[i % ms.length];
1613                 rs[i] = (m ? (byte)(-((byte)a)) : a);
1614             }
1615         }
1616 
1617         bh.consume(rs);
1618     }
1619     @Benchmark
1620     public void ABS(Blackhole bh) {
1621         byte[] as = fa.apply(size);
1622         byte[] rs = fr.apply(size);
1623 
1624         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1625             for (int i = 0; i < as.length; i++) {
1626                 byte a = as[i];
1627                 rs[i] = (byte)(Math.abs((byte)a));
1628             }
1629         }
1630 
1631         bh.consume(rs);
1632     }
1633 
1634     @Benchmark
1635     public void ABSMasked(Blackhole bh) {
1636         byte[] as = fa.apply(size);
1637         byte[] rs = fr.apply(size);
1638         boolean[] ms = fm.apply(size);
1639 
1640         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1641             for (int i = 0; i < as.length; i++) {
1642                 byte a = as[i];
1643                 boolean m = ms[i % ms.length];
1644                 rs[i] = (m ? (byte)(Math.abs((byte)a)) : a);
1645             }
1646         }
1647 
1648         bh.consume(rs);
1649     }
1650     @Benchmark
1651     public void NOT(Blackhole bh) {
1652         byte[] as = fa.apply(size);
1653         byte[] rs = fr.apply(size);
1654 
1655         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1656             for (int i = 0; i < as.length; i++) {
1657                 byte a = as[i];
1658                 rs[i] = (byte)(~((byte)a));
1659             }
1660         }
1661 
1662         bh.consume(rs);
1663     }
1664 
1665     @Benchmark
1666     public void NOTMasked(Blackhole bh) {
1667         byte[] as = fa.apply(size);
1668         byte[] rs = fr.apply(size);
1669         boolean[] ms = fm.apply(size);
1670 
1671         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1672             for (int i = 0; i < as.length; i++) {
1673                 byte a = as[i];
1674                 boolean m = ms[i % ms.length];
1675                 rs[i] = (m ? (byte)(~((byte)a)) : a);
1676             }
1677         }
1678 
1679         bh.consume(rs);
1680     }
1681     @Benchmark
1682     public void ZOMO(Blackhole bh) {
1683         byte[] as = fa.apply(size);
1684         byte[] rs = fr.apply(size);
1685 
1686         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1687             for (int i = 0; i < as.length; i++) {
1688                 byte a = as[i];
1689                 rs[i] = (byte)((a==0?0:-1));
1690             }
1691         }
1692 
1693         bh.consume(rs);
1694     }
1695 
1696     @Benchmark
1697     public void ZOMOMasked(Blackhole bh) {
1698         byte[] as = fa.apply(size);
1699         byte[] rs = fr.apply(size);
1700         boolean[] ms = fm.apply(size);
1701 
1702         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1703             for (int i = 0; i < as.length; i++) {
1704                 byte a = as[i];
1705                 boolean m = ms[i % ms.length];
1706                 rs[i] = (m ? (byte)((a==0?0:-1)) : a);
1707             }
1708         }
1709 
1710         bh.consume(rs);
1711     }
1712     @Benchmark
1713     public void BIT_COUNT(Blackhole bh) {
1714         byte[] as = fa.apply(size);
1715         byte[] rs = fr.apply(size);
1716 
1717         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1718             for (int i = 0; i < as.length; i++) {
1719                 byte a = as[i];
1720                 rs[i] = (byte)(Integer.bitCount((int)a & 0xFF));
1721             }
1722         }
1723 
1724         bh.consume(rs);
1725     }
1726 
1727     @Benchmark
1728     public void BIT_COUNTMasked(Blackhole bh) {
1729         byte[] as = fa.apply(size);
1730         byte[] rs = fr.apply(size);
1731         boolean[] ms = fm.apply(size);
1732 
1733         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1734             for (int i = 0; i < as.length; i++) {
1735                 byte a = as[i];
1736                 boolean m = ms[i % ms.length];
1737                 rs[i] = (m ? (byte)(Integer.bitCount((int)a & 0xFF)) : a);
1738             }
1739         }
1740 
1741         bh.consume(rs);
1742     }
1743     @Benchmark
1744     public void TRAILING_ZEROS_COUNT(Blackhole bh) {
1745         byte[] as = fa.apply(size);
1746         byte[] rs = fr.apply(size);
1747 
1748         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1749             for (int i = 0; i < as.length; i++) {
1750                 byte a = as[i];
1751                 rs[i] = (byte)(TRAILING_ZEROS_COUNT_scalar(a));
1752             }
1753         }
1754 
1755         bh.consume(rs);
1756     }
1757 
1758     @Benchmark
1759     public void TRAILING_ZEROS_COUNTMasked(Blackhole bh) {
1760         byte[] as = fa.apply(size);
1761         byte[] rs = fr.apply(size);
1762         boolean[] ms = fm.apply(size);
1763 
1764         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1765             for (int i = 0; i < as.length; i++) {
1766                 byte a = as[i];
1767                 boolean m = ms[i % ms.length];
1768                 rs[i] = (m ? (byte)(TRAILING_ZEROS_COUNT_scalar(a)) : a);
1769             }
1770         }
1771 
1772         bh.consume(rs);
1773     }
1774     @Benchmark
1775     public void LEADING_ZEROS_COUNT(Blackhole bh) {
1776         byte[] as = fa.apply(size);
1777         byte[] rs = fr.apply(size);
1778 
1779         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1780             for (int i = 0; i < as.length; i++) {
1781                 byte a = as[i];
1782                 rs[i] = (byte)(LEADING_ZEROS_COUNT_scalar(a));
1783             }
1784         }
1785 
1786         bh.consume(rs);
1787     }
1788 
1789     @Benchmark
1790     public void LEADING_ZEROS_COUNTMasked(Blackhole bh) {
1791         byte[] as = fa.apply(size);
1792         byte[] rs = fr.apply(size);
1793         boolean[] ms = fm.apply(size);
1794 
1795         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1796             for (int i = 0; i < as.length; i++) {
1797                 byte a = as[i];
1798                 boolean m = ms[i % ms.length];
1799                 rs[i] = (m ? (byte)(LEADING_ZEROS_COUNT_scalar(a)) : a);
1800             }
1801         }
1802 
1803         bh.consume(rs);
1804     }
1805     @Benchmark
1806     public void REVERSE(Blackhole bh) {
1807         byte[] as = fa.apply(size);
1808         byte[] rs = fr.apply(size);
1809 
1810         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1811             for (int i = 0; i < as.length; i++) {
1812                 byte a = as[i];
1813                 rs[i] = (byte)(REVERSE_scalar(a));
1814             }
1815         }
1816 
1817         bh.consume(rs);
1818     }
1819 
1820     @Benchmark
1821     public void REVERSEMasked(Blackhole bh) {
1822         byte[] as = fa.apply(size);
1823         byte[] rs = fr.apply(size);
1824         boolean[] ms = fm.apply(size);
1825 
1826         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1827             for (int i = 0; i < as.length; i++) {
1828                 byte a = as[i];
1829                 boolean m = ms[i % ms.length];
1830                 rs[i] = (m ? (byte)(REVERSE_scalar(a)) : a);
1831             }
1832         }
1833 
1834         bh.consume(rs);
1835     }
1836     @Benchmark
1837     public void REVERSE_BYTES(Blackhole bh) {
1838         byte[] as = fa.apply(size);
1839         byte[] rs = fr.apply(size);
1840 
1841         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1842             for (int i = 0; i < as.length; i++) {
1843                 byte a = as[i];
1844                 rs[i] = (byte)(a);
1845             }
1846         }
1847 
1848         bh.consume(rs);
1849     }
1850 
1851     @Benchmark
1852     public void REVERSE_BYTESMasked(Blackhole bh) {
1853         byte[] as = fa.apply(size);
1854         byte[] rs = fr.apply(size);
1855         boolean[] ms = fm.apply(size);
1856 
1857         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1858             for (int i = 0; i < as.length; i++) {
1859                 byte a = as[i];
1860                 boolean m = ms[i % ms.length];
1861                 rs[i] = (m ? (byte)(a) : a);
1862             }
1863         }
1864 
1865         bh.consume(rs);
1866     }
1867 }