1 /*
   2  * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package org.openjdk.bench.jdk.incubator.vector.operation;
  25 
  26 // -- This file was mechanically generated: Do not edit! -- //
  27 
  28 import java.util.concurrent.TimeUnit;
  29 import java.util.function.IntFunction;
  30 import jdk.incubator.vector.VectorMath;
  31 
  32 import org.openjdk.jmh.annotations.*;
  33 import org.openjdk.jmh.infra.Blackhole;
  34 
  35 @BenchmarkMode(Mode.Throughput)
  36 @OutputTimeUnit(TimeUnit.MILLISECONDS)
  37 @State(Scope.Benchmark)
  38 @Warmup(iterations = 3, time = 1)
  39 @Measurement(iterations = 5, time = 1)
  40 @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
  41 public class ByteScalar extends AbstractVectorBenchmark {
  42     static final int INVOC_COUNT = 1; // To align with vector benchmarks.
  43 
  44     private static final byte CONST_SHIFT = Byte.SIZE / 2;
  45 
  46     @Param("1024")
  47     int size;
  48 
  49     byte[] fill(IntFunction<Byte> f) {
  50         byte[] array = new byte[size];
  51         for (int i = 0; i < array.length; i++) {
  52             array[i] = f.apply(i);
  53         }
  54         return array;
  55     }
  56 
  57     static byte bits(byte e) {
  58         return e;
  59     }
  60 
  61     byte[] as, bs, cs, rs;
  62     boolean[] ms, mt, rms;
  63     int[] ss;
  64 
  65     @Setup
  66     public void init() {
  67         as = fill(i -> (byte)(2*i));
  68         bs = fill(i -> (byte)(i+1));
  69         cs = fill(i -> (byte)(i+5));
  70         rs = fill(i -> (byte)0);
  71         ms = fillMask(size, i -> (i % 2) == 0);
  72         mt = fillMask(size, i -> true);
  73         rms = fillMask(size, i -> false);
  74 
  75         ss = fillInt(size, i -> RAND.nextInt(Math.max(i,1)));
  76     }
  77 
  78     final IntFunction<byte[]> fa = vl -> as;
  79     final IntFunction<byte[]> fb = vl -> bs;
  80     final IntFunction<byte[]> fc = vl -> cs;
  81     final IntFunction<byte[]> fr = vl -> rs;
  82     final IntFunction<boolean[]> fm = vl -> ms;
  83     final IntFunction<boolean[]> fmt = vl -> mt;
  84     final IntFunction<boolean[]> fmr = vl -> rms;
  85     final IntFunction<int[]> fs = vl -> ss;
  86 
  87     static boolean eq(byte a, byte b) {
  88         return a == b;
  89     }
  90 
  91     static boolean neq(byte a, byte b) {
  92         return a != b;
  93     }
  94 
  95     static boolean lt(byte a, byte b) {
  96         return a < b;
  97     }
  98 
  99     static boolean le(byte a, byte b) {
 100         return a <= b;
 101     }
 102 
 103     static boolean gt(byte a, byte b) {
 104         return a > b;
 105     }
 106 
 107     static boolean ge(byte a, byte b) {
 108         return a >= b;
 109     }
 110 
 111     static boolean ult(byte a, byte b) {
 112         return Byte.compareUnsigned(a, b) < 0;
 113     }
 114 
 115     static boolean ule(byte a, byte b) {
 116         return Byte.compareUnsigned(a, b) <= 0;
 117     }
 118 
 119     static boolean ugt(byte a, byte b) {
 120         return Byte.compareUnsigned(a, b) > 0;
 121     }
 122 
 123     static boolean uge(byte a, byte b) {
 124         return Byte.compareUnsigned(a, b) >= 0;
 125     }
 126 
 127     static byte ROL_scalar(byte a, byte b) {
 128         return (byte)(((((byte)a) & 0xFF) << (b & 7)) | ((((byte)a) & 0xFF) >>> (8 - (b & 7))));
 129     }
 130 
 131     static byte ROR_scalar(byte a, byte b) {
 132         return (byte)(((((byte)a) & 0xFF) >>> (b & 7)) | ((((byte)a) & 0xFF) << (8 - (b & 7))));
 133     }
 134 
 135     static byte TRAILING_ZEROS_COUNT_scalar(byte a) {
 136         return (byte) (a != 0 ? Integer.numberOfTrailingZeros(a) : 8);
 137     }
 138 
 139     static byte LEADING_ZEROS_COUNT_scalar(byte a) {
 140         return (byte) (a >= 0 ? Integer.numberOfLeadingZeros(a) - 24 : 0);
 141     }
 142 
 143     static byte REVERSE_scalar(byte a) {
 144         byte b = ROL_scalar(a, (byte) 4);
 145         b = (byte)(((b & 0x55) << 1) | ((b & 0xAA) >>> 1));
 146         b = (byte)(((b & 0x33) << 2) | ((b & 0xCC) >>> 2));
 147         return b;
 148     }
 149 
 150     @Benchmark
 151     public void ADD(Blackhole bh) {
 152         byte[] as = fa.apply(size);
 153         byte[] bs = fb.apply(size);
 154         byte[] rs = fr.apply(size);
 155 
 156         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 157             for (int i = 0; i < as.length; i++) {
 158                 byte a = as[i];
 159                 byte b = bs[i];
 160                 rs[i] = (byte)(a + b);
 161             }
 162         }
 163 
 164         bh.consume(rs);
 165     }
 166 
 167     @Benchmark
 168     public void ADDMasked(Blackhole bh) {
 169         byte[] as = fa.apply(size);
 170         byte[] bs = fb.apply(size);
 171         byte[] rs = fr.apply(size);
 172         boolean[] ms = fm.apply(size);
 173 
 174         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 175             for (int i = 0; i < as.length; i++) {
 176                 byte a = as[i];
 177                 byte b = bs[i];
 178                 if (ms[i % ms.length]) {
 179                     rs[i] = (byte)(a + b);
 180                 } else {
 181                     rs[i] = a;
 182                 }
 183             }
 184         }
 185         bh.consume(rs);
 186     }
 187 
 188     @Benchmark
 189     public void SUB(Blackhole bh) {
 190         byte[] as = fa.apply(size);
 191         byte[] bs = fb.apply(size);
 192         byte[] rs = fr.apply(size);
 193 
 194         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 195             for (int i = 0; i < as.length; i++) {
 196                 byte a = as[i];
 197                 byte b = bs[i];
 198                 rs[i] = (byte)(a - b);
 199             }
 200         }
 201 
 202         bh.consume(rs);
 203     }
 204 
 205     @Benchmark
 206     public void SUBMasked(Blackhole bh) {
 207         byte[] as = fa.apply(size);
 208         byte[] bs = fb.apply(size);
 209         byte[] rs = fr.apply(size);
 210         boolean[] ms = fm.apply(size);
 211 
 212         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 213             for (int i = 0; i < as.length; i++) {
 214                 byte a = as[i];
 215                 byte b = bs[i];
 216                 if (ms[i % ms.length]) {
 217                     rs[i] = (byte)(a - b);
 218                 } else {
 219                     rs[i] = a;
 220                 }
 221             }
 222         }
 223         bh.consume(rs);
 224     }
 225 
 226     @Benchmark
 227     public void MUL(Blackhole bh) {
 228         byte[] as = fa.apply(size);
 229         byte[] bs = fb.apply(size);
 230         byte[] rs = fr.apply(size);
 231 
 232         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 233             for (int i = 0; i < as.length; i++) {
 234                 byte a = as[i];
 235                 byte b = bs[i];
 236                 rs[i] = (byte)(a * b);
 237             }
 238         }
 239 
 240         bh.consume(rs);
 241     }
 242 
 243     @Benchmark
 244     public void MULMasked(Blackhole bh) {
 245         byte[] as = fa.apply(size);
 246         byte[] bs = fb.apply(size);
 247         byte[] rs = fr.apply(size);
 248         boolean[] ms = fm.apply(size);
 249 
 250         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 251             for (int i = 0; i < as.length; i++) {
 252                 byte a = as[i];
 253                 byte b = bs[i];
 254                 if (ms[i % ms.length]) {
 255                     rs[i] = (byte)(a * b);
 256                 } else {
 257                     rs[i] = a;
 258                 }
 259             }
 260         }
 261         bh.consume(rs);
 262     }
 263 
 264     @Benchmark
 265     public void FIRST_NONZERO(Blackhole bh) {
 266         byte[] as = fa.apply(size);
 267         byte[] bs = fb.apply(size);
 268         byte[] rs = fr.apply(size);
 269 
 270         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 271             for (int i = 0; i < as.length; i++) {
 272                 byte a = as[i];
 273                 byte b = bs[i];
 274                 rs[i] = (byte)((a)!=0?a:b);
 275             }
 276         }
 277 
 278         bh.consume(rs);
 279     }
 280 
 281     @Benchmark
 282     public void FIRST_NONZEROMasked(Blackhole bh) {
 283         byte[] as = fa.apply(size);
 284         byte[] bs = fb.apply(size);
 285         byte[] rs = fr.apply(size);
 286         boolean[] ms = fm.apply(size);
 287 
 288         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 289             for (int i = 0; i < as.length; i++) {
 290                 byte a = as[i];
 291                 byte b = bs[i];
 292                 if (ms[i % ms.length]) {
 293                     rs[i] = (byte)((a)!=0?a:b);
 294                 } else {
 295                     rs[i] = a;
 296                 }
 297             }
 298         }
 299         bh.consume(rs);
 300     }
 301 
 302     @Benchmark
 303     public void AND(Blackhole bh) {
 304         byte[] as = fa.apply(size);
 305         byte[] bs = fb.apply(size);
 306         byte[] rs = fr.apply(size);
 307 
 308         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 309             for (int i = 0; i < as.length; i++) {
 310                 byte a = as[i];
 311                 byte b = bs[i];
 312                 rs[i] = (byte)(a & b);
 313             }
 314         }
 315 
 316         bh.consume(rs);
 317     }
 318 
 319     @Benchmark
 320     public void ANDMasked(Blackhole bh) {
 321         byte[] as = fa.apply(size);
 322         byte[] bs = fb.apply(size);
 323         byte[] rs = fr.apply(size);
 324         boolean[] ms = fm.apply(size);
 325 
 326         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 327             for (int i = 0; i < as.length; i++) {
 328                 byte a = as[i];
 329                 byte b = bs[i];
 330                 if (ms[i % ms.length]) {
 331                     rs[i] = (byte)(a & b);
 332                 } else {
 333                     rs[i] = a;
 334                 }
 335             }
 336         }
 337         bh.consume(rs);
 338     }
 339 
 340     @Benchmark
 341     public void AND_NOT(Blackhole bh) {
 342         byte[] as = fa.apply(size);
 343         byte[] bs = fb.apply(size);
 344         byte[] rs = fr.apply(size);
 345 
 346         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 347             for (int i = 0; i < as.length; i++) {
 348                 byte a = as[i];
 349                 byte b = bs[i];
 350                 rs[i] = (byte)(a & ~b);
 351             }
 352         }
 353 
 354         bh.consume(rs);
 355     }
 356 
 357     @Benchmark
 358     public void AND_NOTMasked(Blackhole bh) {
 359         byte[] as = fa.apply(size);
 360         byte[] bs = fb.apply(size);
 361         byte[] rs = fr.apply(size);
 362         boolean[] ms = fm.apply(size);
 363 
 364         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 365             for (int i = 0; i < as.length; i++) {
 366                 byte a = as[i];
 367                 byte b = bs[i];
 368                 if (ms[i % ms.length]) {
 369                     rs[i] = (byte)(a & ~b);
 370                 } else {
 371                     rs[i] = a;
 372                 }
 373             }
 374         }
 375         bh.consume(rs);
 376     }
 377 
 378     @Benchmark
 379     public void OR(Blackhole bh) {
 380         byte[] as = fa.apply(size);
 381         byte[] bs = fb.apply(size);
 382         byte[] rs = fr.apply(size);
 383 
 384         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 385             for (int i = 0; i < as.length; i++) {
 386                 byte a = as[i];
 387                 byte b = bs[i];
 388                 rs[i] = (byte)(a | b);
 389             }
 390         }
 391 
 392         bh.consume(rs);
 393     }
 394 
 395     @Benchmark
 396     public void ORMasked(Blackhole bh) {
 397         byte[] as = fa.apply(size);
 398         byte[] bs = fb.apply(size);
 399         byte[] rs = fr.apply(size);
 400         boolean[] ms = fm.apply(size);
 401 
 402         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 403             for (int i = 0; i < as.length; i++) {
 404                 byte a = as[i];
 405                 byte b = bs[i];
 406                 if (ms[i % ms.length]) {
 407                     rs[i] = (byte)(a | b);
 408                 } else {
 409                     rs[i] = a;
 410                 }
 411             }
 412         }
 413         bh.consume(rs);
 414     }
 415 
 416     @Benchmark
 417     public void XOR(Blackhole bh) {
 418         byte[] as = fa.apply(size);
 419         byte[] bs = fb.apply(size);
 420         byte[] rs = fr.apply(size);
 421 
 422         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 423             for (int i = 0; i < as.length; i++) {
 424                 byte a = as[i];
 425                 byte b = bs[i];
 426                 rs[i] = (byte)(a ^ b);
 427             }
 428         }
 429 
 430         bh.consume(rs);
 431     }
 432 
 433     @Benchmark
 434     public void XORMasked(Blackhole bh) {
 435         byte[] as = fa.apply(size);
 436         byte[] bs = fb.apply(size);
 437         byte[] rs = fr.apply(size);
 438         boolean[] ms = fm.apply(size);
 439 
 440         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 441             for (int i = 0; i < as.length; i++) {
 442                 byte a = as[i];
 443                 byte b = bs[i];
 444                 if (ms[i % ms.length]) {
 445                     rs[i] = (byte)(a ^ b);
 446                 } else {
 447                     rs[i] = a;
 448                 }
 449             }
 450         }
 451         bh.consume(rs);
 452     }
 453 
 454     @Benchmark
 455     public void LSHL(Blackhole bh) {
 456         byte[] as = fa.apply(size);
 457         byte[] bs = fb.apply(size);
 458         byte[] rs = fr.apply(size);
 459 
 460         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 461             for (int i = 0; i < as.length; i++) {
 462                 byte a = as[i];
 463                 byte b = bs[i];
 464                 rs[i] = (byte)((a << (b & 0x7)));
 465             }
 466         }
 467 
 468         bh.consume(rs);
 469     }
 470 
 471     @Benchmark
 472     public void LSHLMasked(Blackhole bh) {
 473         byte[] as = fa.apply(size);
 474         byte[] bs = fb.apply(size);
 475         byte[] rs = fr.apply(size);
 476         boolean[] ms = fm.apply(size);
 477 
 478         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 479             for (int i = 0; i < as.length; i++) {
 480                 byte a = as[i];
 481                 byte b = bs[i];
 482                 if (ms[i % ms.length]) {
 483                     rs[i] = (byte)((a << (b & 0x7)));
 484                 } else {
 485                     rs[i] = a;
 486                 }
 487             }
 488         }
 489         bh.consume(rs);
 490     }
 491 
 492     @Benchmark
 493     public void ASHR(Blackhole bh) {
 494         byte[] as = fa.apply(size);
 495         byte[] bs = fb.apply(size);
 496         byte[] rs = fr.apply(size);
 497 
 498         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 499             for (int i = 0; i < as.length; i++) {
 500                 byte a = as[i];
 501                 byte b = bs[i];
 502                 rs[i] = (byte)((a >> (b & 0x7)));
 503             }
 504         }
 505 
 506         bh.consume(rs);
 507     }
 508 
 509     @Benchmark
 510     public void ASHRMasked(Blackhole bh) {
 511         byte[] as = fa.apply(size);
 512         byte[] bs = fb.apply(size);
 513         byte[] rs = fr.apply(size);
 514         boolean[] ms = fm.apply(size);
 515 
 516         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 517             for (int i = 0; i < as.length; i++) {
 518                 byte a = as[i];
 519                 byte b = bs[i];
 520                 if (ms[i % ms.length]) {
 521                     rs[i] = (byte)((a >> (b & 0x7)));
 522                 } else {
 523                     rs[i] = a;
 524                 }
 525             }
 526         }
 527         bh.consume(rs);
 528     }
 529 
 530     @Benchmark
 531     public void LSHR(Blackhole bh) {
 532         byte[] as = fa.apply(size);
 533         byte[] bs = fb.apply(size);
 534         byte[] rs = fr.apply(size);
 535 
 536         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 537             for (int i = 0; i < as.length; i++) {
 538                 byte a = as[i];
 539                 byte b = bs[i];
 540                 rs[i] = (byte)(((a & 0xFF) >>> (b & 0x7)));
 541             }
 542         }
 543 
 544         bh.consume(rs);
 545     }
 546 
 547     @Benchmark
 548     public void LSHRMasked(Blackhole bh) {
 549         byte[] as = fa.apply(size);
 550         byte[] bs = fb.apply(size);
 551         byte[] rs = fr.apply(size);
 552         boolean[] ms = fm.apply(size);
 553 
 554         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 555             for (int i = 0; i < as.length; i++) {
 556                 byte a = as[i];
 557                 byte b = bs[i];
 558                 if (ms[i % ms.length]) {
 559                     rs[i] = (byte)(((a & 0xFF) >>> (b & 0x7)));
 560                 } else {
 561                     rs[i] = a;
 562                 }
 563             }
 564         }
 565         bh.consume(rs);
 566     }
 567 
 568     @Benchmark
 569     public void LSHLShift(Blackhole bh) {
 570         byte[] as = fa.apply(size);
 571         byte[] bs = fb.apply(size);
 572         byte[] rs = fr.apply(size);
 573 
 574         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 575             for (int i = 0; i < as.length; i++) {
 576                 byte a = as[i];
 577                 byte b = bs[i];
 578                 rs[i] = (byte)((a << (b & 7)));
 579             }
 580         }
 581 
 582         bh.consume(rs);
 583     }
 584 
 585     @Benchmark
 586     public void LSHLMaskedShift(Blackhole bh) {
 587         byte[] as = fa.apply(size);
 588         byte[] bs = fb.apply(size);
 589         byte[] rs = fr.apply(size);
 590         boolean[] ms = fm.apply(size);
 591 
 592         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 593             for (int i = 0; i < as.length; i++) {
 594                 byte a = as[i];
 595                 byte b = bs[i];
 596                 boolean m = ms[i % ms.length];
 597                 rs[i] = (m ? (byte)((a << (b & 7))) : a);
 598             }
 599         }
 600 
 601         bh.consume(rs);
 602     }
 603 
 604     @Benchmark
 605     public void LSHRShift(Blackhole bh) {
 606         byte[] as = fa.apply(size);
 607         byte[] bs = fb.apply(size);
 608         byte[] rs = fr.apply(size);
 609 
 610         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 611             for (int i = 0; i < as.length; i++) {
 612                 byte a = as[i];
 613                 byte b = bs[i];
 614                 rs[i] = (byte)(((a & 0xFF) >>> (b & 7)));
 615             }
 616         }
 617 
 618         bh.consume(rs);
 619     }
 620 
 621     @Benchmark
 622     public void LSHRMaskedShift(Blackhole bh) {
 623         byte[] as = fa.apply(size);
 624         byte[] bs = fb.apply(size);
 625         byte[] rs = fr.apply(size);
 626         boolean[] ms = fm.apply(size);
 627 
 628         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 629             for (int i = 0; i < as.length; i++) {
 630                 byte a = as[i];
 631                 byte b = bs[i];
 632                 boolean m = ms[i % ms.length];
 633                 rs[i] = (m ? (byte)(((a & 0xFF) >>> (b & 7))) : a);
 634             }
 635         }
 636 
 637         bh.consume(rs);
 638     }
 639 
 640     @Benchmark
 641     public void ASHRShift(Blackhole bh) {
 642         byte[] as = fa.apply(size);
 643         byte[] bs = fb.apply(size);
 644         byte[] rs = fr.apply(size);
 645 
 646         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 647             for (int i = 0; i < as.length; i++) {
 648                 byte a = as[i];
 649                 byte b = bs[i];
 650                 rs[i] = (byte)((a >> (b & 7)));
 651             }
 652         }
 653 
 654         bh.consume(rs);
 655     }
 656 
 657     @Benchmark
 658     public void ASHRMaskedShift(Blackhole bh) {
 659         byte[] as = fa.apply(size);
 660         byte[] bs = fb.apply(size);
 661         byte[] rs = fr.apply(size);
 662         boolean[] ms = fm.apply(size);
 663 
 664         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 665             for (int i = 0; i < as.length; i++) {
 666                 byte a = as[i];
 667                 byte b = bs[i];
 668                 boolean m = ms[i % ms.length];
 669                 rs[i] = (m ? (byte)((a >> (b & 7))) : a);
 670             }
 671         }
 672 
 673         bh.consume(rs);
 674     }
 675 
 676     @Benchmark
 677     public void ROR(Blackhole bh) {
 678         byte[] as = fa.apply(size);
 679         byte[] bs = fb.apply(size);
 680         byte[] rs = fr.apply(size);
 681 
 682         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 683             for (int i = 0; i < as.length; i++) {
 684                 byte a = as[i];
 685                 byte b = bs[i];
 686                 rs[i] = (byte)(ROR_scalar(a,b));
 687             }
 688         }
 689 
 690         bh.consume(rs);
 691     }
 692 
 693     @Benchmark
 694     public void RORMasked(Blackhole bh) {
 695         byte[] as = fa.apply(size);
 696         byte[] bs = fb.apply(size);
 697         byte[] rs = fr.apply(size);
 698         boolean[] ms = fm.apply(size);
 699 
 700         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 701             for (int i = 0; i < as.length; i++) {
 702                 byte a = as[i];
 703                 byte b = bs[i];
 704                 if (ms[i % ms.length]) {
 705                     rs[i] = (byte)(ROR_scalar(a,b));
 706                 } else {
 707                     rs[i] = a;
 708                 }
 709             }
 710         }
 711         bh.consume(rs);
 712     }
 713 
 714     @Benchmark
 715     public void ROL(Blackhole bh) {
 716         byte[] as = fa.apply(size);
 717         byte[] bs = fb.apply(size);
 718         byte[] rs = fr.apply(size);
 719 
 720         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 721             for (int i = 0; i < as.length; i++) {
 722                 byte a = as[i];
 723                 byte b = bs[i];
 724                 rs[i] = (byte)(ROL_scalar(a,b));
 725             }
 726         }
 727 
 728         bh.consume(rs);
 729     }
 730 
 731     @Benchmark
 732     public void ROLMasked(Blackhole bh) {
 733         byte[] as = fa.apply(size);
 734         byte[] bs = fb.apply(size);
 735         byte[] rs = fr.apply(size);
 736         boolean[] ms = fm.apply(size);
 737 
 738         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 739             for (int i = 0; i < as.length; i++) {
 740                 byte a = as[i];
 741                 byte b = bs[i];
 742                 if (ms[i % ms.length]) {
 743                     rs[i] = (byte)(ROL_scalar(a,b));
 744                 } else {
 745                     rs[i] = a;
 746                 }
 747             }
 748         }
 749         bh.consume(rs);
 750     }
 751 
 752     @Benchmark
 753     public void RORShift(Blackhole bh) {
 754         byte[] as = fa.apply(size);
 755         byte[] bs = fb.apply(size);
 756         byte[] rs = fr.apply(size);
 757 
 758         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 759             for (int i = 0; i < as.length; i++) {
 760                 byte a = as[i];
 761                 byte b = bs[i];
 762                 rs[i] = (byte)(ROR_scalar(a, b));
 763             }
 764         }
 765 
 766         bh.consume(rs);
 767     }
 768 
 769     @Benchmark
 770     public void RORMaskedShift(Blackhole bh) {
 771         byte[] as = fa.apply(size);
 772         byte[] bs = fb.apply(size);
 773         byte[] rs = fr.apply(size);
 774         boolean[] ms = fm.apply(size);
 775 
 776         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 777             for (int i = 0; i < as.length; i++) {
 778                 byte a = as[i];
 779                 byte b = bs[i];
 780                 boolean m = ms[i % ms.length];
 781                 rs[i] = (m ? (byte)(ROR_scalar(a, b)) : a);
 782             }
 783         }
 784 
 785         bh.consume(rs);
 786     }
 787 
 788     @Benchmark
 789     public void ROLShift(Blackhole bh) {
 790         byte[] as = fa.apply(size);
 791         byte[] bs = fb.apply(size);
 792         byte[] rs = fr.apply(size);
 793 
 794         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 795             for (int i = 0; i < as.length; i++) {
 796                 byte a = as[i];
 797                 byte b = bs[i];
 798                 rs[i] = (byte)(ROL_scalar(a, b));
 799             }
 800         }
 801 
 802         bh.consume(rs);
 803     }
 804 
 805     @Benchmark
 806     public void ROLMaskedShift(Blackhole bh) {
 807         byte[] as = fa.apply(size);
 808         byte[] bs = fb.apply(size);
 809         byte[] rs = fr.apply(size);
 810         boolean[] ms = fm.apply(size);
 811 
 812         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 813             for (int i = 0; i < as.length; i++) {
 814                 byte a = as[i];
 815                 byte b = bs[i];
 816                 boolean m = ms[i % ms.length];
 817                 rs[i] = (m ? (byte)(ROL_scalar(a, b)) : a);
 818             }
 819         }
 820 
 821         bh.consume(rs);
 822     }
 823 
 824     @Benchmark
 825     public void LSHRShiftConst(Blackhole bh) {
 826         byte[] as = fa.apply(size);
 827         byte[] bs = fb.apply(size);
 828         byte[] rs = fr.apply(size);
 829 
 830         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 831             for (int i = 0; i < as.length; i++) {
 832                 byte a = as[i];
 833                 byte b = bs[i];
 834                 rs[i] = (byte)(((a & 0xFF) >>> CONST_SHIFT));
 835             }
 836         }
 837 
 838         bh.consume(rs);
 839     }
 840 
 841     @Benchmark
 842     public void LSHRMaskedShiftConst(Blackhole bh) {
 843         byte[] as = fa.apply(size);
 844         byte[] bs = fb.apply(size);
 845         byte[] rs = fr.apply(size);
 846         boolean[] ms = fm.apply(size);
 847 
 848         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 849             for (int i = 0; i < as.length; i++) {
 850                 byte a = as[i];
 851                 byte b = bs[i];
 852                 boolean m = ms[i % ms.length];
 853                 rs[i] = (m ? (byte)(((a & 0xFF) >>> CONST_SHIFT)) : a);
 854             }
 855         }
 856 
 857         bh.consume(rs);
 858     }
 859 
 860     @Benchmark
 861     public void LSHLShiftConst(Blackhole bh) {
 862         byte[] as = fa.apply(size);
 863         byte[] bs = fb.apply(size);
 864         byte[] rs = fr.apply(size);
 865 
 866         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 867             for (int i = 0; i < as.length; i++) {
 868                 byte a = as[i];
 869                 byte b = bs[i];
 870                 rs[i] = (byte)((a << CONST_SHIFT));
 871             }
 872         }
 873 
 874         bh.consume(rs);
 875     }
 876 
 877     @Benchmark
 878     public void LSHLMaskedShiftConst(Blackhole bh) {
 879         byte[] as = fa.apply(size);
 880         byte[] bs = fb.apply(size);
 881         byte[] rs = fr.apply(size);
 882         boolean[] ms = fm.apply(size);
 883 
 884         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 885             for (int i = 0; i < as.length; i++) {
 886                 byte a = as[i];
 887                 byte b = bs[i];
 888                 boolean m = ms[i % ms.length];
 889                 rs[i] = (m ? (byte)((a << CONST_SHIFT)) : a);
 890             }
 891         }
 892 
 893         bh.consume(rs);
 894     }
 895 
 896     @Benchmark
 897     public void ASHRShiftConst(Blackhole bh) {
 898         byte[] as = fa.apply(size);
 899         byte[] bs = fb.apply(size);
 900         byte[] rs = fr.apply(size);
 901 
 902         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 903             for (int i = 0; i < as.length; i++) {
 904                 byte a = as[i];
 905                 byte b = bs[i];
 906                 rs[i] = (byte)((a >> CONST_SHIFT));
 907             }
 908         }
 909 
 910         bh.consume(rs);
 911     }
 912 
 913     @Benchmark
 914     public void ASHRMaskedShiftConst(Blackhole bh) {
 915         byte[] as = fa.apply(size);
 916         byte[] bs = fb.apply(size);
 917         byte[] rs = fr.apply(size);
 918         boolean[] ms = fm.apply(size);
 919 
 920         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 921             for (int i = 0; i < as.length; i++) {
 922                 byte a = as[i];
 923                 byte b = bs[i];
 924                 boolean m = ms[i % ms.length];
 925                 rs[i] = (m ? (byte)((a >> CONST_SHIFT)) : a);
 926             }
 927         }
 928 
 929         bh.consume(rs);
 930     }
 931 
 932     @Benchmark
 933     public void RORShiftConst(Blackhole bh) {
 934         byte[] as = fa.apply(size);
 935         byte[] bs = fb.apply(size);
 936         byte[] rs = fr.apply(size);
 937 
 938         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 939             for (int i = 0; i < as.length; i++) {
 940                 byte a = as[i];
 941                 byte b = bs[i];
 942                 rs[i] = (byte)(ROR_scalar(a, CONST_SHIFT));
 943             }
 944         }
 945 
 946         bh.consume(rs);
 947     }
 948 
 949     @Benchmark
 950     public void RORMaskedShiftConst(Blackhole bh) {
 951         byte[] as = fa.apply(size);
 952         byte[] bs = fb.apply(size);
 953         byte[] rs = fr.apply(size);
 954         boolean[] ms = fm.apply(size);
 955 
 956         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 957             for (int i = 0; i < as.length; i++) {
 958                 byte a = as[i];
 959                 byte b = bs[i];
 960                 boolean m = ms[i % ms.length];
 961                 rs[i] = (m ? (byte)(ROR_scalar(a, CONST_SHIFT)) : a);
 962             }
 963         }
 964 
 965         bh.consume(rs);
 966     }
 967 
 968     @Benchmark
 969     public void ROLShiftConst(Blackhole bh) {
 970         byte[] as = fa.apply(size);
 971         byte[] bs = fb.apply(size);
 972         byte[] rs = fr.apply(size);
 973 
 974         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 975             for (int i = 0; i < as.length; i++) {
 976                 byte a = as[i];
 977                 byte b = bs[i];
 978                 rs[i] = (byte)(ROL_scalar(a, CONST_SHIFT));
 979             }
 980         }
 981 
 982         bh.consume(rs);
 983     }
 984 
 985     @Benchmark
 986     public void ROLMaskedShiftConst(Blackhole bh) {
 987         byte[] as = fa.apply(size);
 988         byte[] bs = fb.apply(size);
 989         byte[] rs = fr.apply(size);
 990         boolean[] ms = fm.apply(size);
 991 
 992         for (int ic = 0; ic < INVOC_COUNT; ic++) {
 993             for (int i = 0; i < as.length; i++) {
 994                 byte a = as[i];
 995                 byte b = bs[i];
 996                 boolean m = ms[i % ms.length];
 997                 rs[i] = (m ? (byte)(ROL_scalar(a, CONST_SHIFT)) : a);
 998             }
 999         }
1000 
1001         bh.consume(rs);
1002     }
1003 
1004     @Benchmark
1005     public void MIN(Blackhole bh) {
1006         byte[] as = fa.apply(size);
1007         byte[] bs = fb.apply(size);
1008         byte[] rs = fr.apply(size);
1009 
1010         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1011             for (int i = 0; i < as.length; i++) {
1012                 byte a = as[i];
1013                 byte b = bs[i];
1014                 rs[i] = (byte)(Math.min(a, b));
1015             }
1016         }
1017 
1018         bh.consume(rs);
1019     }
1020 
1021     @Benchmark
1022     public void MAX(Blackhole bh) {
1023         byte[] as = fa.apply(size);
1024         byte[] bs = fb.apply(size);
1025         byte[] rs = fr.apply(size);
1026 
1027         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1028             for (int i = 0; i < as.length; i++) {
1029                 byte a = as[i];
1030                 byte b = bs[i];
1031                 rs[i] = (byte)(Math.max(a, b));
1032             }
1033         }
1034 
1035         bh.consume(rs);
1036     }
1037 
1038     @Benchmark
1039     public void UMIN(Blackhole bh) {
1040         byte[] as = fa.apply(size);
1041         byte[] bs = fb.apply(size);
1042         byte[] rs = fr.apply(size);
1043 
1044         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1045             for (int i = 0; i < as.length; i++) {
1046                 byte a = as[i];
1047                 byte b = bs[i];
1048                 rs[i] = (byte)(VectorMath.minUnsigned(a, b));
1049             }
1050         }
1051 
1052         bh.consume(rs);
1053     }
1054 
1055     @Benchmark
1056     public void UMINMasked(Blackhole bh) {
1057         byte[] as = fa.apply(size);
1058         byte[] bs = fb.apply(size);
1059         byte[] rs = fr.apply(size);
1060         boolean[] ms = fm.apply(size);
1061 
1062         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1063             for (int i = 0; i < as.length; i++) {
1064                 byte a = as[i];
1065                 byte b = bs[i];
1066                 if (ms[i % ms.length]) {
1067                     rs[i] = (byte)(VectorMath.minUnsigned(a, b));
1068                 } else {
1069                     rs[i] = a;
1070                 }
1071             }
1072         }
1073         bh.consume(rs);
1074     }
1075 
1076     @Benchmark
1077     public void UMAX(Blackhole bh) {
1078         byte[] as = fa.apply(size);
1079         byte[] bs = fb.apply(size);
1080         byte[] rs = fr.apply(size);
1081 
1082         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1083             for (int i = 0; i < as.length; i++) {
1084                 byte a = as[i];
1085                 byte b = bs[i];
1086                 rs[i] = (byte)(VectorMath.maxUnsigned(a, b));
1087             }
1088         }
1089 
1090         bh.consume(rs);
1091     }
1092 
1093     @Benchmark
1094     public void UMAXMasked(Blackhole bh) {
1095         byte[] as = fa.apply(size);
1096         byte[] bs = fb.apply(size);
1097         byte[] rs = fr.apply(size);
1098         boolean[] ms = fm.apply(size);
1099 
1100         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1101             for (int i = 0; i < as.length; i++) {
1102                 byte a = as[i];
1103                 byte b = bs[i];
1104                 if (ms[i % ms.length]) {
1105                     rs[i] = (byte)(VectorMath.maxUnsigned(a, b));
1106                 } else {
1107                     rs[i] = a;
1108                 }
1109             }
1110         }
1111         bh.consume(rs);
1112     }
1113 
1114     @Benchmark
1115     public void ANDLanes(Blackhole bh) {
1116         byte[] as = fa.apply(size);
1117         byte r = -1;
1118         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1119             r = -1;
1120             for (int i = 0; i < as.length; i++) {
1121                 r &= as[i];
1122             }
1123         }
1124         bh.consume(r);
1125     }
1126 
1127     @Benchmark
1128     public void ANDMaskedLanes(Blackhole bh) {
1129         byte[] as = fa.apply(size);
1130         boolean[] ms = fm.apply(size);
1131         byte r = -1;
1132         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1133             r = -1;
1134             for (int i = 0; i < as.length; i++) {
1135                 if (ms[i % ms.length])
1136                     r &= as[i];
1137             }
1138         }
1139         bh.consume(r);
1140     }
1141 
1142     @Benchmark
1143     public void ORLanes(Blackhole bh) {
1144         byte[] as = fa.apply(size);
1145         byte r = 0;
1146         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1147             r = 0;
1148             for (int i = 0; i < as.length; i++) {
1149                 r |= as[i];
1150             }
1151         }
1152         bh.consume(r);
1153     }
1154 
1155     @Benchmark
1156     public void ORMaskedLanes(Blackhole bh) {
1157         byte[] as = fa.apply(size);
1158         boolean[] ms = fm.apply(size);
1159         byte r = 0;
1160         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1161             r = 0;
1162             for (int i = 0; i < as.length; i++) {
1163                 if (ms[i % ms.length])
1164                     r |= as[i];
1165             }
1166         }
1167         bh.consume(r);
1168     }
1169 
1170     @Benchmark
1171     public void XORLanes(Blackhole bh) {
1172         byte[] as = fa.apply(size);
1173         byte r = 0;
1174         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1175             r = 0;
1176             for (int i = 0; i < as.length; i++) {
1177                 r ^= as[i];
1178             }
1179         }
1180         bh.consume(r);
1181     }
1182 
1183     @Benchmark
1184     public void XORMaskedLanes(Blackhole bh) {
1185         byte[] as = fa.apply(size);
1186         boolean[] ms = fm.apply(size);
1187         byte r = 0;
1188         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1189             r = 0;
1190             for (int i = 0; i < as.length; i++) {
1191                 if (ms[i % ms.length])
1192                     r ^= as[i];
1193             }
1194         }
1195         bh.consume(r);
1196     }
1197 
1198     @Benchmark
1199     public void ADDLanes(Blackhole bh) {
1200         byte[] as = fa.apply(size);
1201         byte r = 0;
1202         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1203             r = 0;
1204             for (int i = 0; i < as.length; i++) {
1205                 r += as[i];
1206             }
1207         }
1208         bh.consume(r);
1209     }
1210 
1211     @Benchmark
1212     public void ADDMaskedLanes(Blackhole bh) {
1213         byte[] as = fa.apply(size);
1214         boolean[] ms = fm.apply(size);
1215         byte r = 0;
1216         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1217             r = 0;
1218             for (int i = 0; i < as.length; i++) {
1219                 if (ms[i % ms.length])
1220                     r += as[i];
1221             }
1222         }
1223         bh.consume(r);
1224     }
1225 
1226     @Benchmark
1227     public void MULLanes(Blackhole bh) {
1228         byte[] as = fa.apply(size);
1229         byte r = 1;
1230         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1231             r = 1;
1232             for (int i = 0; i < as.length; i++) {
1233                 r *= as[i];
1234             }
1235         }
1236         bh.consume(r);
1237     }
1238 
1239     @Benchmark
1240     public void MULMaskedLanes(Blackhole bh) {
1241         byte[] as = fa.apply(size);
1242         boolean[] ms = fm.apply(size);
1243         byte r = 1;
1244         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1245             r = 1;
1246             for (int i = 0; i < as.length; i++) {
1247                 if (ms[i % ms.length])
1248                     r *= as[i];
1249             }
1250         }
1251         bh.consume(r);
1252     }
1253 
1254     @Benchmark
1255     public void anyTrue(Blackhole bh) {
1256         boolean[] ms = fm.apply(size);
1257         boolean r = false;
1258         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1259             r = false;
1260             for (int i = 0; i < ms.length; i++) {
1261                 r |= ms[i];
1262             }
1263         }
1264         bh.consume(r);
1265     }
1266 
1267     @Benchmark
1268     public void allTrue(Blackhole bh) {
1269         boolean[] ms = fm.apply(size);
1270         boolean r = true;
1271         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1272             r = true;
1273             for (int i = 0; i < ms.length; i++) {
1274                 r &= ms[i];
1275             }
1276         }
1277         bh.consume(r);
1278     }
1279 
1280     @Benchmark
1281     public void IS_DEFAULT(Blackhole bh) {
1282         byte[] as = fa.apply(size);
1283         boolean r = true;
1284 
1285         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1286             for (int i = 0; i < as.length; i++) {
1287                 byte a = as[i];
1288                 r &= (bits(a)==0); // accumulate so JIT can't eliminate the computation
1289             }
1290         }
1291 
1292         bh.consume(r);
1293     }
1294 
1295     @Benchmark
1296     public void IS_NEGATIVE(Blackhole bh) {
1297         byte[] as = fa.apply(size);
1298         boolean r = true;
1299 
1300         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1301             for (int i = 0; i < as.length; i++) {
1302                 byte a = as[i];
1303                 r &= (bits(a)<0); // accumulate so JIT can't eliminate the computation
1304             }
1305         }
1306 
1307         bh.consume(r);
1308     }
1309 
1310     @Benchmark
1311     public void LT(Blackhole bh) {
1312         byte[] as = fa.apply(size);
1313         byte[] bs = fb.apply(size);
1314         boolean r = true;
1315 
1316         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1317             for (int i = 0; i < as.length; i++) {
1318                 r &= lt(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1319             }
1320         }
1321 
1322         bh.consume(r);
1323     }
1324 
1325     @Benchmark
1326     public void GT(Blackhole bh) {
1327         byte[] as = fa.apply(size);
1328         byte[] bs = fb.apply(size);
1329         boolean r = true;
1330 
1331         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1332             for (int i = 0; i < as.length; i++) {
1333                 r &= gt(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1334             }
1335         }
1336 
1337         bh.consume(r);
1338     }
1339 
1340     @Benchmark
1341     public void EQ(Blackhole bh) {
1342         byte[] as = fa.apply(size);
1343         byte[] bs = fb.apply(size);
1344         boolean r = true;
1345 
1346         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1347             for (int i = 0; i < as.length; i++) {
1348                 r &= eq(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1349             }
1350         }
1351 
1352         bh.consume(r);
1353     }
1354 
1355     @Benchmark
1356     public void NE(Blackhole bh) {
1357         byte[] as = fa.apply(size);
1358         byte[] bs = fb.apply(size);
1359         boolean r = true;
1360 
1361         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1362             for (int i = 0; i < as.length; i++) {
1363                 r &= neq(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1364             }
1365         }
1366 
1367         bh.consume(r);
1368     }
1369 
1370     @Benchmark
1371     public void LE(Blackhole bh) {
1372         byte[] as = fa.apply(size);
1373         byte[] bs = fb.apply(size);
1374         boolean r = true;
1375 
1376         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1377             for (int i = 0; i < as.length; i++) {
1378                 r &= le(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1379             }
1380         }
1381 
1382         bh.consume(r);
1383     }
1384 
1385     @Benchmark
1386     public void GE(Blackhole bh) {
1387         byte[] as = fa.apply(size);
1388         byte[] bs = fb.apply(size);
1389         boolean r = true;
1390 
1391         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1392             for (int i = 0; i < as.length; i++) {
1393                 r &= ge(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1394             }
1395         }
1396 
1397         bh.consume(r);
1398     }
1399 
1400     @Benchmark
1401     public void ULT(Blackhole bh) {
1402         byte[] as = fa.apply(size);
1403         byte[] bs = fb.apply(size);
1404         boolean r = true;
1405 
1406         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1407             for (int i = 0; i < as.length; i++) {
1408                 r &= ult(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1409             }
1410         }
1411 
1412         bh.consume(r);
1413     }
1414 
1415     @Benchmark
1416     public void UGT(Blackhole bh) {
1417         byte[] as = fa.apply(size);
1418         byte[] bs = fb.apply(size);
1419         boolean r = true;
1420 
1421         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1422             for (int i = 0; i < as.length; i++) {
1423                 r &= ugt(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1424             }
1425         }
1426 
1427         bh.consume(r);
1428     }
1429 
1430     @Benchmark
1431     public void ULE(Blackhole bh) {
1432         byte[] as = fa.apply(size);
1433         byte[] bs = fb.apply(size);
1434         boolean r = true;
1435 
1436         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1437             for (int i = 0; i < as.length; i++) {
1438                 r &= ule(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1439             }
1440         }
1441 
1442         bh.consume(r);
1443     }
1444 
1445     @Benchmark
1446     public void UGE(Blackhole bh) {
1447         byte[] as = fa.apply(size);
1448         byte[] bs = fb.apply(size);
1449         boolean r = true;
1450 
1451         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1452             for (int i = 0; i < as.length; i++) {
1453                 r &= uge(as[i], bs[i]); // accumulate so JIT can't eliminate the computation
1454             }
1455         }
1456 
1457         bh.consume(r);
1458     }
1459 
1460     @Benchmark
1461     public void blend(Blackhole bh) {
1462         byte[] as = fa.apply(size);
1463         byte[] bs = fb.apply(size);
1464         byte[] rs = fr.apply(size);
1465         boolean[] ms = fm.apply(size);
1466 
1467         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1468             for (int i = 0; i < as.length; i++) {
1469                 byte a = as[i];
1470                 byte b = bs[i];
1471                 boolean m = ms[i % ms.length];
1472                 rs[i] = (m ? b : a);
1473             }
1474         }
1475 
1476         bh.consume(rs);
1477     }
1478 
1479     void rearrangeShared(int window, Blackhole bh) {
1480         byte[] as = fa.apply(size);
1481         int[] order = fs.apply(size);
1482         byte[] rs = fr.apply(size);
1483 
1484         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1485             for (int i = 0; i < as.length; i += window) {
1486                 for (int j = 0; j < window; j++) {
1487                     byte a = as[i+j];
1488                     int pos = order[j];
1489                     rs[i + pos] = a;
1490                 }
1491             }
1492         }
1493 
1494         bh.consume(rs);
1495     }
1496 
1497     @Benchmark
1498     public void rearrange064(Blackhole bh) {
1499         int window = 64 / Byte.SIZE;
1500         rearrangeShared(window, bh);
1501     }
1502 
1503     @Benchmark
1504     public void rearrange128(Blackhole bh) {
1505         int window = 128 / Byte.SIZE;
1506         rearrangeShared(window, bh);
1507     }
1508 
1509     @Benchmark
1510     public void rearrange256(Blackhole bh) {
1511         int window = 256 / Byte.SIZE;
1512         rearrangeShared(window, bh);
1513     }
1514 
1515     @Benchmark
1516     public void rearrange512(Blackhole bh) {
1517         int window = 512 / Byte.SIZE;
1518         rearrangeShared(window, bh);
1519     }
1520 
1521     @Benchmark
1522     public void compressScalar(Blackhole bh) {
1523         byte[] as = fa.apply(size);
1524         byte[] rs = new byte[size];
1525         boolean[] im = fmt.apply(size);
1526 
1527         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1528             for (int i = 0, j = 0; i < as.length; i++) {
1529                 if (im[i]) {
1530                     rs[j++] = as[i];
1531                 }
1532             }
1533         }
1534 
1535         bh.consume(rs);
1536     }
1537 
1538     @Benchmark
1539     public void expandScalar(Blackhole bh) {
1540         byte[] as = fa.apply(size);
1541         byte[] rs = new byte[size];
1542         boolean[] im = fmt.apply(size);
1543 
1544         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1545             for (int i = 0, j = 0; i < as.length; i++) {
1546                 if (im[i]) {
1547                     rs[i++] = as[j++];
1548                 }
1549             }
1550         }
1551 
1552         bh.consume(rs);
1553     }
1554 
1555     @Benchmark
1556     public void maskCompressScalar(Blackhole bh) {
1557         boolean[] im = fmt.apply(size);
1558         boolean[] rm = new boolean[size];
1559 
1560         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1561             for (int i = 0, j = 0; i < im.length; i++) {
1562                 if (im[i]) {
1563                     rm[j++] = im[i];
1564                 }
1565             }
1566         }
1567 
1568         bh.consume(rm);
1569     }
1570 
1571     void broadcastShared(int window, Blackhole bh) {
1572         byte[] as = fa.apply(size);
1573         byte[] rs = fr.apply(size);
1574 
1575         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1576             for (int i = 0; i < as.length; i += window) {
1577                 int idx = i;
1578                 for (int j = 0; j < window; j++) {
1579                     rs[j] = as[idx];
1580                 }
1581             }
1582         }
1583 
1584         bh.consume(rs);
1585     }
1586 
1587     @Benchmark
1588     public void broadcast064(Blackhole bh) {
1589         int window = 64 / Byte.SIZE;
1590         broadcastShared(window, bh);
1591     }
1592 
1593     @Benchmark
1594     public void broadcast128(Blackhole bh) {
1595         int window = 128 / Byte.SIZE;
1596         broadcastShared(window, bh);
1597     }
1598 
1599     @Benchmark
1600     public void broadcast256(Blackhole bh) {
1601         int window = 256 / Byte.SIZE;
1602         broadcastShared(window, bh);
1603     }
1604 
1605     @Benchmark
1606     public void broadcast512(Blackhole bh) {
1607         int window = 512 / Byte.SIZE;
1608         broadcastShared(window, bh);
1609     }
1610 
1611     @Benchmark
1612     public void zero(Blackhole bh) {
1613         byte[] as = fa.apply(size);
1614 
1615         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1616             for (int i = 0; i < as.length; i++) {
1617                 as[i] = (byte)0;
1618             }
1619         }
1620 
1621         bh.consume(as);
1622     }
1623 
1624     @Benchmark
1625     public void BITWISE_BLEND(Blackhole bh) {
1626         byte[] as = fa.apply(size);
1627         byte[] bs = fb.apply(size);
1628         byte[] cs = fc.apply(size);
1629         byte[] rs = fr.apply(size);
1630 
1631         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1632             for (int i = 0; i < as.length; i++) {
1633                 byte a = as[i];
1634                 byte b = bs[i];
1635                 byte c = cs[i];
1636                 rs[i] = (byte)((a&~(c))|(b&c));
1637             }
1638         }
1639 
1640         bh.consume(rs);
1641     }
1642 
1643     @Benchmark
1644     public void BITWISE_BLENDMasked(Blackhole bh) {
1645         byte[] as = fa.apply(size);
1646         byte[] bs = fb.apply(size);
1647         byte[] cs = fc.apply(size);
1648         byte[] rs = fr.apply(size);
1649         boolean[] ms = fm.apply(size);
1650 
1651         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1652             for (int i = 0; i < as.length; i++) {
1653                 byte a = as[i];
1654                 byte b = bs[i];
1655                 byte c = cs[i];
1656                 if (ms[i % ms.length]) {
1657                     rs[i] = (byte)((a&~(c))|(b&c));
1658                 } else {
1659                     rs[i] = a;
1660                 }
1661             }
1662         }
1663         bh.consume(rs);
1664     }
1665     @Benchmark
1666     public void NEG(Blackhole bh) {
1667         byte[] as = fa.apply(size);
1668         byte[] rs = fr.apply(size);
1669 
1670         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1671             for (int i = 0; i < as.length; i++) {
1672                 byte a = as[i];
1673                 rs[i] = (byte)(-((byte)a));
1674             }
1675         }
1676 
1677         bh.consume(rs);
1678     }
1679 
1680     @Benchmark
1681     public void NEGMasked(Blackhole bh) {
1682         byte[] as = fa.apply(size);
1683         byte[] rs = fr.apply(size);
1684         boolean[] ms = fm.apply(size);
1685 
1686         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1687             for (int i = 0; i < as.length; i++) {
1688                 byte a = as[i];
1689                 boolean m = ms[i % ms.length];
1690                 rs[i] = (m ? (byte)(-((byte)a)) : a);
1691             }
1692         }
1693 
1694         bh.consume(rs);
1695     }
1696     @Benchmark
1697     public void ABS(Blackhole bh) {
1698         byte[] as = fa.apply(size);
1699         byte[] rs = fr.apply(size);
1700 
1701         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1702             for (int i = 0; i < as.length; i++) {
1703                 byte a = as[i];
1704                 rs[i] = (byte)(Math.abs((byte)a));
1705             }
1706         }
1707 
1708         bh.consume(rs);
1709     }
1710 
1711     @Benchmark
1712     public void ABSMasked(Blackhole bh) {
1713         byte[] as = fa.apply(size);
1714         byte[] rs = fr.apply(size);
1715         boolean[] ms = fm.apply(size);
1716 
1717         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1718             for (int i = 0; i < as.length; i++) {
1719                 byte a = as[i];
1720                 boolean m = ms[i % ms.length];
1721                 rs[i] = (m ? (byte)(Math.abs((byte)a)) : a);
1722             }
1723         }
1724 
1725         bh.consume(rs);
1726     }
1727     @Benchmark
1728     public void NOT(Blackhole bh) {
1729         byte[] as = fa.apply(size);
1730         byte[] rs = fr.apply(size);
1731 
1732         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1733             for (int i = 0; i < as.length; i++) {
1734                 byte a = as[i];
1735                 rs[i] = (byte)(~((byte)a));
1736             }
1737         }
1738 
1739         bh.consume(rs);
1740     }
1741 
1742     @Benchmark
1743     public void NOTMasked(Blackhole bh) {
1744         byte[] as = fa.apply(size);
1745         byte[] rs = fr.apply(size);
1746         boolean[] ms = fm.apply(size);
1747 
1748         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1749             for (int i = 0; i < as.length; i++) {
1750                 byte a = as[i];
1751                 boolean m = ms[i % ms.length];
1752                 rs[i] = (m ? (byte)(~((byte)a)) : a);
1753             }
1754         }
1755 
1756         bh.consume(rs);
1757     }
1758     @Benchmark
1759     public void ZOMO(Blackhole bh) {
1760         byte[] as = fa.apply(size);
1761         byte[] rs = fr.apply(size);
1762 
1763         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1764             for (int i = 0; i < as.length; i++) {
1765                 byte a = as[i];
1766                 rs[i] = (byte)((a==0?0:-1));
1767             }
1768         }
1769 
1770         bh.consume(rs);
1771     }
1772 
1773     @Benchmark
1774     public void ZOMOMasked(Blackhole bh) {
1775         byte[] as = fa.apply(size);
1776         byte[] rs = fr.apply(size);
1777         boolean[] ms = fm.apply(size);
1778 
1779         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1780             for (int i = 0; i < as.length; i++) {
1781                 byte a = as[i];
1782                 boolean m = ms[i % ms.length];
1783                 rs[i] = (m ? (byte)((a==0?0:-1)) : a);
1784             }
1785         }
1786 
1787         bh.consume(rs);
1788     }
1789     @Benchmark
1790     public void BIT_COUNT(Blackhole bh) {
1791         byte[] as = fa.apply(size);
1792         byte[] rs = fr.apply(size);
1793 
1794         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1795             for (int i = 0; i < as.length; i++) {
1796                 byte a = as[i];
1797                 rs[i] = (byte)(Integer.bitCount((int)a & 0xFF));
1798             }
1799         }
1800 
1801         bh.consume(rs);
1802     }
1803 
1804     @Benchmark
1805     public void BIT_COUNTMasked(Blackhole bh) {
1806         byte[] as = fa.apply(size);
1807         byte[] rs = fr.apply(size);
1808         boolean[] ms = fm.apply(size);
1809 
1810         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1811             for (int i = 0; i < as.length; i++) {
1812                 byte a = as[i];
1813                 boolean m = ms[i % ms.length];
1814                 rs[i] = (m ? (byte)(Integer.bitCount((int)a & 0xFF)) : a);
1815             }
1816         }
1817 
1818         bh.consume(rs);
1819     }
1820     @Benchmark
1821     public void TRAILING_ZEROS_COUNT(Blackhole bh) {
1822         byte[] as = fa.apply(size);
1823         byte[] rs = fr.apply(size);
1824 
1825         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1826             for (int i = 0; i < as.length; i++) {
1827                 byte a = as[i];
1828                 rs[i] = (byte)(TRAILING_ZEROS_COUNT_scalar(a));
1829             }
1830         }
1831 
1832         bh.consume(rs);
1833     }
1834 
1835     @Benchmark
1836     public void TRAILING_ZEROS_COUNTMasked(Blackhole bh) {
1837         byte[] as = fa.apply(size);
1838         byte[] rs = fr.apply(size);
1839         boolean[] ms = fm.apply(size);
1840 
1841         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1842             for (int i = 0; i < as.length; i++) {
1843                 byte a = as[i];
1844                 boolean m = ms[i % ms.length];
1845                 rs[i] = (m ? (byte)(TRAILING_ZEROS_COUNT_scalar(a)) : a);
1846             }
1847         }
1848 
1849         bh.consume(rs);
1850     }
1851     @Benchmark
1852     public void LEADING_ZEROS_COUNT(Blackhole bh) {
1853         byte[] as = fa.apply(size);
1854         byte[] rs = fr.apply(size);
1855 
1856         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1857             for (int i = 0; i < as.length; i++) {
1858                 byte a = as[i];
1859                 rs[i] = (byte)(LEADING_ZEROS_COUNT_scalar(a));
1860             }
1861         }
1862 
1863         bh.consume(rs);
1864     }
1865 
1866     @Benchmark
1867     public void LEADING_ZEROS_COUNTMasked(Blackhole bh) {
1868         byte[] as = fa.apply(size);
1869         byte[] rs = fr.apply(size);
1870         boolean[] ms = fm.apply(size);
1871 
1872         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1873             for (int i = 0; i < as.length; i++) {
1874                 byte a = as[i];
1875                 boolean m = ms[i % ms.length];
1876                 rs[i] = (m ? (byte)(LEADING_ZEROS_COUNT_scalar(a)) : a);
1877             }
1878         }
1879 
1880         bh.consume(rs);
1881     }
1882     @Benchmark
1883     public void REVERSE(Blackhole bh) {
1884         byte[] as = fa.apply(size);
1885         byte[] rs = fr.apply(size);
1886 
1887         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1888             for (int i = 0; i < as.length; i++) {
1889                 byte a = as[i];
1890                 rs[i] = (byte)(REVERSE_scalar(a));
1891             }
1892         }
1893 
1894         bh.consume(rs);
1895     }
1896 
1897     @Benchmark
1898     public void REVERSEMasked(Blackhole bh) {
1899         byte[] as = fa.apply(size);
1900         byte[] rs = fr.apply(size);
1901         boolean[] ms = fm.apply(size);
1902 
1903         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1904             for (int i = 0; i < as.length; i++) {
1905                 byte a = as[i];
1906                 boolean m = ms[i % ms.length];
1907                 rs[i] = (m ? (byte)(REVERSE_scalar(a)) : a);
1908             }
1909         }
1910 
1911         bh.consume(rs);
1912     }
1913     @Benchmark
1914     public void REVERSE_BYTES(Blackhole bh) {
1915         byte[] as = fa.apply(size);
1916         byte[] rs = fr.apply(size);
1917 
1918         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1919             for (int i = 0; i < as.length; i++) {
1920                 byte a = as[i];
1921                 rs[i] = (byte)(a);
1922             }
1923         }
1924 
1925         bh.consume(rs);
1926     }
1927 
1928     @Benchmark
1929     public void REVERSE_BYTESMasked(Blackhole bh) {
1930         byte[] as = fa.apply(size);
1931         byte[] rs = fr.apply(size);
1932         boolean[] ms = fm.apply(size);
1933 
1934         for (int ic = 0; ic < INVOC_COUNT; ic++) {
1935             for (int i = 0; i < as.length; i++) {
1936                 byte a = as[i];
1937                 boolean m = ms[i % ms.length];
1938                 rs[i] = (m ? (byte)(a) : a);
1939             }
1940         }
1941 
1942         bh.consume(rs);
1943     }
1944 }