1 /*
   2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package compiler.loopopts.superword;
  25 
  26 import compiler.lib.ir_framework.*;
  27 import jdk.test.lib.Utils;
  28 import jdk.test.whitebox.WhiteBox;
  29 import jdk.internal.misc.Unsafe;
  30 import java.lang.reflect.Array;
  31 import java.util.Map;
  32 import java.util.HashMap;
  33 import java.util.Random;
  34 import java.nio.ByteOrder;
  35 
  36 /*
  37  * @test id=NoAlignVector
  38  * @bug 8310190
  39  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  40  * @modules java.base/jdk.internal.misc
  41  * @library /test/lib /
  42  * @requires vm.compiler2.enabled
  43  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector
  44  */
  45 
  46 /*
  47  * @test id=AlignVector
  48  * @bug 8310190
  49  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  50  * @modules java.base/jdk.internal.misc
  51  * @library /test/lib /
  52  * @requires vm.compiler2.enabled
  53  * @run driver compiler.loopopts.superword.TestAlignVector AlignVector
  54  */
  55 
  56 /*
  57  * @test id=VerifyAlignVector
  58  * @bug 8310190
  59  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  60  * @modules java.base/jdk.internal.misc
  61  * @library /test/lib /
  62  * @requires vm.compiler2.enabled
  63  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector
  64  */
  65 
  66 public class TestAlignVector {
  67     static int RANGE = 1024*8;
  68     static int RANGE_FINAL = 1024*8;
  69     private static final Unsafe UNSAFE = Unsafe.getUnsafe();
  70     private static final Random RANDOM = Utils.getRandomInstance();
  71 
  72     // Inputs
  73     byte[] aB;
  74     byte[] bB;
  75     byte mB = (byte)31;
  76     short[] aS;
  77     short[] bS;
  78     short mS = (short)0xF0F0;
  79     int[] aI;
  80     int[] bI;
  81     int mI = 0xF0F0F0F0;
  82     long[] aL;
  83     long[] bL;
  84     long mL = 0xF0F0F0F0F0F0F0F0L;
  85 
  86     // List of tests
  87     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
  88 
  89     // List of gold, the results from the first run before compilation
  90     Map<String,Object[]> golds = new HashMap<String,Object[]>();
  91 
  92     interface TestFunction {
  93         Object[] run();
  94     }
  95 
  96     public static void main(String[] args) {
  97         TestFramework framework = new TestFramework(TestAlignVector.class);
  98         framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
  99                            "-XX:LoopUnrollLimit=250");
 100 
 101         switch (args[0]) {
 102             case "NoAlignVector"     -> { framework.addFlags("-XX:-AlignVector"); }
 103             case "AlignVector"       -> { framework.addFlags("-XX:+AlignVector"); }
 104             case "VerifyAlignVector" -> { framework.addFlags("-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 105             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 106         }
 107         framework.start();
 108     }
 109 
 110     public TestAlignVector() {
 111         // Generate input once
 112         aB = generateB();
 113         bB = generateB();
 114         aS = generateS();
 115         bS = generateS();
 116         aI = generateI();
 117         bI = generateI();
 118         aL = generateL();
 119         bL = generateL();
 120 
 121         // Add all tests to list
 122         tests.put("test0",       () -> { return test0(aB.clone(), bB.clone(), mB); });
 123         tests.put("test1",       () -> { return test1(aB.clone(), bB.clone(), mB); });
 124         tests.put("test2",       () -> { return test2(aB.clone(), bB.clone(), mB); });
 125         tests.put("test3",       () -> { return test3(aB.clone(), bB.clone(), mB); });
 126         tests.put("test4",       () -> { return test4(aB.clone(), bB.clone(), mB); });
 127         tests.put("test5",       () -> { return test5(aB.clone(), bB.clone(), mB, 0); });
 128         tests.put("test6",       () -> { return test6(aB.clone(), bB.clone(), mB); });
 129         tests.put("test7",       () -> { return test7(aS.clone(), bS.clone(), mS); });
 130         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 0); });
 131         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 1); });
 132         tests.put("test9",       () -> { return test9(aB.clone(), bB.clone(), mB); });
 133 
 134         tests.put("test10a",     () -> { return test10a(aB.clone(), bB.clone(), mB); });
 135         tests.put("test10b",     () -> { return test10b(aB.clone(), bB.clone(), mB); });
 136         tests.put("test10c",     () -> { return test10c(aS.clone(), bS.clone(), mS); });
 137         tests.put("test10d",     () -> { return test10d(aS.clone(), bS.clone(), mS); });
 138 
 139         tests.put("test11aB",    () -> { return test11aB(aB.clone(), bB.clone(), mB); });
 140         tests.put("test11aS",    () -> { return test11aS(aS.clone(), bS.clone(), mS); });
 141         tests.put("test11aI",    () -> { return test11aI(aI.clone(), bI.clone(), mI); });
 142         tests.put("test11aL",    () -> { return test11aL(aL.clone(), bL.clone(), mL); });
 143 
 144         tests.put("test11bB",    () -> { return test11bB(aB.clone(), bB.clone(), mB); });
 145         tests.put("test11bS",    () -> { return test11bS(aS.clone(), bS.clone(), mS); });
 146         tests.put("test11bI",    () -> { return test11bI(aI.clone(), bI.clone(), mI); });
 147         tests.put("test11bL",    () -> { return test11bL(aL.clone(), bL.clone(), mL); });
 148 
 149         tests.put("test11cB",    () -> { return test11cB(aB.clone(), bB.clone(), mB); });
 150         tests.put("test11cS",    () -> { return test11cS(aS.clone(), bS.clone(), mS); });
 151         tests.put("test11cI",    () -> { return test11cI(aI.clone(), bI.clone(), mI); });
 152         tests.put("test11cL",    () -> { return test11cL(aL.clone(), bL.clone(), mL); });
 153 
 154         tests.put("test11dB",    () -> { return test11dB(aB.clone(), bB.clone(), mB, 0); });
 155         tests.put("test11dS",    () -> { return test11dS(aS.clone(), bS.clone(), mS, 0); });
 156         tests.put("test11dI",    () -> { return test11dI(aI.clone(), bI.clone(), mI, 0); });
 157         tests.put("test11dL",    () -> { return test11dL(aL.clone(), bL.clone(), mL, 0); });
 158 
 159         tests.put("test12",      () -> { return test12(aB.clone(), bB.clone(), mB); });
 160 
 161         tests.put("test13aIL",   () -> { return test13aIL(aI.clone(), aL.clone()); });
 162         tests.put("test13aIB",   () -> { return test13aIB(aI.clone(), aB.clone()); });
 163         tests.put("test13aIS",   () -> { return test13aIS(aI.clone(), aS.clone()); });
 164         tests.put("test13aBSIL", () -> { return test13aBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 165 
 166         tests.put("test13bIL",   () -> { return test13bIL(aI.clone(), aL.clone()); });
 167         tests.put("test13bIB",   () -> { return test13bIB(aI.clone(), aB.clone()); });
 168         tests.put("test13bIS",   () -> { return test13bIS(aI.clone(), aS.clone()); });
 169         tests.put("test13bBSIL", () -> { return test13bBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 170 
 171         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
 172         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
 173         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
 174 
 175         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
 176         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
 177         tests.put("test15cB",    () -> { return test15cB(aB.clone()); });
 178 
 179         tests.put("test16a",     () -> { return test16a(aB.clone(), aS.clone()); });
 180         tests.put("test16b",     () -> { return test16b(aB.clone()); });
 181 
 182         tests.put("test17a",     () -> { return test17a(aL.clone()); });
 183         tests.put("test17b",     () -> { return test17b(aL.clone()); });
 184         tests.put("test17c",     () -> { return test17c(aL.clone()); });
 185         tests.put("test17d",     () -> { return test17d(aL.clone()); });
 186 
 187         tests.put("test18a",     () -> { return test18a(aB.clone(), aI.clone()); });
 188         tests.put("test18b",     () -> { return test18b(aB.clone(), aI.clone()); });
 189 
 190         tests.put("test19",      () -> { return test19(aI.clone(), bI.clone()); });
 191         tests.put("test20",      () -> { return test20(aB.clone()); });
 192 
 193         // Compute gold value for all test methods before compilation
 194         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 195             String name = entry.getKey();
 196             TestFunction test = entry.getValue();
 197             Object[] gold = test.run();
 198             golds.put(name, gold);
 199         }
 200     }
 201 
 202     @Warmup(100)
 203     @Run(test = {"test0",
 204                  "test1",
 205                  "test2",
 206                  "test3",
 207                  "test4",
 208                  "test5",
 209                  "test6",
 210                  "test7",
 211                  "test8",
 212                  "test9",
 213                  "test10a",
 214                  "test10b",
 215                  "test10c",
 216                  "test10d",
 217                  "test11aB",
 218                  "test11aS",
 219                  "test11aI",
 220                  "test11aL",
 221                  "test11bB",
 222                  "test11bS",
 223                  "test11bI",
 224                  "test11bL",
 225                  "test11cB",
 226                  "test11cS",
 227                  "test11cI",
 228                  "test11cL",
 229                  "test11dB",
 230                  "test11dS",
 231                  "test11dI",
 232                  "test11dL",
 233                  "test12",
 234                  "test13aIL",
 235                  "test13aIB",
 236                  "test13aIS",
 237                  "test13aBSIL",
 238                  "test13bIL",
 239                  "test13bIB",
 240                  "test13bIS",
 241                  "test13bBSIL",
 242                  "test14aB",
 243                  "test14bB",
 244                  "test14cB",
 245                  "test15aB",
 246                  "test15bB",
 247                  "test15cB",
 248                  "test16a",
 249                  "test16b",
 250                  "test17a",
 251                  "test17b",
 252                  "test17c",
 253                  "test17d",
 254                  "test18a",
 255                  "test18b",
 256                  "test19",
 257                  "test20"})
 258     public void runTests() {
 259         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 260             String name = entry.getKey();
 261             TestFunction test = entry.getValue();
 262             // Recall gold value from before compilation
 263             Object[] gold = golds.get(name);
 264             // Compute new result
 265             Object[] result = test.run();
 266             // Compare gold and new result
 267             verify(name, gold, result);
 268         }
 269     }
 270 
 271     static byte[] generateB() {
 272         byte[] a = new byte[RANGE];
 273         for (int i = 0; i < a.length; i++) {
 274             a[i] = (byte)RANDOM.nextInt();
 275         }
 276         return a;
 277     }
 278 
 279     static short[] generateS() {
 280         short[] a = new short[RANGE];
 281         for (int i = 0; i < a.length; i++) {
 282             a[i] = (short)RANDOM.nextInt();
 283         }
 284         return a;
 285     }
 286 
 287     static int[] generateI() {
 288         int[] a = new int[RANGE];
 289         for (int i = 0; i < a.length; i++) {
 290             a[i] = RANDOM.nextInt();
 291         }
 292         return a;
 293     }
 294 
 295     static long[] generateL() {
 296         long[] a = new long[RANGE];
 297         for (int i = 0; i < a.length; i++) {
 298             a[i] = RANDOM.nextLong();
 299         }
 300         return a;
 301     }
 302 
 303     static void verify(String name, Object[] gold, Object[] result) {
 304         if (gold.length != result.length) {
 305             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
 306                                        gold.length + ", result.length = " + result.length);
 307         }
 308         for (int i = 0; i < gold.length; i++) {
 309             Object g = gold[i];
 310             Object r = result[i];
 311             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
 312                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
 313                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 314                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 315             }
 316             if (g == r) {
 317                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
 318                                            " gold[" + i + "] == result[" + i + "]");
 319             }
 320             if (Array.getLength(g) != Array.getLength(r)) {
 321                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
 322                                            " gold[" + i + "].length = " + Array.getLength(g) +
 323                                            " result[" + i + "].length = " + Array.getLength(r));
 324             }
 325             Class c = g.getClass().getComponentType();
 326             if (c == byte.class) {
 327                 verifyB(name, i, (byte[])g, (byte[])r);
 328             } else if (c == short.class) {
 329                 verifyS(name, i, (short[])g, (short[])r);
 330             } else if (c == int.class) {
 331                 verifyI(name, i, (int[])g, (int[])r);
 332             } else if (c == long.class) {
 333                 verifyL(name, i, (long[])g, (long[])r);
 334             } else {
 335                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
 336                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 337                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 338             }
 339         }
 340     }
 341 
 342     static void verifyB(String name, int i, byte[] g, byte[] r) {
 343         for (int j = 0; j < g.length; j++) {
 344             if (g[j] != r[j]) {
 345                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 346                                            " gold[" + i + "][" + j + "] = " + g[j] +
 347                                            " result[" + i + "][" + j + "] = " + r[j]);
 348             }
 349         }
 350     }
 351 
 352     static void verifyS(String name, int i, short[] g, short[] r) {
 353         for (int j = 0; j < g.length; j++) {
 354             if (g[j] != r[j]) {
 355                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 356                                            " gold[" + i + "][" + j + "] = " + g[j] +
 357                                            " result[" + i + "][" + j + "] = " + r[j]);
 358             }
 359         }
 360     }
 361 
 362     static void verifyI(String name, int i, int[] g, int[] r) {
 363         for (int j = 0; j < g.length; j++) {
 364             if (g[j] != r[j]) {
 365                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 366                                            " gold[" + i + "][" + j + "] = " + g[j] +
 367                                            " result[" + i + "][" + j + "] = " + r[j]);
 368             }
 369         }
 370     }
 371 
 372     static void verifyL(String name, int i, long[] g, long[] r) {
 373         for (int j = 0; j < g.length; j++) {
 374             if (g[j] != r[j]) {
 375                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 376                                            " gold[" + i + "][" + j + "] = " + g[j] +
 377                                            " result[" + i + "][" + j + "] = " + r[j]);
 378             }
 379         }
 380     }
 381 
 382     @Test
 383     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 384                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 385                   IRNode.STORE_VECTOR, "> 0"},
 386         applyIf = {"MaxVectorSize", ">=8"},
 387         applyIfPlatform = {"64-bit", "true"},
 388         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 389     static Object[] test0(byte[] a, byte[] b, byte mask) {
 390         for (int i = 0; i < RANGE; i+=8) {
 391             // Safe to vectorize with AlignVector
 392             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 393             b[i+1] = (byte)(a[i+1] & mask);
 394             b[i+2] = (byte)(a[i+2] & mask);
 395             b[i+3] = (byte)(a[i+3] & mask);
 396         }
 397         return new Object[]{ a, b };
 398     }
 399 
 400     @Test
 401     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 402                   IRNode.AND_VB, "> 0",
 403                   IRNode.STORE_VECTOR, "> 0"},
 404         applyIfPlatform = {"64-bit", "true"},
 405         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 406     static Object[] test1(byte[] a, byte[] b, byte mask) {
 407         for (int i = 0; i < RANGE; i+=8) {
 408             // Safe to vectorize with AlignVector
 409             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 410             b[i+1] = (byte)(a[i+1] & mask);
 411             b[i+2] = (byte)(a[i+2] & mask);
 412             b[i+3] = (byte)(a[i+3] & mask);
 413             b[i+4] = (byte)(a[i+4] & mask);
 414             b[i+5] = (byte)(a[i+5] & mask);
 415             b[i+6] = (byte)(a[i+6] & mask);
 416             b[i+7] = (byte)(a[i+7] & mask);
 417         }
 418         return new Object[]{ a, b };
 419     }
 420 
 421     @Test
 422     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 423                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 424                   IRNode.STORE_VECTOR, "> 0"},
 425         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 426         applyIfPlatform = {"64-bit", "true"},
 427         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 428     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 429                   IRNode.AND_VB, "= 0",
 430                   IRNode.STORE_VECTOR, "= 0"},
 431         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 432         applyIfPlatform = {"64-bit", "true"},
 433         applyIf = {"AlignVector", "true"})
 434     static Object[] test2(byte[] a, byte[] b, byte mask) {
 435         for (int i = 0; i < RANGE; i+=8) {
 436             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 437             b[i+3] = (byte)(a[i+3] & mask); // at alignment 3
 438             b[i+4] = (byte)(a[i+4] & mask);
 439             b[i+5] = (byte)(a[i+5] & mask);
 440             b[i+6] = (byte)(a[i+6] & mask);
 441         }
 442         return new Object[]{ a, b };
 443     }
 444 
 445     @Test
 446     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 447                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 448                   IRNode.STORE_VECTOR, "> 0"},
 449         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 450         applyIfPlatform = {"64-bit", "true"},
 451         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 452     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 453                   IRNode.AND_VB, "= 0",
 454                   IRNode.STORE_VECTOR, "= 0"},
 455         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 456         applyIfPlatform = {"64-bit", "true"},
 457         applyIf = {"AlignVector", "true"})
 458     static Object[] test3(byte[] a, byte[] b, byte mask) {
 459         for (int i = 0; i < RANGE; i+=8) {
 460             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 461 
 462             // Problematic for AlignVector
 463             b[i+0] = (byte)(a[i+0] & mask); // best_memref, align 0
 464 
 465             b[i+3] = (byte)(a[i+3] & mask); // pack at offset 3 bytes
 466             b[i+4] = (byte)(a[i+4] & mask);
 467             b[i+5] = (byte)(a[i+5] & mask);
 468             b[i+6] = (byte)(a[i+6] & mask);
 469         }
 470         return new Object[]{ a, b };
 471     }
 472 
 473     @Test
 474     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 475                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "> 0",
 476                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 477                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "> 0",
 478                   IRNode.STORE_VECTOR, "> 0"},
 479         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 480         applyIfPlatform = {"64-bit", "true"},
 481         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 482     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 483                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 484                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 485                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 486                   IRNode.STORE_VECTOR, "> 0"},
 487         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 488         applyIfPlatform = {"64-bit", "true"},
 489         applyIfAnd = {"AlignVector", "true", "MaxVectorSize", ">=16"})
 490     static Object[] test4(byte[] a, byte[] b, byte mask) {
 491         for (int i = 0; i < RANGE/16; i++) {
 492             // Problematic for AlignVector
 493             b[i*16 + 0 ] = (byte)(a[i*16 + 0 ] & mask); // 4 pack, 0 aligned
 494             b[i*16 + 1 ] = (byte)(a[i*16 + 1 ] & mask);
 495             b[i*16 + 2 ] = (byte)(a[i*16 + 2 ] & mask);
 496             b[i*16 + 3 ] = (byte)(a[i*16 + 3 ] & mask);
 497 
 498             b[i*16 + 5 ] = (byte)(a[i*16 + 5 ] & mask); // 8 pack, 5 aligned
 499             b[i*16 + 6 ] = (byte)(a[i*16 + 6 ] & mask);
 500             b[i*16 + 7 ] = (byte)(a[i*16 + 7 ] & mask);
 501             b[i*16 + 8 ] = (byte)(a[i*16 + 8 ] & mask);
 502             b[i*16 + 9 ] = (byte)(a[i*16 + 9 ] & mask);
 503             b[i*16 + 10] = (byte)(a[i*16 + 10] & mask);
 504             b[i*16 + 11] = (byte)(a[i*16 + 11] & mask);
 505             b[i*16 + 12] = (byte)(a[i*16 + 12] & mask);
 506         }
 507         return new Object[]{ a, b };
 508     }
 509 
 510     @Test
 511     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 512                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 513                   IRNode.STORE_VECTOR, "> 0"},
 514         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 515         applyIfPlatform = {"64-bit", "true"},
 516         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 517     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 518                   IRNode.AND_VB, "= 0",
 519                   IRNode.STORE_VECTOR, "= 0"},
 520         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 521         applyIfPlatform = {"64-bit", "true"},
 522         applyIf = {"AlignVector", "true"})
 523     static Object[] test5(byte[] a, byte[] b, byte mask, int inv) {
 524         for (int i = 0; i < RANGE; i+=8) {
 525             // Cannot align with AlignVector because of invariant
 526             b[i+inv+0] = (byte)(a[i+inv+0] & mask);
 527 
 528             b[i+inv+3] = (byte)(a[i+inv+3] & mask);
 529             b[i+inv+4] = (byte)(a[i+inv+4] & mask);
 530             b[i+inv+5] = (byte)(a[i+inv+5] & mask);
 531             b[i+inv+6] = (byte)(a[i+inv+6] & mask);
 532         }
 533         return new Object[]{ a, b };
 534     }
 535 
 536     @Test
 537     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 538                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 539                   IRNode.STORE_VECTOR, "> 0"},
 540         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 541         applyIfPlatform = {"64-bit", "true"},
 542         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 543     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 544                   IRNode.AND_VB, "= 0",
 545                   IRNode.STORE_VECTOR, "= 0"},
 546         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 547         applyIfPlatform = {"64-bit", "true"},
 548         applyIf = {"AlignVector", "true"})
 549     static Object[] test6(byte[] a, byte[] b, byte mask) {
 550         for (int i = 0; i < RANGE/8; i+=2) {
 551             // Cannot align with AlignVector because offset is odd
 552             b[i*4+0] = (byte)(a[i*4+0] & mask);
 553 
 554             b[i*4+3] = (byte)(a[i*4+3] & mask);
 555             b[i*4+4] = (byte)(a[i*4+4] & mask);
 556             b[i*4+5] = (byte)(a[i*4+5] & mask);
 557             b[i*4+6] = (byte)(a[i*4+6] & mask);
 558         }
 559         return new Object[]{ a, b };
 560     }
 561 
 562     @Test
 563     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 564                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 565                   IRNode.STORE_VECTOR, "> 0"},
 566         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"},
 567         applyIfPlatform = {"64-bit", "true"},
 568         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 569     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 570                   IRNode.AND_VS, "= 0",
 571                   IRNode.STORE_VECTOR, "= 0"},
 572         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 573         applyIfPlatform = {"64-bit", "true"},
 574         applyIf = {"AlignVector", "true"})
 575     static Object[] test7(short[] a, short[] b, short mask) {
 576         for (int i = 0; i < RANGE/8; i+=2) {
 577             // Cannot align with AlignVector because offset is odd
 578             b[i*4+0] = (short)(a[i*4+0] & mask);
 579 
 580             b[i*4+3] = (short)(a[i*4+3] & mask);
 581             b[i*4+4] = (short)(a[i*4+4] & mask);
 582             b[i*4+5] = (short)(a[i*4+5] & mask);
 583             b[i*4+6] = (short)(a[i*4+6] & mask);
 584         }
 585         return new Object[]{ a, b };
 586     }
 587 
 588     @Test
 589     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 590                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 591                   IRNode.STORE_VECTOR, "> 0"},
 592         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 593         applyIfPlatform = {"64-bit", "true"},
 594         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 595     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 596                   IRNode.AND_VB, "= 0",
 597                   IRNode.STORE_VECTOR, "= 0"},
 598         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 599         applyIfPlatform = {"64-bit", "true"},
 600         applyIf = {"AlignVector", "true"})
 601     static Object[] test8(byte[] a, byte[] b, byte mask, int init) {
 602         for (int i = init; i < RANGE; i+=8) {
 603             // Cannot align with AlignVector because of invariant (variable init becomes invar)
 604             b[i+0] = (byte)(a[i+0] & mask);
 605 
 606             b[i+3] = (byte)(a[i+3] & mask);
 607             b[i+4] = (byte)(a[i+4] & mask);
 608             b[i+5] = (byte)(a[i+5] & mask);
 609             b[i+6] = (byte)(a[i+6] & mask);
 610         }
 611         return new Object[]{ a, b };
 612     }
 613 
 614     @Test
 615     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 616                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 617                   IRNode.STORE_VECTOR, "> 0"},
 618         applyIf = {"MaxVectorSize", ">=8"},
 619         applyIfPlatform = {"64-bit", "true"},
 620         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 621     static Object[] test9(byte[] a, byte[] b, byte mask) {
 622         // known non-zero init value does not affect offset, but has implicit effect on iv
 623         for (int i = 13; i < RANGE-8; i+=8) {
 624             b[i+0] = (byte)(a[i+0] & mask);
 625 
 626             b[i+3] = (byte)(a[i+3] & mask);
 627             b[i+4] = (byte)(a[i+4] & mask);
 628             b[i+5] = (byte)(a[i+5] & mask);
 629             b[i+6] = (byte)(a[i+6] & mask);
 630         }
 631         return new Object[]{ a, b };
 632     }
 633 
 634     @Test
 635     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 636                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 637                   IRNode.STORE_VECTOR, "> 0"},
 638         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 639         applyIfPlatform = {"64-bit", "true"},
 640         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 641     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 642                   IRNode.AND_VB, "= 0",
 643                   IRNode.STORE_VECTOR, "= 0"},
 644         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 645         applyIfPlatform = {"64-bit", "true"},
 646         applyIf = {"AlignVector", "true"})
 647     static Object[] test10a(byte[] a, byte[] b, byte mask) {
 648         // This is not alignable with pre-loop, because of odd init.
 649         for (int i = 3; i < RANGE-8; i+=8) {
 650             b[i+0] = (byte)(a[i+0] & mask);
 651             b[i+1] = (byte)(a[i+1] & mask);
 652             b[i+2] = (byte)(a[i+2] & mask);
 653             b[i+3] = (byte)(a[i+3] & mask);
 654         }
 655         return new Object[]{ a, b };
 656     }
 657 
 658     @Test
 659     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 660                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 661                   IRNode.STORE_VECTOR, "> 0"},
 662         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 663         applyIfPlatform = {"64-bit", "true"},
 664         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 665     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 666                   IRNode.AND_VB, "= 0",
 667                   IRNode.STORE_VECTOR, "= 0"},
 668         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 669         applyIfPlatform = {"64-bit", "true"},
 670         applyIf = {"AlignVector", "true"})
 671     static Object[] test10b(byte[] a, byte[] b, byte mask) {
 672         // This is not alignable with pre-loop, because of odd init.
 673         // Seems not correctly handled.
 674         for (int i = 13; i < RANGE-8; i+=8) {
 675             b[i+0] = (byte)(a[i+0] & mask);
 676             b[i+1] = (byte)(a[i+1] & mask);
 677             b[i+2] = (byte)(a[i+2] & mask);
 678             b[i+3] = (byte)(a[i+3] & mask);
 679         }
 680         return new Object[]{ a, b };
 681     }
 682 
 683     @Test
 684     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 685                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 686                   IRNode.STORE_VECTOR, "> 0"},
 687         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 688         applyIfPlatform = {"64-bit", "true"},
 689         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 690     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 691                   IRNode.AND_VS, "= 0",
 692                   IRNode.STORE_VECTOR, "= 0"},
 693         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 694         applyIfPlatform = {"64-bit", "true"},
 695         applyIf = {"AlignVector", "true"})
 696     static Object[] test10c(short[] a, short[] b, short mask) {
 697         // This is not alignable with pre-loop, because of odd init.
 698         // Seems not correctly handled with MaxVectorSize >= 32.
 699         for (int i = 13; i < RANGE-8; i+=8) {
 700             b[i+0] = (short)(a[i+0] & mask);
 701             b[i+1] = (short)(a[i+1] & mask);
 702             b[i+2] = (short)(a[i+2] & mask);
 703             b[i+3] = (short)(a[i+3] & mask);
 704         }
 705         return new Object[]{ a, b };
 706     }
 707 
 708     @Test
 709     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 710                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 711                   IRNode.STORE_VECTOR, "> 0"},
 712         applyIf = {"MaxVectorSize", ">=16"},
 713         applyIfPlatform = {"64-bit", "true"},
 714         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 715     static Object[] test10d(short[] a, short[] b, short mask) {
 716         for (int i = 13; i < RANGE-16; i+=8) {
 717             // init + offset -> aligned
 718             b[i+0+3] = (short)(a[i+0+3] & mask);
 719             b[i+1+3] = (short)(a[i+1+3] & mask);
 720             b[i+2+3] = (short)(a[i+2+3] & mask);
 721             b[i+3+3] = (short)(a[i+3+3] & mask);
 722         }
 723         return new Object[]{ a, b };
 724     }
 725 
 726     @Test
 727     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 728                   IRNode.AND_VB, "> 0",
 729                   IRNode.STORE_VECTOR, "> 0"},
 730         applyIfPlatform = {"64-bit", "true"},
 731         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 732     static Object[] test11aB(byte[] a, byte[] b, byte mask) {
 733         for (int i = 0; i < RANGE; i++) {
 734             // always alignable
 735             b[i+0] = (byte)(a[i+0] & mask);
 736         }
 737         return new Object[]{ a, b };
 738     }
 739 
 740     @Test
 741     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 742                   IRNode.AND_VS, "> 0",
 743                   IRNode.STORE_VECTOR, "> 0"},
 744         applyIfPlatform = {"64-bit", "true"},
 745         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 746     static Object[] test11aS(short[] a, short[] b, short mask) {
 747         for (int i = 0; i < RANGE; i++) {
 748             // always alignable
 749             b[i+0] = (short)(a[i+0] & mask);
 750         }
 751         return new Object[]{ a, b };
 752     }
 753 
 754     @Test
 755     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 756                   IRNode.AND_VI, "> 0",
 757                   IRNode.STORE_VECTOR, "> 0"},
 758         applyIfPlatform = {"64-bit", "true"},
 759         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 760     static Object[] test11aI(int[] a, int[] b, int mask) {
 761         for (int i = 0; i < RANGE; i++) {
 762             // always alignable
 763             b[i+0] = (int)(a[i+0] & mask);
 764         }
 765         return new Object[]{ a, b };
 766     }
 767 
 768     @Test
 769     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 770                   IRNode.AND_VL, "> 0",
 771                   IRNode.STORE_VECTOR, "> 0"},
 772         applyIfPlatform = {"64-bit", "true"},
 773         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 774     static Object[] test11aL(long[] a, long[] b, long mask) {
 775         for (int i = 0; i < RANGE; i++) {
 776             // always alignable
 777             b[i+0] = (long)(a[i+0] & mask);
 778         }
 779         return new Object[]{ a, b };
 780     }
 781 
 782     @Test
 783     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 784                   IRNode.AND_VB, "> 0",
 785                   IRNode.STORE_VECTOR, "> 0"},
 786         applyIfPlatform = {"64-bit", "true"},
 787         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 788     static Object[] test11bB(byte[] a, byte[] b, byte mask) {
 789         for (int i = 1; i < RANGE; i++) {
 790             // always alignable
 791             b[i+0] = (byte)(a[i+0] & mask);
 792         }
 793         return new Object[]{ a, b };
 794     }
 795 
 796     @Test
 797     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 798                   IRNode.AND_VS, "> 0",
 799                   IRNode.STORE_VECTOR, "> 0"},
 800         applyIfPlatform = {"64-bit", "true"},
 801         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 802     static Object[] test11bS(short[] a, short[] b, short mask) {
 803         for (int i = 1; i < RANGE; i++) {
 804             // always alignable
 805             b[i+0] = (short)(a[i+0] & mask);
 806         }
 807         return new Object[]{ a, b };
 808     }
 809 
 810     @Test
 811     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 812                   IRNode.AND_VI, "> 0",
 813                   IRNode.STORE_VECTOR, "> 0"},
 814         applyIfPlatform = {"64-bit", "true"},
 815         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 816     static Object[] test11bI(int[] a, int[] b, int mask) {
 817         for (int i = 1; i < RANGE; i++) {
 818             // always alignable
 819             b[i+0] = (int)(a[i+0] & mask);
 820         }
 821         return new Object[]{ a, b };
 822     }
 823 
 824     @Test
 825     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 826                   IRNode.AND_VL, "> 0",
 827                   IRNode.STORE_VECTOR, "> 0"},
 828         applyIfPlatform = {"64-bit", "true"},
 829         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 830     static Object[] test11bL(long[] a, long[] b, long mask) {
 831         for (int i = 1; i < RANGE; i++) {
 832             // always alignable
 833             b[i+0] = (long)(a[i+0] & mask);
 834         }
 835         return new Object[]{ a, b };
 836     }
 837 
 838     @Test
 839     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 840                   IRNode.AND_VB, "> 0",
 841                   IRNode.STORE_VECTOR, "> 0"},
 842         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 843         applyIfPlatform = {"64-bit", "true"},
 844         applyIf = {"AlignVector", "false"})
 845     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 846                   IRNode.AND_VB, "= 0",
 847                   IRNode.STORE_VECTOR, "= 0"},
 848         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 849         applyIfPlatform = {"64-bit", "true"},
 850         applyIf = {"AlignVector", "true"})
 851     static Object[] test11cB(byte[] a, byte[] b, byte mask) {
 852         for (int i = 1; i < RANGE-1; i++) {
 853             // 1 byte offset -> not alignable with AlignVector
 854             b[i+0] = (byte)(a[i+1] & mask);
 855         }
 856         return new Object[]{ a, b };
 857     }
 858 
 859     @Test
 860     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 861                   IRNode.AND_VS, "> 0",
 862                   IRNode.STORE_VECTOR, "> 0"},
 863         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 864         applyIfPlatform = {"64-bit", "true"},
 865         applyIf = {"AlignVector", "false"})
 866     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 867                   IRNode.AND_VS, "= 0",
 868                   IRNode.STORE_VECTOR, "= 0"},
 869         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 870         applyIfPlatform = {"64-bit", "true"},
 871         applyIf = {"AlignVector", "true"})
 872     static Object[] test11cS(short[] a, short[] b, short mask) {
 873         for (int i = 1; i < RANGE-1; i++) {
 874             // 2 byte offset -> not alignable with AlignVector
 875             b[i+0] = (short)(a[i+1] & mask);
 876         }
 877         return new Object[]{ a, b };
 878     }
 879 
 880     @Test
 881     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 882                   IRNode.AND_VI, "> 0",
 883                   IRNode.STORE_VECTOR, "> 0"},
 884         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 885         applyIfPlatform = {"64-bit", "true"},
 886         applyIf = {"AlignVector", "false"})
 887     @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0",
 888                   IRNode.AND_VI, "= 0",
 889                   IRNode.STORE_VECTOR, "= 0"},
 890         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 891         applyIfPlatform = {"64-bit", "true"},
 892         applyIf = {"AlignVector", "true"})
 893     static Object[] test11cI(int[] a, int[] b, int mask) {
 894         for (int i = 1; i < RANGE-1; i++) {
 895             // 4 byte offset -> not alignable with AlignVector
 896             b[i+0] = (int)(a[i+1] & mask);
 897         }
 898         return new Object[]{ a, b };
 899     }
 900 
 901     @Test
 902     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 903                   IRNode.AND_VL, "> 0",
 904                   IRNode.STORE_VECTOR, "> 0"},
 905         applyIfPlatform = {"64-bit", "true"},
 906         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 907     static Object[] test11cL(long[] a, long[] b, long mask) {
 908         for (int i = 1; i < RANGE-1; i++) {
 909             // always alignable (8 byte offset)
 910             b[i+0] = (long)(a[i+1] & mask);
 911         }
 912         return new Object[]{ a, b };
 913     }
 914 
 915     @Test
 916     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 917                   IRNode.AND_VB, "> 0",
 918                   IRNode.STORE_VECTOR, "> 0"},
 919         applyIfPlatform = {"64-bit", "true"},
 920         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 921     static Object[] test11dB(byte[] a, byte[] b, byte mask, int invar) {
 922         for (int i = 0; i < RANGE; i++) {
 923             b[i+0+invar] = (byte)(a[i+0+invar] & mask);
 924         }
 925         return new Object[]{ a, b };
 926     }
 927 
 928     @Test
 929     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 930                   IRNode.AND_VS, "> 0",
 931                   IRNode.STORE_VECTOR, "> 0"},
 932         applyIfPlatform = {"64-bit", "true"},
 933         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 934     static Object[] test11dS(short[] a, short[] b, short mask, int invar) {
 935         for (int i = 0; i < RANGE; i++) {
 936             b[i+0+invar] = (short)(a[i+0+invar] & mask);
 937         }
 938         return new Object[]{ a, b };
 939     }
 940 
 941     @Test
 942     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 943                   IRNode.AND_VI, "> 0",
 944                   IRNode.STORE_VECTOR, "> 0"},
 945         applyIfPlatform = {"64-bit", "true"},
 946         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 947     static Object[] test11dI(int[] a, int[] b, int mask, int invar) {
 948         for (int i = 0; i < RANGE; i++) {
 949             b[i+0+invar] = (int)(a[i+0+invar] & mask);
 950         }
 951         return new Object[]{ a, b };
 952     }
 953 
 954     @Test
 955     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 956                   IRNode.AND_VL, "> 0",
 957                   IRNode.STORE_VECTOR, "> 0"},
 958         applyIfPlatform = {"64-bit", "true"},
 959         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 960     static Object[] test11dL(long[] a, long[] b, long mask, int invar) {
 961         for (int i = 0; i < RANGE; i++) {
 962             b[i+0+invar] = (long)(a[i+0+invar] & mask);
 963         }
 964         return new Object[]{ a, b };
 965     }
 966 
 967     @Test
 968     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 969                   IRNode.AND_VB, "= 0",
 970                   IRNode.STORE_VECTOR, "= 0"},
 971         applyIfPlatform = {"64-bit", "true"},
 972         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 973     static Object[] test12(byte[] a, byte[] b, byte mask) {
 974         for (int i = 0; i < RANGE/16; i++) {
 975             // Currently does not vectorize at all
 976             b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask);
 977             b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask);
 978             b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask);
 979             b[i*6 + 3 ] = (byte)(a[i*6 + 3 ] & mask);
 980         }
 981         return new Object[]{ a, b };
 982     }
 983 
 984     @Test
 985     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 986                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 987                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 988                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 989                   IRNode.STORE_VECTOR, "> 0"},
 990         applyIfPlatform = {"64-bit", "true"},
 991         applyIfCPUFeatureOr = {"avx2", "true"})
 992     // require avx to ensure vectors are larger than what unrolling produces
 993     static Object[] test13aIL(int[] a, long[] b) {
 994         for (int i = 0; i < RANGE; i++) {
 995             a[i]++;
 996             b[i]++;
 997         }
 998         return new Object[]{ a, b };
 999     }
1000 
1001     @Test
1002     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1003                   IRNode.LOAD_VECTOR_I, "> 0",
1004                   IRNode.ADD_VB, "> 0",
1005                   IRNode.ADD_VI, "> 0",
1006                   IRNode.STORE_VECTOR, "> 0"},
1007         applyIfPlatform = {"64-bit", "true"},
1008         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1009     static Object[] test13aIB(int[] a, byte[] b) {
1010         for (int i = 0; i < RANGE; i++) {
1011             a[i]++;
1012             b[i]++;
1013         }
1014         return new Object[]{ a, b };
1015     }
1016 
1017     @Test
1018     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1019                   IRNode.LOAD_VECTOR_S, "> 0",
1020                   IRNode.ADD_VI, "> 0",
1021                   IRNode.ADD_VS, "> 0",
1022                   IRNode.STORE_VECTOR, "> 0"},
1023         applyIfPlatform = {"64-bit", "true"},
1024         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1025     static Object[] test13aIS(int[] a, short[] b) {
1026         for (int i = 0; i < RANGE; i++) {
1027             a[i]++;
1028             b[i]++;
1029         }
1030         return new Object[]{ a, b };
1031     }
1032 
1033     @Test
1034     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1035                   IRNode.LOAD_VECTOR_S, "> 0",
1036                   IRNode.LOAD_VECTOR_I, "> 0",
1037                   IRNode.LOAD_VECTOR_L, "> 0",
1038                   IRNode.ADD_VB, "> 0",
1039                   IRNode.ADD_VS, "> 0",
1040                   IRNode.ADD_VI, "> 0",
1041                   IRNode.ADD_VL, "> 0",
1042                   IRNode.STORE_VECTOR, "> 0"},
1043         applyIfPlatform = {"64-bit", "true"},
1044         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1045     static Object[] test13aBSIL(byte[] a, short[] b, int[] c, long[] d) {
1046         for (int i = 0; i < RANGE; i++) {
1047             a[i]++;
1048             b[i]++;
1049             c[i]++;
1050             d[i]++;
1051         }
1052         return new Object[]{ a, b, c, d };
1053     }
1054 
1055     @Test
1056     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1057                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1058                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1059                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1060                   IRNode.STORE_VECTOR, "> 0"},
1061         applyIfPlatform = {"64-bit", "true"},
1062         applyIfCPUFeatureOr = {"avx2", "true"})
1063     // require avx to ensure vectors are larger than what unrolling produces
1064     static Object[] test13bIL(int[] a, long[] b) {
1065         for (int i = 1; i < RANGE; i++) {
1066             a[i]++;
1067             b[i]++;
1068         }
1069         return new Object[]{ a, b };
1070     }
1071 
1072     @Test
1073     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1074                   IRNode.LOAD_VECTOR_I, "> 0",
1075                   IRNode.ADD_VB, "> 0",
1076                   IRNode.ADD_VI, "> 0",
1077                   IRNode.STORE_VECTOR, "> 0"},
1078         applyIfPlatform = {"64-bit", "true"},
1079         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1080     static Object[] test13bIB(int[] a, byte[] b) {
1081         for (int i = 1; i < RANGE; i++) {
1082             a[i]++;
1083             b[i]++;
1084         }
1085         return new Object[]{ a, b };
1086     }
1087 
1088     @Test
1089     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1090                   IRNode.LOAD_VECTOR_S, "> 0",
1091                   IRNode.ADD_VI, "> 0",
1092                   IRNode.ADD_VS, "> 0",
1093                   IRNode.STORE_VECTOR, "> 0"},
1094         applyIfPlatform = {"64-bit", "true"},
1095         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1096     static Object[] test13bIS(int[] a, short[] b) {
1097         for (int i = 1; i < RANGE; i++) {
1098             a[i]++;
1099             b[i]++;
1100         }
1101         return new Object[]{ a, b };
1102     }
1103 
1104     @Test
1105     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1106                   IRNode.LOAD_VECTOR_S, "> 0",
1107                   IRNode.LOAD_VECTOR_I, "> 0",
1108                   IRNode.LOAD_VECTOR_L, "> 0",
1109                   IRNode.ADD_VB, "> 0",
1110                   IRNode.ADD_VS, "> 0",
1111                   IRNode.ADD_VI, "> 0",
1112                   IRNode.ADD_VL, "> 0",
1113                   IRNode.STORE_VECTOR, "> 0"},
1114         applyIfPlatform = {"64-bit", "true"},
1115         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1116     static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
1117         for (int i = 1; i < RANGE; i++) {
1118             a[i]++;
1119             b[i]++;
1120             c[i]++;
1121             d[i]++;
1122         }
1123         return new Object[]{ a, b, c, d };
1124     }
1125 
1126     @Test
1127     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1128                   IRNode.ADD_VB, "> 0",
1129                   IRNode.STORE_VECTOR, "> 0"},
1130         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1131         applyIfPlatform = {"64-bit", "true"},
1132         applyIf = {"AlignVector", "false"})
1133     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1134                   IRNode.ADD_VB, "= 0",
1135                   IRNode.STORE_VECTOR, "= 0"},
1136         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1137         applyIfPlatform = {"64-bit", "true"},
1138         applyIf = {"AlignVector", "true"})
1139     static Object[] test14aB(byte[] a) {
1140         // non-power-of-2 stride
1141         for (int i = 0; i < RANGE-20; i+=9) {
1142             a[i+0]++;
1143             a[i+1]++;
1144             a[i+2]++;
1145             a[i+3]++;
1146             a[i+4]++;
1147             a[i+5]++;
1148             a[i+6]++;
1149             a[i+7]++;
1150             a[i+8]++;
1151             a[i+9]++;
1152             a[i+10]++;
1153             a[i+11]++;
1154             a[i+12]++;
1155             a[i+13]++;
1156             a[i+14]++;
1157             a[i+15]++;
1158         }
1159         return new Object[]{ a };
1160     }
1161 
1162     @Test
1163     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1164                   IRNode.ADD_VB, "> 0",
1165                   IRNode.STORE_VECTOR, "> 0"},
1166         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1167         applyIfPlatform = {"64-bit", "true"},
1168         applyIf = {"AlignVector", "false"})
1169     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1170                   IRNode.ADD_VB, "= 0",
1171                   IRNode.STORE_VECTOR, "= 0"},
1172         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1173         applyIfPlatform = {"64-bit", "true"},
1174         applyIf = {"AlignVector", "true"})
1175     static Object[] test14bB(byte[] a) {
1176         // non-power-of-2 stride
1177         for (int i = 0; i < RANGE-20; i+=3) {
1178             a[i+0]++;
1179             a[i+1]++;
1180             a[i+2]++;
1181             a[i+3]++;
1182             a[i+4]++;
1183             a[i+5]++;
1184             a[i+6]++;
1185             a[i+7]++;
1186             a[i+8]++;
1187             a[i+9]++;
1188             a[i+10]++;
1189             a[i+11]++;
1190             a[i+12]++;
1191             a[i+13]++;
1192             a[i+14]++;
1193             a[i+15]++;
1194         }
1195         return new Object[]{ a };
1196     }
1197 
1198     @Test
1199     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1200                   IRNode.ADD_VB, "> 0",
1201                   IRNode.STORE_VECTOR, "> 0"},
1202         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1203         applyIfPlatform = {"64-bit", "true"},
1204         applyIf = {"AlignVector", "false"})
1205     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1206                   IRNode.ADD_VB, "= 0",
1207                   IRNode.STORE_VECTOR, "= 0"},
1208         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1209         applyIfPlatform = {"64-bit", "true"},
1210         applyIf = {"AlignVector", "true"})
1211     static Object[] test14cB(byte[] a) {
1212         // non-power-of-2 stride
1213         for (int i = 0; i < RANGE-20; i+=5) {
1214             a[i+0]++;
1215             a[i+1]++;
1216             a[i+2]++;
1217             a[i+3]++;
1218             a[i+4]++;
1219             a[i+5]++;
1220             a[i+6]++;
1221             a[i+7]++;
1222             a[i+8]++;
1223             a[i+9]++;
1224             a[i+10]++;
1225             a[i+11]++;
1226             a[i+12]++;
1227             a[i+13]++;
1228             a[i+14]++;
1229             a[i+15]++;
1230         }
1231         return new Object[]{ a };
1232     }
1233 
1234     @Test
1235     // IR rules difficult because of modulo wrapping with offset after peeling.
1236     static Object[] test15aB(byte[] a) {
1237         // non-power-of-2 scale
1238         for (int i = 0; i < RANGE/64-20; i++) {
1239             a[53*i+0]++;
1240             a[53*i+1]++;
1241             a[53*i+2]++;
1242             a[53*i+3]++;
1243             a[53*i+4]++;
1244             a[53*i+5]++;
1245             a[53*i+6]++;
1246             a[53*i+7]++;
1247             a[53*i+8]++;
1248             a[53*i+9]++;
1249             a[53*i+10]++;
1250             a[53*i+11]++;
1251             a[53*i+12]++;
1252             a[53*i+13]++;
1253             a[53*i+14]++;
1254             a[53*i+15]++;
1255         }
1256         return new Object[]{ a };
1257     }
1258 
1259     @Test
1260     // IR rules difficult because of modulo wrapping with offset after peeling.
1261     static Object[] test15bB(byte[] a) {
1262         // non-power-of-2 scale
1263         for (int i = 0; i < RANGE/64-20; i++) {
1264             a[25*i+0]++;
1265             a[25*i+1]++;
1266             a[25*i+2]++;
1267             a[25*i+3]++;
1268             a[25*i+4]++;
1269             a[25*i+5]++;
1270             a[25*i+6]++;
1271             a[25*i+7]++;
1272             a[25*i+8]++;
1273             a[25*i+9]++;
1274             a[25*i+10]++;
1275             a[25*i+11]++;
1276             a[25*i+12]++;
1277             a[25*i+13]++;
1278             a[25*i+14]++;
1279             a[25*i+15]++;
1280         }
1281         return new Object[]{ a };
1282     }
1283 
1284     @Test
1285     // IR rules difficult because of modulo wrapping with offset after peeling.
1286     static Object[] test15cB(byte[] a) {
1287         // non-power-of-2 scale
1288         for (int i = 0; i < RANGE/64-20; i++) {
1289             a[19*i+0]++;
1290             a[19*i+1]++;
1291             a[19*i+2]++;
1292             a[19*i+3]++;
1293             a[19*i+4]++;
1294             a[19*i+5]++;
1295             a[19*i+6]++;
1296             a[19*i+7]++;
1297             a[19*i+8]++;
1298             a[19*i+9]++;
1299             a[19*i+10]++;
1300             a[19*i+11]++;
1301             a[19*i+12]++;
1302             a[19*i+13]++;
1303             a[19*i+14]++;
1304             a[19*i+15]++;
1305         }
1306         return new Object[]{ a };
1307     }
1308 
1309     @Test
1310     static Object[] test16a(byte[] a, short[] b) {
1311         // infinite loop issues
1312         for (int i = 0; i < RANGE/2-20; i++) {
1313             a[2*i+0]++;
1314             a[2*i+1]++;
1315             a[2*i+2]++;
1316             a[2*i+3]++;
1317             a[2*i+4]++;
1318             a[2*i+5]++;
1319             a[2*i+6]++;
1320             a[2*i+7]++;
1321             a[2*i+8]++;
1322             a[2*i+9]++;
1323             a[2*i+10]++;
1324             a[2*i+11]++;
1325             a[2*i+12]++;
1326             a[2*i+13]++;
1327             a[2*i+14]++;
1328 
1329             b[2*i+0]++;
1330             b[2*i+1]++;
1331             b[2*i+2]++;
1332             b[2*i+3]++;
1333         }
1334         return new Object[]{ a, b };
1335     }
1336 
1337     @Test
1338     static Object[] test16b(byte[] a) {
1339         // infinite loop issues
1340         for (int i = 0; i < RANGE/2-20; i++) {
1341             a[2*i+0]++;
1342             a[2*i+1]++;
1343             a[2*i+2]++;
1344             a[2*i+3]++;
1345             a[2*i+4]++;
1346             a[2*i+5]++;
1347             a[2*i+6]++;
1348             a[2*i+7]++;
1349             a[2*i+8]++;
1350             a[2*i+9]++;
1351             a[2*i+10]++;
1352             a[2*i+11]++;
1353             a[2*i+12]++;
1354             a[2*i+13]++;
1355             a[2*i+14]++;
1356         }
1357         return new Object[]{ a };
1358     }
1359 
1360     @Test
1361     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1362                   IRNode.ADD_VL, "> 0",
1363                   IRNode.STORE_VECTOR, "> 0"},
1364         applyIfPlatform = {"64-bit", "true"},
1365         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1366     static Object[] test17a(long[] a) {
1367         // Unsafe: vectorizes with profiling (not xcomp)
1368         for (int i = 0; i < RANGE; i++) {
1369             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i;
1370             long v = UNSAFE.getLongUnaligned(a, adr);
1371             UNSAFE.putLongUnaligned(a, adr, v + 1);
1372         }
1373         return new Object[]{ a };
1374     }
1375 
1376     @Test
1377     // Difficult to write good IR rule. Modulo calculus overflow can create non-power-of-2 packs.
1378     static Object[] test17b(long[] a) {
1379         // Not alignable
1380         for (int i = 0; i < RANGE-1; i++) {
1381             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1;
1382             long v = UNSAFE.getLongUnaligned(a, adr);
1383             UNSAFE.putLongUnaligned(a, adr, v + 1);
1384         }
1385         return new Object[]{ a };
1386     }
1387 
1388     @Test
1389     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1390                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1391                   IRNode.STORE_VECTOR, "> 0"},
1392         applyIf = {"MaxVectorSize", ">=32"},
1393         applyIfPlatform = {"64-bit", "true"},
1394         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1395     static Object[] test17c(long[] a) {
1396         // Unsafe: aligned vectorizes
1397         for (int i = 0; i < RANGE-1; i+=4) {
1398             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i;
1399             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1400             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1401             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1402             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1403         }
1404         return new Object[]{ a };
1405     }
1406 
1407     @Test
1408     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1409                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1410                   IRNode.STORE_VECTOR, "> 0"},
1411         applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
1412         applyIfPlatform = {"64-bit", "true"},
1413         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=64"})
1414     // Ensure vector width is large enough to fit 64 byte for longs:
1415     // The offsets are: 25, 33, 57, 65
1416     // In modulo 32:    25,  1, 25,  1  -> does not vectorize
1417     // In modulo 64:    25, 33, 57,  1  -> at least first pair vectorizes
1418     // This problem is because we compute modulo vector width in memory_alignment.
1419     @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0",
1420                   IRNode.ADD_VL, "= 0",
1421                   IRNode.STORE_VECTOR, "= 0"},
1422         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1423         applyIfPlatform = {"64-bit", "true"},
1424         applyIf = {"AlignVector", "true"})
1425     static Object[] test17d(long[] a) {
1426         // Not alignable
1427         for (int i = 0; i < RANGE-1; i+=4) {
1428             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1;
1429             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1430             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1431             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1432             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1433         }
1434         return new Object[]{ a };
1435     }
1436 
1437     @Test
1438     static Object[] test18a(byte[] a, int[] b) {
1439         // scale = 0  -->  no iv
1440         for (int i = 0; i < RANGE; i++) {
1441             a[0] = 1;
1442             b[i] = 2;
1443             a[1] = 1;
1444         }
1445         return new Object[]{ a, b };
1446     }
1447 
1448     @Test
1449     static Object[] test18b(byte[] a, int[] b) {
1450         // scale = 0  -->  no iv
1451         for (int i = 0; i < RANGE; i++) {
1452             a[1] = 1;
1453             b[i] = 2;
1454             a[2] = 1;
1455         }
1456         return new Object[]{ a, b };
1457     }
1458 
1459     @Test
1460     static Object[] test19(int[] a, int[] b) {
1461         for (int i = 5000; i > 0; i--) {
1462             a[RANGE_FINAL - i] = b[RANGE_FINAL - i];
1463         }
1464         return new Object[]{ a, b };
1465     }
1466 
1467     @Test
1468     static Object[] test20(byte[] a) {
1469         // Example where it is easy to pass alignment check,
1470         // but used to fail the alignment calculation
1471         for (int i = 1; i < RANGE/2-50; i++) {
1472             a[2*i+0+30]++;
1473             a[2*i+1+30]++;
1474             a[2*i+2+30]++;
1475             a[2*i+3+30]++;
1476         }
1477         return new Object[]{ a };
1478     }
1479 }