1 /*
   2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package compiler.loopopts.superword;
  25 
  26 import compiler.lib.ir_framework.*;
  27 import jdk.test.lib.Utils;
  28 import jdk.test.whitebox.WhiteBox;
  29 import jdk.internal.misc.Unsafe;
  30 import java.lang.reflect.Array;
  31 import java.util.Map;
  32 import java.util.HashMap;
  33 import java.util.Random;
  34 import java.nio.ByteOrder;
  35 
  36 /*
  37  * @test id=NoAlignVector
  38  * @bug 8310190
  39  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  40  * @modules java.base/jdk.internal.misc
  41  * @library /test/lib /
  42  * @requires vm.compiler2.enabled
  43  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector
  44  */
  45 
  46 /*
  47  * @test id=AlignVector
  48  * @bug 8310190
  49  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  50  * @modules java.base/jdk.internal.misc
  51  * @library /test/lib /
  52  * @requires vm.compiler2.enabled
  53  * @run driver compiler.loopopts.superword.TestAlignVector AlignVector
  54  */
  55 
  56 /*
  57  * @test id=VerifyAlignVector
  58  * @bug 8310190
  59  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  60  * @modules java.base/jdk.internal.misc
  61  * @library /test/lib /
  62  * @requires vm.compiler2.enabled
  63  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector
  64  */
  65 
  66 public class TestAlignVector {
  67     static int RANGE = 1024*8;
  68     static int RANGE_FINAL = 1024*8;
  69     private static final Unsafe UNSAFE = Unsafe.getUnsafe();
  70     private static final Random RANDOM = Utils.getRandomInstance();
  71 
  72     // Inputs
  73     byte[] aB;
  74     byte[] bB;
  75     byte mB = (byte)31;
  76     short[] aS;
  77     short[] bS;
  78     short mS = (short)0xF0F0;
  79     int[] aI;
  80     int[] bI;
  81     int mI = 0xF0F0F0F0;
  82     long[] aL;
  83     long[] bL;
  84     long mL = 0xF0F0F0F0F0F0F0F0L;
  85 
  86     // List of tests
  87     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
  88 
  89     // List of gold, the results from the first run before compilation
  90     Map<String,Object[]> golds = new HashMap<String,Object[]>();
  91 
  92     interface TestFunction {
  93         Object[] run();
  94     }
  95 
  96     public static void main(String[] args) {
  97         TestFramework framework = new TestFramework(TestAlignVector.class);
  98         framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
  99                            "-XX:LoopUnrollLimit=250");
 100 
 101         switch (args[0]) {
 102             case "NoAlignVector"     -> { framework.addFlags("-XX:-AlignVector"); }
 103             case "AlignVector"       -> { framework.addFlags("-XX:+AlignVector"); }
 104             case "VerifyAlignVector" -> { framework.addFlags("-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 105             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 106         }
 107         framework.start();
 108     }
 109 
 110     public TestAlignVector() {
 111         // Generate input once
 112         aB = generateB();
 113         bB = generateB();
 114         aS = generateS();
 115         bS = generateS();
 116         aI = generateI();
 117         bI = generateI();
 118         aL = generateL();
 119         bL = generateL();
 120 
 121         // Add all tests to list
 122         tests.put("test0",       () -> { return test0(aB.clone(), bB.clone(), mB); });
 123         tests.put("test1",       () -> { return test1(aB.clone(), bB.clone(), mB); });
 124         tests.put("test2",       () -> { return test2(aB.clone(), bB.clone(), mB); });
 125         tests.put("test3",       () -> { return test3(aB.clone(), bB.clone(), mB); });
 126         tests.put("test4",       () -> { return test4(aB.clone(), bB.clone(), mB); });
 127         tests.put("test5",       () -> { return test5(aB.clone(), bB.clone(), mB, 0); });
 128         tests.put("test6",       () -> { return test6(aB.clone(), bB.clone(), mB); });
 129         tests.put("test7",       () -> { return test7(aS.clone(), bS.clone(), mS); });
 130         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 0); });
 131         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 1); });
 132         tests.put("test9",       () -> { return test9(aB.clone(), bB.clone(), mB); });
 133 
 134         tests.put("test10a",     () -> { return test10a(aB.clone(), bB.clone(), mB); });
 135         tests.put("test10b",     () -> { return test10b(aB.clone(), bB.clone(), mB); });
 136         tests.put("test10c",     () -> { return test10c(aS.clone(), bS.clone(), mS); });
 137         tests.put("test10d",     () -> { return test10d(aS.clone(), bS.clone(), mS); });
 138 
 139         tests.put("test11aB",    () -> { return test11aB(aB.clone(), bB.clone(), mB); });
 140         tests.put("test11aS",    () -> { return test11aS(aS.clone(), bS.clone(), mS); });
 141         tests.put("test11aI",    () -> { return test11aI(aI.clone(), bI.clone(), mI); });
 142         tests.put("test11aL",    () -> { return test11aL(aL.clone(), bL.clone(), mL); });
 143 
 144         tests.put("test11bB",    () -> { return test11bB(aB.clone(), bB.clone(), mB); });
 145         tests.put("test11bS",    () -> { return test11bS(aS.clone(), bS.clone(), mS); });
 146         tests.put("test11bI",    () -> { return test11bI(aI.clone(), bI.clone(), mI); });
 147         tests.put("test11bL",    () -> { return test11bL(aL.clone(), bL.clone(), mL); });
 148 
 149         tests.put("test11cB",    () -> { return test11cB(aB.clone(), bB.clone(), mB); });
 150         tests.put("test11cS",    () -> { return test11cS(aS.clone(), bS.clone(), mS); });
 151         tests.put("test11cI",    () -> { return test11cI(aI.clone(), bI.clone(), mI); });
 152         tests.put("test11cL",    () -> { return test11cL(aL.clone(), bL.clone(), mL); });
 153 
 154         tests.put("test11dB",    () -> { return test11dB(aB.clone(), bB.clone(), mB, 0); });
 155         tests.put("test11dS",    () -> { return test11dS(aS.clone(), bS.clone(), mS, 0); });
 156         tests.put("test11dI",    () -> { return test11dI(aI.clone(), bI.clone(), mI, 0); });
 157         tests.put("test11dL",    () -> { return test11dL(aL.clone(), bL.clone(), mL, 0); });
 158 
 159         tests.put("test12",      () -> { return test12(aB.clone(), bB.clone(), mB); });
 160 
 161         tests.put("test13aIL",   () -> { return test13aIL(aI.clone(), aL.clone()); });
 162         tests.put("test13aIB",   () -> { return test13aIB(aI.clone(), aB.clone()); });
 163         tests.put("test13aIS",   () -> { return test13aIS(aI.clone(), aS.clone()); });
 164         tests.put("test13aBSIL", () -> { return test13aBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 165 
 166         tests.put("test13bIL",   () -> { return test13bIL(aI.clone(), aL.clone()); });
 167         tests.put("test13bIB",   () -> { return test13bIB(aI.clone(), aB.clone()); });
 168         tests.put("test13bIS",   () -> { return test13bIS(aI.clone(), aS.clone()); });
 169         tests.put("test13bBSIL", () -> { return test13bBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 170 
 171         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
 172         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
 173         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
 174 
 175         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
 176         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
 177         tests.put("test15cB",    () -> { return test15cB(aB.clone()); });
 178 
 179         tests.put("test16a",     () -> { return test16a(aB.clone(), aS.clone()); });
 180         tests.put("test16b",     () -> { return test16b(aB.clone()); });
 181 
 182         tests.put("test17a",     () -> { return test17a(aL.clone()); });
 183         tests.put("test17b",     () -> { return test17b(aL.clone()); });
 184         tests.put("test17c",     () -> { return test17c(aL.clone()); });
 185         tests.put("test17d",     () -> { return test17d(aL.clone()); });
 186 
 187         tests.put("test18a",     () -> { return test18a(aB.clone(), aI.clone()); });
 188         tests.put("test18b",     () -> { return test18b(aB.clone(), aI.clone()); });
 189 
 190         tests.put("test19",      () -> { return test19(aI.clone(), bI.clone()); });
 191         tests.put("test20",      () -> { return test20(aB.clone()); });
 192 
 193         // Compute gold value for all test methods before compilation
 194         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 195             String name = entry.getKey();
 196             TestFunction test = entry.getValue();
 197             Object[] gold = test.run();
 198             golds.put(name, gold);
 199         }
 200     }
 201 
 202     @Warmup(100)
 203     @Run(test = {"test0",
 204                  "test1",
 205                  "test2",
 206                  "test3",
 207                  "test4",
 208                  "test5",
 209                  "test6",
 210                  "test7",
 211                  "test8",
 212                  "test9",
 213                  "test10a",
 214                  "test10b",
 215                  "test10c",
 216                  "test10d",
 217                  "test11aB",
 218                  "test11aS",
 219                  "test11aI",
 220                  "test11aL",
 221                  "test11bB",
 222                  "test11bS",
 223                  "test11bI",
 224                  "test11bL",
 225                  "test11cB",
 226                  "test11cS",
 227                  "test11cI",
 228                  "test11cL",
 229                  "test11dB",
 230                  "test11dS",
 231                  "test11dI",
 232                  "test11dL",
 233                  "test12",
 234                  "test13aIL",
 235                  "test13aIB",
 236                  "test13aIS",
 237                  "test13aBSIL",
 238                  "test13bIL",
 239                  "test13bIB",
 240                  "test13bIS",
 241                  "test13bBSIL",
 242                  "test14aB",
 243                  "test14bB",
 244                  "test14cB",
 245                  "test15aB",
 246                  "test15bB",
 247                  "test15cB",
 248                  "test16a",
 249                  "test16b",
 250                  "test17a",
 251                  "test17b",
 252                  "test17c",
 253                  "test17d",
 254                  "test18a",
 255                  "test18b",
 256                  "test19",
 257                  "test20"})
 258     public void runTests() {
 259         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 260             String name = entry.getKey();
 261             TestFunction test = entry.getValue();
 262             // Recall gold value from before compilation
 263             Object[] gold = golds.get(name);
 264             // Compute new result
 265             Object[] result = test.run();
 266             // Compare gold and new result
 267             verify(name, gold, result);
 268         }
 269     }
 270 
 271     static byte[] generateB() {
 272         byte[] a = new byte[RANGE];
 273         for (int i = 0; i < a.length; i++) {
 274             a[i] = (byte)RANDOM.nextInt();
 275         }
 276         return a;
 277     }
 278 
 279     static short[] generateS() {
 280         short[] a = new short[RANGE];
 281         for (int i = 0; i < a.length; i++) {
 282             a[i] = (short)RANDOM.nextInt();
 283         }
 284         return a;
 285     }
 286 
 287     static int[] generateI() {
 288         int[] a = new int[RANGE];
 289         for (int i = 0; i < a.length; i++) {
 290             a[i] = RANDOM.nextInt();
 291         }
 292         return a;
 293     }
 294 
 295     static long[] generateL() {
 296         long[] a = new long[RANGE];
 297         for (int i = 0; i < a.length; i++) {
 298             a[i] = RANDOM.nextLong();
 299         }
 300         return a;
 301     }
 302 
 303     static void verify(String name, Object[] gold, Object[] result) {
 304         if (gold.length != result.length) {
 305             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
 306                                        gold.length + ", result.length = " + result.length);
 307         }
 308         for (int i = 0; i < gold.length; i++) {
 309             Object g = gold[i];
 310             Object r = result[i];
 311             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
 312                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
 313                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 314                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 315             }
 316             if (g == r) {
 317                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
 318                                            " gold[" + i + "] == result[" + i + "]");
 319             }
 320             if (Array.getLength(g) != Array.getLength(r)) {
 321                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
 322                                            " gold[" + i + "].length = " + Array.getLength(g) +
 323                                            " result[" + i + "].length = " + Array.getLength(r));
 324             }
 325             Class c = g.getClass().getComponentType();
 326             if (c == byte.class) {
 327                 verifyB(name, i, (byte[])g, (byte[])r);
 328             } else if (c == short.class) {
 329                 verifyS(name, i, (short[])g, (short[])r);
 330             } else if (c == int.class) {
 331                 verifyI(name, i, (int[])g, (int[])r);
 332             } else if (c == long.class) {
 333                 verifyL(name, i, (long[])g, (long[])r);
 334             } else {
 335                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
 336                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 337                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 338             }
 339         }
 340     }
 341 
 342     static void verifyB(String name, int i, byte[] g, byte[] r) {
 343         for (int j = 0; j < g.length; j++) {
 344             if (g[j] != r[j]) {
 345                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 346                                            " gold[" + i + "][" + j + "] = " + g[j] +
 347                                            " result[" + i + "][" + j + "] = " + r[j]);
 348             }
 349         }
 350     }
 351 
 352     static void verifyS(String name, int i, short[] g, short[] r) {
 353         for (int j = 0; j < g.length; j++) {
 354             if (g[j] != r[j]) {
 355                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 356                                            " gold[" + i + "][" + j + "] = " + g[j] +
 357                                            " result[" + i + "][" + j + "] = " + r[j]);
 358             }
 359         }
 360     }
 361 
 362     static void verifyI(String name, int i, int[] g, int[] r) {
 363         for (int j = 0; j < g.length; j++) {
 364             if (g[j] != r[j]) {
 365                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 366                                            " gold[" + i + "][" + j + "] = " + g[j] +
 367                                            " result[" + i + "][" + j + "] = " + r[j]);
 368             }
 369         }
 370     }
 371 
 372     static void verifyL(String name, int i, long[] g, long[] r) {
 373         for (int j = 0; j < g.length; j++) {
 374             if (g[j] != r[j]) {
 375                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 376                                            " gold[" + i + "][" + j + "] = " + g[j] +
 377                                            " result[" + i + "][" + j + "] = " + r[j]);
 378             }
 379         }
 380     }
 381 
 382     @Test
 383     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 384                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 385                   IRNode.STORE_VECTOR, "> 0"},
 386         applyIf = {"MaxVectorSize", ">=8"},
 387         applyIfPlatform = {"64-bit", "true"},
 388         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 389     static Object[] test0(byte[] a, byte[] b, byte mask) {
 390         for (int i = 0; i < RANGE; i+=8) {
 391             // Safe to vectorize with AlignVector
 392             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 393             b[i+1] = (byte)(a[i+1] & mask);
 394             b[i+2] = (byte)(a[i+2] & mask);
 395             b[i+3] = (byte)(a[i+3] & mask);
 396         }
 397         return new Object[]{ a, b };
 398     }
 399 
 400     @Test
 401     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 402                   IRNode.AND_VB, "> 0",
 403                   IRNode.STORE_VECTOR, "> 0"},
 404         applyIf = {"UseCompactObjectHeaders", "false"},
 405         applyIfPlatform = {"64-bit", "true"},
 406         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 407     static Object[] test1(byte[] a, byte[] b, byte mask) {
 408         for (int i = 0; i < RANGE; i+=8) {
 409             // Safe to vectorize with AlignVector
 410             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 411             b[i+1] = (byte)(a[i+1] & mask);
 412             b[i+2] = (byte)(a[i+2] & mask);
 413             b[i+3] = (byte)(a[i+3] & mask);
 414             b[i+4] = (byte)(a[i+4] & mask);
 415             b[i+5] = (byte)(a[i+5] & mask);
 416             b[i+6] = (byte)(a[i+6] & mask);
 417             b[i+7] = (byte)(a[i+7] & mask);
 418         }
 419         return new Object[]{ a, b };
 420     }
 421 
 422     @Test
 423     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 424                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 425                   IRNode.STORE_VECTOR, "> 0"},
 426         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 427         applyIfPlatform = {"64-bit", "true"},
 428         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 429     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 430                   IRNode.AND_VB, "= 0",
 431                   IRNode.STORE_VECTOR, "= 0"},
 432         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 433         applyIfPlatform = {"64-bit", "true"},
 434         applyIf = {"AlignVector", "true"})
 435     static Object[] test2(byte[] a, byte[] b, byte mask) {
 436         for (int i = 0; i < RANGE; i+=8) {
 437             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 438             b[i+3] = (byte)(a[i+3] & mask); // at alignment 3
 439             b[i+4] = (byte)(a[i+4] & mask);
 440             b[i+5] = (byte)(a[i+5] & mask);
 441             b[i+6] = (byte)(a[i+6] & mask);
 442         }
 443         return new Object[]{ a, b };
 444     }
 445 
 446     @Test
 447     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 448                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 449                   IRNode.STORE_VECTOR, "> 0"},
 450         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 451         applyIfPlatform = {"64-bit", "true"},
 452         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 453     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 454                   IRNode.AND_VB, "= 0",
 455                   IRNode.STORE_VECTOR, "= 0"},
 456         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 457         applyIfPlatform = {"64-bit", "true"},
 458         applyIf = {"AlignVector", "true"})
 459     static Object[] test3(byte[] a, byte[] b, byte mask) {
 460         for (int i = 0; i < RANGE; i+=8) {
 461             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 462 
 463             // Problematic for AlignVector
 464             b[i+0] = (byte)(a[i+0] & mask); // best_memref, align 0
 465 
 466             b[i+3] = (byte)(a[i+3] & mask); // pack at offset 3 bytes
 467             b[i+4] = (byte)(a[i+4] & mask);
 468             b[i+5] = (byte)(a[i+5] & mask);
 469             b[i+6] = (byte)(a[i+6] & mask);
 470         }
 471         return new Object[]{ a, b };
 472     }
 473 
 474     @Test
 475     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 476                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "> 0",
 477                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 478                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "> 0",
 479                   IRNode.STORE_VECTOR, "> 0"},
 480         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 481         applyIfPlatform = {"64-bit", "true"},
 482         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 483     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 484                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 485                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 486                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 487                   IRNode.STORE_VECTOR, "> 0"},
 488         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 489         applyIfPlatform = {"64-bit", "true"},
 490         applyIfAnd = {"AlignVector", "true", "MaxVectorSize", ">=16"})
 491     static Object[] test4(byte[] a, byte[] b, byte mask) {
 492         for (int i = 0; i < RANGE/16; i++) {
 493             // Problematic for AlignVector
 494             b[i*16 + 0 ] = (byte)(a[i*16 + 0 ] & mask); // 4 pack, 0 aligned
 495             b[i*16 + 1 ] = (byte)(a[i*16 + 1 ] & mask);
 496             b[i*16 + 2 ] = (byte)(a[i*16 + 2 ] & mask);
 497             b[i*16 + 3 ] = (byte)(a[i*16 + 3 ] & mask);
 498 
 499             b[i*16 + 5 ] = (byte)(a[i*16 + 5 ] & mask); // 8 pack, 5 aligned
 500             b[i*16 + 6 ] = (byte)(a[i*16 + 6 ] & mask);
 501             b[i*16 + 7 ] = (byte)(a[i*16 + 7 ] & mask);
 502             b[i*16 + 8 ] = (byte)(a[i*16 + 8 ] & mask);
 503             b[i*16 + 9 ] = (byte)(a[i*16 + 9 ] & mask);
 504             b[i*16 + 10] = (byte)(a[i*16 + 10] & mask);
 505             b[i*16 + 11] = (byte)(a[i*16 + 11] & mask);
 506             b[i*16 + 12] = (byte)(a[i*16 + 12] & mask);
 507         }
 508         return new Object[]{ a, b };
 509     }
 510 
 511     @Test
 512     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 513                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 514                   IRNode.STORE_VECTOR, "> 0"},
 515         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 516         applyIfPlatform = {"64-bit", "true"},
 517         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 518     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 519                   IRNode.AND_VB, "= 0",
 520                   IRNode.STORE_VECTOR, "= 0"},
 521         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 522         applyIfPlatform = {"64-bit", "true"},
 523         applyIf = {"AlignVector", "true"})
 524     static Object[] test5(byte[] a, byte[] b, byte mask, int inv) {
 525         for (int i = 0; i < RANGE; i+=8) {
 526             // Cannot align with AlignVector because of invariant
 527             b[i+inv+0] = (byte)(a[i+inv+0] & mask);
 528 
 529             b[i+inv+3] = (byte)(a[i+inv+3] & mask);
 530             b[i+inv+4] = (byte)(a[i+inv+4] & mask);
 531             b[i+inv+5] = (byte)(a[i+inv+5] & mask);
 532             b[i+inv+6] = (byte)(a[i+inv+6] & mask);
 533         }
 534         return new Object[]{ a, b };
 535     }
 536 
 537     @Test
 538     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 539                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 540                   IRNode.STORE_VECTOR, "> 0"},
 541         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 542         applyIfPlatform = {"64-bit", "true"},
 543         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 544     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 545                   IRNode.AND_VB, "= 0",
 546                   IRNode.STORE_VECTOR, "= 0"},
 547         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 548         applyIfPlatform = {"64-bit", "true"},
 549         applyIf = {"AlignVector", "true"})
 550     static Object[] test6(byte[] a, byte[] b, byte mask) {
 551         for (int i = 0; i < RANGE/8; i+=2) {
 552             // Cannot align with AlignVector because offset is odd
 553             b[i*4+0] = (byte)(a[i*4+0] & mask);
 554 
 555             b[i*4+3] = (byte)(a[i*4+3] & mask);
 556             b[i*4+4] = (byte)(a[i*4+4] & mask);
 557             b[i*4+5] = (byte)(a[i*4+5] & mask);
 558             b[i*4+6] = (byte)(a[i*4+6] & mask);
 559         }
 560         return new Object[]{ a, b };
 561     }
 562 
 563     @Test
 564     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 565                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 566                   IRNode.STORE_VECTOR, "> 0"},
 567         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"},
 568         applyIfPlatform = {"64-bit", "true"},
 569         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 570     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 571                   IRNode.AND_VS, "= 0",
 572                   IRNode.STORE_VECTOR, "= 0"},
 573         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 574         applyIfPlatform = {"64-bit", "true"},
 575         applyIf = {"AlignVector", "true"})
 576     static Object[] test7(short[] a, short[] b, short mask) {
 577         for (int i = 0; i < RANGE/8; i+=2) {
 578             // Cannot align with AlignVector because offset is odd
 579             b[i*4+0] = (short)(a[i*4+0] & mask);
 580 
 581             b[i*4+3] = (short)(a[i*4+3] & mask);
 582             b[i*4+4] = (short)(a[i*4+4] & mask);
 583             b[i*4+5] = (short)(a[i*4+5] & mask);
 584             b[i*4+6] = (short)(a[i*4+6] & mask);
 585         }
 586         return new Object[]{ a, b };
 587     }
 588 
 589     @Test
 590     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 591                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 592                   IRNode.STORE_VECTOR, "> 0"},
 593         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 594         applyIfPlatform = {"64-bit", "true"},
 595         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 596     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 597                   IRNode.AND_VB, "= 0",
 598                   IRNode.STORE_VECTOR, "= 0"},
 599         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 600         applyIfPlatform = {"64-bit", "true"},
 601         applyIf = {"AlignVector", "true"})
 602     static Object[] test8(byte[] a, byte[] b, byte mask, int init) {
 603         for (int i = init; i < RANGE; i+=8) {
 604             // Cannot align with AlignVector because of invariant (variable init becomes invar)
 605             b[i+0] = (byte)(a[i+0] & mask);
 606 
 607             b[i+3] = (byte)(a[i+3] & mask);
 608             b[i+4] = (byte)(a[i+4] & mask);
 609             b[i+5] = (byte)(a[i+5] & mask);
 610             b[i+6] = (byte)(a[i+6] & mask);
 611         }
 612         return new Object[]{ a, b };
 613     }
 614 
 615     @Test
 616     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 617                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 618                   IRNode.STORE_VECTOR, "> 0"},
 619         applyIf = {"MaxVectorSize", ">=8"},
 620         applyIfPlatform = {"64-bit", "true"},
 621         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 622     static Object[] test9(byte[] a, byte[] b, byte mask) {
 623         // known non-zero init value does not affect offset, but has implicit effect on iv
 624         for (int i = 13; i < RANGE-8; i+=8) {
 625             b[i+0] = (byte)(a[i+0] & mask);
 626 
 627             b[i+3] = (byte)(a[i+3] & mask);
 628             b[i+4] = (byte)(a[i+4] & mask);
 629             b[i+5] = (byte)(a[i+5] & mask);
 630             b[i+6] = (byte)(a[i+6] & mask);
 631         }
 632         return new Object[]{ a, b };
 633     }
 634 
 635     @Test
 636     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 637                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 638                   IRNode.STORE_VECTOR, "> 0"},
 639         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 640         applyIfPlatform = {"64-bit", "true"},
 641         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 642     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 643                   IRNode.AND_VB, "= 0",
 644                   IRNode.STORE_VECTOR, "= 0"},
 645         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 646         applyIfPlatform = {"64-bit", "true"},
 647         applyIf = {"AlignVector", "true"})
 648     static Object[] test10a(byte[] a, byte[] b, byte mask) {
 649         // This is not alignable with pre-loop, because of odd init.
 650         for (int i = 3; i < RANGE-8; i+=8) {
 651             b[i+0] = (byte)(a[i+0] & mask);
 652             b[i+1] = (byte)(a[i+1] & mask);
 653             b[i+2] = (byte)(a[i+2] & mask);
 654             b[i+3] = (byte)(a[i+3] & mask);
 655         }
 656         return new Object[]{ a, b };
 657     }
 658 
 659     @Test
 660     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 661                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 662                   IRNode.STORE_VECTOR, "> 0"},
 663         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 664         applyIfPlatform = {"64-bit", "true"},
 665         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 666     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 667                   IRNode.AND_VB, "= 0",
 668                   IRNode.STORE_VECTOR, "= 0"},
 669         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 670         applyIfPlatform = {"64-bit", "true"},
 671         applyIf = {"AlignVector", "true"})
 672     static Object[] test10b(byte[] a, byte[] b, byte mask) {
 673         // This is not alignable with pre-loop, because of odd init.
 674         // Seems not correctly handled.
 675         for (int i = 13; i < RANGE-8; i+=8) {
 676             b[i+0] = (byte)(a[i+0] & mask);
 677             b[i+1] = (byte)(a[i+1] & mask);
 678             b[i+2] = (byte)(a[i+2] & mask);
 679             b[i+3] = (byte)(a[i+3] & mask);
 680         }
 681         return new Object[]{ a, b };
 682     }
 683 
 684     @Test
 685     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 686                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 687                   IRNode.STORE_VECTOR, "> 0"},
 688         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 689         applyIfPlatform = {"64-bit", "true"},
 690         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 691     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 692                   IRNode.AND_VS, "= 0",
 693                   IRNode.STORE_VECTOR, "= 0"},
 694         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 695         applyIfPlatform = {"64-bit", "true"},
 696         applyIf = {"AlignVector", "true"})
 697     static Object[] test10c(short[] a, short[] b, short mask) {
 698         // This is not alignable with pre-loop, because of odd init.
 699         // Seems not correctly handled with MaxVectorSize >= 32.
 700         for (int i = 13; i < RANGE-8; i+=8) {
 701             b[i+0] = (short)(a[i+0] & mask);
 702             b[i+1] = (short)(a[i+1] & mask);
 703             b[i+2] = (short)(a[i+2] & mask);
 704             b[i+3] = (short)(a[i+3] & mask);
 705         }
 706         return new Object[]{ a, b };
 707     }
 708 
 709     @Test
 710     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 711                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 712                   IRNode.STORE_VECTOR, "> 0"},
 713         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "false"},
 714         applyIfPlatform = {"64-bit", "true"},
 715         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 716     static Object[] test10d(short[] a, short[] b, short mask) {
 717         for (int i = 13; i < RANGE-16; i+=8) {
 718             // init + offset -> aligned
 719             b[i+0+3] = (short)(a[i+0+3] & mask);
 720             b[i+1+3] = (short)(a[i+1+3] & mask);
 721             b[i+2+3] = (short)(a[i+2+3] & mask);
 722             b[i+3+3] = (short)(a[i+3+3] & mask);
 723         }
 724         return new Object[]{ a, b };
 725     }
 726 
 727     @Test
 728     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 729                   IRNode.AND_VB, "> 0",
 730                   IRNode.STORE_VECTOR, "> 0"},
 731         applyIfPlatform = {"64-bit", "true"},
 732         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 733     static Object[] test11aB(byte[] a, byte[] b, byte mask) {
 734         for (int i = 0; i < RANGE; i++) {
 735             // always alignable
 736             b[i+0] = (byte)(a[i+0] & mask);
 737         }
 738         return new Object[]{ a, b };
 739     }
 740 
 741     @Test
 742     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 743                   IRNode.AND_VS, "> 0",
 744                   IRNode.STORE_VECTOR, "> 0"},
 745         applyIfPlatform = {"64-bit", "true"},
 746         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 747     static Object[] test11aS(short[] a, short[] b, short mask) {
 748         for (int i = 0; i < RANGE; i++) {
 749             // always alignable
 750             b[i+0] = (short)(a[i+0] & mask);
 751         }
 752         return new Object[]{ a, b };
 753     }
 754 
 755     @Test
 756     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 757                   IRNode.AND_VI, "> 0",
 758                   IRNode.STORE_VECTOR, "> 0"},
 759         applyIfPlatform = {"64-bit", "true"},
 760         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 761     static Object[] test11aI(int[] a, int[] b, int mask) {
 762         for (int i = 0; i < RANGE; i++) {
 763             // always alignable
 764             b[i+0] = (int)(a[i+0] & mask);
 765         }
 766         return new Object[]{ a, b };
 767     }
 768 
 769     @Test
 770     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 771                   IRNode.AND_VL, "> 0",
 772                   IRNode.STORE_VECTOR, "> 0"},
 773         applyIfPlatform = {"64-bit", "true"},
 774         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 775     static Object[] test11aL(long[] a, long[] b, long mask) {
 776         for (int i = 0; i < RANGE; i++) {
 777             // always alignable
 778             b[i+0] = (long)(a[i+0] & mask);
 779         }
 780         return new Object[]{ a, b };
 781     }
 782 
 783     @Test
 784     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 785                   IRNode.AND_VB, "> 0",
 786                   IRNode.STORE_VECTOR, "> 0"},
 787         applyIfPlatform = {"64-bit", "true"},
 788         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 789     static Object[] test11bB(byte[] a, byte[] b, byte mask) {
 790         for (int i = 1; i < RANGE; i++) {
 791             // always alignable
 792             b[i+0] = (byte)(a[i+0] & mask);
 793         }
 794         return new Object[]{ a, b };
 795     }
 796 
 797     @Test
 798     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 799                   IRNode.AND_VS, "> 0",
 800                   IRNode.STORE_VECTOR, "> 0"},
 801         applyIfPlatform = {"64-bit", "true"},
 802         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 803     static Object[] test11bS(short[] a, short[] b, short mask) {
 804         for (int i = 1; i < RANGE; i++) {
 805             // always alignable
 806             b[i+0] = (short)(a[i+0] & mask);
 807         }
 808         return new Object[]{ a, b };
 809     }
 810 
 811     @Test
 812     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 813                   IRNode.AND_VI, "> 0",
 814                   IRNode.STORE_VECTOR, "> 0"},
 815         applyIfPlatform = {"64-bit", "true"},
 816         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 817     static Object[] test11bI(int[] a, int[] b, int mask) {
 818         for (int i = 1; i < RANGE; i++) {
 819             // always alignable
 820             b[i+0] = (int)(a[i+0] & mask);
 821         }
 822         return new Object[]{ a, b };
 823     }
 824 
 825     @Test
 826     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 827                   IRNode.AND_VL, "> 0",
 828                   IRNode.STORE_VECTOR, "> 0"},
 829         applyIfPlatform = {"64-bit", "true"},
 830         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 831     static Object[] test11bL(long[] a, long[] b, long mask) {
 832         for (int i = 1; i < RANGE; i++) {
 833             // always alignable
 834             b[i+0] = (long)(a[i+0] & mask);
 835         }
 836         return new Object[]{ a, b };
 837     }
 838 
 839     @Test
 840     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 841                   IRNode.AND_VB, "> 0",
 842                   IRNode.STORE_VECTOR, "> 0"},
 843         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 844         applyIfPlatform = {"64-bit", "true"},
 845         applyIf = {"AlignVector", "false"})
 846     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 847                   IRNode.AND_VB, "= 0",
 848                   IRNode.STORE_VECTOR, "= 0"},
 849         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 850         applyIfPlatform = {"64-bit", "true"},
 851         applyIf = {"AlignVector", "true"})
 852     static Object[] test11cB(byte[] a, byte[] b, byte mask) {
 853         for (int i = 1; i < RANGE-1; i++) {
 854             // 1 byte offset -> not alignable with AlignVector
 855             b[i+0] = (byte)(a[i+1] & mask);
 856         }
 857         return new Object[]{ a, b };
 858     }
 859 
 860     @Test
 861     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 862                   IRNode.AND_VS, "> 0",
 863                   IRNode.STORE_VECTOR, "> 0"},
 864         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 865         applyIfPlatform = {"64-bit", "true"},
 866         applyIf = {"AlignVector", "false"})
 867     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 868                   IRNode.AND_VS, "= 0",
 869                   IRNode.STORE_VECTOR, "= 0"},
 870         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 871         applyIfPlatform = {"64-bit", "true"},
 872         applyIf = {"AlignVector", "true"})
 873     static Object[] test11cS(short[] a, short[] b, short mask) {
 874         for (int i = 1; i < RANGE-1; i++) {
 875             // 2 byte offset -> not alignable with AlignVector
 876             b[i+0] = (short)(a[i+1] & mask);
 877         }
 878         return new Object[]{ a, b };
 879     }
 880 
 881     @Test
 882     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 883                   IRNode.AND_VI, "> 0",
 884                   IRNode.STORE_VECTOR, "> 0"},
 885         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 886         applyIfPlatform = {"64-bit", "true"},
 887         applyIf = {"AlignVector", "false"})
 888     @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0",
 889                   IRNode.AND_VI, "= 0",
 890                   IRNode.STORE_VECTOR, "= 0"},
 891         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 892         applyIfPlatform = {"64-bit", "true"},
 893         applyIf = {"AlignVector", "true"})
 894     static Object[] test11cI(int[] a, int[] b, int mask) {
 895         for (int i = 1; i < RANGE-1; i++) {
 896             // 4 byte offset -> not alignable with AlignVector
 897             b[i+0] = (int)(a[i+1] & mask);
 898         }
 899         return new Object[]{ a, b };
 900     }
 901 
 902     @Test
 903     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 904                   IRNode.AND_VL, "> 0",
 905                   IRNode.STORE_VECTOR, "> 0"},
 906         applyIfPlatform = {"64-bit", "true"},
 907         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 908     static Object[] test11cL(long[] a, long[] b, long mask) {
 909         for (int i = 1; i < RANGE-1; i++) {
 910             // always alignable (8 byte offset)
 911             b[i+0] = (long)(a[i+1] & mask);
 912         }
 913         return new Object[]{ a, b };
 914     }
 915 
 916     @Test
 917     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 918                   IRNode.AND_VB, "> 0",
 919                   IRNode.STORE_VECTOR, "> 0"},
 920         applyIfPlatform = {"64-bit", "true"},
 921         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 922     static Object[] test11dB(byte[] a, byte[] b, byte mask, int invar) {
 923         for (int i = 0; i < RANGE; i++) {
 924             b[i+0+invar] = (byte)(a[i+0+invar] & mask);
 925         }
 926         return new Object[]{ a, b };
 927     }
 928 
 929     @Test
 930     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 931                   IRNode.AND_VS, "> 0",
 932                   IRNode.STORE_VECTOR, "> 0"},
 933         applyIfPlatform = {"64-bit", "true"},
 934         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 935     static Object[] test11dS(short[] a, short[] b, short mask, int invar) {
 936         for (int i = 0; i < RANGE; i++) {
 937             b[i+0+invar] = (short)(a[i+0+invar] & mask);
 938         }
 939         return new Object[]{ a, b };
 940     }
 941 
 942     @Test
 943     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 944                   IRNode.AND_VI, "> 0",
 945                   IRNode.STORE_VECTOR, "> 0"},
 946         applyIfPlatform = {"64-bit", "true"},
 947         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 948     static Object[] test11dI(int[] a, int[] b, int mask, int invar) {
 949         for (int i = 0; i < RANGE; i++) {
 950             b[i+0+invar] = (int)(a[i+0+invar] & mask);
 951         }
 952         return new Object[]{ a, b };
 953     }
 954 
 955     @Test
 956     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 957                   IRNode.AND_VL, "> 0",
 958                   IRNode.STORE_VECTOR, "> 0"},
 959         applyIfPlatform = {"64-bit", "true"},
 960         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 961     static Object[] test11dL(long[] a, long[] b, long mask, int invar) {
 962         for (int i = 0; i < RANGE; i++) {
 963             b[i+0+invar] = (long)(a[i+0+invar] & mask);
 964         }
 965         return new Object[]{ a, b };
 966     }
 967 
 968     @Test
 969     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 970                   IRNode.AND_VB, "= 0",
 971                   IRNode.STORE_VECTOR, "= 0"},
 972         applyIfPlatform = {"64-bit", "true"},
 973         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 974     static Object[] test12(byte[] a, byte[] b, byte mask) {
 975         for (int i = 0; i < RANGE/16; i++) {
 976             // Currently does not vectorize at all
 977             b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask);
 978             b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask);
 979             b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask);
 980             b[i*6 + 3 ] = (byte)(a[i*6 + 3 ] & mask);
 981         }
 982         return new Object[]{ a, b };
 983     }
 984 
 985     @Test
 986     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 987                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 988                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 989                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 990                   IRNode.STORE_VECTOR, "> 0"},
 991         applyIfPlatform = {"64-bit", "true"},
 992         applyIfCPUFeatureOr = {"avx2", "true"})
 993     // require avx to ensure vectors are larger than what unrolling produces
 994     static Object[] test13aIL(int[] a, long[] b) {
 995         for (int i = 0; i < RANGE; i++) {
 996             a[i]++;
 997             b[i]++;
 998         }
 999         return new Object[]{ a, b };
1000     }
1001 
1002     @Test
1003     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1004                   IRNode.LOAD_VECTOR_I, "> 0",
1005                   IRNode.ADD_VB, "> 0",
1006                   IRNode.ADD_VI, "> 0",
1007                   IRNode.STORE_VECTOR, "> 0"},
1008         applyIf = {"UseCompactObjectHeaders", "false"},
1009         applyIfPlatform = {"64-bit", "true"},
1010         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1011     static Object[] test13aIB(int[] a, byte[] b) {
1012         for (int i = 0; i < RANGE; i++) {
1013             a[i]++;
1014             b[i]++;
1015         }
1016         return new Object[]{ a, b };
1017     }
1018 
1019     @Test
1020     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1021                   IRNode.LOAD_VECTOR_S, "> 0",
1022                   IRNode.ADD_VI, "> 0",
1023                   IRNode.ADD_VS, "> 0",
1024                   IRNode.STORE_VECTOR, "> 0"},
1025         applyIf = {"UseCompactObjectHeaders", "false"},
1026         applyIfPlatform = {"64-bit", "true"},
1027         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1028     static Object[] test13aIS(int[] a, short[] b) {
1029         for (int i = 0; i < RANGE; i++) {
1030             a[i]++;
1031             b[i]++;
1032         }
1033         return new Object[]{ a, b };
1034     }
1035 
1036     @Test
1037     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1038                   IRNode.LOAD_VECTOR_S, "> 0",
1039                   IRNode.LOAD_VECTOR_I, "> 0",
1040                   IRNode.LOAD_VECTOR_L, "> 0",
1041                   IRNode.ADD_VB, "> 0",
1042                   IRNode.ADD_VS, "> 0",
1043                   IRNode.ADD_VI, "> 0",
1044                   IRNode.ADD_VL, "> 0",
1045                   IRNode.STORE_VECTOR, "> 0"},
1046         applyIf = {"UseCompactObjectHeaders", "false"},
1047         applyIfPlatform = {"64-bit", "true"},
1048         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1049     static Object[] test13aBSIL(byte[] a, short[] b, int[] c, long[] d) {
1050         for (int i = 0; i < RANGE; i++) {
1051             a[i]++;
1052             b[i]++;
1053             c[i]++;
1054             d[i]++;
1055         }
1056         return new Object[]{ a, b, c, d };
1057     }
1058 
1059     @Test
1060     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1061                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1062                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1063                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1064                   IRNode.STORE_VECTOR, "> 0"},
1065         applyIfPlatform = {"64-bit", "true"},
1066         applyIfCPUFeatureOr = {"avx2", "true"})
1067     // require avx to ensure vectors are larger than what unrolling produces
1068     static Object[] test13bIL(int[] a, long[] b) {
1069         for (int i = 1; i < RANGE; i++) {
1070             a[i]++;
1071             b[i]++;
1072         }
1073         return new Object[]{ a, b };
1074     }
1075 
1076     @Test
1077     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1078                   IRNode.LOAD_VECTOR_I, "> 0",
1079                   IRNode.ADD_VB, "> 0",
1080                   IRNode.ADD_VI, "> 0",
1081                   IRNode.STORE_VECTOR, "> 0"},
1082         applyIf = {"UseCompactObjectHeaders", "false"},
1083         applyIfPlatform = {"64-bit", "true"},
1084         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1085     static Object[] test13bIB(int[] a, byte[] b) {
1086         for (int i = 1; i < RANGE; i++) {
1087             a[i]++;
1088             b[i]++;
1089         }
1090         return new Object[]{ a, b };
1091     }
1092 
1093     @Test
1094     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1095                   IRNode.LOAD_VECTOR_S, "> 0",
1096                   IRNode.ADD_VI, "> 0",
1097                   IRNode.ADD_VS, "> 0",
1098                   IRNode.STORE_VECTOR, "> 0"},
1099         applyIf = {"UseCompactObjectHeaders", "false"},
1100         applyIfPlatform = {"64-bit", "true"},
1101         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1102     static Object[] test13bIS(int[] a, short[] b) {
1103         for (int i = 1; i < RANGE; i++) {
1104             a[i]++;
1105             b[i]++;
1106         }
1107         return new Object[]{ a, b };
1108     }
1109 
1110     @Test
1111     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1112                   IRNode.LOAD_VECTOR_S, "> 0",
1113                   IRNode.LOAD_VECTOR_I, "> 0",
1114                   IRNode.LOAD_VECTOR_L, "> 0",
1115                   IRNode.ADD_VB, "> 0",
1116                   IRNode.ADD_VS, "> 0",
1117                   IRNode.ADD_VI, "> 0",
1118                   IRNode.ADD_VL, "> 0",
1119                   IRNode.STORE_VECTOR, "> 0"},
1120         applyIf = {"UseCompactObjectHeaders", "false"},
1121         applyIfPlatform = {"64-bit", "true"},
1122         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1123     static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
1124         for (int i = 1; i < RANGE; i++) {
1125             a[i]++;
1126             b[i]++;
1127             c[i]++;
1128             d[i]++;
1129         }
1130         return new Object[]{ a, b, c, d };
1131     }
1132 
1133     @Test
1134     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1135                   IRNode.ADD_VB, "> 0",
1136                   IRNode.STORE_VECTOR, "> 0"},
1137         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1138         applyIfPlatform = {"64-bit", "true"},
1139         applyIf = {"AlignVector", "false"})
1140     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1141                   IRNode.ADD_VB, "= 0",
1142                   IRNode.STORE_VECTOR, "= 0"},
1143         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1144         applyIfPlatform = {"64-bit", "true"},
1145         applyIf = {"AlignVector", "true"})
1146     static Object[] test14aB(byte[] a) {
1147         // non-power-of-2 stride
1148         for (int i = 0; i < RANGE-20; i+=9) {
1149             a[i+0]++;
1150             a[i+1]++;
1151             a[i+2]++;
1152             a[i+3]++;
1153             a[i+4]++;
1154             a[i+5]++;
1155             a[i+6]++;
1156             a[i+7]++;
1157             a[i+8]++;
1158             a[i+9]++;
1159             a[i+10]++;
1160             a[i+11]++;
1161             a[i+12]++;
1162             a[i+13]++;
1163             a[i+14]++;
1164             a[i+15]++;
1165         }
1166         return new Object[]{ a };
1167     }
1168 
1169     @Test
1170     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1171                   IRNode.ADD_VB, "> 0",
1172                   IRNode.STORE_VECTOR, "> 0"},
1173         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1174         applyIfPlatform = {"64-bit", "true"},
1175         applyIf = {"AlignVector", "false"})
1176     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1177                   IRNode.ADD_VB, "= 0",
1178                   IRNode.STORE_VECTOR, "= 0"},
1179         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1180         applyIfPlatform = {"64-bit", "true"},
1181         applyIf = {"AlignVector", "true"})
1182     static Object[] test14bB(byte[] a) {
1183         // non-power-of-2 stride
1184         for (int i = 0; i < RANGE-20; i+=3) {
1185             a[i+0]++;
1186             a[i+1]++;
1187             a[i+2]++;
1188             a[i+3]++;
1189             a[i+4]++;
1190             a[i+5]++;
1191             a[i+6]++;
1192             a[i+7]++;
1193             a[i+8]++;
1194             a[i+9]++;
1195             a[i+10]++;
1196             a[i+11]++;
1197             a[i+12]++;
1198             a[i+13]++;
1199             a[i+14]++;
1200             a[i+15]++;
1201         }
1202         return new Object[]{ a };
1203     }
1204 
1205     @Test
1206     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1207                   IRNode.ADD_VB, "> 0",
1208                   IRNode.STORE_VECTOR, "> 0"},
1209         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1210         applyIfPlatform = {"64-bit", "true"},
1211         applyIf = {"AlignVector", "false"})
1212     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1213                   IRNode.ADD_VB, "= 0",
1214                   IRNode.STORE_VECTOR, "= 0"},
1215         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1216         applyIfPlatform = {"64-bit", "true"},
1217         applyIf = {"AlignVector", "true"})
1218     static Object[] test14cB(byte[] a) {
1219         // non-power-of-2 stride
1220         for (int i = 0; i < RANGE-20; i+=5) {
1221             a[i+0]++;
1222             a[i+1]++;
1223             a[i+2]++;
1224             a[i+3]++;
1225             a[i+4]++;
1226             a[i+5]++;
1227             a[i+6]++;
1228             a[i+7]++;
1229             a[i+8]++;
1230             a[i+9]++;
1231             a[i+10]++;
1232             a[i+11]++;
1233             a[i+12]++;
1234             a[i+13]++;
1235             a[i+14]++;
1236             a[i+15]++;
1237         }
1238         return new Object[]{ a };
1239     }
1240 
1241     @Test
1242     // IR rules difficult because of modulo wrapping with offset after peeling.
1243     static Object[] test15aB(byte[] a) {
1244         // non-power-of-2 scale
1245         for (int i = 0; i < RANGE/64-20; i++) {
1246             a[53*i+0]++;
1247             a[53*i+1]++;
1248             a[53*i+2]++;
1249             a[53*i+3]++;
1250             a[53*i+4]++;
1251             a[53*i+5]++;
1252             a[53*i+6]++;
1253             a[53*i+7]++;
1254             a[53*i+8]++;
1255             a[53*i+9]++;
1256             a[53*i+10]++;
1257             a[53*i+11]++;
1258             a[53*i+12]++;
1259             a[53*i+13]++;
1260             a[53*i+14]++;
1261             a[53*i+15]++;
1262         }
1263         return new Object[]{ a };
1264     }
1265 
1266     @Test
1267     // IR rules difficult because of modulo wrapping with offset after peeling.
1268     static Object[] test15bB(byte[] a) {
1269         // non-power-of-2 scale
1270         for (int i = 0; i < RANGE/64-20; i++) {
1271             a[25*i+0]++;
1272             a[25*i+1]++;
1273             a[25*i+2]++;
1274             a[25*i+3]++;
1275             a[25*i+4]++;
1276             a[25*i+5]++;
1277             a[25*i+6]++;
1278             a[25*i+7]++;
1279             a[25*i+8]++;
1280             a[25*i+9]++;
1281             a[25*i+10]++;
1282             a[25*i+11]++;
1283             a[25*i+12]++;
1284             a[25*i+13]++;
1285             a[25*i+14]++;
1286             a[25*i+15]++;
1287         }
1288         return new Object[]{ a };
1289     }
1290 
1291     @Test
1292     // IR rules difficult because of modulo wrapping with offset after peeling.
1293     static Object[] test15cB(byte[] a) {
1294         // non-power-of-2 scale
1295         for (int i = 0; i < RANGE/64-20; i++) {
1296             a[19*i+0]++;
1297             a[19*i+1]++;
1298             a[19*i+2]++;
1299             a[19*i+3]++;
1300             a[19*i+4]++;
1301             a[19*i+5]++;
1302             a[19*i+6]++;
1303             a[19*i+7]++;
1304             a[19*i+8]++;
1305             a[19*i+9]++;
1306             a[19*i+10]++;
1307             a[19*i+11]++;
1308             a[19*i+12]++;
1309             a[19*i+13]++;
1310             a[19*i+14]++;
1311             a[19*i+15]++;
1312         }
1313         return new Object[]{ a };
1314     }
1315 
1316     @Test
1317     static Object[] test16a(byte[] a, short[] b) {
1318         // infinite loop issues
1319         for (int i = 0; i < RANGE/2-20; i++) {
1320             a[2*i+0]++;
1321             a[2*i+1]++;
1322             a[2*i+2]++;
1323             a[2*i+3]++;
1324             a[2*i+4]++;
1325             a[2*i+5]++;
1326             a[2*i+6]++;
1327             a[2*i+7]++;
1328             a[2*i+8]++;
1329             a[2*i+9]++;
1330             a[2*i+10]++;
1331             a[2*i+11]++;
1332             a[2*i+12]++;
1333             a[2*i+13]++;
1334             a[2*i+14]++;
1335 
1336             b[2*i+0]++;
1337             b[2*i+1]++;
1338             b[2*i+2]++;
1339             b[2*i+3]++;
1340         }
1341         return new Object[]{ a, b };
1342     }
1343 
1344     @Test
1345     static Object[] test16b(byte[] a) {
1346         // infinite loop issues
1347         for (int i = 0; i < RANGE/2-20; i++) {
1348             a[2*i+0]++;
1349             a[2*i+1]++;
1350             a[2*i+2]++;
1351             a[2*i+3]++;
1352             a[2*i+4]++;
1353             a[2*i+5]++;
1354             a[2*i+6]++;
1355             a[2*i+7]++;
1356             a[2*i+8]++;
1357             a[2*i+9]++;
1358             a[2*i+10]++;
1359             a[2*i+11]++;
1360             a[2*i+12]++;
1361             a[2*i+13]++;
1362             a[2*i+14]++;
1363         }
1364         return new Object[]{ a };
1365     }
1366 
1367     @Test
1368     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1369                   IRNode.ADD_VL, "> 0",
1370                   IRNode.STORE_VECTOR, "> 0"},
1371         applyIfPlatform = {"64-bit", "true"},
1372         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1373     static Object[] test17a(long[] a) {
1374         // Unsafe: vectorizes with profiling (not xcomp)
1375         for (int i = 0; i < RANGE; i++) {
1376             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i;
1377             long v = UNSAFE.getLongUnaligned(a, adr);
1378             UNSAFE.putLongUnaligned(a, adr, v + 1);
1379         }
1380         return new Object[]{ a };
1381     }
1382 
1383     @Test
1384     // Difficult to write good IR rule. Modulo calculus overflow can create non-power-of-2 packs.
1385     static Object[] test17b(long[] a) {
1386         // Not alignable
1387         for (int i = 0; i < RANGE-1; i++) {
1388             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1;
1389             long v = UNSAFE.getLongUnaligned(a, adr);
1390             UNSAFE.putLongUnaligned(a, adr, v + 1);
1391         }
1392         return new Object[]{ a };
1393     }
1394 
1395     @Test
1396     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1397                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1398                   IRNode.STORE_VECTOR, "> 0"},
1399         applyIf = {"MaxVectorSize", ">=32"},
1400         applyIfPlatform = {"64-bit", "true"},
1401         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1402     static Object[] test17c(long[] a) {
1403         // Unsafe: aligned vectorizes
1404         for (int i = 0; i < RANGE-1; i+=4) {
1405             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i;
1406             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1407             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1408             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1409             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1410         }
1411         return new Object[]{ a };
1412     }
1413 
1414     @Test
1415     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1416                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1417                   IRNode.STORE_VECTOR, "> 0"},
1418         applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
1419         applyIfPlatform = {"64-bit", "true"},
1420         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=64"})
1421     // Ensure vector width is large enough to fit 64 byte for longs:
1422     // The offsets are: 25, 33, 57, 65
1423     // In modulo 32:    25,  1, 25,  1  -> does not vectorize
1424     // In modulo 64:    25, 33, 57,  1  -> at least first pair vectorizes
1425     // This problem is because we compute modulo vector width in memory_alignment.
1426     @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0",
1427                   IRNode.ADD_VL, "= 0",
1428                   IRNode.STORE_VECTOR, "= 0"},
1429         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1430         applyIfPlatform = {"64-bit", "true"},
1431         applyIf = {"AlignVector", "true"})
1432     static Object[] test17d(long[] a) {
1433         // Not alignable
1434         for (int i = 0; i < RANGE-1; i+=4) {
1435             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1;
1436             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1437             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1438             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1439             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1440         }
1441         return new Object[]{ a };
1442     }
1443 
1444     @Test
1445     static Object[] test18a(byte[] a, int[] b) {
1446         // scale = 0  -->  no iv
1447         for (int i = 0; i < RANGE; i++) {
1448             a[0] = 1;
1449             b[i] = 2;
1450             a[1] = 1;
1451         }
1452         return new Object[]{ a, b };
1453     }
1454 
1455     @Test
1456     static Object[] test18b(byte[] a, int[] b) {
1457         // scale = 0  -->  no iv
1458         for (int i = 0; i < RANGE; i++) {
1459             a[1] = 1;
1460             b[i] = 2;
1461             a[2] = 1;
1462         }
1463         return new Object[]{ a, b };
1464     }
1465 
1466     @Test
1467     static Object[] test19(int[] a, int[] b) {
1468         for (int i = 5000; i > 0; i--) {
1469             a[RANGE_FINAL - i] = b[RANGE_FINAL - i];
1470         }
1471         return new Object[]{ a, b };
1472     }
1473 
1474     @Test
1475     static Object[] test20(byte[] a) {
1476         // Example where it is easy to pass alignment check,
1477         // but used to fail the alignment calculation
1478         for (int i = 1; i < RANGE/2-50; i++) {
1479             a[2*i+0+30]++;
1480             a[2*i+1+30]++;
1481             a[2*i+2+30]++;
1482             a[2*i+3+30]++;
1483         }
1484         return new Object[]{ a };
1485     }
1486 }