1 /*
   2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package compiler.loopopts.superword;
  25 
  26 import compiler.lib.ir_framework.*;
  27 import jdk.test.lib.Utils;
  28 import jdk.test.whitebox.WhiteBox;
  29 import jdk.internal.misc.Unsafe;
  30 import java.lang.reflect.Array;
  31 import java.util.Map;
  32 import java.util.HashMap;
  33 import java.util.Random;
  34 import java.nio.ByteOrder;
  35 
  36 /*
  37  * @test id=NoAlignVector
  38  * @bug 8310190
  39  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  40  * @modules java.base/jdk.internal.misc
  41  * @library /test/lib /
  42  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector
  43  */
  44 
  45 /*
  46  * @test id=AlignVector
  47  * @bug 8310190
  48  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  49  * @modules java.base/jdk.internal.misc
  50  * @library /test/lib /
  51  * @run driver compiler.loopopts.superword.TestAlignVector AlignVector
  52  */
  53 
  54 /*
  55  * @test id=VerifyAlignVector
  56  * @bug 8310190
  57  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  58  * @modules java.base/jdk.internal.misc
  59  * @library /test/lib /
  60  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector
  61  */
  62 
  63 public class TestAlignVector {
  64     static int RANGE = 1024*8;
  65     static int RANGE_FINAL = 1024*8;
  66     private static final Unsafe UNSAFE = Unsafe.getUnsafe();
  67     private static final Random RANDOM = Utils.getRandomInstance();
  68 
  69     // Inputs
  70     byte[] aB;
  71     byte[] bB;
  72     byte mB = (byte)31;
  73     short[] aS;
  74     short[] bS;
  75     short mS = (short)0xF0F0;
  76     int[] aI;
  77     int[] bI;
  78     int mI = 0xF0F0F0F0;
  79     long[] aL;
  80     long[] bL;
  81     long mL = 0xF0F0F0F0F0F0F0F0L;
  82 
  83     // List of tests
  84     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
  85 
  86     // List of gold, the results from the first run before compilation
  87     Map<String,Object[]> golds = new HashMap<String,Object[]>();
  88 
  89     interface TestFunction {
  90         Object[] run();
  91     }
  92 
  93     public static void main(String[] args) {
  94         TestFramework framework = new TestFramework(TestAlignVector.class);
  95         framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
  96                            "-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=250");
  97 
  98         switch (args[0]) {
  99             case "NoAlignVector"     -> { framework.addFlags("-XX:-AlignVector"); }
 100             case "AlignVector"       -> { framework.addFlags("-XX:+AlignVector"); }
 101             case "VerifyAlignVector" -> { framework.addFlags("-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 102             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 103         }
 104         framework.start();
 105     }
 106 
 107     public TestAlignVector() {
 108         // Generate input once
 109         aB = generateB();
 110         bB = generateB();
 111         aS = generateS();
 112         bS = generateS();
 113         aI = generateI();
 114         bI = generateI();
 115         aL = generateL();
 116         bL = generateL();
 117 
 118         // Add all tests to list
 119         tests.put("test0",       () -> { return test0(aB.clone(), bB.clone(), mB); });
 120         tests.put("test1",       () -> { return test1(aB.clone(), bB.clone(), mB); });
 121         tests.put("test2",       () -> { return test2(aB.clone(), bB.clone(), mB); });
 122         tests.put("test3",       () -> { return test3(aB.clone(), bB.clone(), mB); });
 123         tests.put("test4",       () -> { return test4(aB.clone(), bB.clone(), mB); });
 124         tests.put("test5",       () -> { return test5(aB.clone(), bB.clone(), mB, 0); });
 125         tests.put("test6",       () -> { return test6(aB.clone(), bB.clone(), mB); });
 126         tests.put("test7",       () -> { return test7(aS.clone(), bS.clone(), mS); });
 127         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 0); });
 128         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 1); });
 129         tests.put("test9",       () -> { return test9(aB.clone(), bB.clone(), mB); });
 130 
 131         tests.put("test10a",     () -> { return test10a(aB.clone(), bB.clone(), mB); });
 132         tests.put("test10b",     () -> { return test10b(aB.clone(), bB.clone(), mB); });
 133         tests.put("test10c",     () -> { return test10c(aS.clone(), bS.clone(), mS); });
 134         tests.put("test10d",     () -> { return test10d(aS.clone(), bS.clone(), mS); });
 135 
 136         tests.put("test11aB",    () -> { return test11aB(aB.clone(), bB.clone(), mB); });
 137         tests.put("test11aS",    () -> { return test11aS(aS.clone(), bS.clone(), mS); });
 138         tests.put("test11aI",    () -> { return test11aI(aI.clone(), bI.clone(), mI); });
 139         tests.put("test11aL",    () -> { return test11aL(aL.clone(), bL.clone(), mL); });
 140 
 141         tests.put("test11bB",    () -> { return test11bB(aB.clone(), bB.clone(), mB); });
 142         tests.put("test11bS",    () -> { return test11bS(aS.clone(), bS.clone(), mS); });
 143         tests.put("test11bI",    () -> { return test11bI(aI.clone(), bI.clone(), mI); });
 144         tests.put("test11bL",    () -> { return test11bL(aL.clone(), bL.clone(), mL); });
 145 
 146         tests.put("test11cB",    () -> { return test11cB(aB.clone(), bB.clone(), mB); });
 147         tests.put("test11cS",    () -> { return test11cS(aS.clone(), bS.clone(), mS); });
 148         tests.put("test11cI",    () -> { return test11cI(aI.clone(), bI.clone(), mI); });
 149         tests.put("test11cL",    () -> { return test11cL(aL.clone(), bL.clone(), mL); });
 150 
 151         tests.put("test11dB",    () -> { return test11dB(aB.clone(), bB.clone(), mB, 0); });
 152         tests.put("test11dS",    () -> { return test11dS(aS.clone(), bS.clone(), mS, 0); });
 153         tests.put("test11dI",    () -> { return test11dI(aI.clone(), bI.clone(), mI, 0); });
 154         tests.put("test11dL",    () -> { return test11dL(aL.clone(), bL.clone(), mL, 0); });
 155 
 156         tests.put("test12",      () -> { return test12(aB.clone(), bB.clone(), mB); });
 157 
 158         tests.put("test13aIL",   () -> { return test13aIL(aI.clone(), aL.clone()); });
 159         tests.put("test13aIB",   () -> { return test13aIB(aI.clone(), aB.clone()); });
 160         tests.put("test13aIS",   () -> { return test13aIS(aI.clone(), aS.clone()); });
 161         tests.put("test13aBSIL", () -> { return test13aBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 162 
 163         tests.put("test13bIL",   () -> { return test13bIL(aI.clone(), aL.clone()); });
 164         tests.put("test13bIB",   () -> { return test13bIB(aI.clone(), aB.clone()); });
 165         tests.put("test13bIS",   () -> { return test13bIS(aI.clone(), aS.clone()); });
 166         tests.put("test13bBSIL", () -> { return test13bBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 167 
 168         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
 169         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
 170         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
 171 
 172         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
 173         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
 174         tests.put("test15cB",    () -> { return test15cB(aB.clone()); });
 175 
 176         tests.put("test16a",     () -> { return test16a(aB.clone(), aS.clone()); });
 177         tests.put("test16b",     () -> { return test16b(aB.clone()); });
 178 
 179         tests.put("test17a",     () -> { return test17a(aL.clone()); });
 180         tests.put("test17b",     () -> { return test17b(aL.clone()); });
 181         tests.put("test17c",     () -> { return test17c(aL.clone()); });
 182         tests.put("test17d",     () -> { return test17d(aL.clone()); });
 183 
 184         tests.put("test18a",     () -> { return test18a(aB.clone(), aI.clone()); });
 185         tests.put("test18b",     () -> { return test18b(aB.clone(), aI.clone()); });
 186 
 187         tests.put("test19",      () -> { return test19(aI.clone(), bI.clone()); });
 188         tests.put("test20",      () -> { return test20(aB.clone()); });
 189 
 190         // Compute gold value for all test methods before compilation
 191         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 192             String name = entry.getKey();
 193             TestFunction test = entry.getValue();
 194             Object[] gold = test.run();
 195             golds.put(name, gold);
 196         }
 197     }
 198 
 199     @Warmup(100)
 200     @Run(test = {"test0",
 201                  "test1",
 202                  "test2",
 203                  "test3",
 204                  "test4",
 205                  "test5",
 206                  "test6",
 207                  "test7",
 208                  "test8",
 209                  "test9",
 210                  "test10a",
 211                  "test10b",
 212                  "test10c",
 213                  "test10d",
 214                  "test11aB",
 215                  "test11aS",
 216                  "test11aI",
 217                  "test11aL",
 218                  "test11bB",
 219                  "test11bS",
 220                  "test11bI",
 221                  "test11bL",
 222                  "test11cB",
 223                  "test11cS",
 224                  "test11cI",
 225                  "test11cL",
 226                  "test11dB",
 227                  "test11dS",
 228                  "test11dI",
 229                  "test11dL",
 230                  "test12",
 231                  "test13aIL",
 232                  "test13aIB",
 233                  "test13aIS",
 234                  "test13aBSIL",
 235                  "test13bIL",
 236                  "test13bIB",
 237                  "test13bIS",
 238                  "test13bBSIL",
 239                  "test14aB",
 240                  "test14bB",
 241                  "test14cB",
 242                  "test15aB",
 243                  "test15bB",
 244                  "test15cB",
 245                  "test16a",
 246                  "test16b",
 247                  "test17a",
 248                  "test17b",
 249                  "test17c",
 250                  "test17d",
 251                  "test18a",
 252                  "test18b",
 253                  "test19",
 254                  "test20"})
 255     public void runTests() {
 256         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 257             String name = entry.getKey();
 258             TestFunction test = entry.getValue();
 259             // Recall gold value from before compilation
 260             Object[] gold = golds.get(name);
 261             // Compute new result
 262             Object[] result = test.run();
 263             // Compare gold and new result
 264             verify(name, gold, result);
 265         }
 266     }
 267 
 268     static byte[] generateB() {
 269         byte[] a = new byte[RANGE];
 270         for (int i = 0; i < a.length; i++) {
 271             a[i] = (byte)RANDOM.nextInt();
 272         }
 273         return a;
 274     }
 275 
 276     static short[] generateS() {
 277         short[] a = new short[RANGE];
 278         for (int i = 0; i < a.length; i++) {
 279             a[i] = (short)RANDOM.nextInt();
 280         }
 281         return a;
 282     }
 283 
 284     static int[] generateI() {
 285         int[] a = new int[RANGE];
 286         for (int i = 0; i < a.length; i++) {
 287             a[i] = RANDOM.nextInt();
 288         }
 289         return a;
 290     }
 291 
 292     static long[] generateL() {
 293         long[] a = new long[RANGE];
 294         for (int i = 0; i < a.length; i++) {
 295             a[i] = RANDOM.nextLong();
 296         }
 297         return a;
 298     }
 299 
 300     static void verify(String name, Object[] gold, Object[] result) {
 301         if (gold.length != result.length) {
 302             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
 303                                        gold.length + ", result.length = " + result.length);
 304         }
 305         for (int i = 0; i < gold.length; i++) {
 306             Object g = gold[i];
 307             Object r = result[i];
 308             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
 309                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
 310                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 311                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 312             }
 313             if (g == r) {
 314                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
 315                                            " gold[" + i + "] == result[" + i + "]");
 316             }
 317             if (Array.getLength(g) != Array.getLength(r)) {
 318                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
 319                                            " gold[" + i + "].length = " + Array.getLength(g) +
 320                                            " result[" + i + "].length = " + Array.getLength(r));
 321             }
 322             Class c = g.getClass().getComponentType();
 323             if (c == byte.class) {
 324                 verifyB(name, i, (byte[])g, (byte[])r);
 325             } else if (c == short.class) {
 326                 verifyS(name, i, (short[])g, (short[])r);
 327             } else if (c == int.class) {
 328                 verifyI(name, i, (int[])g, (int[])r);
 329             } else if (c == long.class) {
 330                 verifyL(name, i, (long[])g, (long[])r);
 331             } else {
 332                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
 333                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 334                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 335             }
 336         }
 337     }
 338 
 339     static void verifyB(String name, int i, byte[] g, byte[] r) {
 340         for (int j = 0; j < g.length; j++) {
 341             if (g[j] != r[j]) {
 342                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 343                                            " gold[" + i + "][" + j + "] = " + g[j] +
 344                                            " result[" + i + "][" + j + "] = " + r[j]);
 345             }
 346         }
 347     }
 348 
 349     static void verifyS(String name, int i, short[] g, short[] r) {
 350         for (int j = 0; j < g.length; j++) {
 351             if (g[j] != r[j]) {
 352                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 353                                            " gold[" + i + "][" + j + "] = " + g[j] +
 354                                            " result[" + i + "][" + j + "] = " + r[j]);
 355             }
 356         }
 357     }
 358 
 359     static void verifyI(String name, int i, int[] g, int[] r) {
 360         for (int j = 0; j < g.length; j++) {
 361             if (g[j] != r[j]) {
 362                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 363                                            " gold[" + i + "][" + j + "] = " + g[j] +
 364                                            " result[" + i + "][" + j + "] = " + r[j]);
 365             }
 366         }
 367     }
 368 
 369     static void verifyL(String name, int i, long[] g, long[] r) {
 370         for (int j = 0; j < g.length; j++) {
 371             if (g[j] != r[j]) {
 372                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 373                                            " gold[" + i + "][" + j + "] = " + g[j] +
 374                                            " result[" + i + "][" + j + "] = " + r[j]);
 375             }
 376         }
 377     }
 378 
 379     @Test
 380     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 381                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 382                   IRNode.STORE_VECTOR, "> 0"},
 383         applyIf = {"MaxVectorSize", ">=8"},
 384         applyIfPlatform = {"64-bit", "true"},
 385         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 386     static Object[] test0(byte[] a, byte[] b, byte mask) {
 387         for (int i = 0; i < RANGE; i+=8) {
 388             // Safe to vectorize with AlignVector
 389             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 390             b[i+1] = (byte)(a[i+1] & mask);
 391             b[i+2] = (byte)(a[i+2] & mask);
 392             b[i+3] = (byte)(a[i+3] & mask);
 393         }
 394         return new Object[]{ a, b };
 395     }
 396 
 397     @Test
 398     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 399                   IRNode.AND_VB, "> 0",
 400                   IRNode.STORE_VECTOR, "> 0"},
 401         applyIfPlatform = {"64-bit", "true"},
 402         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 403     static Object[] test1(byte[] a, byte[] b, byte mask) {
 404         for (int i = 0; i < RANGE; i+=8) {
 405             // Safe to vectorize with AlignVector
 406             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 407             b[i+1] = (byte)(a[i+1] & mask);
 408             b[i+2] = (byte)(a[i+2] & mask);
 409             b[i+3] = (byte)(a[i+3] & mask);
 410             b[i+4] = (byte)(a[i+4] & mask);
 411             b[i+5] = (byte)(a[i+5] & mask);
 412             b[i+6] = (byte)(a[i+6] & mask);
 413             b[i+7] = (byte)(a[i+7] & mask);
 414         }
 415         return new Object[]{ a, b };
 416     }
 417 
 418     @Test
 419     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 420                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 421                   IRNode.STORE_VECTOR, "> 0"},
 422         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 423         applyIfPlatform = {"64-bit", "true"},
 424         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 425     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 426                   IRNode.AND_VB, "= 0",
 427                   IRNode.STORE_VECTOR, "= 0"},
 428         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 429         applyIfPlatform = {"64-bit", "true"},
 430         applyIf = {"AlignVector", "true"})
 431     static Object[] test2(byte[] a, byte[] b, byte mask) {
 432         for (int i = 0; i < RANGE; i+=8) {
 433             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 434             b[i+3] = (byte)(a[i+3] & mask); // at alignment 3
 435             b[i+4] = (byte)(a[i+4] & mask);
 436             b[i+5] = (byte)(a[i+5] & mask);
 437             b[i+6] = (byte)(a[i+6] & mask);
 438         }
 439         return new Object[]{ a, b };
 440     }
 441 
 442     @Test
 443     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 444                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 445                   IRNode.STORE_VECTOR, "> 0"},
 446         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 447         applyIfPlatform = {"64-bit", "true"},
 448         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 449     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 450                   IRNode.AND_VB, "= 0",
 451                   IRNode.STORE_VECTOR, "= 0"},
 452         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 453         applyIfPlatform = {"64-bit", "true"},
 454         applyIf = {"AlignVector", "true"})
 455     static Object[] test3(byte[] a, byte[] b, byte mask) {
 456         for (int i = 0; i < RANGE; i+=8) {
 457             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 458 
 459             // Problematic for AlignVector
 460             b[i+0] = (byte)(a[i+0] & mask); // best_memref, align 0
 461 
 462             b[i+3] = (byte)(a[i+3] & mask); // pack at offset 3 bytes
 463             b[i+4] = (byte)(a[i+4] & mask);
 464             b[i+5] = (byte)(a[i+5] & mask);
 465             b[i+6] = (byte)(a[i+6] & mask);
 466         }
 467         return new Object[]{ a, b };
 468     }
 469 
 470     @Test
 471     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 472                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "> 0",
 473                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 474                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "> 0",
 475                   IRNode.STORE_VECTOR, "> 0"},
 476         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 477         applyIfPlatform = {"64-bit", "true"},
 478         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 479     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 480                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 481                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 482                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 483                   IRNode.STORE_VECTOR, "> 0"},
 484         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 485         applyIfPlatform = {"64-bit", "true"},
 486         applyIfAnd = {"AlignVector", "true", "MaxVectorSize", ">=16"})
 487     static Object[] test4(byte[] a, byte[] b, byte mask) {
 488         for (int i = 0; i < RANGE/16; i++) {
 489             // Problematic for AlignVector
 490             b[i*16 + 0 ] = (byte)(a[i*16 + 0 ] & mask); // 4 pack, 0 aligned
 491             b[i*16 + 1 ] = (byte)(a[i*16 + 1 ] & mask);
 492             b[i*16 + 2 ] = (byte)(a[i*16 + 2 ] & mask);
 493             b[i*16 + 3 ] = (byte)(a[i*16 + 3 ] & mask);
 494 
 495             b[i*16 + 5 ] = (byte)(a[i*16 + 5 ] & mask); // 8 pack, 5 aligned
 496             b[i*16 + 6 ] = (byte)(a[i*16 + 6 ] & mask);
 497             b[i*16 + 7 ] = (byte)(a[i*16 + 7 ] & mask);
 498             b[i*16 + 8 ] = (byte)(a[i*16 + 8 ] & mask);
 499             b[i*16 + 9 ] = (byte)(a[i*16 + 9 ] & mask);
 500             b[i*16 + 10] = (byte)(a[i*16 + 10] & mask);
 501             b[i*16 + 11] = (byte)(a[i*16 + 11] & mask);
 502             b[i*16 + 12] = (byte)(a[i*16 + 12] & mask);
 503         }
 504         return new Object[]{ a, b };
 505     }
 506 
 507     @Test
 508     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 509                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 510                   IRNode.STORE_VECTOR, "> 0"},
 511         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 512         applyIfPlatform = {"64-bit", "true"},
 513         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 514     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 515                   IRNode.AND_VB, "= 0",
 516                   IRNode.STORE_VECTOR, "= 0"},
 517         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 518         applyIfPlatform = {"64-bit", "true"},
 519         applyIf = {"AlignVector", "true"})
 520     static Object[] test5(byte[] a, byte[] b, byte mask, int inv) {
 521         for (int i = 0; i < RANGE; i+=8) {
 522             // Cannot align with AlignVector because of invariant
 523             b[i+inv+0] = (byte)(a[i+inv+0] & mask);
 524 
 525             b[i+inv+3] = (byte)(a[i+inv+3] & mask);
 526             b[i+inv+4] = (byte)(a[i+inv+4] & mask);
 527             b[i+inv+5] = (byte)(a[i+inv+5] & mask);
 528             b[i+inv+6] = (byte)(a[i+inv+6] & mask);
 529         }
 530         return new Object[]{ a, b };
 531     }
 532 
 533     @Test
 534     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 535                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 536                   IRNode.STORE_VECTOR, "> 0"},
 537         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 538         applyIfPlatform = {"64-bit", "true"},
 539         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 540     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 541                   IRNode.AND_VB, "= 0",
 542                   IRNode.STORE_VECTOR, "= 0"},
 543         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 544         applyIfPlatform = {"64-bit", "true"},
 545         applyIf = {"AlignVector", "true"})
 546     static Object[] test6(byte[] a, byte[] b, byte mask) {
 547         for (int i = 0; i < RANGE/8; i+=2) {
 548             // Cannot align with AlignVector because offset is odd
 549             b[i*4+0] = (byte)(a[i*4+0] & mask);
 550 
 551             b[i*4+3] = (byte)(a[i*4+3] & mask);
 552             b[i*4+4] = (byte)(a[i*4+4] & mask);
 553             b[i*4+5] = (byte)(a[i*4+5] & mask);
 554             b[i*4+6] = (byte)(a[i*4+6] & mask);
 555         }
 556         return new Object[]{ a, b };
 557     }
 558 
 559     @Test
 560     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 561                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 562                   IRNode.STORE_VECTOR, "> 0"},
 563         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"},
 564         applyIfPlatform = {"64-bit", "true"},
 565         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 566     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 567                   IRNode.AND_VS, "= 0",
 568                   IRNode.STORE_VECTOR, "= 0"},
 569         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 570         applyIfPlatform = {"64-bit", "true"},
 571         applyIf = {"AlignVector", "true"})
 572     static Object[] test7(short[] a, short[] b, short mask) {
 573         for (int i = 0; i < RANGE/8; i+=2) {
 574             // Cannot align with AlignVector because offset is odd
 575             b[i*4+0] = (short)(a[i*4+0] & mask);
 576 
 577             b[i*4+3] = (short)(a[i*4+3] & mask);
 578             b[i*4+4] = (short)(a[i*4+4] & mask);
 579             b[i*4+5] = (short)(a[i*4+5] & mask);
 580             b[i*4+6] = (short)(a[i*4+6] & mask);
 581         }
 582         return new Object[]{ a, b };
 583     }
 584 
 585     @Test
 586     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 587                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 588                   IRNode.STORE_VECTOR, "> 0"},
 589         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 590         applyIfPlatform = {"64-bit", "true"},
 591         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 592     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 593                   IRNode.AND_VB, "= 0",
 594                   IRNode.STORE_VECTOR, "= 0"},
 595         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 596         applyIfPlatform = {"64-bit", "true"},
 597         applyIf = {"AlignVector", "true"})
 598     static Object[] test8(byte[] a, byte[] b, byte mask, int init) {
 599         for (int i = init; i < RANGE; i+=8) {
 600             // Cannot align with AlignVector because of invariant (variable init becomes invar)
 601             b[i+0] = (byte)(a[i+0] & mask);
 602 
 603             b[i+3] = (byte)(a[i+3] & mask);
 604             b[i+4] = (byte)(a[i+4] & mask);
 605             b[i+5] = (byte)(a[i+5] & mask);
 606             b[i+6] = (byte)(a[i+6] & mask);
 607         }
 608         return new Object[]{ a, b };
 609     }
 610 
 611     @Test
 612     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 613                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 614                   IRNode.STORE_VECTOR, "> 0"},
 615         applyIf = {"MaxVectorSize", ">=8"},
 616         applyIfPlatform = {"64-bit", "true"},
 617         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 618     static Object[] test9(byte[] a, byte[] b, byte mask) {
 619         // known non-zero init value does not affect offset, but has implicit effect on iv
 620         for (int i = 13; i < RANGE-8; i+=8) {
 621             b[i+0] = (byte)(a[i+0] & mask);
 622 
 623             b[i+3] = (byte)(a[i+3] & mask);
 624             b[i+4] = (byte)(a[i+4] & mask);
 625             b[i+5] = (byte)(a[i+5] & mask);
 626             b[i+6] = (byte)(a[i+6] & mask);
 627         }
 628         return new Object[]{ a, b };
 629     }
 630 
 631     @Test
 632     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 633                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 634                   IRNode.STORE_VECTOR, "> 0"},
 635         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 636         applyIfPlatform = {"64-bit", "true"},
 637         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 638     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 639                   IRNode.AND_VB, "= 0",
 640                   IRNode.STORE_VECTOR, "= 0"},
 641         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 642         applyIfPlatform = {"64-bit", "true"},
 643         applyIf = {"AlignVector", "true"})
 644     static Object[] test10a(byte[] a, byte[] b, byte mask) {
 645         // This is not alignable with pre-loop, because of odd init.
 646         for (int i = 3; i < RANGE-8; i+=8) {
 647             b[i+0] = (byte)(a[i+0] & mask);
 648             b[i+1] = (byte)(a[i+1] & mask);
 649             b[i+2] = (byte)(a[i+2] & mask);
 650             b[i+3] = (byte)(a[i+3] & mask);
 651         }
 652         return new Object[]{ a, b };
 653     }
 654 
 655     @Test
 656     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 657                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 658                   IRNode.STORE_VECTOR, "> 0"},
 659         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 660         applyIfPlatform = {"64-bit", "true"},
 661         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 662     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 663                   IRNode.AND_VB, "= 0",
 664                   IRNode.STORE_VECTOR, "= 0"},
 665         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 666         applyIfPlatform = {"64-bit", "true"},
 667         applyIf = {"AlignVector", "true"})
 668     static Object[] test10b(byte[] a, byte[] b, byte mask) {
 669         // This is not alignable with pre-loop, because of odd init.
 670         // Seems not correctly handled.
 671         for (int i = 13; i < RANGE-8; i+=8) {
 672             b[i+0] = (byte)(a[i+0] & mask);
 673             b[i+1] = (byte)(a[i+1] & mask);
 674             b[i+2] = (byte)(a[i+2] & mask);
 675             b[i+3] = (byte)(a[i+3] & mask);
 676         }
 677         return new Object[]{ a, b };
 678     }
 679 
 680     @Test
 681     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 682                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 683                   IRNode.STORE_VECTOR, "> 0"},
 684         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 685         applyIfPlatform = {"64-bit", "true"},
 686         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 687     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 688                   IRNode.AND_VS, "= 0",
 689                   IRNode.STORE_VECTOR, "= 0"},
 690         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 691         applyIfPlatform = {"64-bit", "true"},
 692         applyIf = {"AlignVector", "true"})
 693     static Object[] test10c(short[] a, short[] b, short mask) {
 694         // This is not alignable with pre-loop, because of odd init.
 695         // Seems not correctly handled with MaxVectorSize >= 32.
 696         for (int i = 13; i < RANGE-8; i+=8) {
 697             b[i+0] = (short)(a[i+0] & mask);
 698             b[i+1] = (short)(a[i+1] & mask);
 699             b[i+2] = (short)(a[i+2] & mask);
 700             b[i+3] = (short)(a[i+3] & mask);
 701         }
 702         return new Object[]{ a, b };
 703     }
 704 
 705     @Test
 706     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 707                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 708                   IRNode.STORE_VECTOR, "> 0"},
 709         applyIf = {"MaxVectorSize", ">=16"},
 710         applyIfPlatform = {"64-bit", "true"},
 711         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 712     static Object[] test10d(short[] a, short[] b, short mask) {
 713         for (int i = 13; i < RANGE-16; i+=8) {
 714             // init + offset -> aligned
 715             b[i+0+3] = (short)(a[i+0+3] & mask);
 716             b[i+1+3] = (short)(a[i+1+3] & mask);
 717             b[i+2+3] = (short)(a[i+2+3] & mask);
 718             b[i+3+3] = (short)(a[i+3+3] & mask);
 719         }
 720         return new Object[]{ a, b };
 721     }
 722 
 723     @Test
 724     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 725                   IRNode.AND_VB, "> 0",
 726                   IRNode.STORE_VECTOR, "> 0"},
 727         applyIfPlatform = {"64-bit", "true"},
 728         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 729     static Object[] test11aB(byte[] a, byte[] b, byte mask) {
 730         for (int i = 0; i < RANGE; i++) {
 731             // always alignable
 732             b[i+0] = (byte)(a[i+0] & mask);
 733         }
 734         return new Object[]{ a, b };
 735     }
 736 
 737     @Test
 738     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 739                   IRNode.AND_VS, "> 0",
 740                   IRNode.STORE_VECTOR, "> 0"},
 741         applyIfPlatform = {"64-bit", "true"},
 742         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 743     static Object[] test11aS(short[] a, short[] b, short mask) {
 744         for (int i = 0; i < RANGE; i++) {
 745             // always alignable
 746             b[i+0] = (short)(a[i+0] & mask);
 747         }
 748         return new Object[]{ a, b };
 749     }
 750 
 751     @Test
 752     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 753                   IRNode.AND_VI, "> 0",
 754                   IRNode.STORE_VECTOR, "> 0"},
 755         applyIfPlatform = {"64-bit", "true"},
 756         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 757     static Object[] test11aI(int[] a, int[] b, int mask) {
 758         for (int i = 0; i < RANGE; i++) {
 759             // always alignable
 760             b[i+0] = (int)(a[i+0] & mask);
 761         }
 762         return new Object[]{ a, b };
 763     }
 764 
 765     @Test
 766     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 767                   IRNode.AND_VL, "> 0",
 768                   IRNode.STORE_VECTOR, "> 0"},
 769         applyIfPlatform = {"64-bit", "true"},
 770         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 771     static Object[] test11aL(long[] a, long[] b, long mask) {
 772         for (int i = 0; i < RANGE; i++) {
 773             // always alignable
 774             b[i+0] = (long)(a[i+0] & mask);
 775         }
 776         return new Object[]{ a, b };
 777     }
 778 
 779     @Test
 780     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 781                   IRNode.AND_VB, "> 0",
 782                   IRNode.STORE_VECTOR, "> 0"},
 783         applyIfPlatform = {"64-bit", "true"},
 784         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 785     static Object[] test11bB(byte[] a, byte[] b, byte mask) {
 786         for (int i = 1; i < RANGE; i++) {
 787             // always alignable
 788             b[i+0] = (byte)(a[i+0] & mask);
 789         }
 790         return new Object[]{ a, b };
 791     }
 792 
 793     @Test
 794     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 795                   IRNode.AND_VS, "> 0",
 796                   IRNode.STORE_VECTOR, "> 0"},
 797         applyIfPlatform = {"64-bit", "true"},
 798         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 799     static Object[] test11bS(short[] a, short[] b, short mask) {
 800         for (int i = 1; i < RANGE; i++) {
 801             // always alignable
 802             b[i+0] = (short)(a[i+0] & mask);
 803         }
 804         return new Object[]{ a, b };
 805     }
 806 
 807     @Test
 808     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 809                   IRNode.AND_VI, "> 0",
 810                   IRNode.STORE_VECTOR, "> 0"},
 811         applyIfPlatform = {"64-bit", "true"},
 812         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 813     static Object[] test11bI(int[] a, int[] b, int mask) {
 814         for (int i = 1; i < RANGE; i++) {
 815             // always alignable
 816             b[i+0] = (int)(a[i+0] & mask);
 817         }
 818         return new Object[]{ a, b };
 819     }
 820 
 821     @Test
 822     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 823                   IRNode.AND_VL, "> 0",
 824                   IRNode.STORE_VECTOR, "> 0"},
 825         applyIfPlatform = {"64-bit", "true"},
 826         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 827     static Object[] test11bL(long[] a, long[] b, long mask) {
 828         for (int i = 1; i < RANGE; i++) {
 829             // always alignable
 830             b[i+0] = (long)(a[i+0] & mask);
 831         }
 832         return new Object[]{ a, b };
 833     }
 834 
 835     @Test
 836     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 837                   IRNode.AND_VB, "> 0",
 838                   IRNode.STORE_VECTOR, "> 0"},
 839         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 840         applyIfPlatform = {"64-bit", "true"},
 841         applyIf = {"AlignVector", "false"})
 842     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 843                   IRNode.AND_VB, "= 0",
 844                   IRNode.STORE_VECTOR, "= 0"},
 845         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 846         applyIfPlatform = {"64-bit", "true"},
 847         applyIf = {"AlignVector", "true"})
 848     static Object[] test11cB(byte[] a, byte[] b, byte mask) {
 849         for (int i = 1; i < RANGE-1; i++) {
 850             // 1 byte offset -> not alignable with AlignVector
 851             b[i+0] = (byte)(a[i+1] & mask);
 852         }
 853         return new Object[]{ a, b };
 854     }
 855 
 856     @Test
 857     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 858                   IRNode.AND_VS, "> 0",
 859                   IRNode.STORE_VECTOR, "> 0"},
 860         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 861         applyIfPlatform = {"64-bit", "true"},
 862         applyIf = {"AlignVector", "false"})
 863     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 864                   IRNode.AND_VS, "= 0",
 865                   IRNode.STORE_VECTOR, "= 0"},
 866         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 867         applyIfPlatform = {"64-bit", "true"},
 868         applyIf = {"AlignVector", "true"})
 869     static Object[] test11cS(short[] a, short[] b, short mask) {
 870         for (int i = 1; i < RANGE-1; i++) {
 871             // 2 byte offset -> not alignable with AlignVector
 872             b[i+0] = (short)(a[i+1] & mask);
 873         }
 874         return new Object[]{ a, b };
 875     }
 876 
 877     @Test
 878     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 879                   IRNode.AND_VI, "> 0",
 880                   IRNode.STORE_VECTOR, "> 0"},
 881         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 882         applyIfPlatform = {"64-bit", "true"},
 883         applyIf = {"AlignVector", "false"})
 884     @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0",
 885                   IRNode.AND_VI, "= 0",
 886                   IRNode.STORE_VECTOR, "= 0"},
 887         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 888         applyIfPlatform = {"64-bit", "true"},
 889         applyIf = {"AlignVector", "true"})
 890     static Object[] test11cI(int[] a, int[] b, int mask) {
 891         for (int i = 1; i < RANGE-1; i++) {
 892             // 4 byte offset -> not alignable with AlignVector
 893             b[i+0] = (int)(a[i+1] & mask);
 894         }
 895         return new Object[]{ a, b };
 896     }
 897 
 898     @Test
 899     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 900                   IRNode.AND_VL, "> 0",
 901                   IRNode.STORE_VECTOR, "> 0"},
 902         applyIfPlatform = {"64-bit", "true"},
 903         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 904     static Object[] test11cL(long[] a, long[] b, long mask) {
 905         for (int i = 1; i < RANGE-1; i++) {
 906             // always alignable (8 byte offset)
 907             b[i+0] = (long)(a[i+1] & mask);
 908         }
 909         return new Object[]{ a, b };
 910     }
 911 
 912     @Test
 913     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 914                   IRNode.AND_VB, "> 0",
 915                   IRNode.STORE_VECTOR, "> 0"},
 916         applyIfPlatform = {"64-bit", "true"},
 917         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 918     static Object[] test11dB(byte[] a, byte[] b, byte mask, int invar) {
 919         for (int i = 0; i < RANGE; i++) {
 920             b[i+0+invar] = (byte)(a[i+0+invar] & mask);
 921         }
 922         return new Object[]{ a, b };
 923     }
 924 
 925     @Test
 926     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 927                   IRNode.AND_VS, "> 0",
 928                   IRNode.STORE_VECTOR, "> 0"},
 929         applyIfPlatform = {"64-bit", "true"},
 930         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 931     static Object[] test11dS(short[] a, short[] b, short mask, int invar) {
 932         for (int i = 0; i < RANGE; i++) {
 933             b[i+0+invar] = (short)(a[i+0+invar] & mask);
 934         }
 935         return new Object[]{ a, b };
 936     }
 937 
 938     @Test
 939     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 940                   IRNode.AND_VI, "> 0",
 941                   IRNode.STORE_VECTOR, "> 0"},
 942         applyIfPlatform = {"64-bit", "true"},
 943         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 944     static Object[] test11dI(int[] a, int[] b, int mask, int invar) {
 945         for (int i = 0; i < RANGE; i++) {
 946             b[i+0+invar] = (int)(a[i+0+invar] & mask);
 947         }
 948         return new Object[]{ a, b };
 949     }
 950 
 951     @Test
 952     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 953                   IRNode.AND_VL, "> 0",
 954                   IRNode.STORE_VECTOR, "> 0"},
 955         applyIfPlatform = {"64-bit", "true"},
 956         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 957     static Object[] test11dL(long[] a, long[] b, long mask, int invar) {
 958         for (int i = 0; i < RANGE; i++) {
 959             b[i+0+invar] = (long)(a[i+0+invar] & mask);
 960         }
 961         return new Object[]{ a, b };
 962     }
 963 
 964     @Test
 965     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 966                   IRNode.AND_VB, "= 0",
 967                   IRNode.STORE_VECTOR, "= 0"},
 968         applyIfPlatform = {"64-bit", "true"},
 969         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 970     static Object[] test12(byte[] a, byte[] b, byte mask) {
 971         for (int i = 0; i < RANGE/16; i++) {
 972             // Currently does not vectorize at all
 973             b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask);
 974             b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask);
 975             b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask);
 976             b[i*6 + 3 ] = (byte)(a[i*6 + 3 ] & mask);
 977         }
 978         return new Object[]{ a, b };
 979     }
 980 
 981     @Test
 982     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 983                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 984                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 985                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
 986                   IRNode.STORE_VECTOR, "> 0"},
 987         applyIfPlatform = {"64-bit", "true"},
 988         applyIfCPUFeatureOr = {"avx2", "true"})
 989     // require avx to ensure vectors are larger than what unrolling produces
 990     static Object[] test13aIL(int[] a, long[] b) {
 991         for (int i = 0; i < RANGE; i++) {
 992             a[i]++;
 993             b[i]++;
 994         }
 995         return new Object[]{ a, b };
 996     }
 997 
 998     @Test
 999     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1000                   IRNode.LOAD_VECTOR_I, "> 0",
1001                   IRNode.ADD_VB, "> 0",
1002                   IRNode.ADD_VI, "> 0",
1003                   IRNode.STORE_VECTOR, "> 0"},
1004         applyIfPlatform = {"64-bit", "true"},
1005         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1006     static Object[] test13aIB(int[] a, byte[] b) {
1007         for (int i = 0; i < RANGE; i++) {
1008             a[i]++;
1009             b[i]++;
1010         }
1011         return new Object[]{ a, b };
1012     }
1013 
1014     @Test
1015     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1016                   IRNode.LOAD_VECTOR_S, "> 0",
1017                   IRNode.ADD_VI, "> 0",
1018                   IRNode.ADD_VS, "> 0",
1019                   IRNode.STORE_VECTOR, "> 0"},
1020         applyIfPlatform = {"64-bit", "true"},
1021         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1022     static Object[] test13aIS(int[] a, short[] b) {
1023         for (int i = 0; i < RANGE; i++) {
1024             a[i]++;
1025             b[i]++;
1026         }
1027         return new Object[]{ a, b };
1028     }
1029 
1030     @Test
1031     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1032                   IRNode.LOAD_VECTOR_S, "> 0",
1033                   IRNode.LOAD_VECTOR_I, "> 0",
1034                   IRNode.LOAD_VECTOR_L, "> 0",
1035                   IRNode.ADD_VB, "> 0",
1036                   IRNode.ADD_VS, "> 0",
1037                   IRNode.ADD_VI, "> 0",
1038                   IRNode.ADD_VL, "> 0",
1039                   IRNode.STORE_VECTOR, "> 0"},
1040         applyIfPlatform = {"64-bit", "true"},
1041         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1042     static Object[] test13aBSIL(byte[] a, short[] b, int[] c, long[] d) {
1043         for (int i = 0; i < RANGE; i++) {
1044             a[i]++;
1045             b[i]++;
1046             c[i]++;
1047             d[i]++;
1048         }
1049         return new Object[]{ a, b, c, d };
1050     }
1051 
1052     @Test
1053     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1054                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1055                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1056                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1057                   IRNode.STORE_VECTOR, "> 0"},
1058         applyIfPlatform = {"64-bit", "true"},
1059         applyIfCPUFeatureOr = {"avx2", "true"})
1060     // require avx to ensure vectors are larger than what unrolling produces
1061     static Object[] test13bIL(int[] a, long[] b) {
1062         for (int i = 1; i < RANGE; i++) {
1063             a[i]++;
1064             b[i]++;
1065         }
1066         return new Object[]{ a, b };
1067     }
1068 
1069     @Test
1070     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1071                   IRNode.LOAD_VECTOR_I, "> 0",
1072                   IRNode.ADD_VB, "> 0",
1073                   IRNode.ADD_VI, "> 0",
1074                   IRNode.STORE_VECTOR, "> 0"},
1075         applyIfPlatform = {"64-bit", "true"},
1076         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1077     static Object[] test13bIB(int[] a, byte[] b) {
1078         for (int i = 1; i < RANGE; i++) {
1079             a[i]++;
1080             b[i]++;
1081         }
1082         return new Object[]{ a, b };
1083     }
1084 
1085     @Test
1086     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1087                   IRNode.LOAD_VECTOR_S, "> 0",
1088                   IRNode.ADD_VI, "> 0",
1089                   IRNode.ADD_VS, "> 0",
1090                   IRNode.STORE_VECTOR, "> 0"},
1091         applyIfPlatform = {"64-bit", "true"},
1092         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1093     static Object[] test13bIS(int[] a, short[] b) {
1094         for (int i = 1; i < RANGE; i++) {
1095             a[i]++;
1096             b[i]++;
1097         }
1098         return new Object[]{ a, b };
1099     }
1100 
1101     @Test
1102     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1103                   IRNode.LOAD_VECTOR_S, "> 0",
1104                   IRNode.LOAD_VECTOR_I, "> 0",
1105                   IRNode.LOAD_VECTOR_L, "> 0",
1106                   IRNode.ADD_VB, "> 0",
1107                   IRNode.ADD_VS, "> 0",
1108                   IRNode.ADD_VI, "> 0",
1109                   IRNode.ADD_VL, "> 0",
1110                   IRNode.STORE_VECTOR, "> 0"},
1111         applyIfPlatform = {"64-bit", "true"},
1112         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1113     static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
1114         for (int i = 1; i < RANGE; i++) {
1115             a[i]++;
1116             b[i]++;
1117             c[i]++;
1118             d[i]++;
1119         }
1120         return new Object[]{ a, b, c, d };
1121     }
1122 
1123     @Test
1124     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1125                   IRNode.ADD_VB, "> 0",
1126                   IRNode.STORE_VECTOR, "> 0"},
1127         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1128         applyIfPlatform = {"64-bit", "true"},
1129         applyIf = {"AlignVector", "false"})
1130     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1131                   IRNode.ADD_VB, "= 0",
1132                   IRNode.STORE_VECTOR, "= 0"},
1133         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1134         applyIfPlatform = {"64-bit", "true"},
1135         applyIf = {"AlignVector", "true"})
1136     static Object[] test14aB(byte[] a) {
1137         // non-power-of-2 stride
1138         for (int i = 0; i < RANGE-20; i+=9) {
1139             a[i+0]++;
1140             a[i+1]++;
1141             a[i+2]++;
1142             a[i+3]++;
1143             a[i+4]++;
1144             a[i+5]++;
1145             a[i+6]++;
1146             a[i+7]++;
1147             a[i+8]++;
1148             a[i+9]++;
1149             a[i+10]++;
1150             a[i+11]++;
1151             a[i+12]++;
1152             a[i+13]++;
1153             a[i+14]++;
1154             a[i+15]++;
1155         }
1156         return new Object[]{ a };
1157     }
1158 
1159     @Test
1160     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1161                   IRNode.ADD_VB, "> 0",
1162                   IRNode.STORE_VECTOR, "> 0"},
1163         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1164         applyIfPlatform = {"64-bit", "true"},
1165         applyIf = {"AlignVector", "false"})
1166     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1167                   IRNode.ADD_VB, "= 0",
1168                   IRNode.STORE_VECTOR, "= 0"},
1169         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1170         applyIfPlatform = {"64-bit", "true"},
1171         applyIf = {"AlignVector", "true"})
1172     static Object[] test14bB(byte[] a) {
1173         // non-power-of-2 stride
1174         for (int i = 0; i < RANGE-20; i+=3) {
1175             a[i+0]++;
1176             a[i+1]++;
1177             a[i+2]++;
1178             a[i+3]++;
1179             a[i+4]++;
1180             a[i+5]++;
1181             a[i+6]++;
1182             a[i+7]++;
1183             a[i+8]++;
1184             a[i+9]++;
1185             a[i+10]++;
1186             a[i+11]++;
1187             a[i+12]++;
1188             a[i+13]++;
1189             a[i+14]++;
1190             a[i+15]++;
1191         }
1192         return new Object[]{ a };
1193     }
1194 
1195     @Test
1196     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1197                   IRNode.ADD_VB, "> 0",
1198                   IRNode.STORE_VECTOR, "> 0"},
1199         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1200         applyIfPlatform = {"64-bit", "true"},
1201         applyIf = {"AlignVector", "false"})
1202     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1203                   IRNode.ADD_VB, "= 0",
1204                   IRNode.STORE_VECTOR, "= 0"},
1205         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1206         applyIfPlatform = {"64-bit", "true"},
1207         applyIf = {"AlignVector", "true"})
1208     static Object[] test14cB(byte[] a) {
1209         // non-power-of-2 stride
1210         for (int i = 0; i < RANGE-20; i+=5) {
1211             a[i+0]++;
1212             a[i+1]++;
1213             a[i+2]++;
1214             a[i+3]++;
1215             a[i+4]++;
1216             a[i+5]++;
1217             a[i+6]++;
1218             a[i+7]++;
1219             a[i+8]++;
1220             a[i+9]++;
1221             a[i+10]++;
1222             a[i+11]++;
1223             a[i+12]++;
1224             a[i+13]++;
1225             a[i+14]++;
1226             a[i+15]++;
1227         }
1228         return new Object[]{ a };
1229     }
1230 
1231     @Test
1232     // IR rules difficult because of modulo wrapping with offset after peeling.
1233     static Object[] test15aB(byte[] a) {
1234         // non-power-of-2 scale
1235         for (int i = 0; i < RANGE/64-20; i++) {
1236             a[53*i+0]++;
1237             a[53*i+1]++;
1238             a[53*i+2]++;
1239             a[53*i+3]++;
1240             a[53*i+4]++;
1241             a[53*i+5]++;
1242             a[53*i+6]++;
1243             a[53*i+7]++;
1244             a[53*i+8]++;
1245             a[53*i+9]++;
1246             a[53*i+10]++;
1247             a[53*i+11]++;
1248             a[53*i+12]++;
1249             a[53*i+13]++;
1250             a[53*i+14]++;
1251             a[53*i+15]++;
1252         }
1253         return new Object[]{ a };
1254     }
1255 
1256     @Test
1257     // IR rules difficult because of modulo wrapping with offset after peeling.
1258     static Object[] test15bB(byte[] a) {
1259         // non-power-of-2 scale
1260         for (int i = 0; i < RANGE/64-20; i++) {
1261             a[25*i+0]++;
1262             a[25*i+1]++;
1263             a[25*i+2]++;
1264             a[25*i+3]++;
1265             a[25*i+4]++;
1266             a[25*i+5]++;
1267             a[25*i+6]++;
1268             a[25*i+7]++;
1269             a[25*i+8]++;
1270             a[25*i+9]++;
1271             a[25*i+10]++;
1272             a[25*i+11]++;
1273             a[25*i+12]++;
1274             a[25*i+13]++;
1275             a[25*i+14]++;
1276             a[25*i+15]++;
1277         }
1278         return new Object[]{ a };
1279     }
1280 
1281     @Test
1282     // IR rules difficult because of modulo wrapping with offset after peeling.
1283     static Object[] test15cB(byte[] a) {
1284         // non-power-of-2 scale
1285         for (int i = 0; i < RANGE/64-20; i++) {
1286             a[19*i+0]++;
1287             a[19*i+1]++;
1288             a[19*i+2]++;
1289             a[19*i+3]++;
1290             a[19*i+4]++;
1291             a[19*i+5]++;
1292             a[19*i+6]++;
1293             a[19*i+7]++;
1294             a[19*i+8]++;
1295             a[19*i+9]++;
1296             a[19*i+10]++;
1297             a[19*i+11]++;
1298             a[19*i+12]++;
1299             a[19*i+13]++;
1300             a[19*i+14]++;
1301             a[19*i+15]++;
1302         }
1303         return new Object[]{ a };
1304     }
1305 
1306     @Test
1307     static Object[] test16a(byte[] a, short[] b) {
1308         // infinite loop issues
1309         for (int i = 0; i < RANGE/2-20; i++) {
1310             a[2*i+0]++;
1311             a[2*i+1]++;
1312             a[2*i+2]++;
1313             a[2*i+3]++;
1314             a[2*i+4]++;
1315             a[2*i+5]++;
1316             a[2*i+6]++;
1317             a[2*i+7]++;
1318             a[2*i+8]++;
1319             a[2*i+9]++;
1320             a[2*i+10]++;
1321             a[2*i+11]++;
1322             a[2*i+12]++;
1323             a[2*i+13]++;
1324             a[2*i+14]++;
1325 
1326             b[2*i+0]++;
1327             b[2*i+1]++;
1328             b[2*i+2]++;
1329             b[2*i+3]++;
1330         }
1331         return new Object[]{ a, b };
1332     }
1333 
1334     @Test
1335     static Object[] test16b(byte[] a) {
1336         // infinite loop issues
1337         for (int i = 0; i < RANGE/2-20; i++) {
1338             a[2*i+0]++;
1339             a[2*i+1]++;
1340             a[2*i+2]++;
1341             a[2*i+3]++;
1342             a[2*i+4]++;
1343             a[2*i+5]++;
1344             a[2*i+6]++;
1345             a[2*i+7]++;
1346             a[2*i+8]++;
1347             a[2*i+9]++;
1348             a[2*i+10]++;
1349             a[2*i+11]++;
1350             a[2*i+12]++;
1351             a[2*i+13]++;
1352             a[2*i+14]++;
1353         }
1354         return new Object[]{ a };
1355     }
1356 
1357     @Test
1358     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1359                   IRNode.ADD_VL, "> 0",
1360                   IRNode.STORE_VECTOR, "> 0"},
1361         applyIfPlatform = {"64-bit", "true"},
1362         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1363     static Object[] test17a(long[] a) {
1364         // Unsafe: vectorizes with profiling (not xcomp)
1365         for (int i = 0; i < RANGE; i++) {
1366             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i;
1367             long v = UNSAFE.getLongUnaligned(a, adr);
1368             UNSAFE.putLongUnaligned(a, adr, v + 1);
1369         }
1370         return new Object[]{ a };
1371     }
1372 
1373     @Test
1374     // Difficult to write good IR rule. Modulo calculus overflow can create non-power-of-2 packs.
1375     static Object[] test17b(long[] a) {
1376         // Not alignable
1377         for (int i = 0; i < RANGE-1; i++) {
1378             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1;
1379             long v = UNSAFE.getLongUnaligned(a, adr);
1380             UNSAFE.putLongUnaligned(a, adr, v + 1);
1381         }
1382         return new Object[]{ a };
1383     }
1384 
1385     @Test
1386     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1387                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1388                   IRNode.STORE_VECTOR, "> 0"},
1389         applyIf = {"MaxVectorSize", ">=32"},
1390         applyIfPlatform = {"64-bit", "true"},
1391         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1392     static Object[] test17c(long[] a) {
1393         // Unsafe: aligned vectorizes
1394         for (int i = 0; i < RANGE-1; i+=4) {
1395             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i;
1396             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1397             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1398             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1399             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1400         }
1401         return new Object[]{ a };
1402     }
1403 
1404     @Test
1405     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1406                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1407                   IRNode.STORE_VECTOR, "> 0"},
1408         applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
1409         applyIfPlatform = {"64-bit", "true"},
1410         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=64"})
1411     // Ensure vector width is large enough to fit 64 byte for longs:
1412     // The offsets are: 25, 33, 57, 65
1413     // In modulo 32:    25,  1, 25,  1  -> does not vectorize
1414     // In modulo 64:    25, 33, 57,  1  -> at least first pair vectorizes
1415     // This problem is because we compute modulo vector width in memory_alignment.
1416     @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0",
1417                   IRNode.ADD_VL, "= 0",
1418                   IRNode.STORE_VECTOR, "= 0"},
1419         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1420         applyIfPlatform = {"64-bit", "true"},
1421         applyIf = {"AlignVector", "true"})
1422     static Object[] test17d(long[] a) {
1423         // Not alignable
1424         for (int i = 0; i < RANGE-1; i+=4) {
1425             int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1;
1426             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1427             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1428             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1429             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1430         }
1431         return new Object[]{ a };
1432     }
1433 
1434     @Test
1435     static Object[] test18a(byte[] a, int[] b) {
1436         // scale = 0  -->  no iv
1437         for (int i = 0; i < RANGE; i++) {
1438             a[0] = 1;
1439             b[i] = 2;
1440             a[1] = 1;
1441         }
1442         return new Object[]{ a, b };
1443     }
1444 
1445     @Test
1446     static Object[] test18b(byte[] a, int[] b) {
1447         // scale = 0  -->  no iv
1448         for (int i = 0; i < RANGE; i++) {
1449             a[1] = 1;
1450             b[i] = 2;
1451             a[2] = 1;
1452         }
1453         return new Object[]{ a, b };
1454     }
1455 
1456     @Test
1457     static Object[] test19(int[] a, int[] b) {
1458         for (int i = 5000; i > 0; i--) {
1459             a[RANGE_FINAL - i] = b[RANGE_FINAL - i];
1460         }
1461         return new Object[]{ a, b };
1462     }
1463 
1464     @Test
1465     static Object[] test20(byte[] a) {
1466         // Example where it is easy to pass alignment check,
1467         // but used to fail the alignment calculation
1468         for (int i = 1; i < RANGE/2-50; i++) {
1469             a[2*i+0+30]++;
1470             a[2*i+1+30]++;
1471             a[2*i+2+30]++;
1472             a[2*i+3+30]++;
1473         }
1474         return new Object[]{ a };
1475     }
1476 }