1 /*
   2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package compiler.loopopts.superword;
  25 
  26 import compiler.lib.ir_framework.*;
  27 import jdk.test.lib.Utils;
  28 import jdk.test.whitebox.WhiteBox;
  29 import jdk.internal.misc.Unsafe;
  30 import java.lang.reflect.Array;
  31 import java.util.Map;
  32 import java.util.HashMap;
  33 import java.util.Random;
  34 import java.nio.ByteOrder;
  35 
  36 /*
  37  * @test id=NoAlignVector
  38  * @bug 8310190
  39  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  40  * @modules java.base/jdk.internal.misc
  41  * @library /test/lib /
  42  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector
  43  */
  44 
  45 /*
  46  * @test id=AlignVector
  47  * @bug 8310190
  48  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  49  * @modules java.base/jdk.internal.misc
  50  * @library /test/lib /
  51  * @run driver compiler.loopopts.superword.TestAlignVector AlignVector
  52  */
  53 
  54 /*
  55  * @test id=VerifyAlignVector
  56  * @bug 8310190
  57  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  58  * @modules java.base/jdk.internal.misc
  59  * @library /test/lib /
  60  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector
  61  */
  62 
  63 /*
  64  * @test id=NoAlignVector-COH
  65  * @bug 8310190
  66  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  67  * @modules java.base/jdk.internal.misc
  68  * @library /test/lib /
  69  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector-COH
  70  */
  71 
  72 /*
  73  * @test id=VerifyAlignVector-COH
  74  * @bug 8310190
  75  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  76  * @modules java.base/jdk.internal.misc
  77  * @library /test/lib /
  78  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector-COH
  79  */
  80 
  81 public class TestAlignVector {
  82     static int RANGE = 1024*8;
  83     static int RANGE_FINAL = 1024*8;
  84     private static final Unsafe UNSAFE = Unsafe.getUnsafe();
  85     private static final Random RANDOM = Utils.getRandomInstance();
  86 
  87     // Inputs
  88     byte[] aB;
  89     byte[] bB;
  90     byte mB = (byte)31;
  91     short[] aS;
  92     short[] bS;
  93     short mS = (short)0xF0F0;
  94     int[] aI;
  95     int[] bI;
  96     int mI = 0xF0F0F0F0;
  97     long[] aL;
  98     long[] bL;
  99     long mL = 0xF0F0F0F0F0F0F0F0L;
 100 
 101     // List of tests
 102     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
 103 
 104     // List of gold, the results from the first run before compilation
 105     Map<String,Object[]> golds = new HashMap<String,Object[]>();
 106 
 107     interface TestFunction {
 108         Object[] run();
 109     }
 110 
 111     public static void main(String[] args) {
 112         TestFramework framework = new TestFramework(TestAlignVector.class);
 113         framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
 114                            "-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=250");
 115 
 116         switch (args[0]) {
 117             case "NoAlignVector"         -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); }
 118             case "AlignVector"           -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); }
 119             case "VerifyAlignVector"     -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 120             case "NoAlignVector-COH"     -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); }
 121             case "VerifyAlignVector-COH" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 122             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 123         }
 124         framework.start();
 125     }
 126 
 127     public TestAlignVector() {
 128         // Generate input once
 129         aB = generateB();
 130         bB = generateB();
 131         aS = generateS();
 132         bS = generateS();
 133         aI = generateI();
 134         bI = generateI();
 135         aL = generateL();
 136         bL = generateL();
 137 
 138         // Add all tests to list
 139         tests.put("test0",       () -> { return test0(aB.clone(), bB.clone(), mB); });
 140         tests.put("test1a",      () -> { return test1a(aB.clone(), bB.clone(), mB); });
 141         tests.put("test1b",      () -> { return test1b(aB.clone(), bB.clone(), mB); });
 142         tests.put("test2",       () -> { return test2(aB.clone(), bB.clone(), mB); });
 143         tests.put("test3",       () -> { return test3(aB.clone(), bB.clone(), mB); });
 144         tests.put("test4",       () -> { return test4(aB.clone(), bB.clone(), mB); });
 145         tests.put("test5",       () -> { return test5(aB.clone(), bB.clone(), mB, 0); });
 146         tests.put("test6",       () -> { return test6(aB.clone(), bB.clone(), mB); });
 147         tests.put("test7",       () -> { return test7(aS.clone(), bS.clone(), mS); });
 148         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 0); });
 149         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 1); });
 150         tests.put("test9",       () -> { return test9(aB.clone(), bB.clone(), mB); });
 151 
 152         tests.put("test10a",     () -> { return test10a(aB.clone(), bB.clone(), mB); });
 153         tests.put("test10b",     () -> { return test10b(aB.clone(), bB.clone(), mB); });
 154         tests.put("test10c",     () -> { return test10c(aS.clone(), bS.clone(), mS); });
 155         tests.put("test10d",     () -> { return test10d(aS.clone(), bS.clone(), mS); });
 156         tests.put("test10e",     () -> { return test10e(aS.clone(), bS.clone(), mS); });
 157 
 158         tests.put("test11aB",    () -> { return test11aB(aB.clone(), bB.clone(), mB); });
 159         tests.put("test11aS",    () -> { return test11aS(aS.clone(), bS.clone(), mS); });
 160         tests.put("test11aI",    () -> { return test11aI(aI.clone(), bI.clone(), mI); });
 161         tests.put("test11aL",    () -> { return test11aL(aL.clone(), bL.clone(), mL); });
 162 
 163         tests.put("test11bB",    () -> { return test11bB(aB.clone(), bB.clone(), mB); });
 164         tests.put("test11bS",    () -> { return test11bS(aS.clone(), bS.clone(), mS); });
 165         tests.put("test11bI",    () -> { return test11bI(aI.clone(), bI.clone(), mI); });
 166         tests.put("test11bL",    () -> { return test11bL(aL.clone(), bL.clone(), mL); });
 167 
 168         tests.put("test11cB",    () -> { return test11cB(aB.clone(), bB.clone(), mB); });
 169         tests.put("test11cS",    () -> { return test11cS(aS.clone(), bS.clone(), mS); });
 170         tests.put("test11cI",    () -> { return test11cI(aI.clone(), bI.clone(), mI); });
 171         tests.put("test11cL",    () -> { return test11cL(aL.clone(), bL.clone(), mL); });
 172 
 173         tests.put("test11dB",    () -> { return test11dB(aB.clone(), bB.clone(), mB, 0); });
 174         tests.put("test11dS",    () -> { return test11dS(aS.clone(), bS.clone(), mS, 0); });
 175         tests.put("test11dI",    () -> { return test11dI(aI.clone(), bI.clone(), mI, 0); });
 176         tests.put("test11dL",    () -> { return test11dL(aL.clone(), bL.clone(), mL, 0); });
 177 
 178         tests.put("test12",      () -> { return test12(aB.clone(), bB.clone(), mB); });
 179 
 180         tests.put("test13aIL",   () -> { return test13aIL(aI.clone(), aL.clone()); });
 181         tests.put("test13aIB",   () -> { return test13aIB(aI.clone(), aB.clone()); });
 182         tests.put("test13aIS",   () -> { return test13aIS(aI.clone(), aS.clone()); });
 183         tests.put("test13aBSIL", () -> { return test13aBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 184 
 185         tests.put("test13bIL",   () -> { return test13bIL(aI.clone(), aL.clone()); });
 186         tests.put("test13bIB",   () -> { return test13bIB(aI.clone(), aB.clone()); });
 187         tests.put("test13bIS",   () -> { return test13bIS(aI.clone(), aS.clone()); });
 188         tests.put("test13bBSIL", () -> { return test13bBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 189 
 190         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
 191         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
 192         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
 193         tests.put("test14dB",    () -> { return test14dB(aB.clone()); });
 194         tests.put("test14eB",    () -> { return test14eB(aB.clone()); });
 195         tests.put("test14fB",    () -> { return test14fB(aB.clone()); });
 196 
 197         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
 198         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
 199         tests.put("test15cB",    () -> { return test15cB(aB.clone()); });
 200 
 201         tests.put("test16a",     () -> { return test16a(aB.clone(), aS.clone()); });
 202         tests.put("test16b",     () -> { return test16b(aB.clone()); });
 203 
 204         tests.put("test17a",     () -> { return test17a(aL.clone()); });
 205         tests.put("test17b",     () -> { return test17b(aL.clone()); });
 206         tests.put("test17c",     () -> { return test17c(aL.clone()); });
 207         tests.put("test17d",     () -> { return test17d(aL.clone()); });
 208 
 209         tests.put("test18a",     () -> { return test18a(aB.clone(), aI.clone()); });
 210         tests.put("test18b",     () -> { return test18b(aB.clone(), aI.clone()); });
 211 
 212         tests.put("test19",      () -> { return test19(aI.clone(), bI.clone()); });
 213         tests.put("test20",      () -> { return test20(aB.clone()); });
 214 
 215         // Compute gold value for all test methods before compilation
 216         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 217             String name = entry.getKey();
 218             TestFunction test = entry.getValue();
 219             Object[] gold = test.run();
 220             golds.put(name, gold);
 221         }
 222     }
 223 
 224     @Warmup(100)
 225     @Run(test = {"test0",
 226                  "test1a",
 227                  "test1b",
 228                  "test2",
 229                  "test3",
 230                  "test4",
 231                  "test5",
 232                  "test6",
 233                  "test7",
 234                  "test8",
 235                  "test9",
 236                  "test10a",
 237                  "test10b",
 238                  "test10c",
 239                  "test10d",
 240                  "test10e",
 241                  "test11aB",
 242                  "test11aS",
 243                  "test11aI",
 244                  "test11aL",
 245                  "test11bB",
 246                  "test11bS",
 247                  "test11bI",
 248                  "test11bL",
 249                  "test11cB",
 250                  "test11cS",
 251                  "test11cI",
 252                  "test11cL",
 253                  "test11dB",
 254                  "test11dS",
 255                  "test11dI",
 256                  "test11dL",
 257                  "test12",
 258                  "test13aIL",
 259                  "test13aIB",
 260                  "test13aIS",
 261                  "test13aBSIL",
 262                  "test13bIL",
 263                  "test13bIB",
 264                  "test13bIS",
 265                  "test13bBSIL",
 266                  "test14aB",
 267                  "test14bB",
 268                  "test14cB",
 269                  "test14dB",
 270                  "test14eB",
 271                  "test14fB",
 272                  "test15aB",
 273                  "test15bB",
 274                  "test15cB",
 275                  "test16a",
 276                  "test16b",
 277                  "test17a",
 278                  "test17b",
 279                  "test17c",
 280                  "test17d",
 281                  "test18a",
 282                  "test18b",
 283                  "test19",
 284                  "test20"})
 285     public void runTests() {
 286         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 287             String name = entry.getKey();
 288             TestFunction test = entry.getValue();
 289             // Recall gold value from before compilation
 290             Object[] gold = golds.get(name);
 291             // Compute new result
 292             Object[] result = test.run();
 293             // Compare gold and new result
 294             verify(name, gold, result);
 295         }
 296     }
 297 
 298     static byte[] generateB() {
 299         byte[] a = new byte[RANGE];
 300         for (int i = 0; i < a.length; i++) {
 301             a[i] = (byte)RANDOM.nextInt();
 302         }
 303         return a;
 304     }
 305 
 306     static short[] generateS() {
 307         short[] a = new short[RANGE];
 308         for (int i = 0; i < a.length; i++) {
 309             a[i] = (short)RANDOM.nextInt();
 310         }
 311         return a;
 312     }
 313 
 314     static int[] generateI() {
 315         int[] a = new int[RANGE];
 316         for (int i = 0; i < a.length; i++) {
 317             a[i] = RANDOM.nextInt();
 318         }
 319         return a;
 320     }
 321 
 322     static long[] generateL() {
 323         long[] a = new long[RANGE];
 324         for (int i = 0; i < a.length; i++) {
 325             a[i] = RANDOM.nextLong();
 326         }
 327         return a;
 328     }
 329 
 330     static void verify(String name, Object[] gold, Object[] result) {
 331         if (gold.length != result.length) {
 332             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
 333                                        gold.length + ", result.length = " + result.length);
 334         }
 335         for (int i = 0; i < gold.length; i++) {
 336             Object g = gold[i];
 337             Object r = result[i];
 338             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
 339                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
 340                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 341                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 342             }
 343             if (g == r) {
 344                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
 345                                            " gold[" + i + "] == result[" + i + "]");
 346             }
 347             if (Array.getLength(g) != Array.getLength(r)) {
 348                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
 349                                            " gold[" + i + "].length = " + Array.getLength(g) +
 350                                            " result[" + i + "].length = " + Array.getLength(r));
 351             }
 352             Class c = g.getClass().getComponentType();
 353             if (c == byte.class) {
 354                 verifyB(name, i, (byte[])g, (byte[])r);
 355             } else if (c == short.class) {
 356                 verifyS(name, i, (short[])g, (short[])r);
 357             } else if (c == int.class) {
 358                 verifyI(name, i, (int[])g, (int[])r);
 359             } else if (c == long.class) {
 360                 verifyL(name, i, (long[])g, (long[])r);
 361             } else {
 362                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
 363                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 364                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 365             }
 366         }
 367     }
 368 
 369     static void verifyB(String name, int i, byte[] g, byte[] r) {
 370         for (int j = 0; j < g.length; j++) {
 371             if (g[j] != r[j]) {
 372                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 373                                            " gold[" + i + "][" + j + "] = " + g[j] +
 374                                            " result[" + i + "][" + j + "] = " + r[j]);
 375             }
 376         }
 377     }
 378 
 379     static void verifyS(String name, int i, short[] g, short[] r) {
 380         for (int j = 0; j < g.length; j++) {
 381             if (g[j] != r[j]) {
 382                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 383                                            " gold[" + i + "][" + j + "] = " + g[j] +
 384                                            " result[" + i + "][" + j + "] = " + r[j]);
 385             }
 386         }
 387     }
 388 
 389     static void verifyI(String name, int i, int[] g, int[] r) {
 390         for (int j = 0; j < g.length; j++) {
 391             if (g[j] != r[j]) {
 392                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 393                                            " gold[" + i + "][" + j + "] = " + g[j] +
 394                                            " result[" + i + "][" + j + "] = " + r[j]);
 395             }
 396         }
 397     }
 398 
 399     static void verifyL(String name, int i, long[] g, long[] r) {
 400         for (int j = 0; j < g.length; j++) {
 401             if (g[j] != r[j]) {
 402                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 403                                            " gold[" + i + "][" + j + "] = " + g[j] +
 404                                            " result[" + i + "][" + j + "] = " + r[j]);
 405             }
 406         }
 407     }
 408 
 409     @Test
 410     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 411                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 412                   IRNode.STORE_VECTOR, "> 0"},
 413         applyIf = {"MaxVectorSize", ">=8"},
 414         applyIfPlatform = {"64-bit", "true"},
 415         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 416     static Object[] test0(byte[] a, byte[] b, byte mask) {
 417         for (int i = 0; i < RANGE; i+=8) {
 418             // Safe to vectorize with AlignVector
 419             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 420             b[i+1] = (byte)(a[i+1] & mask);
 421             b[i+2] = (byte)(a[i+2] & mask);
 422             b[i+3] = (byte)(a[i+3] & mask);
 423         }
 424         return new Object[]{ a, b };
 425     }
 426 
 427     @Test
 428     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 429                   IRNode.AND_VB, "> 0",
 430                   IRNode.STORE_VECTOR, "> 0"},
 431         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
 432         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 433         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 434         applyIfPlatform = {"64-bit", "true"},
 435         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 436     static Object[] test1a(byte[] a, byte[] b, byte mask) {
 437         for (int i = 0; i < RANGE; i+=8) {
 438             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0 + iter*8
 439             b[i+1] = (byte)(a[i+1] & mask);
 440             b[i+2] = (byte)(a[i+2] & mask);
 441             b[i+3] = (byte)(a[i+3] & mask);
 442             b[i+4] = (byte)(a[i+4] & mask);
 443             b[i+5] = (byte)(a[i+5] & mask);
 444             b[i+6] = (byte)(a[i+6] & mask);
 445             b[i+7] = (byte)(a[i+7] & mask);
 446         }
 447         return new Object[]{ a, b };
 448     }
 449 
 450     @Test
 451     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 452                   IRNode.AND_VB, "> 0",
 453                   IRNode.STORE_VECTOR, "> 0"},
 454         applyIfOr = {"UseCompactObjectHeaders", "true", "AlignVector", "false"},
 455         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 456         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 457         applyIfPlatform = {"64-bit", "true"},
 458         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 459     static Object[] test1b(byte[] a, byte[] b, byte mask) {
 460         for (int i = 4; i < RANGE-8; i+=8) {
 461             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4 + iter*8
 462             b[i+1] = (byte)(a[i+1] & mask);
 463             b[i+2] = (byte)(a[i+2] & mask);
 464             b[i+3] = (byte)(a[i+3] & mask);
 465             b[i+4] = (byte)(a[i+4] & mask);
 466             b[i+5] = (byte)(a[i+5] & mask);
 467             b[i+6] = (byte)(a[i+6] & mask);
 468             b[i+7] = (byte)(a[i+7] & mask);
 469         }
 470         return new Object[]{ a, b };
 471     }
 472 
 473     @Test
 474     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 475                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 476                   IRNode.STORE_VECTOR, "> 0"},
 477         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 478         applyIfPlatform = {"64-bit", "true"},
 479         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 480     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 481                   IRNode.AND_VB, "= 0",
 482                   IRNode.STORE_VECTOR, "= 0"},
 483         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 484         applyIfPlatform = {"64-bit", "true"},
 485         applyIf = {"AlignVector", "true"})
 486     static Object[] test2(byte[] a, byte[] b, byte mask) {
 487         for (int i = 0; i < RANGE; i+=8) {
 488             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 489             b[i+3] = (byte)(a[i+3] & mask); // at alignment 3
 490             b[i+4] = (byte)(a[i+4] & mask);
 491             b[i+5] = (byte)(a[i+5] & mask);
 492             b[i+6] = (byte)(a[i+6] & mask);
 493         }
 494         return new Object[]{ a, b };
 495     }
 496 
 497     @Test
 498     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 499                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 500                   IRNode.STORE_VECTOR, "> 0"},
 501         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 502         applyIfPlatform = {"64-bit", "true"},
 503         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 504     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 505                   IRNode.AND_VB, "= 0",
 506                   IRNode.STORE_VECTOR, "= 0"},
 507         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 508         applyIfPlatform = {"64-bit", "true"},
 509         applyIf = {"AlignVector", "true"})
 510     static Object[] test3(byte[] a, byte[] b, byte mask) {
 511         for (int i = 0; i < RANGE; i+=8) {
 512             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 513 
 514             // Problematic for AlignVector
 515             b[i+0] = (byte)(a[i+0] & mask); // best_memref, align 0
 516 
 517             b[i+3] = (byte)(a[i+3] & mask); // pack at offset 3 bytes
 518             b[i+4] = (byte)(a[i+4] & mask);
 519             b[i+5] = (byte)(a[i+5] & mask);
 520             b[i+6] = (byte)(a[i+6] & mask);
 521         }
 522         return new Object[]{ a, b };
 523     }
 524 
 525     @Test
 526     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 527                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "> 0",
 528                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 529                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "> 0",
 530                   IRNode.STORE_VECTOR, "> 0"},
 531         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 532         applyIfPlatform = {"64-bit", "true"},
 533         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 534     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 535                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 536                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 537                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 538                   IRNode.STORE_VECTOR, "> 0"},
 539         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 540         applyIfPlatform = {"64-bit", "true"},
 541         applyIfAnd = {"AlignVector", "true", "MaxVectorSize", ">=16"})
 542     static Object[] test4(byte[] a, byte[] b, byte mask) {
 543         for (int i = 0; i < RANGE/16; i++) {
 544             // Problematic for AlignVector
 545             b[i*16 + 0 ] = (byte)(a[i*16 + 0 ] & mask); // 4 pack, 0 aligned
 546             b[i*16 + 1 ] = (byte)(a[i*16 + 1 ] & mask);
 547             b[i*16 + 2 ] = (byte)(a[i*16 + 2 ] & mask);
 548             b[i*16 + 3 ] = (byte)(a[i*16 + 3 ] & mask);
 549 
 550             b[i*16 + 5 ] = (byte)(a[i*16 + 5 ] & mask); // 8 pack, 5 aligned
 551             b[i*16 + 6 ] = (byte)(a[i*16 + 6 ] & mask);
 552             b[i*16 + 7 ] = (byte)(a[i*16 + 7 ] & mask);
 553             b[i*16 + 8 ] = (byte)(a[i*16 + 8 ] & mask);
 554             b[i*16 + 9 ] = (byte)(a[i*16 + 9 ] & mask);
 555             b[i*16 + 10] = (byte)(a[i*16 + 10] & mask);
 556             b[i*16 + 11] = (byte)(a[i*16 + 11] & mask);
 557             b[i*16 + 12] = (byte)(a[i*16 + 12] & mask);
 558         }
 559         return new Object[]{ a, b };
 560     }
 561 
 562     @Test
 563     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 564                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 565                   IRNode.STORE_VECTOR, "> 0"},
 566         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 567         applyIfPlatform = {"64-bit", "true"},
 568         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 569     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 570                   IRNode.AND_VB, "= 0",
 571                   IRNode.STORE_VECTOR, "= 0"},
 572         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 573         applyIfPlatform = {"64-bit", "true"},
 574         applyIf = {"AlignVector", "true"})
 575     static Object[] test5(byte[] a, byte[] b, byte mask, int inv) {
 576         for (int i = 0; i < RANGE; i+=8) {
 577             // Cannot align with AlignVector because of invariant
 578             b[i+inv+0] = (byte)(a[i+inv+0] & mask);
 579 
 580             b[i+inv+3] = (byte)(a[i+inv+3] & mask);
 581             b[i+inv+4] = (byte)(a[i+inv+4] & mask);
 582             b[i+inv+5] = (byte)(a[i+inv+5] & mask);
 583             b[i+inv+6] = (byte)(a[i+inv+6] & mask);
 584         }
 585         return new Object[]{ a, b };
 586     }
 587 
 588     @Test
 589     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 590                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 591                   IRNode.STORE_VECTOR, "> 0"},
 592         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 593         applyIfPlatform = {"64-bit", "true"},
 594         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 595     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 596                   IRNode.AND_VB, "= 0",
 597                   IRNode.STORE_VECTOR, "= 0"},
 598         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 599         applyIfPlatform = {"64-bit", "true"},
 600         applyIf = {"AlignVector", "true"})
 601     static Object[] test6(byte[] a, byte[] b, byte mask) {
 602         for (int i = 0; i < RANGE/8; i+=2) {
 603             // Cannot align with AlignVector because offset is odd
 604             b[i*4+0] = (byte)(a[i*4+0] & mask);
 605 
 606             b[i*4+3] = (byte)(a[i*4+3] & mask);
 607             b[i*4+4] = (byte)(a[i*4+4] & mask);
 608             b[i*4+5] = (byte)(a[i*4+5] & mask);
 609             b[i*4+6] = (byte)(a[i*4+6] & mask);
 610         }
 611         return new Object[]{ a, b };
 612     }
 613 
 614     @Test
 615     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 616                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 617                   IRNode.STORE_VECTOR, "> 0"},
 618         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"},
 619         applyIfPlatform = {"64-bit", "true"},
 620         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 621     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 622                   IRNode.AND_VS, "= 0",
 623                   IRNode.STORE_VECTOR, "= 0"},
 624         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 625         applyIfPlatform = {"64-bit", "true"},
 626         applyIf = {"AlignVector", "true"})
 627     static Object[] test7(short[] a, short[] b, short mask) {
 628         for (int i = 0; i < RANGE/8; i+=2) {
 629             // Cannot align with AlignVector because offset is odd
 630             b[i*4+0] = (short)(a[i*4+0] & mask);
 631 
 632             b[i*4+3] = (short)(a[i*4+3] & mask);
 633             b[i*4+4] = (short)(a[i*4+4] & mask);
 634             b[i*4+5] = (short)(a[i*4+5] & mask);
 635             b[i*4+6] = (short)(a[i*4+6] & mask);
 636         }
 637         return new Object[]{ a, b };
 638     }
 639 
 640     @Test
 641     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 642                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 643                   IRNode.STORE_VECTOR, "> 0"},
 644         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 645         applyIfPlatform = {"64-bit", "true"},
 646         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 647     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 648                   IRNode.AND_VB, "= 0",
 649                   IRNode.STORE_VECTOR, "= 0"},
 650         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 651         applyIfPlatform = {"64-bit", "true"},
 652         applyIf = {"AlignVector", "true"})
 653     static Object[] test8(byte[] a, byte[] b, byte mask, int init) {
 654         for (int i = init; i < RANGE; i+=8) {
 655             // Cannot align with AlignVector because of invariant (variable init becomes invar)
 656             b[i+0] = (byte)(a[i+0] & mask);
 657 
 658             b[i+3] = (byte)(a[i+3] & mask);
 659             b[i+4] = (byte)(a[i+4] & mask);
 660             b[i+5] = (byte)(a[i+5] & mask);
 661             b[i+6] = (byte)(a[i+6] & mask);
 662         }
 663         return new Object[]{ a, b };
 664     }
 665 
 666     @Test
 667     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 668                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 669                   IRNode.STORE_VECTOR, "> 0"},
 670         applyIf = {"MaxVectorSize", ">=8"},
 671         applyIfPlatform = {"64-bit", "true"},
 672         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 673     static Object[] test9(byte[] a, byte[] b, byte mask) {
 674         // known non-zero init value does not affect offset, but has implicit effect on iv
 675         for (int i = 13; i < RANGE-8; i+=8) {
 676             b[i+0] = (byte)(a[i+0] & mask);
 677 
 678             b[i+3] = (byte)(a[i+3] & mask);
 679             b[i+4] = (byte)(a[i+4] & mask);
 680             b[i+5] = (byte)(a[i+5] & mask);
 681             b[i+6] = (byte)(a[i+6] & mask);
 682         }
 683         return new Object[]{ a, b };
 684     }
 685 
 686     @Test
 687     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 688                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 689                   IRNode.STORE_VECTOR, "> 0"},
 690         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 691         applyIfPlatform = {"64-bit", "true"},
 692         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 693     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 694                   IRNode.AND_VB, "= 0",
 695                   IRNode.STORE_VECTOR, "= 0"},
 696         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 697         applyIfPlatform = {"64-bit", "true"},
 698         applyIf = {"AlignVector", "true"})
 699     static Object[] test10a(byte[] a, byte[] b, byte mask) {
 700         // This is not alignable with pre-loop, because of odd init.
 701         for (int i = 3; i < RANGE-8; i+=8) {
 702             b[i+0] = (byte)(a[i+0] & mask);
 703             b[i+1] = (byte)(a[i+1] & mask);
 704             b[i+2] = (byte)(a[i+2] & mask);
 705             b[i+3] = (byte)(a[i+3] & mask);
 706         }
 707         return new Object[]{ a, b };
 708     }
 709 
 710     @Test
 711     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 712                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 713                   IRNode.STORE_VECTOR, "> 0"},
 714         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 715         applyIfPlatform = {"64-bit", "true"},
 716         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 717     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 718                   IRNode.AND_VB, "= 0",
 719                   IRNode.STORE_VECTOR, "= 0"},
 720         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 721         applyIfPlatform = {"64-bit", "true"},
 722         applyIf = {"AlignVector", "true"})
 723     static Object[] test10b(byte[] a, byte[] b, byte mask) {
 724         // This is not alignable with pre-loop, because of odd init.
 725         // Seems not correctly handled.
 726         for (int i = 13; i < RANGE-8; i+=8) {
 727             b[i+0] = (byte)(a[i+0] & mask);
 728             b[i+1] = (byte)(a[i+1] & mask);
 729             b[i+2] = (byte)(a[i+2] & mask);
 730             b[i+3] = (byte)(a[i+3] & mask);
 731         }
 732         return new Object[]{ a, b };
 733     }
 734 
 735     @Test
 736     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 737                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 738                   IRNode.STORE_VECTOR, "> 0"},
 739         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 740         applyIfPlatform = {"64-bit", "true"},
 741         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 742     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 743                   IRNode.AND_VS, "= 0",
 744                   IRNode.STORE_VECTOR, "= 0"},
 745         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 746         applyIfPlatform = {"64-bit", "true"},
 747         applyIf = {"AlignVector", "true"})
 748     static Object[] test10c(short[] a, short[] b, short mask) {
 749         // This is not alignable with pre-loop, because of odd init.
 750         // Seems not correctly handled with MaxVectorSize >= 32.
 751         for (int i = 13; i < RANGE-8; i+=8) {
 752             b[i+0] = (short)(a[i+0] & mask);
 753             b[i+1] = (short)(a[i+1] & mask);
 754             b[i+2] = (short)(a[i+2] & mask);
 755             b[i+3] = (short)(a[i+3] & mask);
 756         }
 757         return new Object[]{ a, b };
 758     }
 759 
 760     @Test
 761     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 762                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 763                   IRNode.STORE_VECTOR, "> 0"},
 764         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "false"},
 765         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 766         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 767         applyIfPlatform = {"64-bit", "true"},
 768         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 769     static Object[] test10d(short[] a, short[] b, short mask) {
 770         for (int i = 13; i < RANGE-16; i+=8) {
 771             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 13) + iter*16
 772             b[i+0+3] = (short)(a[i+0+3] & mask);
 773             b[i+1+3] = (short)(a[i+1+3] & mask);
 774             b[i+2+3] = (short)(a[i+2+3] & mask);
 775             b[i+3+3] = (short)(a[i+3+3] & mask);
 776         }
 777         return new Object[]{ a, b };
 778     }
 779 
 780     @Test
 781     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 782                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 783                   IRNode.STORE_VECTOR, "> 0"},
 784         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "true"},
 785         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 786         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 787         applyIfPlatform = {"64-bit", "true"},
 788         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 789     static Object[] test10e(short[] a, short[] b, short mask) {
 790         for (int i = 11; i < RANGE-16; i+=8) {
 791             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 11) + iter*16
 792             b[i+0+3] = (short)(a[i+0+3] & mask);
 793             b[i+1+3] = (short)(a[i+1+3] & mask);
 794             b[i+2+3] = (short)(a[i+2+3] & mask);
 795             b[i+3+3] = (short)(a[i+3+3] & mask);
 796         }
 797         return new Object[]{ a, b };
 798     }
 799 
 800     @Test
 801     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 802                   IRNode.AND_VB, "> 0",
 803                   IRNode.STORE_VECTOR, "> 0"},
 804         applyIfPlatform = {"64-bit", "true"},
 805         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 806     static Object[] test11aB(byte[] a, byte[] b, byte mask) {
 807         for (int i = 0; i < RANGE; i++) {
 808             // always alignable
 809             b[i+0] = (byte)(a[i+0] & mask);
 810         }
 811         return new Object[]{ a, b };
 812     }
 813 
 814     @Test
 815     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 816                   IRNode.AND_VS, "> 0",
 817                   IRNode.STORE_VECTOR, "> 0"},
 818         applyIfPlatform = {"64-bit", "true"},
 819         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 820     static Object[] test11aS(short[] a, short[] b, short mask) {
 821         for (int i = 0; i < RANGE; i++) {
 822             // always alignable
 823             b[i+0] = (short)(a[i+0] & mask);
 824         }
 825         return new Object[]{ a, b };
 826     }
 827 
 828     @Test
 829     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 830                   IRNode.AND_VI, "> 0",
 831                   IRNode.STORE_VECTOR, "> 0"},
 832         applyIfPlatform = {"64-bit", "true"},
 833         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 834     static Object[] test11aI(int[] a, int[] b, int mask) {
 835         for (int i = 0; i < RANGE; i++) {
 836             // always alignable
 837             b[i+0] = (int)(a[i+0] & mask);
 838         }
 839         return new Object[]{ a, b };
 840     }
 841 
 842     @Test
 843     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 844                   IRNode.AND_VL, "> 0",
 845                   IRNode.STORE_VECTOR, "> 0"},
 846         applyIfPlatform = {"64-bit", "true"},
 847         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 848     static Object[] test11aL(long[] a, long[] b, long mask) {
 849         for (int i = 0; i < RANGE; i++) {
 850             // always alignable
 851             b[i+0] = (long)(a[i+0] & mask);
 852         }
 853         return new Object[]{ a, b };
 854     }
 855 
 856     @Test
 857     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 858                   IRNode.AND_VB, "> 0",
 859                   IRNode.STORE_VECTOR, "> 0"},
 860         applyIfPlatform = {"64-bit", "true"},
 861         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 862     static Object[] test11bB(byte[] a, byte[] b, byte mask) {
 863         for (int i = 1; i < RANGE; i++) {
 864             // always alignable
 865             b[i+0] = (byte)(a[i+0] & mask);
 866         }
 867         return new Object[]{ a, b };
 868     }
 869 
 870     @Test
 871     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 872                   IRNode.AND_VS, "> 0",
 873                   IRNode.STORE_VECTOR, "> 0"},
 874         applyIfPlatform = {"64-bit", "true"},
 875         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 876     static Object[] test11bS(short[] a, short[] b, short mask) {
 877         for (int i = 1; i < RANGE; i++) {
 878             // always alignable
 879             b[i+0] = (short)(a[i+0] & mask);
 880         }
 881         return new Object[]{ a, b };
 882     }
 883 
 884     @Test
 885     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 886                   IRNode.AND_VI, "> 0",
 887                   IRNode.STORE_VECTOR, "> 0"},
 888         applyIfPlatform = {"64-bit", "true"},
 889         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 890     static Object[] test11bI(int[] a, int[] b, int mask) {
 891         for (int i = 1; i < RANGE; i++) {
 892             // always alignable
 893             b[i+0] = (int)(a[i+0] & mask);
 894         }
 895         return new Object[]{ a, b };
 896     }
 897 
 898     @Test
 899     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 900                   IRNode.AND_VL, "> 0",
 901                   IRNode.STORE_VECTOR, "> 0"},
 902         applyIfPlatform = {"64-bit", "true"},
 903         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 904     static Object[] test11bL(long[] a, long[] b, long mask) {
 905         for (int i = 1; i < RANGE; i++) {
 906             // always alignable
 907             b[i+0] = (long)(a[i+0] & mask);
 908         }
 909         return new Object[]{ a, b };
 910     }
 911 
 912     @Test
 913     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 914                   IRNode.AND_VB, "> 0",
 915                   IRNode.STORE_VECTOR, "> 0"},
 916         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 917         applyIfPlatform = {"64-bit", "true"},
 918         applyIf = {"AlignVector", "false"})
 919     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 920                   IRNode.AND_VB, "= 0",
 921                   IRNode.STORE_VECTOR, "= 0"},
 922         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 923         applyIfPlatform = {"64-bit", "true"},
 924         applyIf = {"AlignVector", "true"})
 925     static Object[] test11cB(byte[] a, byte[] b, byte mask) {
 926         for (int i = 1; i < RANGE-1; i++) {
 927             // 1 byte offset -> not alignable with AlignVector
 928             b[i+0] = (byte)(a[i+1] & mask);
 929         }
 930         return new Object[]{ a, b };
 931     }
 932 
 933     @Test
 934     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 935                   IRNode.AND_VS, "> 0",
 936                   IRNode.STORE_VECTOR, "> 0"},
 937         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 938         applyIfPlatform = {"64-bit", "true"},
 939         applyIf = {"AlignVector", "false"})
 940     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 941                   IRNode.AND_VS, "= 0",
 942                   IRNode.STORE_VECTOR, "= 0"},
 943         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 944         applyIfPlatform = {"64-bit", "true"},
 945         applyIf = {"AlignVector", "true"})
 946     static Object[] test11cS(short[] a, short[] b, short mask) {
 947         for (int i = 1; i < RANGE-1; i++) {
 948             // 2 byte offset -> not alignable with AlignVector
 949             b[i+0] = (short)(a[i+1] & mask);
 950         }
 951         return new Object[]{ a, b };
 952     }
 953 
 954     @Test
 955     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 956                   IRNode.AND_VI, "> 0",
 957                   IRNode.STORE_VECTOR, "> 0"},
 958         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 959         applyIfPlatform = {"64-bit", "true"},
 960         applyIf = {"AlignVector", "false"})
 961     @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0",
 962                   IRNode.AND_VI, "= 0",
 963                   IRNode.STORE_VECTOR, "= 0"},
 964         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 965         applyIfPlatform = {"64-bit", "true"},
 966         applyIf = {"AlignVector", "true"})
 967     static Object[] test11cI(int[] a, int[] b, int mask) {
 968         for (int i = 1; i < RANGE-1; i++) {
 969             // 4 byte offset -> not alignable with AlignVector
 970             b[i+0] = (int)(a[i+1] & mask);
 971         }
 972         return new Object[]{ a, b };
 973     }
 974 
 975     @Test
 976     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 977                   IRNode.AND_VL, "> 0",
 978                   IRNode.STORE_VECTOR, "> 0"},
 979         applyIfPlatform = {"64-bit", "true"},
 980         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 981     static Object[] test11cL(long[] a, long[] b, long mask) {
 982         for (int i = 1; i < RANGE-1; i++) {
 983             // always alignable (8 byte offset)
 984             b[i+0] = (long)(a[i+1] & mask);
 985         }
 986         return new Object[]{ a, b };
 987     }
 988 
 989     @Test
 990     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 991                   IRNode.AND_VB, "> 0",
 992                   IRNode.STORE_VECTOR, "> 0"},
 993         applyIfPlatform = {"64-bit", "true"},
 994         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 995     static Object[] test11dB(byte[] a, byte[] b, byte mask, int invar) {
 996         for (int i = 0; i < RANGE; i++) {
 997             b[i+0+invar] = (byte)(a[i+0+invar] & mask);
 998         }
 999         return new Object[]{ a, b };
1000     }
1001 
1002     @Test
1003     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
1004                   IRNode.AND_VS, "> 0",
1005                   IRNode.STORE_VECTOR, "> 0"},
1006         applyIfPlatform = {"64-bit", "true"},
1007         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1008     static Object[] test11dS(short[] a, short[] b, short mask, int invar) {
1009         for (int i = 0; i < RANGE; i++) {
1010             b[i+0+invar] = (short)(a[i+0+invar] & mask);
1011         }
1012         return new Object[]{ a, b };
1013     }
1014 
1015     @Test
1016     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1017                   IRNode.AND_VI, "> 0",
1018                   IRNode.STORE_VECTOR, "> 0"},
1019         applyIfPlatform = {"64-bit", "true"},
1020         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1021     static Object[] test11dI(int[] a, int[] b, int mask, int invar) {
1022         for (int i = 0; i < RANGE; i++) {
1023             b[i+0+invar] = (int)(a[i+0+invar] & mask);
1024         }
1025         return new Object[]{ a, b };
1026     }
1027 
1028     @Test
1029     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1030                   IRNode.AND_VL, "> 0",
1031                   IRNode.STORE_VECTOR, "> 0"},
1032         applyIfPlatform = {"64-bit", "true"},
1033         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1034     static Object[] test11dL(long[] a, long[] b, long mask, int invar) {
1035         for (int i = 0; i < RANGE; i++) {
1036             b[i+0+invar] = (long)(a[i+0+invar] & mask);
1037         }
1038         return new Object[]{ a, b };
1039     }
1040 
1041     @Test
1042     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1043                   IRNode.AND_VB, "= 0",
1044                   IRNode.STORE_VECTOR, "= 0"},
1045         applyIfPlatform = {"64-bit", "true"},
1046         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1047     static Object[] test12(byte[] a, byte[] b, byte mask) {
1048         for (int i = 0; i < RANGE/16; i++) {
1049             // Currently does not vectorize at all
1050             b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask);
1051             b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask);
1052             b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask);
1053             b[i*6 + 3 ] = (byte)(a[i*6 + 3 ] & mask);
1054         }
1055         return new Object[]{ a, b };
1056     }
1057 
1058     @Test
1059     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1060                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1061                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1062                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1063                   IRNode.STORE_VECTOR, "> 0"},
1064         applyIfPlatform = {"64-bit", "true"},
1065         applyIfCPUFeatureOr = {"avx2", "true"})
1066     // require avx to ensure vectors are larger than what unrolling produces
1067     static Object[] test13aIL(int[] a, long[] b) {
1068         for (int i = 0; i < RANGE; i++) {
1069             a[i]++;
1070             b[i]++;
1071         }
1072         return new Object[]{ a, b };
1073     }
1074 
1075     @Test
1076     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1077                   IRNode.LOAD_VECTOR_I, "> 0",
1078                   IRNode.ADD_VB, "> 0",
1079                   IRNode.ADD_VI, "> 0",
1080                   IRNode.STORE_VECTOR, "> 0"},
1081         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1082         applyIfPlatform = {"64-bit", "true"},
1083         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1084     static Object[] test13aIB(int[] a, byte[] b) {
1085         for (int i = 0; i < RANGE; i++) {
1086             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1*iter
1087             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1088             a[i]++;
1089             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET  + 4*iter
1090             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1091             b[i]++;
1092             // For AlignVector, all adr must be 8-byte aligned. Let's see for which iteration this can hold:
1093             // If UseCompactObjectHeaders=false:
1094             //   a: 0, 8, 16, 24, 32, ...
1095             //   b: 0, 2,  4,  6,  8, ...
1096             //   -> Ok, aligns every 8th iteration.
1097             // If UseCompactObjectHeaders=true:
1098             //   a: 4, 12, 20, 28, 36, ...
1099             //   b: 1,  3,  5,  7,  9, ...
1100             //   -> we can never align both vectors!
1101         }
1102         return new Object[]{ a, b };
1103     }
1104 
1105     @Test
1106     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1107                   IRNode.LOAD_VECTOR_S, "> 0",
1108                   IRNode.ADD_VI, "> 0",
1109                   IRNode.ADD_VS, "> 0",
1110                   IRNode.STORE_VECTOR, "> 0"},
1111         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1112         applyIfPlatform = {"64-bit", "true"},
1113         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1114     static Object[] test13aIS(int[] a, short[] b) {
1115         for (int i = 0; i < RANGE; i++) {
1116             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4*iter
1117             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1118             a[i]++;
1119             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*iter
1120             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1121             b[i]++;
1122             // For AlignVector, all adr must be 8-byte aligned. Let's see for which iteration this can hold:
1123             // If UseCompactObjectHeaders=false:
1124             //   a: iter % 2 == 0
1125             //   b: iter % 4 == 0
1126             //   -> Ok, aligns every 4th iteration.
1127             // If UseCompactObjectHeaders=true:
1128             //   a: iter % 2 = 1
1129             //   b: iter % 4 = 2
1130             //   -> we can never align both vectors!
1131         }
1132         return new Object[]{ a, b };
1133     }
1134 
1135     @Test
1136     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1137                   IRNode.LOAD_VECTOR_S, "> 0",
1138                   IRNode.LOAD_VECTOR_I, "> 0",
1139                   IRNode.LOAD_VECTOR_L, "> 0",
1140                   IRNode.ADD_VB, "> 0",
1141                   IRNode.ADD_VS, "> 0",
1142                   IRNode.ADD_VI, "> 0",
1143                   IRNode.ADD_VL, "> 0",
1144                   IRNode.STORE_VECTOR, "> 0"},
1145         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1146         applyIfPlatform = {"64-bit", "true"},
1147         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1148     static Object[] test13aBSIL(byte[] a, short[] b, int[] c, long[] d) {
1149         for (int i = 0; i < RANGE; i++) {
1150             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1*iter
1151             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1152             a[i]++;
1153             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*iter
1154             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1155             b[i]++;
1156             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4*iter
1157             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1158             c[i]++;
1159             // adr = base + UNSAFE.ARRAY_LONG_BASE_OFFSET + 8*iter
1160             //              = 16 (always)
1161             d[i]++;
1162             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1163             //   a: iter % 8 = 4
1164             //   c: iter % 2 = 1
1165             //   -> can never align both vectors!
1166         }
1167         return new Object[]{ a, b, c, d };
1168     }
1169 
1170     @Test
1171     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1172                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1173                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1174                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1175                   IRNode.STORE_VECTOR, "> 0"},
1176         applyIfPlatform = {"64-bit", "true"},
1177         applyIfCPUFeatureOr = {"avx2", "true"})
1178     // require avx to ensure vectors are larger than what unrolling produces
1179     static Object[] test13bIL(int[] a, long[] b) {
1180         for (int i = 1; i < RANGE; i++) {
1181             a[i]++;
1182             b[i]++;
1183         }
1184         return new Object[]{ a, b };
1185     }
1186 
1187     @Test
1188     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1189                   IRNode.LOAD_VECTOR_I, "> 0",
1190                   IRNode.ADD_VB, "> 0",
1191                   IRNode.ADD_VI, "> 0",
1192                   IRNode.STORE_VECTOR, "> 0"},
1193         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1194         applyIfPlatform = {"64-bit", "true"},
1195         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1196     static Object[] test13bIB(int[] a, byte[] b) {
1197         for (int i = 1; i < RANGE; i++) {
1198             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1199             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1200             a[i]++;
1201             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1 + 1*iter
1202             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1203             b[i]++;
1204             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1205             //   a: iter % 2 = 0
1206             //   b: iter % 8 = 3
1207             //   -> can never align both vectors!
1208         }
1209         return new Object[]{ a, b };
1210     }
1211 
1212     @Test
1213     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1214                   IRNode.LOAD_VECTOR_S, "> 0",
1215                   IRNode.ADD_VI, "> 0",
1216                   IRNode.ADD_VS, "> 0",
1217                   IRNode.STORE_VECTOR, "> 0"},
1218         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1219         applyIfPlatform = {"64-bit", "true"},
1220         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1221     static Object[] test13bIS(int[] a, short[] b) {
1222         for (int i = 1; i < RANGE; i++) {
1223             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1224             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1225             a[i]++;
1226             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2 + 2*iter
1227             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1228             b[i]++;
1229             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1230             //   a: iter % 2 = 0
1231             //   b: iter % 4 = 1
1232             //   -> can never align both vectors!
1233         }
1234         return new Object[]{ a, b };
1235     }
1236 
1237     @Test
1238     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1239                   IRNode.LOAD_VECTOR_S, "> 0",
1240                   IRNode.LOAD_VECTOR_I, "> 0",
1241                   IRNode.LOAD_VECTOR_L, "> 0",
1242                   IRNode.ADD_VB, "> 0",
1243                   IRNode.ADD_VS, "> 0",
1244                   IRNode.ADD_VI, "> 0",
1245                   IRNode.ADD_VL, "> 0",
1246                   IRNode.STORE_VECTOR, "> 0"},
1247         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1248         applyIfPlatform = {"64-bit", "true"},
1249         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1250     static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
1251         for (int i = 1; i < RANGE; i++) {
1252             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1 + 1*iter
1253             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1254             a[i]++;
1255             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2 + 2*iter
1256             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1257             b[i]++;
1258             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1259             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1260             c[i]++;
1261             // adr = base + UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 + 8*iter
1262             //              = 16 (always)
1263             d[i]++;
1264             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1265             //   a: iter % 8 = 3
1266             //   c: iter % 2 = 0
1267             //   -> can never align both vectors!
1268         }
1269         return new Object[]{ a, b, c, d };
1270     }
1271 
1272     @Test
1273     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1274                   IRNode.ADD_VB, "= 0",
1275                   IRNode.STORE_VECTOR, "= 0"},
1276         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1277         applyIfPlatform = {"64-bit", "true"},
1278         applyIf = {"AlignVector", "false"})
1279     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1280                   IRNode.ADD_VB, "= 0",
1281                   IRNode.STORE_VECTOR, "= 0"},
1282         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1283         applyIfPlatform = {"64-bit", "true"},
1284         applyIf = {"AlignVector", "true"})
1285     static Object[] test14aB(byte[] a) {
1286         // non-power-of-2 stride
1287         for (int i = 0; i < RANGE-20; i+=9) {
1288             // Since the stride is shorter than the vector length, there will be always
1289             // partial overlap of loads with previous stores, this leads to failure in
1290             // store-to-load-forwarding -> vectorization not profitable.
1291             a[i+0]++;
1292             a[i+1]++;
1293             a[i+2]++;
1294             a[i+3]++;
1295             a[i+4]++;
1296             a[i+5]++;
1297             a[i+6]++;
1298             a[i+7]++;
1299             a[i+8]++;
1300             a[i+9]++;
1301             a[i+10]++;
1302             a[i+11]++;
1303             a[i+12]++;
1304             a[i+13]++;
1305             a[i+14]++;
1306             a[i+15]++;
1307         }
1308         return new Object[]{ a };
1309     }
1310 
1311     @Test
1312     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1313                   IRNode.ADD_VB, "= 0",
1314                   IRNode.STORE_VECTOR, "= 0"},
1315         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1316         applyIfPlatform = {"64-bit", "true"},
1317         applyIf = {"AlignVector", "false"})
1318     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1319                   IRNode.ADD_VB, "= 0",
1320                   IRNode.STORE_VECTOR, "= 0"},
1321         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1322         applyIfPlatform = {"64-bit", "true"},
1323         applyIf = {"AlignVector", "true"})
1324     static Object[] test14bB(byte[] a) {
1325         // non-power-of-2 stride
1326         for (int i = 0; i < RANGE-20; i+=3) {
1327             // Since the stride is shorter than the vector length, there will be always
1328             // partial overlap of loads with previous stores, this leads to failure in
1329             // store-to-load-forwarding -> vectorization not profitable.
1330             a[i+0]++;
1331             a[i+1]++;
1332             a[i+2]++;
1333             a[i+3]++;
1334             a[i+4]++;
1335             a[i+5]++;
1336             a[i+6]++;
1337             a[i+7]++;
1338             a[i+8]++;
1339             a[i+9]++;
1340             a[i+10]++;
1341             a[i+11]++;
1342             a[i+12]++;
1343             a[i+13]++;
1344             a[i+14]++;
1345             a[i+15]++;
1346         }
1347         return new Object[]{ a };
1348     }
1349 
1350     @Test
1351     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1352                   IRNode.ADD_VB, "= 0",
1353                   IRNode.STORE_VECTOR, "= 0"},
1354         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1355         applyIfPlatform = {"64-bit", "true"},
1356         applyIf = {"AlignVector", "false"})
1357     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1358                   IRNode.ADD_VB, "= 0",
1359                   IRNode.STORE_VECTOR, "= 0"},
1360         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1361         applyIfPlatform = {"64-bit", "true"},
1362         applyIf = {"AlignVector", "true"})
1363     static Object[] test14cB(byte[] a) {
1364         // non-power-of-2 stride
1365         for (int i = 0; i < RANGE-20; i+=5) {
1366             // Since the stride is shorter than the vector length, there will be always
1367             // partial overlap of loads with previous stores, this leads to failure in
1368             // store-to-load-forwarding -> vectorization not profitable.
1369             a[i+0]++;
1370             a[i+1]++;
1371             a[i+2]++;
1372             a[i+3]++;
1373             a[i+4]++;
1374             a[i+5]++;
1375             a[i+6]++;
1376             a[i+7]++;
1377             a[i+8]++;
1378             a[i+9]++;
1379             a[i+10]++;
1380             a[i+11]++;
1381             a[i+12]++;
1382             a[i+13]++;
1383             a[i+14]++;
1384             a[i+15]++;
1385         }
1386         return new Object[]{ a };
1387     }
1388 
1389     @Test
1390     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1391                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1392                   IRNode.STORE_VECTOR,                                           "> 0"},
1393         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1394         applyIfPlatform = {"64-bit", "true"},
1395         applyIf = {"AlignVector", "false"})
1396     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1397                   IRNode.ADD_VB, "= 0",
1398                   IRNode.STORE_VECTOR, "= 0"},
1399         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1400         applyIfPlatform = {"64-bit", "true"},
1401         applyIf = {"AlignVector", "true"})
1402     static Object[] test14dB(byte[] a) {
1403         // non-power-of-2 stride
1404         for (int i = 0; i < RANGE-20; i+=9) {
1405             a[i+0]++;
1406             a[i+1]++;
1407             a[i+2]++;
1408             a[i+3]++;
1409             a[i+4]++;
1410             a[i+5]++;
1411             a[i+6]++;
1412             a[i+7]++;
1413         }
1414         return new Object[]{ a };
1415     }
1416 
1417     @Test
1418     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1419                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1420                   IRNode.STORE_VECTOR,                                           "> 0"},
1421         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1422         applyIfPlatform = {"64-bit", "true"},
1423         applyIf = {"AlignVector", "false"})
1424     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1425                   IRNode.ADD_VB, "= 0",
1426                   IRNode.STORE_VECTOR, "= 0"},
1427         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1428         applyIfPlatform = {"64-bit", "true"},
1429         applyIf = {"AlignVector", "true"})
1430     static Object[] test14eB(byte[] a) {
1431         // non-power-of-2 stride
1432         for (int i = 0; i < RANGE-32; i+=11) {
1433             a[i+0]++;
1434             a[i+1]++;
1435             a[i+2]++;
1436             a[i+3]++;
1437             a[i+4]++;
1438             a[i+5]++;
1439             a[i+6]++;
1440             a[i+7]++;
1441         }
1442         return new Object[]{ a };
1443     }
1444 
1445     @Test
1446     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1447                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1448                   IRNode.STORE_VECTOR,                                           "> 0"},
1449         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1450         applyIfPlatform = {"64-bit", "true"},
1451         applyIf = {"AlignVector", "false"})
1452     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1453                   IRNode.ADD_VB, "= 0",
1454                   IRNode.STORE_VECTOR, "= 0"},
1455         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1456         applyIfPlatform = {"64-bit", "true"},
1457         applyIf = {"AlignVector", "true"})
1458     static Object[] test14fB(byte[] a) {
1459         // non-power-of-2 stride
1460         for (int i = 0; i < RANGE-40; i+=12) {
1461             a[i+0]++;
1462             a[i+1]++;
1463             a[i+2]++;
1464             a[i+3]++;
1465             a[i+4]++;
1466             a[i+5]++;
1467             a[i+6]++;
1468             a[i+7]++;
1469         }
1470         return new Object[]{ a };
1471     }
1472 
1473     @Test
1474     // IR rules difficult because of modulo wrapping with offset after peeling.
1475     static Object[] test15aB(byte[] a) {
1476         // non-power-of-2 scale
1477         for (int i = 0; i < RANGE/64-20; i++) {
1478             a[53*i+0]++;
1479             a[53*i+1]++;
1480             a[53*i+2]++;
1481             a[53*i+3]++;
1482             a[53*i+4]++;
1483             a[53*i+5]++;
1484             a[53*i+6]++;
1485             a[53*i+7]++;
1486             a[53*i+8]++;
1487             a[53*i+9]++;
1488             a[53*i+10]++;
1489             a[53*i+11]++;
1490             a[53*i+12]++;
1491             a[53*i+13]++;
1492             a[53*i+14]++;
1493             a[53*i+15]++;
1494         }
1495         return new Object[]{ a };
1496     }
1497 
1498     @Test
1499     // IR rules difficult because of modulo wrapping with offset after peeling.
1500     static Object[] test15bB(byte[] a) {
1501         // non-power-of-2 scale
1502         for (int i = 0; i < RANGE/64-20; i++) {
1503             a[25*i+0]++;
1504             a[25*i+1]++;
1505             a[25*i+2]++;
1506             a[25*i+3]++;
1507             a[25*i+4]++;
1508             a[25*i+5]++;
1509             a[25*i+6]++;
1510             a[25*i+7]++;
1511             a[25*i+8]++;
1512             a[25*i+9]++;
1513             a[25*i+10]++;
1514             a[25*i+11]++;
1515             a[25*i+12]++;
1516             a[25*i+13]++;
1517             a[25*i+14]++;
1518             a[25*i+15]++;
1519         }
1520         return new Object[]{ a };
1521     }
1522 
1523     @Test
1524     // IR rules difficult because of modulo wrapping with offset after peeling.
1525     static Object[] test15cB(byte[] a) {
1526         // non-power-of-2 scale
1527         for (int i = 0; i < RANGE/64-20; i++) {
1528             a[19*i+0]++;
1529             a[19*i+1]++;
1530             a[19*i+2]++;
1531             a[19*i+3]++;
1532             a[19*i+4]++;
1533             a[19*i+5]++;
1534             a[19*i+6]++;
1535             a[19*i+7]++;
1536             a[19*i+8]++;
1537             a[19*i+9]++;
1538             a[19*i+10]++;
1539             a[19*i+11]++;
1540             a[19*i+12]++;
1541             a[19*i+13]++;
1542             a[19*i+14]++;
1543             a[19*i+15]++;
1544         }
1545         return new Object[]{ a };
1546     }
1547 
1548     @Test
1549     static Object[] test16a(byte[] a, short[] b) {
1550         // infinite loop issues
1551         for (int i = 0; i < RANGE/2-20; i++) {
1552             a[2*i+0]++;
1553             a[2*i+1]++;
1554             a[2*i+2]++;
1555             a[2*i+3]++;
1556             a[2*i+4]++;
1557             a[2*i+5]++;
1558             a[2*i+6]++;
1559             a[2*i+7]++;
1560             a[2*i+8]++;
1561             a[2*i+9]++;
1562             a[2*i+10]++;
1563             a[2*i+11]++;
1564             a[2*i+12]++;
1565             a[2*i+13]++;
1566             a[2*i+14]++;
1567 
1568             b[2*i+0]++;
1569             b[2*i+1]++;
1570             b[2*i+2]++;
1571             b[2*i+3]++;
1572         }
1573         return new Object[]{ a, b };
1574     }
1575 
1576     @Test
1577     static Object[] test16b(byte[] a) {
1578         // infinite loop issues
1579         for (int i = 0; i < RANGE/2-20; i++) {
1580             a[2*i+0]++;
1581             a[2*i+1]++;
1582             a[2*i+2]++;
1583             a[2*i+3]++;
1584             a[2*i+4]++;
1585             a[2*i+5]++;
1586             a[2*i+6]++;
1587             a[2*i+7]++;
1588             a[2*i+8]++;
1589             a[2*i+9]++;
1590             a[2*i+10]++;
1591             a[2*i+11]++;
1592             a[2*i+12]++;
1593             a[2*i+13]++;
1594             a[2*i+14]++;
1595         }
1596         return new Object[]{ a };
1597     }
1598 
1599     @Test
1600     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1601                   IRNode.ADD_VL, "> 0",
1602                   IRNode.STORE_VECTOR, "> 0"},
1603         applyIfPlatform = {"64-bit", "true"},
1604         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1605     static Object[] test17a(long[] a) {
1606         // Unsafe: vectorizes with profiling (not xcomp)
1607         for (int i = 0; i < RANGE; i++) {
1608             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1609             long v = UNSAFE.getLongUnaligned(a, adr);
1610             UNSAFE.putLongUnaligned(a, adr, v + 1);
1611         }
1612         return new Object[]{ a };
1613     }
1614 
1615     @Test
1616     // Difficult to write good IR rule. Modulo calculus overflow can create non-power-of-2 packs.
1617     static Object[] test17b(long[] a) {
1618         // Not alignable
1619         for (int i = 0; i < RANGE-1; i++) {
1620             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1621             long v = UNSAFE.getLongUnaligned(a, adr);
1622             UNSAFE.putLongUnaligned(a, adr, v + 1);
1623         }
1624         return new Object[]{ a };
1625     }
1626 
1627     @Test
1628     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1629                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1630                   IRNode.STORE_VECTOR, "> 0"},
1631         applyIf = {"MaxVectorSize", ">=32"},
1632         applyIfPlatform = {"64-bit", "true"},
1633         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1634     static Object[] test17c(long[] a) {
1635         // Unsafe: aligned vectorizes
1636         for (int i = 0; i < RANGE-1; i+=4) {
1637             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1638             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1639             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1640             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1641             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1642         }
1643         return new Object[]{ a };
1644     }
1645 
1646     @Test
1647     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1648                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1649                   IRNode.STORE_VECTOR, "> 0"},
1650         applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
1651         applyIfPlatform = {"64-bit", "true"},
1652         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=64"})
1653     // Ensure vector width is large enough to fit 64 byte for longs:
1654     // The offsets are: 25, 33, 57, 65
1655     // In modulo 32:    25,  1, 25,  1  -> does not vectorize
1656     // In modulo 64:    25, 33, 57,  1  -> at least first pair vectorizes
1657     // This problem is because we compute modulo vector width in memory_alignment.
1658     @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0",
1659                   IRNode.ADD_VL, "= 0",
1660                   IRNode.STORE_VECTOR, "= 0"},
1661         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1662         applyIfPlatform = {"64-bit", "true"},
1663         applyIf = {"AlignVector", "true"})
1664     static Object[] test17d(long[] a) {
1665         // Not alignable
1666         for (int i = 0; i < RANGE-1; i+=4) {
1667             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1668             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1669             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1670             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1671             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1672         }
1673         return new Object[]{ a };
1674     }
1675 
1676     @Test
1677     static Object[] test18a(byte[] a, int[] b) {
1678         // scale = 0  -->  no iv
1679         for (int i = 0; i < RANGE; i++) {
1680             a[0] = 1;
1681             b[i] = 2;
1682             a[1] = 1;
1683         }
1684         return new Object[]{ a, b };
1685     }
1686 
1687     @Test
1688     static Object[] test18b(byte[] a, int[] b) {
1689         // scale = 0  -->  no iv
1690         for (int i = 0; i < RANGE; i++) {
1691             a[1] = 1;
1692             b[i] = 2;
1693             a[2] = 1;
1694         }
1695         return new Object[]{ a, b };
1696     }
1697 
1698     @Test
1699     static Object[] test19(int[] a, int[] b) {
1700         for (int i = 5000; i > 0; i--) {
1701             a[RANGE_FINAL - i] = b[RANGE_FINAL - i];
1702         }
1703         return new Object[]{ a, b };
1704     }
1705 
1706     @Test
1707     static Object[] test20(byte[] a) {
1708         // Example where it is easy to pass alignment check,
1709         // but used to fail the alignment calculation
1710         for (int i = 1; i < RANGE/2-50; i++) {
1711             a[2*i+0+30]++;
1712             a[2*i+1+30]++;
1713             a[2*i+2+30]++;
1714             a[2*i+3+30]++;
1715         }
1716         return new Object[]{ a };
1717     }
1718 }