1 /*
   2  * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package compiler.loopopts.superword;
  25 
  26 import compiler.lib.ir_framework.*;
  27 import jdk.test.lib.Utils;
  28 import jdk.test.whitebox.WhiteBox;
  29 import jdk.internal.misc.Unsafe;
  30 import java.lang.reflect.Array;
  31 import java.util.Map;
  32 import java.util.HashMap;
  33 import java.util.Random;
  34 import java.nio.ByteOrder;
  35 
  36 /*
  37  * @test id=NoAlignVector
  38  * @bug 8310190
  39  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  40  * @modules java.base/jdk.internal.misc
  41  * @library /test/lib /
  42  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector
  43  */
  44 
  45 /*
  46  * @test id=AlignVector
  47  * @bug 8310190
  48  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  49  * @modules java.base/jdk.internal.misc
  50  * @library /test/lib /
  51  * @run driver compiler.loopopts.superword.TestAlignVector AlignVector
  52  */
  53 
  54 /*
  55  * @test id=VerifyAlignVector
  56  * @bug 8310190
  57  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  58  * @modules java.base/jdk.internal.misc
  59  * @library /test/lib /
  60  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector
  61  */
  62 
  63 /*
  64  * @test id=NoAlignVector-COH
  65  * @bug 8310190
  66  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  67  * @modules java.base/jdk.internal.misc
  68  * @library /test/lib /
  69  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector-COH
  70  */
  71 
  72 /*
  73  * @test id=VerifyAlignVector-COH
  74  * @bug 8310190
  75  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  76  * @modules java.base/jdk.internal.misc
  77  * @library /test/lib /
  78  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector-COH
  79  */
  80 
  81 public class TestAlignVector {
  82     static int RANGE = 1024*8;
  83     static int RANGE_FINAL = 1024*8;
  84     private static final Unsafe UNSAFE = Unsafe.getUnsafe();
  85     private static final Random RANDOM = Utils.getRandomInstance();
  86 
  87     // Inputs
  88     byte[] aB;
  89     byte[] bB;
  90     byte mB = (byte)31;
  91     short[] aS;
  92     short[] bS;
  93     short mS = (short)0xF0F0;
  94     int[] aI;
  95     int[] bI;
  96     int mI = 0xF0F0F0F0;
  97     long[] aL;
  98     long[] bL;
  99     long mL = 0xF0F0F0F0F0F0F0F0L;
 100 
 101     // List of tests
 102     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
 103 
 104     // List of gold, the results from the first run before compilation
 105     Map<String,Object[]> golds = new HashMap<String,Object[]>();
 106 
 107     interface TestFunction {
 108         Object[] run();
 109     }
 110 
 111     public static void main(String[] args) {
 112         TestFramework framework = new TestFramework(TestAlignVector.class);
 113         framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
 114                            "-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=250");
 115 
 116         switch (args[0]) {
 117             case "NoAlignVector"         -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); }
 118             case "AlignVector"           -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); }
 119             case "VerifyAlignVector"     -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 120             case "NoAlignVector-COH"     -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); }
 121             case "VerifyAlignVector-COH" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 122             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 123         }
 124         framework.start();
 125     }
 126 
 127     public TestAlignVector() {
 128         // Generate input once
 129         aB = generateB();
 130         bB = generateB();
 131         aS = generateS();
 132         bS = generateS();
 133         aI = generateI();
 134         bI = generateI();
 135         aL = generateL();
 136         bL = generateL();
 137 
 138         // Add all tests to list
 139         tests.put("test0",       () -> { return test0(aB.clone(), bB.clone(), mB); });
 140         tests.put("test1",       () -> { return test1(aB.clone(), bB.clone(), mB); });
 141         tests.put("test2",       () -> { return test2(aB.clone(), bB.clone(), mB); });
 142         tests.put("test3",       () -> { return test3(aB.clone(), bB.clone(), mB); });
 143         tests.put("test4",       () -> { return test4(aB.clone(), bB.clone(), mB); });
 144         tests.put("test5",       () -> { return test5(aB.clone(), bB.clone(), mB, 0); });
 145         tests.put("test6",       () -> { return test6(aB.clone(), bB.clone(), mB); });
 146         tests.put("test7",       () -> { return test7(aS.clone(), bS.clone(), mS); });
 147         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 0); });
 148         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 1); });
 149         tests.put("test9",       () -> { return test9(aB.clone(), bB.clone(), mB); });
 150 
 151         tests.put("test10a",     () -> { return test10a(aB.clone(), bB.clone(), mB); });
 152         tests.put("test10b",     () -> { return test10b(aB.clone(), bB.clone(), mB); });
 153         tests.put("test10c",     () -> { return test10c(aS.clone(), bS.clone(), mS); });
 154         tests.put("test10d",     () -> { return test10d(aS.clone(), bS.clone(), mS); });
 155 
 156         tests.put("test11aB",    () -> { return test11aB(aB.clone(), bB.clone(), mB); });
 157         tests.put("test11aS",    () -> { return test11aS(aS.clone(), bS.clone(), mS); });
 158         tests.put("test11aI",    () -> { return test11aI(aI.clone(), bI.clone(), mI); });
 159         tests.put("test11aL",    () -> { return test11aL(aL.clone(), bL.clone(), mL); });
 160 
 161         tests.put("test11bB",    () -> { return test11bB(aB.clone(), bB.clone(), mB); });
 162         tests.put("test11bS",    () -> { return test11bS(aS.clone(), bS.clone(), mS); });
 163         tests.put("test11bI",    () -> { return test11bI(aI.clone(), bI.clone(), mI); });
 164         tests.put("test11bL",    () -> { return test11bL(aL.clone(), bL.clone(), mL); });
 165 
 166         tests.put("test11cB",    () -> { return test11cB(aB.clone(), bB.clone(), mB); });
 167         tests.put("test11cS",    () -> { return test11cS(aS.clone(), bS.clone(), mS); });
 168         tests.put("test11cI",    () -> { return test11cI(aI.clone(), bI.clone(), mI); });
 169         tests.put("test11cL",    () -> { return test11cL(aL.clone(), bL.clone(), mL); });
 170 
 171         tests.put("test11dB",    () -> { return test11dB(aB.clone(), bB.clone(), mB, 0); });
 172         tests.put("test11dS",    () -> { return test11dS(aS.clone(), bS.clone(), mS, 0); });
 173         tests.put("test11dI",    () -> { return test11dI(aI.clone(), bI.clone(), mI, 0); });
 174         tests.put("test11dL",    () -> { return test11dL(aL.clone(), bL.clone(), mL, 0); });
 175 
 176         tests.put("test12",      () -> { return test12(aB.clone(), bB.clone(), mB); });
 177 
 178         tests.put("test13aIL",   () -> { return test13aIL(aI.clone(), aL.clone()); });
 179         tests.put("test13aIB",   () -> { return test13aIB(aI.clone(), aB.clone()); });
 180         tests.put("test13aIS",   () -> { return test13aIS(aI.clone(), aS.clone()); });
 181         tests.put("test13aBSIL", () -> { return test13aBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 182 
 183         tests.put("test13bIL",   () -> { return test13bIL(aI.clone(), aL.clone()); });
 184         tests.put("test13bIB",   () -> { return test13bIB(aI.clone(), aB.clone()); });
 185         tests.put("test13bIS",   () -> { return test13bIS(aI.clone(), aS.clone()); });
 186         tests.put("test13bBSIL", () -> { return test13bBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 187 
 188         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
 189         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
 190         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
 191         tests.put("test14dB",    () -> { return test14dB(aB.clone()); });
 192         tests.put("test14eB",    () -> { return test14eB(aB.clone()); });
 193         tests.put("test14fB",    () -> { return test14fB(aB.clone()); });
 194 
 195         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
 196         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
 197         tests.put("test15cB",    () -> { return test15cB(aB.clone()); });
 198 
 199         tests.put("test16a",     () -> { return test16a(aB.clone(), aS.clone()); });
 200         tests.put("test16b",     () -> { return test16b(aB.clone()); });
 201 
 202         tests.put("test17a",     () -> { return test17a(aL.clone()); });
 203         tests.put("test17b",     () -> { return test17b(aL.clone()); });
 204         tests.put("test17c",     () -> { return test17c(aL.clone()); });
 205         tests.put("test17d",     () -> { return test17d(aL.clone()); });
 206 
 207         tests.put("test18a",     () -> { return test18a(aB.clone(), aI.clone()); });
 208         tests.put("test18b",     () -> { return test18b(aB.clone(), aI.clone()); });
 209 
 210         tests.put("test19",      () -> { return test19(aI.clone(), bI.clone()); });
 211         tests.put("test20",      () -> { return test20(aB.clone()); });
 212 
 213         // Compute gold value for all test methods before compilation
 214         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 215             String name = entry.getKey();
 216             TestFunction test = entry.getValue();
 217             Object[] gold = test.run();
 218             golds.put(name, gold);
 219         }
 220     }
 221 
 222     @Warmup(100)
 223     @Run(test = {"test0",
 224                  "test1",
 225                  "test2",
 226                  "test3",
 227                  "test4",
 228                  "test5",
 229                  "test6",
 230                  "test7",
 231                  "test8",
 232                  "test9",
 233                  "test10a",
 234                  "test10b",
 235                  "test10c",
 236                  "test10d",
 237                  "test11aB",
 238                  "test11aS",
 239                  "test11aI",
 240                  "test11aL",
 241                  "test11bB",
 242                  "test11bS",
 243                  "test11bI",
 244                  "test11bL",
 245                  "test11cB",
 246                  "test11cS",
 247                  "test11cI",
 248                  "test11cL",
 249                  "test11dB",
 250                  "test11dS",
 251                  "test11dI",
 252                  "test11dL",
 253                  "test12",
 254                  "test13aIL",
 255                  "test13aIB",
 256                  "test13aIS",
 257                  "test13aBSIL",
 258                  "test13bIL",
 259                  "test13bIB",
 260                  "test13bIS",
 261                  "test13bBSIL",
 262                  "test14aB",
 263                  "test14bB",
 264                  "test14cB",
 265                  "test14dB",
 266                  "test14eB",
 267                  "test14fB",
 268                  "test15aB",
 269                  "test15bB",
 270                  "test15cB",
 271                  "test16a",
 272                  "test16b",
 273                  "test17a",
 274                  "test17b",
 275                  "test17c",
 276                  "test17d",
 277                  "test18a",
 278                  "test18b",
 279                  "test19",
 280                  "test20"})
 281     public void runTests() {
 282         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 283             String name = entry.getKey();
 284             TestFunction test = entry.getValue();
 285             // Recall gold value from before compilation
 286             Object[] gold = golds.get(name);
 287             // Compute new result
 288             Object[] result = test.run();
 289             // Compare gold and new result
 290             verify(name, gold, result);
 291         }
 292     }
 293 
 294     static byte[] generateB() {
 295         byte[] a = new byte[RANGE];
 296         for (int i = 0; i < a.length; i++) {
 297             a[i] = (byte)RANDOM.nextInt();
 298         }
 299         return a;
 300     }
 301 
 302     static short[] generateS() {
 303         short[] a = new short[RANGE];
 304         for (int i = 0; i < a.length; i++) {
 305             a[i] = (short)RANDOM.nextInt();
 306         }
 307         return a;
 308     }
 309 
 310     static int[] generateI() {
 311         int[] a = new int[RANGE];
 312         for (int i = 0; i < a.length; i++) {
 313             a[i] = RANDOM.nextInt();
 314         }
 315         return a;
 316     }
 317 
 318     static long[] generateL() {
 319         long[] a = new long[RANGE];
 320         for (int i = 0; i < a.length; i++) {
 321             a[i] = RANDOM.nextLong();
 322         }
 323         return a;
 324     }
 325 
 326     static void verify(String name, Object[] gold, Object[] result) {
 327         if (gold.length != result.length) {
 328             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
 329                                        gold.length + ", result.length = " + result.length);
 330         }
 331         for (int i = 0; i < gold.length; i++) {
 332             Object g = gold[i];
 333             Object r = result[i];
 334             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
 335                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
 336                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 337                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 338             }
 339             if (g == r) {
 340                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
 341                                            " gold[" + i + "] == result[" + i + "]");
 342             }
 343             if (Array.getLength(g) != Array.getLength(r)) {
 344                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
 345                                            " gold[" + i + "].length = " + Array.getLength(g) +
 346                                            " result[" + i + "].length = " + Array.getLength(r));
 347             }
 348             Class c = g.getClass().getComponentType();
 349             if (c == byte.class) {
 350                 verifyB(name, i, (byte[])g, (byte[])r);
 351             } else if (c == short.class) {
 352                 verifyS(name, i, (short[])g, (short[])r);
 353             } else if (c == int.class) {
 354                 verifyI(name, i, (int[])g, (int[])r);
 355             } else if (c == long.class) {
 356                 verifyL(name, i, (long[])g, (long[])r);
 357             } else {
 358                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
 359                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 360                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 361             }
 362         }
 363     }
 364 
 365     static void verifyB(String name, int i, byte[] g, byte[] r) {
 366         for (int j = 0; j < g.length; j++) {
 367             if (g[j] != r[j]) {
 368                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 369                                            " gold[" + i + "][" + j + "] = " + g[j] +
 370                                            " result[" + i + "][" + j + "] = " + r[j]);
 371             }
 372         }
 373     }
 374 
 375     static void verifyS(String name, int i, short[] g, short[] r) {
 376         for (int j = 0; j < g.length; j++) {
 377             if (g[j] != r[j]) {
 378                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 379                                            " gold[" + i + "][" + j + "] = " + g[j] +
 380                                            " result[" + i + "][" + j + "] = " + r[j]);
 381             }
 382         }
 383     }
 384 
 385     static void verifyI(String name, int i, int[] g, int[] r) {
 386         for (int j = 0; j < g.length; j++) {
 387             if (g[j] != r[j]) {
 388                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 389                                            " gold[" + i + "][" + j + "] = " + g[j] +
 390                                            " result[" + i + "][" + j + "] = " + r[j]);
 391             }
 392         }
 393     }
 394 
 395     static void verifyL(String name, int i, long[] g, long[] r) {
 396         for (int j = 0; j < g.length; j++) {
 397             if (g[j] != r[j]) {
 398                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 399                                            " gold[" + i + "][" + j + "] = " + g[j] +
 400                                            " result[" + i + "][" + j + "] = " + r[j]);
 401             }
 402         }
 403     }
 404 
 405     @Test
 406     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 407                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 408                   IRNode.STORE_VECTOR, "> 0"},
 409         applyIf = {"MaxVectorSize", ">=8"},
 410         applyIfPlatform = {"64-bit", "true"},
 411         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 412     static Object[] test0(byte[] a, byte[] b, byte mask) {
 413         for (int i = 0; i < RANGE; i+=8) {
 414             // Safe to vectorize with AlignVector
 415             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 416             b[i+1] = (byte)(a[i+1] & mask);
 417             b[i+2] = (byte)(a[i+2] & mask);
 418             b[i+3] = (byte)(a[i+3] & mask);
 419         }
 420         return new Object[]{ a, b };
 421     }
 422 
 423     @Test
 424     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 425                   IRNode.AND_VB, "> 0",
 426                   IRNode.STORE_VECTOR, "> 0"},
 427         applyIfPlatform = {"64-bit", "true"},
 428         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 429     static Object[] test1(byte[] a, byte[] b, byte mask) {
 430         for (int i = 0; i < RANGE; i+=8) {
 431             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0 + iter*8
 432             b[i+1] = (byte)(a[i+1] & mask);
 433             b[i+2] = (byte)(a[i+2] & mask);
 434             b[i+3] = (byte)(a[i+3] & mask);
 435             b[i+4] = (byte)(a[i+4] & mask);
 436             b[i+5] = (byte)(a[i+5] & mask);
 437             b[i+6] = (byte)(a[i+6] & mask);
 438             b[i+7] = (byte)(a[i+7] & mask);
 439         }
 440         return new Object[]{ a, b };
 441     }
 442 
 443     @Test
 444     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 445                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 446                   IRNode.STORE_VECTOR, "> 0"},
 447         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 448         applyIfPlatform = {"64-bit", "true"},
 449         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 450     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 451                   IRNode.AND_VB, "= 0",
 452                   IRNode.STORE_VECTOR, "= 0"},
 453         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 454         applyIfPlatform = {"64-bit", "true"},
 455         applyIf = {"AlignVector", "true"})
 456     static Object[] test2(byte[] a, byte[] b, byte mask) {
 457         for (int i = 0; i < RANGE; i+=8) {
 458             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 459             b[i+3] = (byte)(a[i+3] & mask); // at alignment 3
 460             b[i+4] = (byte)(a[i+4] & mask);
 461             b[i+5] = (byte)(a[i+5] & mask);
 462             b[i+6] = (byte)(a[i+6] & mask);
 463         }
 464         return new Object[]{ a, b };
 465     }
 466 
 467     @Test
 468     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 469                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 470                   IRNode.STORE_VECTOR, "> 0"},
 471         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 472         applyIfPlatform = {"64-bit", "true"},
 473         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 474     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 475                   IRNode.AND_VB, "= 0",
 476                   IRNode.STORE_VECTOR, "= 0"},
 477         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 478         applyIfPlatform = {"64-bit", "true"},
 479         applyIf = {"AlignVector", "true"})
 480     static Object[] test3(byte[] a, byte[] b, byte mask) {
 481         for (int i = 0; i < RANGE; i+=8) {
 482             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 483 
 484             // Problematic for AlignVector
 485             b[i+0] = (byte)(a[i+0] & mask); // best_memref, align 0
 486 
 487             b[i+3] = (byte)(a[i+3] & mask); // pack at offset 3 bytes
 488             b[i+4] = (byte)(a[i+4] & mask);
 489             b[i+5] = (byte)(a[i+5] & mask);
 490             b[i+6] = (byte)(a[i+6] & mask);
 491         }
 492         return new Object[]{ a, b };
 493     }
 494 
 495     @Test
 496     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 497                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "> 0",
 498                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 499                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "> 0",
 500                   IRNode.STORE_VECTOR, "> 0"},
 501         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 502         applyIfPlatform = {"64-bit", "true"},
 503         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 504     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 505                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 506                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 507                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 508                   IRNode.STORE_VECTOR, "> 0"},
 509         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 510         applyIfPlatform = {"64-bit", "true"},
 511         applyIfAnd = {"AlignVector", "true", "MaxVectorSize", ">=16"})
 512     static Object[] test4(byte[] a, byte[] b, byte mask) {
 513         for (int i = 0; i < RANGE/16; i++) {
 514             // Problematic for AlignVector
 515             b[i*16 + 0 ] = (byte)(a[i*16 + 0 ] & mask); // 4 pack, 0 aligned
 516             b[i*16 + 1 ] = (byte)(a[i*16 + 1 ] & mask);
 517             b[i*16 + 2 ] = (byte)(a[i*16 + 2 ] & mask);
 518             b[i*16 + 3 ] = (byte)(a[i*16 + 3 ] & mask);
 519 
 520             b[i*16 + 5 ] = (byte)(a[i*16 + 5 ] & mask); // 8 pack, 5 aligned
 521             b[i*16 + 6 ] = (byte)(a[i*16 + 6 ] & mask);
 522             b[i*16 + 7 ] = (byte)(a[i*16 + 7 ] & mask);
 523             b[i*16 + 8 ] = (byte)(a[i*16 + 8 ] & mask);
 524             b[i*16 + 9 ] = (byte)(a[i*16 + 9 ] & mask);
 525             b[i*16 + 10] = (byte)(a[i*16 + 10] & mask);
 526             b[i*16 + 11] = (byte)(a[i*16 + 11] & mask);
 527             b[i*16 + 12] = (byte)(a[i*16 + 12] & mask);
 528         }
 529         return new Object[]{ a, b };
 530     }
 531 
 532     @Test
 533     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 534                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 535                   IRNode.STORE_VECTOR, "> 0"},
 536         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 537         applyIfPlatform = {"64-bit", "true"},
 538         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 539     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 540                   IRNode.AND_VB, "= 0",
 541                   IRNode.STORE_VECTOR, "= 0"},
 542         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 543         applyIfPlatform = {"64-bit", "true"},
 544         applyIf = {"AlignVector", "true"})
 545     static Object[] test5(byte[] a, byte[] b, byte mask, int inv) {
 546         for (int i = 0; i < RANGE; i+=8) {
 547             // Cannot align with AlignVector because of invariant
 548             b[i+inv+0] = (byte)(a[i+inv+0] & mask);
 549 
 550             b[i+inv+3] = (byte)(a[i+inv+3] & mask);
 551             b[i+inv+4] = (byte)(a[i+inv+4] & mask);
 552             b[i+inv+5] = (byte)(a[i+inv+5] & mask);
 553             b[i+inv+6] = (byte)(a[i+inv+6] & mask);
 554         }
 555         return new Object[]{ a, b };
 556     }
 557 
 558     @Test
 559     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 560                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 561                   IRNode.STORE_VECTOR, "> 0"},
 562         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 563         applyIfPlatform = {"64-bit", "true"},
 564         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 565     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 566                   IRNode.AND_VB, "= 0",
 567                   IRNode.STORE_VECTOR, "= 0"},
 568         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 569         applyIfPlatform = {"64-bit", "true"},
 570         applyIf = {"AlignVector", "true"})
 571     static Object[] test6(byte[] a, byte[] b, byte mask) {
 572         for (int i = 0; i < RANGE/8; i+=2) {
 573             // Cannot align with AlignVector because offset is odd
 574             b[i*4+0] = (byte)(a[i*4+0] & mask);
 575 
 576             b[i*4+3] = (byte)(a[i*4+3] & mask);
 577             b[i*4+4] = (byte)(a[i*4+4] & mask);
 578             b[i*4+5] = (byte)(a[i*4+5] & mask);
 579             b[i*4+6] = (byte)(a[i*4+6] & mask);
 580         }
 581         return new Object[]{ a, b };
 582     }
 583 
 584     @Test
 585     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 586                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 587                   IRNode.STORE_VECTOR, "> 0"},
 588         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"},
 589         applyIfPlatform = {"64-bit", "true"},
 590         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 591     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 592                   IRNode.AND_VS, "= 0",
 593                   IRNode.STORE_VECTOR, "= 0"},
 594         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 595         applyIfPlatform = {"64-bit", "true"},
 596         applyIf = {"AlignVector", "true"})
 597     static Object[] test7(short[] a, short[] b, short mask) {
 598         for (int i = 0; i < RANGE/8; i+=2) {
 599             // Cannot align with AlignVector because offset is odd
 600             b[i*4+0] = (short)(a[i*4+0] & mask);
 601 
 602             b[i*4+3] = (short)(a[i*4+3] & mask);
 603             b[i*4+4] = (short)(a[i*4+4] & mask);
 604             b[i*4+5] = (short)(a[i*4+5] & mask);
 605             b[i*4+6] = (short)(a[i*4+6] & mask);
 606         }
 607         return new Object[]{ a, b };
 608     }
 609 
 610     @Test
 611     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 612                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 613                   IRNode.STORE_VECTOR, "> 0"},
 614         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 615         applyIfPlatform = {"64-bit", "true"},
 616         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 617     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 618                   IRNode.AND_VB, "= 0",
 619                   IRNode.STORE_VECTOR, "= 0"},
 620         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 621         applyIfPlatform = {"64-bit", "true"},
 622         applyIf = {"AlignVector", "true"})
 623     static Object[] test8(byte[] a, byte[] b, byte mask, int init) {
 624         for (int i = init; i < RANGE; i+=8) {
 625             // Cannot align with AlignVector because of invariant (variable init becomes invar)
 626             b[i+0] = (byte)(a[i+0] & mask);
 627 
 628             b[i+3] = (byte)(a[i+3] & mask);
 629             b[i+4] = (byte)(a[i+4] & mask);
 630             b[i+5] = (byte)(a[i+5] & mask);
 631             b[i+6] = (byte)(a[i+6] & mask);
 632         }
 633         return new Object[]{ a, b };
 634     }
 635 
 636     @Test
 637     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 638                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 639                   IRNode.STORE_VECTOR, "> 0"},
 640         applyIf = {"MaxVectorSize", ">=8"},
 641         applyIfPlatform = {"64-bit", "true"},
 642         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 643     static Object[] test9(byte[] a, byte[] b, byte mask) {
 644         // known non-zero init value does not affect offset, but has implicit effect on iv
 645         for (int i = 13; i < RANGE-8; i+=8) {
 646             b[i+0] = (byte)(a[i+0] & mask);
 647 
 648             b[i+3] = (byte)(a[i+3] & mask);
 649             b[i+4] = (byte)(a[i+4] & mask);
 650             b[i+5] = (byte)(a[i+5] & mask);
 651             b[i+6] = (byte)(a[i+6] & mask);
 652         }
 653         return new Object[]{ a, b };
 654     }
 655 
 656     @Test
 657     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 658                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 659                   IRNode.STORE_VECTOR, "> 0"},
 660         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 661         applyIfPlatform = {"64-bit", "true"},
 662         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 663     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 664                   IRNode.AND_VB, "= 0",
 665                   IRNode.STORE_VECTOR, "= 0"},
 666         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 667         applyIfPlatform = {"64-bit", "true"},
 668         applyIf = {"AlignVector", "true"})
 669     static Object[] test10a(byte[] a, byte[] b, byte mask) {
 670         // This is not alignable with pre-loop, because of odd init.
 671         for (int i = 3; i < RANGE-8; i+=8) {
 672             b[i+0] = (byte)(a[i+0] & mask);
 673             b[i+1] = (byte)(a[i+1] & mask);
 674             b[i+2] = (byte)(a[i+2] & mask);
 675             b[i+3] = (byte)(a[i+3] & mask);
 676         }
 677         return new Object[]{ a, b };
 678     }
 679 
 680     @Test
 681     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 682                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 683                   IRNode.STORE_VECTOR, "> 0"},
 684         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 685         applyIfPlatform = {"64-bit", "true"},
 686         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 687     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 688                   IRNode.AND_VB, "= 0",
 689                   IRNode.STORE_VECTOR, "= 0"},
 690         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 691         applyIfPlatform = {"64-bit", "true"},
 692         applyIf = {"AlignVector", "true"})
 693     static Object[] test10b(byte[] a, byte[] b, byte mask) {
 694         // This is not alignable with pre-loop, because of odd init.
 695         // Seems not correctly handled.
 696         for (int i = 13; i < RANGE-8; i+=8) {
 697             b[i+0] = (byte)(a[i+0] & mask);
 698             b[i+1] = (byte)(a[i+1] & mask);
 699             b[i+2] = (byte)(a[i+2] & mask);
 700             b[i+3] = (byte)(a[i+3] & mask);
 701         }
 702         return new Object[]{ a, b };
 703     }
 704 
 705     @Test
 706     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 707                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 708                   IRNode.STORE_VECTOR, "> 0"},
 709         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 710         applyIfPlatform = {"64-bit", "true"},
 711         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 712     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 713                   IRNode.AND_VS, "= 0",
 714                   IRNode.STORE_VECTOR, "= 0"},
 715         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 716         applyIfPlatform = {"64-bit", "true"},
 717         applyIf = {"AlignVector", "true"})
 718     static Object[] test10c(short[] a, short[] b, short mask) {
 719         // This is not alignable with pre-loop, because of odd init.
 720         // Seems not correctly handled with MaxVectorSize >= 32.
 721         for (int i = 13; i < RANGE-8; i+=8) {
 722             b[i+0] = (short)(a[i+0] & mask);
 723             b[i+1] = (short)(a[i+1] & mask);
 724             b[i+2] = (short)(a[i+2] & mask);
 725             b[i+3] = (short)(a[i+3] & mask);
 726         }
 727         return new Object[]{ a, b };
 728     }
 729 
 730     @Test
 731     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 732                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 733                   IRNode.STORE_VECTOR, "> 0"},
 734         applyIf = {"MaxVectorSize", ">=16"},
 735         applyIfPlatform = {"64-bit", "true"},
 736         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 737     static Object[] test10d(short[] a, short[] b, short mask) {
 738         for (int i = 13; i < RANGE-16; i+=8) {
 739             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 13) + iter*16
 740             b[i+0+3] = (short)(a[i+0+3] & mask);
 741             b[i+1+3] = (short)(a[i+1+3] & mask);
 742             b[i+2+3] = (short)(a[i+2+3] & mask);
 743             b[i+3+3] = (short)(a[i+3+3] & mask);
 744         }
 745         return new Object[]{ a, b };
 746     }
 747 
 748     @Test
 749     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 750                   IRNode.AND_VB, "> 0",
 751                   IRNode.STORE_VECTOR, "> 0"},
 752         applyIfPlatform = {"64-bit", "true"},
 753         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 754     static Object[] test11aB(byte[] a, byte[] b, byte mask) {
 755         for (int i = 0; i < RANGE; i++) {
 756             // always alignable
 757             b[i+0] = (byte)(a[i+0] & mask);
 758         }
 759         return new Object[]{ a, b };
 760     }
 761 
 762     @Test
 763     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 764                   IRNode.AND_VS, "> 0",
 765                   IRNode.STORE_VECTOR, "> 0"},
 766         applyIfPlatform = {"64-bit", "true"},
 767         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 768     static Object[] test11aS(short[] a, short[] b, short mask) {
 769         for (int i = 0; i < RANGE; i++) {
 770             // always alignable
 771             b[i+0] = (short)(a[i+0] & mask);
 772         }
 773         return new Object[]{ a, b };
 774     }
 775 
 776     @Test
 777     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 778                   IRNode.AND_VI, "> 0",
 779                   IRNode.STORE_VECTOR, "> 0"},
 780         applyIfPlatform = {"64-bit", "true"},
 781         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 782     static Object[] test11aI(int[] a, int[] b, int mask) {
 783         for (int i = 0; i < RANGE; i++) {
 784             // always alignable
 785             b[i+0] = (int)(a[i+0] & mask);
 786         }
 787         return new Object[]{ a, b };
 788     }
 789 
 790     @Test
 791     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 792                   IRNode.AND_VL, "> 0",
 793                   IRNode.STORE_VECTOR, "> 0"},
 794         applyIfPlatform = {"64-bit", "true"},
 795         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 796     static Object[] test11aL(long[] a, long[] b, long mask) {
 797         for (int i = 0; i < RANGE; i++) {
 798             // always alignable
 799             b[i+0] = (long)(a[i+0] & mask);
 800         }
 801         return new Object[]{ a, b };
 802     }
 803 
 804     @Test
 805     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 806                   IRNode.AND_VB, "> 0",
 807                   IRNode.STORE_VECTOR, "> 0"},
 808         applyIfPlatform = {"64-bit", "true"},
 809         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 810     static Object[] test11bB(byte[] a, byte[] b, byte mask) {
 811         for (int i = 1; i < RANGE; i++) {
 812             // always alignable
 813             b[i+0] = (byte)(a[i+0] & mask);
 814         }
 815         return new Object[]{ a, b };
 816     }
 817 
 818     @Test
 819     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 820                   IRNode.AND_VS, "> 0",
 821                   IRNode.STORE_VECTOR, "> 0"},
 822         applyIfPlatform = {"64-bit", "true"},
 823         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 824     static Object[] test11bS(short[] a, short[] b, short mask) {
 825         for (int i = 1; i < RANGE; i++) {
 826             // always alignable
 827             b[i+0] = (short)(a[i+0] & mask);
 828         }
 829         return new Object[]{ a, b };
 830     }
 831 
 832     @Test
 833     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 834                   IRNode.AND_VI, "> 0",
 835                   IRNode.STORE_VECTOR, "> 0"},
 836         applyIfPlatform = {"64-bit", "true"},
 837         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 838     static Object[] test11bI(int[] a, int[] b, int mask) {
 839         for (int i = 1; i < RANGE; i++) {
 840             // always alignable
 841             b[i+0] = (int)(a[i+0] & mask);
 842         }
 843         return new Object[]{ a, b };
 844     }
 845 
 846     @Test
 847     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 848                   IRNode.AND_VL, "> 0",
 849                   IRNode.STORE_VECTOR, "> 0"},
 850         applyIfPlatform = {"64-bit", "true"},
 851         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 852     static Object[] test11bL(long[] a, long[] b, long mask) {
 853         for (int i = 1; i < RANGE; i++) {
 854             // always alignable
 855             b[i+0] = (long)(a[i+0] & mask);
 856         }
 857         return new Object[]{ a, b };
 858     }
 859 
 860     @Test
 861     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 862                   IRNode.AND_VB, "> 0",
 863                   IRNode.STORE_VECTOR, "> 0"},
 864         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 865         applyIfPlatform = {"64-bit", "true"},
 866         applyIf = {"AlignVector", "false"})
 867     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 868                   IRNode.AND_VB, "= 0",
 869                   IRNode.STORE_VECTOR, "= 0"},
 870         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 871         applyIfPlatform = {"64-bit", "true"},
 872         applyIf = {"AlignVector", "true"})
 873     static Object[] test11cB(byte[] a, byte[] b, byte mask) {
 874         for (int i = 1; i < RANGE-1; i++) {
 875             // 1 byte offset -> not alignable with AlignVector
 876             b[i+0] = (byte)(a[i+1] & mask);
 877         }
 878         return new Object[]{ a, b };
 879     }
 880 
 881     @Test
 882     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 883                   IRNode.AND_VS, "> 0",
 884                   IRNode.STORE_VECTOR, "> 0"},
 885         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 886         applyIfPlatform = {"64-bit", "true"},
 887         applyIf = {"AlignVector", "false"})
 888     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 889                   IRNode.AND_VS, "= 0",
 890                   IRNode.STORE_VECTOR, "= 0"},
 891         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 892         applyIfPlatform = {"64-bit", "true"},
 893         applyIf = {"AlignVector", "true"})
 894     static Object[] test11cS(short[] a, short[] b, short mask) {
 895         for (int i = 1; i < RANGE-1; i++) {
 896             // 2 byte offset -> not alignable with AlignVector
 897             b[i+0] = (short)(a[i+1] & mask);
 898         }
 899         return new Object[]{ a, b };
 900     }
 901 
 902     @Test
 903     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 904                   IRNode.AND_VI, "> 0",
 905                   IRNode.STORE_VECTOR, "> 0"},
 906         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 907         applyIfPlatform = {"64-bit", "true"},
 908         applyIf = {"AlignVector", "false"})
 909     @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0",
 910                   IRNode.AND_VI, "= 0",
 911                   IRNode.STORE_VECTOR, "= 0"},
 912         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 913         applyIfPlatform = {"64-bit", "true"},
 914         applyIf = {"AlignVector", "true"})
 915     static Object[] test11cI(int[] a, int[] b, int mask) {
 916         for (int i = 1; i < RANGE-1; i++) {
 917             // 4 byte offset -> not alignable with AlignVector
 918             b[i+0] = (int)(a[i+1] & mask);
 919         }
 920         return new Object[]{ a, b };
 921     }
 922 
 923     @Test
 924     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 925                   IRNode.AND_VL, "> 0",
 926                   IRNode.STORE_VECTOR, "> 0"},
 927         applyIfPlatform = {"64-bit", "true"},
 928         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 929     static Object[] test11cL(long[] a, long[] b, long mask) {
 930         for (int i = 1; i < RANGE-1; i++) {
 931             // always alignable (8 byte offset)
 932             b[i+0] = (long)(a[i+1] & mask);
 933         }
 934         return new Object[]{ a, b };
 935     }
 936 
 937     @Test
 938     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 939                   IRNode.AND_VB, "> 0",
 940                   IRNode.STORE_VECTOR, "> 0"},
 941         applyIfPlatform = {"64-bit", "true"},
 942         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 943     static Object[] test11dB(byte[] a, byte[] b, byte mask, int invar) {
 944         for (int i = 0; i < RANGE; i++) {
 945             b[i+0+invar] = (byte)(a[i+0+invar] & mask);
 946         }
 947         return new Object[]{ a, b };
 948     }
 949 
 950     @Test
 951     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 952                   IRNode.AND_VS, "> 0",
 953                   IRNode.STORE_VECTOR, "> 0"},
 954         applyIfPlatform = {"64-bit", "true"},
 955         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 956     static Object[] test11dS(short[] a, short[] b, short mask, int invar) {
 957         for (int i = 0; i < RANGE; i++) {
 958             b[i+0+invar] = (short)(a[i+0+invar] & mask);
 959         }
 960         return new Object[]{ a, b };
 961     }
 962 
 963     @Test
 964     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 965                   IRNode.AND_VI, "> 0",
 966                   IRNode.STORE_VECTOR, "> 0"},
 967         applyIfPlatform = {"64-bit", "true"},
 968         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 969     static Object[] test11dI(int[] a, int[] b, int mask, int invar) {
 970         for (int i = 0; i < RANGE; i++) {
 971             b[i+0+invar] = (int)(a[i+0+invar] & mask);
 972         }
 973         return new Object[]{ a, b };
 974     }
 975 
 976     @Test
 977     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 978                   IRNode.AND_VL, "> 0",
 979                   IRNode.STORE_VECTOR, "> 0"},
 980         applyIfPlatform = {"64-bit", "true"},
 981         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 982     static Object[] test11dL(long[] a, long[] b, long mask, int invar) {
 983         for (int i = 0; i < RANGE; i++) {
 984             b[i+0+invar] = (long)(a[i+0+invar] & mask);
 985         }
 986         return new Object[]{ a, b };
 987     }
 988 
 989     @Test
 990     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0",
 991                   IRNode.AND_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0",
 992                   IRNode.STORE_VECTOR,                                           "> 0"},
 993         applyIfPlatform = {"64-bit", "true"},
 994         applyIf = {"AlignVector", "false"},
 995         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 996     static Object[] test12(byte[] a, byte[] b, byte mask) {
 997         for (int i = 0; i < RANGE/16; i++) {
 998             // Non-power-of-2 stride. Vectorization of 4 bytes, then 2-bytes gap.
 999             b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask);
1000             b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask);
1001             b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask);
1002             b[i*6 + 3 ] = (byte)(a[i*6 + 3 ] & mask);
1003         }
1004         return new Object[]{ a, b };
1005     }
1006 
1007     @Test
1008     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1009                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1010                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1011                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1012                   IRNode.STORE_VECTOR, "> 0"},
1013         applyIfPlatform = {"64-bit", "true"},
1014         applyIfCPUFeatureOr = {"avx2", "true"})
1015     // require avx to ensure vectors are larger than what unrolling produces
1016     static Object[] test13aIL(int[] a, long[] b) {
1017         for (int i = 0; i < RANGE; i++) {
1018             a[i]++;
1019             b[i]++;
1020         }
1021         return new Object[]{ a, b };
1022     }
1023 
1024     @Test
1025     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1026                   IRNode.LOAD_VECTOR_I, "> 0",
1027                   IRNode.ADD_VB, "> 0",
1028                   IRNode.ADD_VI, "> 0",
1029                   IRNode.STORE_VECTOR, "> 0"},
1030         applyIfPlatform = {"64-bit", "true"},
1031         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1032     static Object[] test13aIB(int[] a, byte[] b) {
1033         for (int i = 0; i < RANGE; i++) {
1034             a[i]++;
1035             b[i]++;
1036         }
1037         return new Object[]{ a, b };
1038     }
1039 
1040     @Test
1041     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1042                   IRNode.LOAD_VECTOR_S, "> 0",
1043                   IRNode.ADD_VI, "> 0",
1044                   IRNode.ADD_VS, "> 0",
1045                   IRNode.STORE_VECTOR, "> 0"},
1046         applyIfPlatform = {"64-bit", "true"},
1047         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1048     static Object[] test13aIS(int[] a, short[] b) {
1049         for (int i = 0; i < RANGE; i++) {
1050             a[i]++;
1051             b[i]++;
1052         }
1053         return new Object[]{ a, b };
1054     }
1055 
1056     @Test
1057     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1058                   IRNode.LOAD_VECTOR_S, "> 0",
1059                   IRNode.LOAD_VECTOR_I, "> 0",
1060                   IRNode.LOAD_VECTOR_L, "> 0",
1061                   IRNode.ADD_VB, "> 0",
1062                   IRNode.ADD_VS, "> 0",
1063                   IRNode.ADD_VI, "> 0",
1064                   IRNode.ADD_VL, "> 0",
1065                   IRNode.STORE_VECTOR, "> 0"},
1066         applyIfPlatform = {"64-bit", "true"},
1067         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1068     static Object[] test13aBSIL(byte[] a, short[] b, int[] c, long[] d) {
1069         for (int i = 0; i < RANGE; i++) {
1070             a[i]++;
1071             b[i]++;
1072             c[i]++;
1073             d[i]++;
1074         }
1075         return new Object[]{ a, b, c, d };
1076     }
1077 
1078     @Test
1079     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1080                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1081                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1082                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1083                   IRNode.STORE_VECTOR, "> 0"},
1084         applyIfPlatform = {"64-bit", "true"},
1085         applyIfCPUFeatureOr = {"avx2", "true"})
1086     // require avx to ensure vectors are larger than what unrolling produces
1087     static Object[] test13bIL(int[] a, long[] b) {
1088         for (int i = 1; i < RANGE; i++) {
1089             a[i]++;
1090             b[i]++;
1091         }
1092         return new Object[]{ a, b };
1093     }
1094 
1095     @Test
1096     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1097                   IRNode.LOAD_VECTOR_I, "> 0",
1098                   IRNode.ADD_VB, "> 0",
1099                   IRNode.ADD_VI, "> 0",
1100                   IRNode.STORE_VECTOR, "> 0"},
1101         applyIfPlatform = {"64-bit", "true"},
1102         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1103     static Object[] test13bIB(int[] a, byte[] b) {
1104         for (int i = 1; i < RANGE; i++) {
1105             a[i]++;
1106             b[i]++;
1107         }
1108         return new Object[]{ a, b };
1109     }
1110 
1111     @Test
1112     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1113                   IRNode.LOAD_VECTOR_S, "> 0",
1114                   IRNode.ADD_VI, "> 0",
1115                   IRNode.ADD_VS, "> 0",
1116                   IRNode.STORE_VECTOR, "> 0"},
1117         applyIfPlatform = {"64-bit", "true"},
1118         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1119     static Object[] test13bIS(int[] a, short[] b) {
1120         for (int i = 1; i < RANGE; i++) {
1121             a[i]++;
1122             b[i]++;
1123         }
1124         return new Object[]{ a, b };
1125     }
1126 
1127     @Test
1128     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1129                   IRNode.LOAD_VECTOR_S, "> 0",
1130                   IRNode.LOAD_VECTOR_I, "> 0",
1131                   IRNode.LOAD_VECTOR_L, "> 0",
1132                   IRNode.ADD_VB, "> 0",
1133                   IRNode.ADD_VS, "> 0",
1134                   IRNode.ADD_VI, "> 0",
1135                   IRNode.ADD_VL, "> 0",
1136                   IRNode.STORE_VECTOR, "> 0"},
1137         applyIfPlatform = {"64-bit", "true"},
1138         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1139     static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
1140         for (int i = 1; i < RANGE; i++) {
1141             a[i]++;
1142             b[i]++;
1143             c[i]++;
1144             d[i]++;
1145         }
1146         return new Object[]{ a, b, c, d };
1147     }
1148 
1149     @Test
1150     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1151                   IRNode.ADD_VB, "= 0",
1152                   IRNode.STORE_VECTOR, "= 0"},
1153         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1154         applyIfPlatform = {"64-bit", "true"},
1155         applyIf = {"AlignVector", "false"})
1156     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1157                   IRNode.ADD_VB, "= 0",
1158                   IRNode.STORE_VECTOR, "= 0"},
1159         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1160         applyIfPlatform = {"64-bit", "true"},
1161         applyIf = {"AlignVector", "true"})
1162     static Object[] test14aB(byte[] a) {
1163         // non-power-of-2 stride
1164         for (int i = 0; i < RANGE-20; i+=9) {
1165             // Since the stride is shorter than the vector length, there will be always
1166             // partial overlap of loads with previous stores, this leads to failure in
1167             // store-to-load-forwarding -> vectorization not profitable.
1168             a[i+0]++;
1169             a[i+1]++;
1170             a[i+2]++;
1171             a[i+3]++;
1172             a[i+4]++;
1173             a[i+5]++;
1174             a[i+6]++;
1175             a[i+7]++;
1176             a[i+8]++;
1177             a[i+9]++;
1178             a[i+10]++;
1179             a[i+11]++;
1180             a[i+12]++;
1181             a[i+13]++;
1182             a[i+14]++;
1183             a[i+15]++;
1184         }
1185         return new Object[]{ a };
1186     }
1187 
1188     @Test
1189     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1190                   IRNode.ADD_VB, "= 0",
1191                   IRNode.STORE_VECTOR, "= 0"},
1192         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1193         applyIfPlatform = {"64-bit", "true"},
1194         applyIf = {"AlignVector", "false"})
1195     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1196                   IRNode.ADD_VB, "= 0",
1197                   IRNode.STORE_VECTOR, "= 0"},
1198         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1199         applyIfPlatform = {"64-bit", "true"},
1200         applyIf = {"AlignVector", "true"})
1201     static Object[] test14bB(byte[] a) {
1202         // non-power-of-2 stride
1203         for (int i = 0; i < RANGE-20; i+=3) {
1204             // Since the stride is shorter than the vector length, there will be always
1205             // partial overlap of loads with previous stores, this leads to failure in
1206             // store-to-load-forwarding -> vectorization not profitable.
1207             a[i+0]++;
1208             a[i+1]++;
1209             a[i+2]++;
1210             a[i+3]++;
1211             a[i+4]++;
1212             a[i+5]++;
1213             a[i+6]++;
1214             a[i+7]++;
1215             a[i+8]++;
1216             a[i+9]++;
1217             a[i+10]++;
1218             a[i+11]++;
1219             a[i+12]++;
1220             a[i+13]++;
1221             a[i+14]++;
1222             a[i+15]++;
1223         }
1224         return new Object[]{ a };
1225     }
1226 
1227     @Test
1228     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1229                   IRNode.ADD_VB, "= 0",
1230                   IRNode.STORE_VECTOR, "= 0"},
1231         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1232         applyIfPlatform = {"64-bit", "true"},
1233         applyIf = {"AlignVector", "false"})
1234     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1235                   IRNode.ADD_VB, "= 0",
1236                   IRNode.STORE_VECTOR, "= 0"},
1237         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1238         applyIfPlatform = {"64-bit", "true"},
1239         applyIf = {"AlignVector", "true"})
1240     static Object[] test14cB(byte[] a) {
1241         // non-power-of-2 stride
1242         for (int i = 0; i < RANGE-20; i+=5) {
1243             // Since the stride is shorter than the vector length, there will be always
1244             // partial overlap of loads with previous stores, this leads to failure in
1245             // store-to-load-forwarding -> vectorization not profitable.
1246             a[i+0]++;
1247             a[i+1]++;
1248             a[i+2]++;
1249             a[i+3]++;
1250             a[i+4]++;
1251             a[i+5]++;
1252             a[i+6]++;
1253             a[i+7]++;
1254             a[i+8]++;
1255             a[i+9]++;
1256             a[i+10]++;
1257             a[i+11]++;
1258             a[i+12]++;
1259             a[i+13]++;
1260             a[i+14]++;
1261             a[i+15]++;
1262         }
1263         return new Object[]{ a };
1264     }
1265 
1266     @Test
1267     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1268                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1269                   IRNode.STORE_VECTOR,                                           "> 0"},
1270         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1271         applyIfPlatform = {"64-bit", "true"},
1272         applyIf = {"AlignVector", "false"})
1273     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1274                   IRNode.ADD_VB, "= 0",
1275                   IRNode.STORE_VECTOR, "= 0"},
1276         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1277         applyIfPlatform = {"64-bit", "true"},
1278         applyIf = {"AlignVector", "true"})
1279     static Object[] test14dB(byte[] a) {
1280         // non-power-of-2 stride
1281         for (int i = 0; i < RANGE-20; i+=9) {
1282             a[i+0]++;
1283             a[i+1]++;
1284             a[i+2]++;
1285             a[i+3]++;
1286             a[i+4]++;
1287             a[i+5]++;
1288             a[i+6]++;
1289             a[i+7]++;
1290         }
1291         return new Object[]{ a };
1292     }
1293 
1294     @Test
1295     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1296                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1297                   IRNode.STORE_VECTOR,                                           "> 0"},
1298         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1299         applyIfPlatform = {"64-bit", "true"},
1300         applyIf = {"AlignVector", "false"})
1301     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1302                   IRNode.ADD_VB, "= 0",
1303                   IRNode.STORE_VECTOR, "= 0"},
1304         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1305         applyIfPlatform = {"64-bit", "true"},
1306         applyIf = {"AlignVector", "true"})
1307     static Object[] test14eB(byte[] a) {
1308         // non-power-of-2 stride
1309         for (int i = 0; i < RANGE-32; i+=11) {
1310             a[i+0]++;
1311             a[i+1]++;
1312             a[i+2]++;
1313             a[i+3]++;
1314             a[i+4]++;
1315             a[i+5]++;
1316             a[i+6]++;
1317             a[i+7]++;
1318         }
1319         return new Object[]{ a };
1320     }
1321 
1322     @Test
1323     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1324                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1325                   IRNode.STORE_VECTOR,                                           "> 0"},
1326         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1327         applyIfPlatform = {"64-bit", "true"},
1328         applyIf = {"AlignVector", "false"})
1329     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1330                   IRNode.ADD_VB, "= 0",
1331                   IRNode.STORE_VECTOR, "= 0"},
1332         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1333         applyIfPlatform = {"64-bit", "true"},
1334         applyIf = {"AlignVector", "true"})
1335     static Object[] test14fB(byte[] a) {
1336         // non-power-of-2 stride
1337         for (int i = 0; i < RANGE-40; i+=12) {
1338             a[i+0]++;
1339             a[i+1]++;
1340             a[i+2]++;
1341             a[i+3]++;
1342             a[i+4]++;
1343             a[i+5]++;
1344             a[i+6]++;
1345             a[i+7]++;
1346         }
1347         return new Object[]{ a };
1348     }
1349 
1350     @Test
1351     // IR rules difficult because of modulo wrapping with offset after peeling.
1352     static Object[] test15aB(byte[] a) {
1353         // non-power-of-2 scale
1354         for (int i = 0; i < RANGE/64-20; i++) {
1355             a[53*i+0]++;
1356             a[53*i+1]++;
1357             a[53*i+2]++;
1358             a[53*i+3]++;
1359             a[53*i+4]++;
1360             a[53*i+5]++;
1361             a[53*i+6]++;
1362             a[53*i+7]++;
1363             a[53*i+8]++;
1364             a[53*i+9]++;
1365             a[53*i+10]++;
1366             a[53*i+11]++;
1367             a[53*i+12]++;
1368             a[53*i+13]++;
1369             a[53*i+14]++;
1370             a[53*i+15]++;
1371         }
1372         return new Object[]{ a };
1373     }
1374 
1375     @Test
1376     // IR rules difficult because of modulo wrapping with offset after peeling.
1377     static Object[] test15bB(byte[] a) {
1378         // non-power-of-2 scale
1379         for (int i = 0; i < RANGE/64-20; i++) {
1380             a[25*i+0]++;
1381             a[25*i+1]++;
1382             a[25*i+2]++;
1383             a[25*i+3]++;
1384             a[25*i+4]++;
1385             a[25*i+5]++;
1386             a[25*i+6]++;
1387             a[25*i+7]++;
1388             a[25*i+8]++;
1389             a[25*i+9]++;
1390             a[25*i+10]++;
1391             a[25*i+11]++;
1392             a[25*i+12]++;
1393             a[25*i+13]++;
1394             a[25*i+14]++;
1395             a[25*i+15]++;
1396         }
1397         return new Object[]{ a };
1398     }
1399 
1400     @Test
1401     // IR rules difficult because of modulo wrapping with offset after peeling.
1402     static Object[] test15cB(byte[] a) {
1403         // non-power-of-2 scale
1404         for (int i = 0; i < RANGE/64-20; i++) {
1405             a[19*i+0]++;
1406             a[19*i+1]++;
1407             a[19*i+2]++;
1408             a[19*i+3]++;
1409             a[19*i+4]++;
1410             a[19*i+5]++;
1411             a[19*i+6]++;
1412             a[19*i+7]++;
1413             a[19*i+8]++;
1414             a[19*i+9]++;
1415             a[19*i+10]++;
1416             a[19*i+11]++;
1417             a[19*i+12]++;
1418             a[19*i+13]++;
1419             a[19*i+14]++;
1420             a[19*i+15]++;
1421         }
1422         return new Object[]{ a };
1423     }
1424 
1425     @Test
1426     static Object[] test16a(byte[] a, short[] b) {
1427         // infinite loop issues
1428         for (int i = 0; i < RANGE/2-20; i++) {
1429             a[2*i+0]++;
1430             a[2*i+1]++;
1431             a[2*i+2]++;
1432             a[2*i+3]++;
1433             a[2*i+4]++;
1434             a[2*i+5]++;
1435             a[2*i+6]++;
1436             a[2*i+7]++;
1437             a[2*i+8]++;
1438             a[2*i+9]++;
1439             a[2*i+10]++;
1440             a[2*i+11]++;
1441             a[2*i+12]++;
1442             a[2*i+13]++;
1443             a[2*i+14]++;
1444 
1445             b[2*i+0]++;
1446             b[2*i+1]++;
1447             b[2*i+2]++;
1448             b[2*i+3]++;
1449         }
1450         return new Object[]{ a, b };
1451     }
1452 
1453     @Test
1454     static Object[] test16b(byte[] a) {
1455         // infinite loop issues
1456         for (int i = 0; i < RANGE/2-20; i++) {
1457             a[2*i+0]++;
1458             a[2*i+1]++;
1459             a[2*i+2]++;
1460             a[2*i+3]++;
1461             a[2*i+4]++;
1462             a[2*i+5]++;
1463             a[2*i+6]++;
1464             a[2*i+7]++;
1465             a[2*i+8]++;
1466             a[2*i+9]++;
1467             a[2*i+10]++;
1468             a[2*i+11]++;
1469             a[2*i+12]++;
1470             a[2*i+13]++;
1471             a[2*i+14]++;
1472         }
1473         return new Object[]{ a };
1474     }
1475 
1476     @Test
1477     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1478                   IRNode.ADD_VL, "> 0",
1479                   IRNode.STORE_VECTOR, "> 0"},
1480         applyIfPlatform = {"64-bit", "true"},
1481         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1482     static Object[] test17a(long[] a) {
1483         // Unsafe: vectorizes with profiling (not xcomp)
1484         for (int i = 0; i < RANGE; i++) {
1485             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1486             long v = UNSAFE.getLongUnaligned(a, adr);
1487             UNSAFE.putLongUnaligned(a, adr, v + 1);
1488         }
1489         return new Object[]{ a };
1490     }
1491 
1492     @Test
1493     // Difficult to write good IR rule. Modulo calculus overflow can create non-power-of-2 packs.
1494     static Object[] test17b(long[] a) {
1495         // Not alignable
1496         for (int i = 0; i < RANGE-1; i++) {
1497             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1498             long v = UNSAFE.getLongUnaligned(a, adr);
1499             UNSAFE.putLongUnaligned(a, adr, v + 1);
1500         }
1501         return new Object[]{ a };
1502     }
1503 
1504     @Test
1505     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1506                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1507                   IRNode.STORE_VECTOR, "> 0"},
1508         applyIf = {"MaxVectorSize", ">=32"},
1509         applyIfPlatform = {"64-bit", "true"},
1510         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1511     static Object[] test17c(long[] a) {
1512         // Unsafe: aligned vectorizes
1513         for (int i = 0; i < RANGE-1; i+=4) {
1514             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1515             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1516             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1517             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1518             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1519         }
1520         return new Object[]{ a };
1521     }
1522 
1523     @Test
1524     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1525                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1526                   IRNode.STORE_VECTOR, "> 0"},
1527         applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
1528         applyIfPlatform = {"64-bit", "true"},
1529         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=64"})
1530     // Ensure vector width is large enough to fit 64 byte for longs:
1531     // The offsets are: 25, 33, 57, 65
1532     // In modulo 32:    25,  1, 25,  1  -> does not vectorize
1533     // In modulo 64:    25, 33, 57,  1  -> at least first pair vectorizes
1534     // This problem is because we compute modulo vector width in memory_alignment.
1535     @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0",
1536                   IRNode.ADD_VL, "= 0",
1537                   IRNode.STORE_VECTOR, "= 0"},
1538         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1539         applyIfPlatform = {"64-bit", "true"},
1540         applyIf = {"AlignVector", "true"})
1541     static Object[] test17d(long[] a) {
1542         // Not alignable
1543         for (int i = 0; i < RANGE-1; i+=4) {
1544             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1545             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1546             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1547             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1548             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1549         }
1550         return new Object[]{ a };
1551     }
1552 
1553     @Test
1554     static Object[] test18a(byte[] a, int[] b) {
1555         // scale = 0  -->  no iv
1556         for (int i = 0; i < RANGE; i++) {
1557             a[0] = 1;
1558             b[i] = 2;
1559             a[1] = 1;
1560         }
1561         return new Object[]{ a, b };
1562     }
1563 
1564     @Test
1565     static Object[] test18b(byte[] a, int[] b) {
1566         // scale = 0  -->  no iv
1567         for (int i = 0; i < RANGE; i++) {
1568             a[1] = 1;
1569             b[i] = 2;
1570             a[2] = 1;
1571         }
1572         return new Object[]{ a, b };
1573     }
1574 
1575     @Test
1576     static Object[] test19(int[] a, int[] b) {
1577         for (int i = 5000; i > 0; i--) {
1578             a[RANGE_FINAL - i] = b[RANGE_FINAL - i];
1579         }
1580         return new Object[]{ a, b };
1581     }
1582 
1583     @Test
1584     static Object[] test20(byte[] a) {
1585         // Example where it is easy to pass alignment check,
1586         // but used to fail the alignment calculation
1587         for (int i = 1; i < RANGE/2-50; i++) {
1588             a[2*i+0+30]++;
1589             a[2*i+1+30]++;
1590             a[2*i+2+30]++;
1591             a[2*i+3+30]++;
1592         }
1593         return new Object[]{ a };
1594     }
1595 }