1 /*
   2  * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package compiler.loopopts.superword;
  25 
  26 import compiler.lib.ir_framework.*;
  27 import jdk.test.lib.Utils;
  28 import jdk.test.whitebox.WhiteBox;
  29 import jdk.internal.misc.Unsafe;
  30 import java.lang.reflect.Array;
  31 import java.util.Map;
  32 import java.util.HashMap;
  33 import java.util.Random;
  34 import java.nio.ByteOrder;
  35 
  36 /*
  37  * @test id=NoAlignVector
  38  * @bug 8310190
  39  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  40  * @modules java.base/jdk.internal.misc
  41  * @library /test/lib /
  42  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector
  43  */
  44 
  45 /*
  46  * @test id=AlignVector
  47  * @bug 8310190
  48  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  49  * @modules java.base/jdk.internal.misc
  50  * @library /test/lib /
  51  * @run driver compiler.loopopts.superword.TestAlignVector AlignVector
  52  */
  53 
  54 /*
  55  * @test id=VerifyAlignVector
  56  * @bug 8310190
  57  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  58  * @modules java.base/jdk.internal.misc
  59  * @library /test/lib /
  60  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector
  61  */
  62 
  63 /*
  64  * @test id=NoAlignVector-COH
  65  * @bug 8310190
  66  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  67  * @modules java.base/jdk.internal.misc
  68  * @library /test/lib /
  69  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector-COH
  70  */
  71 
  72 /*
  73  * @test id=VerifyAlignVector-COH
  74  * @bug 8310190
  75  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  76  * @modules java.base/jdk.internal.misc
  77  * @library /test/lib /
  78  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector-COH
  79  */
  80 
  81 public class TestAlignVector {
  82     static int RANGE = 1024*8;
  83     static int RANGE_FINAL = 1024*8;
  84     private static final Unsafe UNSAFE = Unsafe.getUnsafe();
  85     private static final Random RANDOM = Utils.getRandomInstance();
  86 
  87     // Inputs
  88     byte[] aB;
  89     byte[] bB;
  90     byte mB = (byte)31;
  91     short[] aS;
  92     short[] bS;
  93     short mS = (short)0xF0F0;
  94     int[] aI;
  95     int[] bI;
  96     int mI = 0xF0F0F0F0;
  97     long[] aL;
  98     long[] bL;
  99     long mL = 0xF0F0F0F0F0F0F0F0L;
 100 
 101     // List of tests
 102     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
 103 
 104     // List of gold, the results from the first run before compilation
 105     Map<String,Object[]> golds = new HashMap<String,Object[]>();
 106 
 107     interface TestFunction {
 108         Object[] run();
 109     }
 110 
 111     public static void main(String[] args) {
 112         TestFramework framework = new TestFramework(TestAlignVector.class);
 113         framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
 114                            "-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=250");
 115 
 116         switch (args[0]) {
 117             case "NoAlignVector"         -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); }
 118             case "AlignVector"           -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); }
 119             case "VerifyAlignVector"     -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 120             case "NoAlignVector-COH"     -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); }
 121             case "VerifyAlignVector-COH" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 122             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 123         }
 124         framework.start();
 125     }
 126 
 127     public TestAlignVector() {
 128         // Generate input once
 129         aB = generateB();
 130         bB = generateB();
 131         aS = generateS();
 132         bS = generateS();
 133         aI = generateI();
 134         bI = generateI();
 135         aL = generateL();
 136         bL = generateL();
 137 
 138         // Add all tests to list
 139         tests.put("test0",       () -> { return test0(aB.clone(), bB.clone(), mB); });
 140         tests.put("test1a",      () -> { return test1a(aB.clone(), bB.clone(), mB); });
 141         tests.put("test1b",      () -> { return test1b(aB.clone(), bB.clone(), mB); });
 142         tests.put("test2",       () -> { return test2(aB.clone(), bB.clone(), mB); });
 143         tests.put("test3",       () -> { return test3(aB.clone(), bB.clone(), mB); });
 144         tests.put("test4",       () -> { return test4(aB.clone(), bB.clone(), mB); });
 145         tests.put("test5",       () -> { return test5(aB.clone(), bB.clone(), mB, 0); });
 146         tests.put("test6",       () -> { return test6(aB.clone(), bB.clone(), mB); });
 147         tests.put("test7",       () -> { return test7(aS.clone(), bS.clone(), mS); });
 148         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 0); });
 149         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 1); });
 150         tests.put("test9",       () -> { return test9(aB.clone(), bB.clone(), mB); });
 151 
 152         tests.put("test10a",     () -> { return test10a(aB.clone(), bB.clone(), mB); });
 153         tests.put("test10b",     () -> { return test10b(aB.clone(), bB.clone(), mB); });
 154         tests.put("test10c",     () -> { return test10c(aS.clone(), bS.clone(), mS); });
 155         tests.put("test10d",     () -> { return test10d(aS.clone(), bS.clone(), mS); });
 156         tests.put("test10e",     () -> { return test10e(aS.clone(), bS.clone(), mS); });
 157 
 158         tests.put("test11aB",    () -> { return test11aB(aB.clone(), bB.clone(), mB); });
 159         tests.put("test11aS",    () -> { return test11aS(aS.clone(), bS.clone(), mS); });
 160         tests.put("test11aI",    () -> { return test11aI(aI.clone(), bI.clone(), mI); });
 161         tests.put("test11aL",    () -> { return test11aL(aL.clone(), bL.clone(), mL); });
 162 
 163         tests.put("test11bB",    () -> { return test11bB(aB.clone(), bB.clone(), mB); });
 164         tests.put("test11bS",    () -> { return test11bS(aS.clone(), bS.clone(), mS); });
 165         tests.put("test11bI",    () -> { return test11bI(aI.clone(), bI.clone(), mI); });
 166         tests.put("test11bL",    () -> { return test11bL(aL.clone(), bL.clone(), mL); });
 167 
 168         tests.put("test11cB",    () -> { return test11cB(aB.clone(), bB.clone(), mB); });
 169         tests.put("test11cS",    () -> { return test11cS(aS.clone(), bS.clone(), mS); });
 170         tests.put("test11cI",    () -> { return test11cI(aI.clone(), bI.clone(), mI); });
 171         tests.put("test11cL",    () -> { return test11cL(aL.clone(), bL.clone(), mL); });
 172 
 173         tests.put("test11dB",    () -> { return test11dB(aB.clone(), bB.clone(), mB, 0); });
 174         tests.put("test11dS",    () -> { return test11dS(aS.clone(), bS.clone(), mS, 0); });
 175         tests.put("test11dI",    () -> { return test11dI(aI.clone(), bI.clone(), mI, 0); });
 176         tests.put("test11dL",    () -> { return test11dL(aL.clone(), bL.clone(), mL, 0); });
 177 
 178         tests.put("test12",      () -> { return test12(aB.clone(), bB.clone(), mB); });
 179 
 180         tests.put("test13aIL",   () -> { return test13aIL(aI.clone(), aL.clone()); });
 181         tests.put("test13aIB",   () -> { return test13aIB(aI.clone(), aB.clone()); });
 182         tests.put("test13aIS",   () -> { return test13aIS(aI.clone(), aS.clone()); });
 183         tests.put("test13aBSIL", () -> { return test13aBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 184 
 185         tests.put("test13bIL",   () -> { return test13bIL(aI.clone(), aL.clone()); });
 186         tests.put("test13bIB",   () -> { return test13bIB(aI.clone(), aB.clone()); });
 187         tests.put("test13bIS",   () -> { return test13bIS(aI.clone(), aS.clone()); });
 188         tests.put("test13bBSIL", () -> { return test13bBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 189 
 190         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
 191         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
 192         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
 193         tests.put("test14dB",    () -> { return test14dB(aB.clone()); });
 194         tests.put("test14eB",    () -> { return test14eB(aB.clone()); });
 195         tests.put("test14fB",    () -> { return test14fB(aB.clone()); });
 196 
 197         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
 198         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
 199         tests.put("test15cB",    () -> { return test15cB(aB.clone()); });
 200 
 201         tests.put("test16a",     () -> { return test16a(aB.clone(), aS.clone()); });
 202         tests.put("test16b",     () -> { return test16b(aB.clone()); });
 203 
 204         tests.put("test17a",     () -> { return test17a(aL.clone()); });
 205         tests.put("test17b",     () -> { return test17b(aL.clone()); });
 206         tests.put("test17c",     () -> { return test17c(aL.clone()); });
 207         tests.put("test17d",     () -> { return test17d(aL.clone()); });
 208 
 209         tests.put("test18a",     () -> { return test18a(aB.clone(), aI.clone()); });
 210         tests.put("test18b",     () -> { return test18b(aB.clone(), aI.clone()); });
 211 
 212         tests.put("test19",      () -> { return test19(aI.clone(), bI.clone()); });
 213         tests.put("test20",      () -> { return test20(aB.clone()); });
 214 
 215         // Compute gold value for all test methods before compilation
 216         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 217             String name = entry.getKey();
 218             TestFunction test = entry.getValue();
 219             Object[] gold = test.run();
 220             golds.put(name, gold);
 221         }
 222     }
 223 
 224     @Warmup(100)
 225     @Run(test = {"test0",
 226                  "test1a",
 227                  "test1b",
 228                  "test2",
 229                  "test3",
 230                  "test4",
 231                  "test5",
 232                  "test6",
 233                  "test7",
 234                  "test8",
 235                  "test9",
 236                  "test10a",
 237                  "test10b",
 238                  "test10c",
 239                  "test10d",
 240                  "test10e",
 241                  "test11aB",
 242                  "test11aS",
 243                  "test11aI",
 244                  "test11aL",
 245                  "test11bB",
 246                  "test11bS",
 247                  "test11bI",
 248                  "test11bL",
 249                  "test11cB",
 250                  "test11cS",
 251                  "test11cI",
 252                  "test11cL",
 253                  "test11dB",
 254                  "test11dS",
 255                  "test11dI",
 256                  "test11dL",
 257                  "test12",
 258                  "test13aIL",
 259                  "test13aIB",
 260                  "test13aIS",
 261                  "test13aBSIL",
 262                  "test13bIL",
 263                  "test13bIB",
 264                  "test13bIS",
 265                  "test13bBSIL",
 266                  "test14aB",
 267                  "test14bB",
 268                  "test14cB",
 269                  "test14dB",
 270                  "test14eB",
 271                  "test14fB",
 272                  "test15aB",
 273                  "test15bB",
 274                  "test15cB",
 275                  "test16a",
 276                  "test16b",
 277                  "test17a",
 278                  "test17b",
 279                  "test17c",
 280                  "test17d",
 281                  "test18a",
 282                  "test18b",
 283                  "test19",
 284                  "test20"})
 285     public void runTests() {
 286         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 287             String name = entry.getKey();
 288             TestFunction test = entry.getValue();
 289             // Recall gold value from before compilation
 290             Object[] gold = golds.get(name);
 291             // Compute new result
 292             Object[] result = test.run();
 293             // Compare gold and new result
 294             verify(name, gold, result);
 295         }
 296     }
 297 
 298     static byte[] generateB() {
 299         byte[] a = new byte[RANGE];
 300         for (int i = 0; i < a.length; i++) {
 301             a[i] = (byte)RANDOM.nextInt();
 302         }
 303         return a;
 304     }
 305 
 306     static short[] generateS() {
 307         short[] a = new short[RANGE];
 308         for (int i = 0; i < a.length; i++) {
 309             a[i] = (short)RANDOM.nextInt();
 310         }
 311         return a;
 312     }
 313 
 314     static int[] generateI() {
 315         int[] a = new int[RANGE];
 316         for (int i = 0; i < a.length; i++) {
 317             a[i] = RANDOM.nextInt();
 318         }
 319         return a;
 320     }
 321 
 322     static long[] generateL() {
 323         long[] a = new long[RANGE];
 324         for (int i = 0; i < a.length; i++) {
 325             a[i] = RANDOM.nextLong();
 326         }
 327         return a;
 328     }
 329 
 330     static void verify(String name, Object[] gold, Object[] result) {
 331         if (gold.length != result.length) {
 332             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
 333                                        gold.length + ", result.length = " + result.length);
 334         }
 335         for (int i = 0; i < gold.length; i++) {
 336             Object g = gold[i];
 337             Object r = result[i];
 338             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
 339                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
 340                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 341                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 342             }
 343             if (g == r) {
 344                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
 345                                            " gold[" + i + "] == result[" + i + "]");
 346             }
 347             if (Array.getLength(g) != Array.getLength(r)) {
 348                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
 349                                            " gold[" + i + "].length = " + Array.getLength(g) +
 350                                            " result[" + i + "].length = " + Array.getLength(r));
 351             }
 352             Class c = g.getClass().getComponentType();
 353             if (c == byte.class) {
 354                 verifyB(name, i, (byte[])g, (byte[])r);
 355             } else if (c == short.class) {
 356                 verifyS(name, i, (short[])g, (short[])r);
 357             } else if (c == int.class) {
 358                 verifyI(name, i, (int[])g, (int[])r);
 359             } else if (c == long.class) {
 360                 verifyL(name, i, (long[])g, (long[])r);
 361             } else {
 362                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
 363                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 364                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 365             }
 366         }
 367     }
 368 
 369     static void verifyB(String name, int i, byte[] g, byte[] r) {
 370         for (int j = 0; j < g.length; j++) {
 371             if (g[j] != r[j]) {
 372                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 373                                            " gold[" + i + "][" + j + "] = " + g[j] +
 374                                            " result[" + i + "][" + j + "] = " + r[j]);
 375             }
 376         }
 377     }
 378 
 379     static void verifyS(String name, int i, short[] g, short[] r) {
 380         for (int j = 0; j < g.length; j++) {
 381             if (g[j] != r[j]) {
 382                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 383                                            " gold[" + i + "][" + j + "] = " + g[j] +
 384                                            " result[" + i + "][" + j + "] = " + r[j]);
 385             }
 386         }
 387     }
 388 
 389     static void verifyI(String name, int i, int[] g, int[] r) {
 390         for (int j = 0; j < g.length; j++) {
 391             if (g[j] != r[j]) {
 392                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 393                                            " gold[" + i + "][" + j + "] = " + g[j] +
 394                                            " result[" + i + "][" + j + "] = " + r[j]);
 395             }
 396         }
 397     }
 398 
 399     static void verifyL(String name, int i, long[] g, long[] r) {
 400         for (int j = 0; j < g.length; j++) {
 401             if (g[j] != r[j]) {
 402                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 403                                            " gold[" + i + "][" + j + "] = " + g[j] +
 404                                            " result[" + i + "][" + j + "] = " + r[j]);
 405             }
 406         }
 407     }
 408 
 409     @Test
 410     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 411                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 412                   IRNode.STORE_VECTOR, "> 0"},
 413         applyIf = {"MaxVectorSize", ">=8"},
 414         applyIfPlatform = {"64-bit", "true"},
 415         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 416     static Object[] test0(byte[] a, byte[] b, byte mask) {
 417         for (int i = 0; i < RANGE; i+=8) {
 418             // Safe to vectorize with AlignVector
 419             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 420             b[i+1] = (byte)(a[i+1] & mask);
 421             b[i+2] = (byte)(a[i+2] & mask);
 422             b[i+3] = (byte)(a[i+3] & mask);
 423         }
 424         return new Object[]{ a, b };
 425     }
 426 
 427     @Test
 428     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 429                   IRNode.AND_VB, "> 0",
 430                   IRNode.STORE_VECTOR, "> 0"},
 431         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
 432         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 433         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 434         applyIfPlatform = {"64-bit", "true"},
 435         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 436     static Object[] test1a(byte[] a, byte[] b, byte mask) {
 437         for (int i = 0; i < RANGE; i+=8) {
 438             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0 + iter*8
 439             b[i+1] = (byte)(a[i+1] & mask);
 440             b[i+2] = (byte)(a[i+2] & mask);
 441             b[i+3] = (byte)(a[i+3] & mask);
 442             b[i+4] = (byte)(a[i+4] & mask);
 443             b[i+5] = (byte)(a[i+5] & mask);
 444             b[i+6] = (byte)(a[i+6] & mask);
 445             b[i+7] = (byte)(a[i+7] & mask);
 446         }
 447         return new Object[]{ a, b };
 448     }
 449 
 450     @Test
 451     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 452                   IRNode.AND_VB, "> 0",
 453                   IRNode.STORE_VECTOR, "> 0"},
 454         applyIfOr = {"UseCompactObjectHeaders", "true", "AlignVector", "false"},
 455         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 456         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 457         applyIfPlatform = {"64-bit", "true"},
 458         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 459     static Object[] test1b(byte[] a, byte[] b, byte mask) {
 460         for (int i = 4; i < RANGE-8; i+=8) {
 461             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4 + iter*8
 462             b[i+1] = (byte)(a[i+1] & mask);
 463             b[i+2] = (byte)(a[i+2] & mask);
 464             b[i+3] = (byte)(a[i+3] & mask);
 465             b[i+4] = (byte)(a[i+4] & mask);
 466             b[i+5] = (byte)(a[i+5] & mask);
 467             b[i+6] = (byte)(a[i+6] & mask);
 468             b[i+7] = (byte)(a[i+7] & mask);
 469         }
 470         return new Object[]{ a, b };
 471     }
 472 
 473     @Test
 474     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 475                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 476                   IRNode.STORE_VECTOR, "> 0"},
 477         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 478         applyIfPlatform = {"64-bit", "true"},
 479         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 480     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 481                   IRNode.AND_VB, "= 0",
 482                   IRNode.STORE_VECTOR, "= 0"},
 483         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 484         applyIfPlatform = {"64-bit", "true"},
 485         applyIf = {"AlignVector", "true"})
 486     static Object[] test2(byte[] a, byte[] b, byte mask) {
 487         for (int i = 0; i < RANGE; i+=8) {
 488             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 489             b[i+3] = (byte)(a[i+3] & mask); // at alignment 3
 490             b[i+4] = (byte)(a[i+4] & mask);
 491             b[i+5] = (byte)(a[i+5] & mask);
 492             b[i+6] = (byte)(a[i+6] & mask);
 493         }
 494         return new Object[]{ a, b };
 495     }
 496 
 497     @Test
 498     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 499                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 500                   IRNode.STORE_VECTOR, "> 0"},
 501         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 502         applyIfPlatform = {"64-bit", "true"},
 503         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 504     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 505                   IRNode.AND_VB, "= 0",
 506                   IRNode.STORE_VECTOR, "= 0"},
 507         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 508         applyIfPlatform = {"64-bit", "true"},
 509         applyIf = {"AlignVector", "true"})
 510     static Object[] test3(byte[] a, byte[] b, byte mask) {
 511         for (int i = 0; i < RANGE; i+=8) {
 512             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 513 
 514             // Problematic for AlignVector
 515             b[i+0] = (byte)(a[i+0] & mask); // best_memref, align 0
 516 
 517             b[i+3] = (byte)(a[i+3] & mask); // pack at offset 3 bytes
 518             b[i+4] = (byte)(a[i+4] & mask);
 519             b[i+5] = (byte)(a[i+5] & mask);
 520             b[i+6] = (byte)(a[i+6] & mask);
 521         }
 522         return new Object[]{ a, b };
 523     }
 524 
 525     @Test
 526     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 527                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "> 0",
 528                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 529                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "> 0",
 530                   IRNode.STORE_VECTOR, "> 0"},
 531         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 532         applyIfPlatform = {"64-bit", "true"},
 533         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 534     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 535                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 536                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 537                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 538                   IRNode.STORE_VECTOR, "> 0"},
 539         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 540         applyIfPlatform = {"64-bit", "true"},
 541         applyIfAnd = {"AlignVector", "true", "MaxVectorSize", ">=16"})
 542     static Object[] test4(byte[] a, byte[] b, byte mask) {
 543         for (int i = 0; i < RANGE/16; i++) {
 544             // Problematic for AlignVector
 545             b[i*16 + 0 ] = (byte)(a[i*16 + 0 ] & mask); // 4 pack, 0 aligned
 546             b[i*16 + 1 ] = (byte)(a[i*16 + 1 ] & mask);
 547             b[i*16 + 2 ] = (byte)(a[i*16 + 2 ] & mask);
 548             b[i*16 + 3 ] = (byte)(a[i*16 + 3 ] & mask);
 549 
 550             b[i*16 + 5 ] = (byte)(a[i*16 + 5 ] & mask); // 8 pack, 5 aligned
 551             b[i*16 + 6 ] = (byte)(a[i*16 + 6 ] & mask);
 552             b[i*16 + 7 ] = (byte)(a[i*16 + 7 ] & mask);
 553             b[i*16 + 8 ] = (byte)(a[i*16 + 8 ] & mask);
 554             b[i*16 + 9 ] = (byte)(a[i*16 + 9 ] & mask);
 555             b[i*16 + 10] = (byte)(a[i*16 + 10] & mask);
 556             b[i*16 + 11] = (byte)(a[i*16 + 11] & mask);
 557             b[i*16 + 12] = (byte)(a[i*16 + 12] & mask);
 558         }
 559         return new Object[]{ a, b };
 560     }
 561 
 562     @Test
 563     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 564                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 565                   IRNode.STORE_VECTOR, "> 0"},
 566         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 567         applyIfPlatform = {"64-bit", "true"},
 568         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 569     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 570                   IRNode.AND_VB, "= 0",
 571                   IRNode.STORE_VECTOR, "= 0"},
 572         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 573         applyIfPlatform = {"64-bit", "true"},
 574         applyIf = {"AlignVector", "true"})
 575     static Object[] test5(byte[] a, byte[] b, byte mask, int inv) {
 576         for (int i = 0; i < RANGE; i+=8) {
 577             // Cannot align with AlignVector because of invariant
 578             b[i+inv+0] = (byte)(a[i+inv+0] & mask);
 579 
 580             b[i+inv+3] = (byte)(a[i+inv+3] & mask);
 581             b[i+inv+4] = (byte)(a[i+inv+4] & mask);
 582             b[i+inv+5] = (byte)(a[i+inv+5] & mask);
 583             b[i+inv+6] = (byte)(a[i+inv+6] & mask);
 584         }
 585         return new Object[]{ a, b };
 586     }
 587 
 588     @Test
 589     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 590                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 591                   IRNode.STORE_VECTOR, "> 0"},
 592         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 593         applyIfPlatform = {"64-bit", "true"},
 594         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 595     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 596                   IRNode.AND_VB, "= 0",
 597                   IRNode.STORE_VECTOR, "= 0"},
 598         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 599         applyIfPlatform = {"64-bit", "true"},
 600         applyIf = {"AlignVector", "true"})
 601     static Object[] test6(byte[] a, byte[] b, byte mask) {
 602         for (int i = 0; i < RANGE/8; i+=2) {
 603             // Cannot align with AlignVector because offset is odd
 604             b[i*4+0] = (byte)(a[i*4+0] & mask);
 605 
 606             b[i*4+3] = (byte)(a[i*4+3] & mask);
 607             b[i*4+4] = (byte)(a[i*4+4] & mask);
 608             b[i*4+5] = (byte)(a[i*4+5] & mask);
 609             b[i*4+6] = (byte)(a[i*4+6] & mask);
 610         }
 611         return new Object[]{ a, b };
 612     }
 613 
 614     @Test
 615     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 616                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 617                   IRNode.STORE_VECTOR, "> 0"},
 618         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"},
 619         applyIfPlatform = {"64-bit", "true"},
 620         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 621     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 622                   IRNode.AND_VS, "= 0",
 623                   IRNode.STORE_VECTOR, "= 0"},
 624         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 625         applyIfPlatform = {"64-bit", "true"},
 626         applyIf = {"AlignVector", "true"})
 627     static Object[] test7(short[] a, short[] b, short mask) {
 628         for (int i = 0; i < RANGE/8; i+=2) {
 629             // Cannot align with AlignVector because offset is odd
 630             b[i*4+0] = (short)(a[i*4+0] & mask);
 631 
 632             b[i*4+3] = (short)(a[i*4+3] & mask);
 633             b[i*4+4] = (short)(a[i*4+4] & mask);
 634             b[i*4+5] = (short)(a[i*4+5] & mask);
 635             b[i*4+6] = (short)(a[i*4+6] & mask);
 636         }
 637         return new Object[]{ a, b };
 638     }
 639 
 640     @Test
 641     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 642                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 643                   IRNode.STORE_VECTOR, "> 0"},
 644         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 645         applyIfPlatform = {"64-bit", "true"},
 646         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 647     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 648                   IRNode.AND_VB, "= 0",
 649                   IRNode.STORE_VECTOR, "= 0"},
 650         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 651         applyIfPlatform = {"64-bit", "true"},
 652         applyIf = {"AlignVector", "true"})
 653     static Object[] test8(byte[] a, byte[] b, byte mask, int init) {
 654         for (int i = init; i < RANGE; i+=8) {
 655             // Cannot align with AlignVector because of invariant (variable init becomes invar)
 656             b[i+0] = (byte)(a[i+0] & mask);
 657 
 658             b[i+3] = (byte)(a[i+3] & mask);
 659             b[i+4] = (byte)(a[i+4] & mask);
 660             b[i+5] = (byte)(a[i+5] & mask);
 661             b[i+6] = (byte)(a[i+6] & mask);
 662         }
 663         return new Object[]{ a, b };
 664     }
 665 
 666     @Test
 667     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 668                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 669                   IRNode.STORE_VECTOR, "> 0"},
 670         applyIf = {"MaxVectorSize", ">=8"},
 671         applyIfPlatform = {"64-bit", "true"},
 672         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 673     static Object[] test9(byte[] a, byte[] b, byte mask) {
 674         // known non-zero init value does not affect offset, but has implicit effect on iv
 675         for (int i = 13; i < RANGE-8; i+=8) {
 676             b[i+0] = (byte)(a[i+0] & mask);
 677 
 678             b[i+3] = (byte)(a[i+3] & mask);
 679             b[i+4] = (byte)(a[i+4] & mask);
 680             b[i+5] = (byte)(a[i+5] & mask);
 681             b[i+6] = (byte)(a[i+6] & mask);
 682         }
 683         return new Object[]{ a, b };
 684     }
 685 
 686     @Test
 687     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 688                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 689                   IRNode.STORE_VECTOR, "> 0"},
 690         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 691         applyIfPlatform = {"64-bit", "true"},
 692         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 693     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 694                   IRNode.AND_VB, "= 0",
 695                   IRNode.STORE_VECTOR, "= 0"},
 696         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 697         applyIfPlatform = {"64-bit", "true"},
 698         applyIf = {"AlignVector", "true"})
 699     static Object[] test10a(byte[] a, byte[] b, byte mask) {
 700         // This is not alignable with pre-loop, because of odd init.
 701         for (int i = 3; i < RANGE-8; i+=8) {
 702             b[i+0] = (byte)(a[i+0] & mask);
 703             b[i+1] = (byte)(a[i+1] & mask);
 704             b[i+2] = (byte)(a[i+2] & mask);
 705             b[i+3] = (byte)(a[i+3] & mask);
 706         }
 707         return new Object[]{ a, b };
 708     }
 709 
 710     @Test
 711     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 712                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 713                   IRNode.STORE_VECTOR, "> 0"},
 714         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 715         applyIfPlatform = {"64-bit", "true"},
 716         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 717     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 718                   IRNode.AND_VB, "= 0",
 719                   IRNode.STORE_VECTOR, "= 0"},
 720         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 721         applyIfPlatform = {"64-bit", "true"},
 722         applyIf = {"AlignVector", "true"})
 723     static Object[] test10b(byte[] a, byte[] b, byte mask) {
 724         // This is not alignable with pre-loop, because of odd init.
 725         // Seems not correctly handled.
 726         for (int i = 13; i < RANGE-8; i+=8) {
 727             b[i+0] = (byte)(a[i+0] & mask);
 728             b[i+1] = (byte)(a[i+1] & mask);
 729             b[i+2] = (byte)(a[i+2] & mask);
 730             b[i+3] = (byte)(a[i+3] & mask);
 731         }
 732         return new Object[]{ a, b };
 733     }
 734 
 735     @Test
 736     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 737                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 738                   IRNode.STORE_VECTOR, "> 0"},
 739         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 740         applyIfPlatform = {"64-bit", "true"},
 741         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 742     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 743                   IRNode.AND_VS, "= 0",
 744                   IRNode.STORE_VECTOR, "= 0"},
 745         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 746         applyIfPlatform = {"64-bit", "true"},
 747         applyIf = {"AlignVector", "true"})
 748     static Object[] test10c(short[] a, short[] b, short mask) {
 749         // This is not alignable with pre-loop, because of odd init.
 750         // Seems not correctly handled with MaxVectorSize >= 32.
 751         for (int i = 13; i < RANGE-8; i+=8) {
 752             b[i+0] = (short)(a[i+0] & mask);
 753             b[i+1] = (short)(a[i+1] & mask);
 754             b[i+2] = (short)(a[i+2] & mask);
 755             b[i+3] = (short)(a[i+3] & mask);
 756         }
 757         return new Object[]{ a, b };
 758     }
 759 
 760     @Test
 761     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 762                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 763                   IRNode.STORE_VECTOR, "> 0"},
 764         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "false"},
 765         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 766         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 767         applyIfPlatform = {"64-bit", "true"},
 768         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 769     static Object[] test10d(short[] a, short[] b, short mask) {
 770         for (int i = 13; i < RANGE-16; i+=8) {
 771             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 13) + iter*16
 772             b[i+0+3] = (short)(a[i+0+3] & mask);
 773             b[i+1+3] = (short)(a[i+1+3] & mask);
 774             b[i+2+3] = (short)(a[i+2+3] & mask);
 775             b[i+3+3] = (short)(a[i+3+3] & mask);
 776         }
 777         return new Object[]{ a, b };
 778     }
 779 
 780     @Test
 781     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 782                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 783                   IRNode.STORE_VECTOR, "> 0"},
 784         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "true"},
 785         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 786         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 787         applyIfPlatform = {"64-bit", "true"},
 788         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
 789     static Object[] test10e(short[] a, short[] b, short mask) {
 790         for (int i = 11; i < RANGE-16; i+=8) {
 791             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 11) + iter*16
 792             b[i+0+3] = (short)(a[i+0+3] & mask);
 793             b[i+1+3] = (short)(a[i+1+3] & mask);
 794             b[i+2+3] = (short)(a[i+2+3] & mask);
 795             b[i+3+3] = (short)(a[i+3+3] & mask);
 796         }
 797         return new Object[]{ a, b };
 798     }
 799 
 800     @Test
 801     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 802                   IRNode.AND_VB, "> 0",
 803                   IRNode.STORE_VECTOR, "> 0"},
 804         applyIfPlatform = {"64-bit", "true"},
 805         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 806     static Object[] test11aB(byte[] a, byte[] b, byte mask) {
 807         for (int i = 0; i < RANGE; i++) {
 808             // always alignable
 809             b[i+0] = (byte)(a[i+0] & mask);
 810         }
 811         return new Object[]{ a, b };
 812     }
 813 
 814     @Test
 815     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 816                   IRNode.AND_VS, "> 0",
 817                   IRNode.STORE_VECTOR, "> 0"},
 818         applyIfPlatform = {"64-bit", "true"},
 819         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 820     static Object[] test11aS(short[] a, short[] b, short mask) {
 821         for (int i = 0; i < RANGE; i++) {
 822             // always alignable
 823             b[i+0] = (short)(a[i+0] & mask);
 824         }
 825         return new Object[]{ a, b };
 826     }
 827 
 828     @Test
 829     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 830                   IRNode.AND_VI, "> 0",
 831                   IRNode.STORE_VECTOR, "> 0"},
 832         applyIfPlatform = {"64-bit", "true"},
 833         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 834     static Object[] test11aI(int[] a, int[] b, int mask) {
 835         for (int i = 0; i < RANGE; i++) {
 836             // always alignable
 837             b[i+0] = (int)(a[i+0] & mask);
 838         }
 839         return new Object[]{ a, b };
 840     }
 841 
 842     @Test
 843     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 844                   IRNode.AND_VL, "> 0",
 845                   IRNode.STORE_VECTOR, "> 0"},
 846         applyIfPlatform = {"64-bit", "true"},
 847         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 848     static Object[] test11aL(long[] a, long[] b, long mask) {
 849         for (int i = 0; i < RANGE; i++) {
 850             // always alignable
 851             b[i+0] = (long)(a[i+0] & mask);
 852         }
 853         return new Object[]{ a, b };
 854     }
 855 
 856     @Test
 857     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 858                   IRNode.AND_VB, "> 0",
 859                   IRNode.STORE_VECTOR, "> 0"},
 860         applyIfPlatform = {"64-bit", "true"},
 861         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 862     static Object[] test11bB(byte[] a, byte[] b, byte mask) {
 863         for (int i = 1; i < RANGE; i++) {
 864             // always alignable
 865             b[i+0] = (byte)(a[i+0] & mask);
 866         }
 867         return new Object[]{ a, b };
 868     }
 869 
 870     @Test
 871     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 872                   IRNode.AND_VS, "> 0",
 873                   IRNode.STORE_VECTOR, "> 0"},
 874         applyIfPlatform = {"64-bit", "true"},
 875         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 876     static Object[] test11bS(short[] a, short[] b, short mask) {
 877         for (int i = 1; i < RANGE; i++) {
 878             // always alignable
 879             b[i+0] = (short)(a[i+0] & mask);
 880         }
 881         return new Object[]{ a, b };
 882     }
 883 
 884     @Test
 885     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 886                   IRNode.AND_VI, "> 0",
 887                   IRNode.STORE_VECTOR, "> 0"},
 888         applyIfPlatform = {"64-bit", "true"},
 889         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 890     static Object[] test11bI(int[] a, int[] b, int mask) {
 891         for (int i = 1; i < RANGE; i++) {
 892             // always alignable
 893             b[i+0] = (int)(a[i+0] & mask);
 894         }
 895         return new Object[]{ a, b };
 896     }
 897 
 898     @Test
 899     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 900                   IRNode.AND_VL, "> 0",
 901                   IRNode.STORE_VECTOR, "> 0"},
 902         applyIfPlatform = {"64-bit", "true"},
 903         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 904     static Object[] test11bL(long[] a, long[] b, long mask) {
 905         for (int i = 1; i < RANGE; i++) {
 906             // always alignable
 907             b[i+0] = (long)(a[i+0] & mask);
 908         }
 909         return new Object[]{ a, b };
 910     }
 911 
 912     @Test
 913     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 914                   IRNode.AND_VB, "> 0",
 915                   IRNode.STORE_VECTOR, "> 0"},
 916         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 917         applyIfPlatform = {"64-bit", "true"},
 918         applyIf = {"AlignVector", "false"})
 919     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 920                   IRNode.AND_VB, "= 0",
 921                   IRNode.STORE_VECTOR, "= 0"},
 922         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 923         applyIfPlatform = {"64-bit", "true"},
 924         applyIf = {"AlignVector", "true"})
 925     static Object[] test11cB(byte[] a, byte[] b, byte mask) {
 926         for (int i = 1; i < RANGE-1; i++) {
 927             // 1 byte offset -> not alignable with AlignVector
 928             b[i+0] = (byte)(a[i+1] & mask);
 929         }
 930         return new Object[]{ a, b };
 931     }
 932 
 933     @Test
 934     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 935                   IRNode.AND_VS, "> 0",
 936                   IRNode.STORE_VECTOR, "> 0"},
 937         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 938         applyIfPlatform = {"64-bit", "true"},
 939         applyIf = {"AlignVector", "false"})
 940     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 941                   IRNode.AND_VS, "= 0",
 942                   IRNode.STORE_VECTOR, "= 0"},
 943         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 944         applyIfPlatform = {"64-bit", "true"},
 945         applyIf = {"AlignVector", "true"})
 946     static Object[] test11cS(short[] a, short[] b, short mask) {
 947         for (int i = 1; i < RANGE-1; i++) {
 948             // 2 byte offset -> not alignable with AlignVector
 949             b[i+0] = (short)(a[i+1] & mask);
 950         }
 951         return new Object[]{ a, b };
 952     }
 953 
 954     @Test
 955     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 956                   IRNode.AND_VI, "> 0",
 957                   IRNode.STORE_VECTOR, "> 0"},
 958         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 959         applyIfPlatform = {"64-bit", "true"},
 960         applyIf = {"AlignVector", "false"})
 961     @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0",
 962                   IRNode.AND_VI, "= 0",
 963                   IRNode.STORE_VECTOR, "= 0"},
 964         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
 965         applyIfPlatform = {"64-bit", "true"},
 966         applyIf = {"AlignVector", "true"})
 967     static Object[] test11cI(int[] a, int[] b, int mask) {
 968         for (int i = 1; i < RANGE-1; i++) {
 969             // 4 byte offset -> not alignable with AlignVector
 970             b[i+0] = (int)(a[i+1] & mask);
 971         }
 972         return new Object[]{ a, b };
 973     }
 974 
 975     @Test
 976     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 977                   IRNode.AND_VL, "> 0",
 978                   IRNode.STORE_VECTOR, "> 0"},
 979         applyIfPlatform = {"64-bit", "true"},
 980         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 981     static Object[] test11cL(long[] a, long[] b, long mask) {
 982         for (int i = 1; i < RANGE-1; i++) {
 983             // always alignable (8 byte offset)
 984             b[i+0] = (long)(a[i+1] & mask);
 985         }
 986         return new Object[]{ a, b };
 987     }
 988 
 989     @Test
 990     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 991                   IRNode.AND_VB, "> 0",
 992                   IRNode.STORE_VECTOR, "> 0"},
 993         applyIfPlatform = {"64-bit", "true"},
 994         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
 995     static Object[] test11dB(byte[] a, byte[] b, byte mask, int invar) {
 996         for (int i = 0; i < RANGE; i++) {
 997             b[i+0+invar] = (byte)(a[i+0+invar] & mask);
 998         }
 999         return new Object[]{ a, b };
1000     }
1001 
1002     @Test
1003     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
1004                   IRNode.AND_VS, "> 0",
1005                   IRNode.STORE_VECTOR, "> 0"},
1006         applyIfPlatform = {"64-bit", "true"},
1007         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1008     static Object[] test11dS(short[] a, short[] b, short mask, int invar) {
1009         for (int i = 0; i < RANGE; i++) {
1010             b[i+0+invar] = (short)(a[i+0+invar] & mask);
1011         }
1012         return new Object[]{ a, b };
1013     }
1014 
1015     @Test
1016     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1017                   IRNode.AND_VI, "> 0",
1018                   IRNode.STORE_VECTOR, "> 0"},
1019         applyIfPlatform = {"64-bit", "true"},
1020         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1021     static Object[] test11dI(int[] a, int[] b, int mask, int invar) {
1022         for (int i = 0; i < RANGE; i++) {
1023             b[i+0+invar] = (int)(a[i+0+invar] & mask);
1024         }
1025         return new Object[]{ a, b };
1026     }
1027 
1028     @Test
1029     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1030                   IRNode.AND_VL, "> 0",
1031                   IRNode.STORE_VECTOR, "> 0"},
1032         applyIfPlatform = {"64-bit", "true"},
1033         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1034     static Object[] test11dL(long[] a, long[] b, long mask, int invar) {
1035         for (int i = 0; i < RANGE; i++) {
1036             b[i+0+invar] = (long)(a[i+0+invar] & mask);
1037         }
1038         return new Object[]{ a, b };
1039     }
1040 
1041     @Test
1042     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0",
1043                   IRNode.AND_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0",
1044                   IRNode.STORE_VECTOR,                                           "> 0"},
1045         applyIfPlatform = {"64-bit", "true"},
1046         applyIf = {"AlignVector", "false"},
1047         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1048     static Object[] test12(byte[] a, byte[] b, byte mask) {
1049         for (int i = 0; i < RANGE/16; i++) {
1050             // Non-power-of-2 stride. Vectorization of 4 bytes, then 2-bytes gap.
1051             b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask);
1052             b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask);
1053             b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask);
1054             b[i*6 + 3 ] = (byte)(a[i*6 + 3 ] & mask);
1055         }
1056         return new Object[]{ a, b };
1057     }
1058 
1059     @Test
1060     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1061                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1062                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1063                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1064                   IRNode.STORE_VECTOR, "> 0"},
1065         applyIfPlatform = {"64-bit", "true"},
1066         applyIfCPUFeatureOr = {"avx2", "true"})
1067     // require avx to ensure vectors are larger than what unrolling produces
1068     static Object[] test13aIL(int[] a, long[] b) {
1069         for (int i = 0; i < RANGE; i++) {
1070             a[i]++;
1071             b[i]++;
1072         }
1073         return new Object[]{ a, b };
1074     }
1075 
1076     @Test
1077     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1078                   IRNode.LOAD_VECTOR_I, "> 0",
1079                   IRNode.ADD_VB, "> 0",
1080                   IRNode.ADD_VI, "> 0",
1081                   IRNode.STORE_VECTOR, "> 0"},
1082         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1083         applyIfPlatform = {"64-bit", "true"},
1084         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1085     static Object[] test13aIB(int[] a, byte[] b) {
1086         for (int i = 0; i < RANGE; i++) {
1087             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1*iter
1088             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1089             a[i]++;
1090             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET  + 4*iter
1091             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1092             b[i]++;
1093             // For AlignVector, all adr must be 8-byte aligned. Let's see for which iteration this can hold:
1094             // If UseCompactObjectHeaders=false:
1095             //   a: 0, 8, 16, 24, 32, ...
1096             //   b: 0, 2,  4,  6,  8, ...
1097             //   -> Ok, aligns every 8th iteration.
1098             // If UseCompactObjectHeaders=true:
1099             //   a: 4, 12, 20, 28, 36, ...
1100             //   b: 1,  3,  5,  7,  9, ...
1101             //   -> we can never align both vectors!
1102         }
1103         return new Object[]{ a, b };
1104     }
1105 
1106     @Test
1107     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1108                   IRNode.LOAD_VECTOR_S, "> 0",
1109                   IRNode.ADD_VI, "> 0",
1110                   IRNode.ADD_VS, "> 0",
1111                   IRNode.STORE_VECTOR, "> 0"},
1112         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1113         applyIfPlatform = {"64-bit", "true"},
1114         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1115     static Object[] test13aIS(int[] a, short[] b) {
1116         for (int i = 0; i < RANGE; i++) {
1117             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4*iter
1118             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1119             a[i]++;
1120             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*iter
1121             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1122             b[i]++;
1123             // For AlignVector, all adr must be 8-byte aligned. Let's see for which iteration this can hold:
1124             // If UseCompactObjectHeaders=false:
1125             //   a: iter % 2 == 0
1126             //   b: iter % 4 == 0
1127             //   -> Ok, aligns every 4th iteration.
1128             // If UseCompactObjectHeaders=true:
1129             //   a: iter % 2 = 1
1130             //   b: iter % 4 = 2
1131             //   -> we can never align both vectors!
1132         }
1133         return new Object[]{ a, b };
1134     }
1135 
1136     @Test
1137     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1138                   IRNode.LOAD_VECTOR_S, "> 0",
1139                   IRNode.LOAD_VECTOR_I, "> 0",
1140                   IRNode.LOAD_VECTOR_L, "> 0",
1141                   IRNode.ADD_VB, "> 0",
1142                   IRNode.ADD_VS, "> 0",
1143                   IRNode.ADD_VI, "> 0",
1144                   IRNode.ADD_VL, "> 0",
1145                   IRNode.STORE_VECTOR, "> 0"},
1146         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1147         applyIfPlatform = {"64-bit", "true"},
1148         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1149     static Object[] test13aBSIL(byte[] a, short[] b, int[] c, long[] d) {
1150         for (int i = 0; i < RANGE; i++) {
1151             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1*iter
1152             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1153             a[i]++;
1154             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*iter
1155             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1156             b[i]++;
1157             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4*iter
1158             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1159             c[i]++;
1160             // adr = base + UNSAFE.ARRAY_LONG_BASE_OFFSET + 8*iter
1161             //              = 16 (always)
1162             d[i]++;
1163             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1164             //   a: iter % 8 = 4
1165             //   c: iter % 2 = 1
1166             //   -> can never align both vectors!
1167         }
1168         return new Object[]{ a, b, c, d };
1169     }
1170 
1171     @Test
1172     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1173                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1174                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1175                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1176                   IRNode.STORE_VECTOR, "> 0"},
1177         applyIfPlatform = {"64-bit", "true"},
1178         applyIfCPUFeatureOr = {"avx2", "true"})
1179     // require avx to ensure vectors are larger than what unrolling produces
1180     static Object[] test13bIL(int[] a, long[] b) {
1181         for (int i = 1; i < RANGE; i++) {
1182             a[i]++;
1183             b[i]++;
1184         }
1185         return new Object[]{ a, b };
1186     }
1187 
1188     @Test
1189     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1190                   IRNode.LOAD_VECTOR_I, "> 0",
1191                   IRNode.ADD_VB, "> 0",
1192                   IRNode.ADD_VI, "> 0",
1193                   IRNode.STORE_VECTOR, "> 0"},
1194         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1195         applyIfPlatform = {"64-bit", "true"},
1196         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1197     static Object[] test13bIB(int[] a, byte[] b) {
1198         for (int i = 1; i < RANGE; i++) {
1199             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1200             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1201             a[i]++;
1202             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1 + 1*iter
1203             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1204             b[i]++;
1205             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1206             //   a: iter % 2 = 0
1207             //   b: iter % 8 = 3
1208             //   -> can never align both vectors!
1209         }
1210         return new Object[]{ a, b };
1211     }
1212 
1213     @Test
1214     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1215                   IRNode.LOAD_VECTOR_S, "> 0",
1216                   IRNode.ADD_VI, "> 0",
1217                   IRNode.ADD_VS, "> 0",
1218                   IRNode.STORE_VECTOR, "> 0"},
1219         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1220         applyIfPlatform = {"64-bit", "true"},
1221         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1222     static Object[] test13bIS(int[] a, short[] b) {
1223         for (int i = 1; i < RANGE; i++) {
1224             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1225             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1226             a[i]++;
1227             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2 + 2*iter
1228             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1229             b[i]++;
1230             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1231             //   a: iter % 2 = 0
1232             //   b: iter % 4 = 1
1233             //   -> can never align both vectors!
1234         }
1235         return new Object[]{ a, b };
1236     }
1237 
1238     @Test
1239     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1240                   IRNode.LOAD_VECTOR_S, "> 0",
1241                   IRNode.LOAD_VECTOR_I, "> 0",
1242                   IRNode.LOAD_VECTOR_L, "> 0",
1243                   IRNode.ADD_VB, "> 0",
1244                   IRNode.ADD_VS, "> 0",
1245                   IRNode.ADD_VI, "> 0",
1246                   IRNode.ADD_VL, "> 0",
1247                   IRNode.STORE_VECTOR, "> 0"},
1248         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1249         applyIfPlatform = {"64-bit", "true"},
1250         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
1251     static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
1252         for (int i = 1; i < RANGE; i++) {
1253             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1 + 1*iter
1254             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1255             a[i]++;
1256             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2 + 2*iter
1257             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1258             b[i]++;
1259             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1260             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1261             c[i]++;
1262             // adr = base + UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 + 8*iter
1263             //              = 16 (always)
1264             d[i]++;
1265             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1266             //   a: iter % 8 = 3
1267             //   c: iter % 2 = 0
1268             //   -> can never align both vectors!
1269         }
1270         return new Object[]{ a, b, c, d };
1271     }
1272 
1273     @Test
1274     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1275                   IRNode.ADD_VB, "= 0",
1276                   IRNode.STORE_VECTOR, "= 0"},
1277         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1278         applyIfPlatform = {"64-bit", "true"},
1279         applyIf = {"AlignVector", "false"})
1280     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1281                   IRNode.ADD_VB, "= 0",
1282                   IRNode.STORE_VECTOR, "= 0"},
1283         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1284         applyIfPlatform = {"64-bit", "true"},
1285         applyIf = {"AlignVector", "true"})
1286     static Object[] test14aB(byte[] a) {
1287         // non-power-of-2 stride
1288         for (int i = 0; i < RANGE-20; i+=9) {
1289             // Since the stride is shorter than the vector length, there will be always
1290             // partial overlap of loads with previous stores, this leads to failure in
1291             // store-to-load-forwarding -> vectorization not profitable.
1292             a[i+0]++;
1293             a[i+1]++;
1294             a[i+2]++;
1295             a[i+3]++;
1296             a[i+4]++;
1297             a[i+5]++;
1298             a[i+6]++;
1299             a[i+7]++;
1300             a[i+8]++;
1301             a[i+9]++;
1302             a[i+10]++;
1303             a[i+11]++;
1304             a[i+12]++;
1305             a[i+13]++;
1306             a[i+14]++;
1307             a[i+15]++;
1308         }
1309         return new Object[]{ a };
1310     }
1311 
1312     @Test
1313     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1314                   IRNode.ADD_VB, "= 0",
1315                   IRNode.STORE_VECTOR, "= 0"},
1316         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1317         applyIfPlatform = {"64-bit", "true"},
1318         applyIf = {"AlignVector", "false"})
1319     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1320                   IRNode.ADD_VB, "= 0",
1321                   IRNode.STORE_VECTOR, "= 0"},
1322         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1323         applyIfPlatform = {"64-bit", "true"},
1324         applyIf = {"AlignVector", "true"})
1325     static Object[] test14bB(byte[] a) {
1326         // non-power-of-2 stride
1327         for (int i = 0; i < RANGE-20; i+=3) {
1328             // Since the stride is shorter than the vector length, there will be always
1329             // partial overlap of loads with previous stores, this leads to failure in
1330             // store-to-load-forwarding -> vectorization not profitable.
1331             a[i+0]++;
1332             a[i+1]++;
1333             a[i+2]++;
1334             a[i+3]++;
1335             a[i+4]++;
1336             a[i+5]++;
1337             a[i+6]++;
1338             a[i+7]++;
1339             a[i+8]++;
1340             a[i+9]++;
1341             a[i+10]++;
1342             a[i+11]++;
1343             a[i+12]++;
1344             a[i+13]++;
1345             a[i+14]++;
1346             a[i+15]++;
1347         }
1348         return new Object[]{ a };
1349     }
1350 
1351     @Test
1352     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1353                   IRNode.ADD_VB, "= 0",
1354                   IRNode.STORE_VECTOR, "= 0"},
1355         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1356         applyIfPlatform = {"64-bit", "true"},
1357         applyIf = {"AlignVector", "false"})
1358     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1359                   IRNode.ADD_VB, "= 0",
1360                   IRNode.STORE_VECTOR, "= 0"},
1361         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1362         applyIfPlatform = {"64-bit", "true"},
1363         applyIf = {"AlignVector", "true"})
1364     static Object[] test14cB(byte[] a) {
1365         // non-power-of-2 stride
1366         for (int i = 0; i < RANGE-20; i+=5) {
1367             // Since the stride is shorter than the vector length, there will be always
1368             // partial overlap of loads with previous stores, this leads to failure in
1369             // store-to-load-forwarding -> vectorization not profitable.
1370             a[i+0]++;
1371             a[i+1]++;
1372             a[i+2]++;
1373             a[i+3]++;
1374             a[i+4]++;
1375             a[i+5]++;
1376             a[i+6]++;
1377             a[i+7]++;
1378             a[i+8]++;
1379             a[i+9]++;
1380             a[i+10]++;
1381             a[i+11]++;
1382             a[i+12]++;
1383             a[i+13]++;
1384             a[i+14]++;
1385             a[i+15]++;
1386         }
1387         return new Object[]{ a };
1388     }
1389 
1390     @Test
1391     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1392                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1393                   IRNode.STORE_VECTOR,                                           "> 0"},
1394         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1395         applyIfPlatform = {"64-bit", "true"},
1396         applyIf = {"AlignVector", "false"})
1397     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1398                   IRNode.ADD_VB, "= 0",
1399                   IRNode.STORE_VECTOR, "= 0"},
1400         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1401         applyIfPlatform = {"64-bit", "true"},
1402         applyIf = {"AlignVector", "true"})
1403     static Object[] test14dB(byte[] a) {
1404         // non-power-of-2 stride
1405         for (int i = 0; i < RANGE-20; i+=9) {
1406             a[i+0]++;
1407             a[i+1]++;
1408             a[i+2]++;
1409             a[i+3]++;
1410             a[i+4]++;
1411             a[i+5]++;
1412             a[i+6]++;
1413             a[i+7]++;
1414         }
1415         return new Object[]{ a };
1416     }
1417 
1418     @Test
1419     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1420                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1421                   IRNode.STORE_VECTOR,                                           "> 0"},
1422         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1423         applyIfPlatform = {"64-bit", "true"},
1424         applyIf = {"AlignVector", "false"})
1425     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1426                   IRNode.ADD_VB, "= 0",
1427                   IRNode.STORE_VECTOR, "= 0"},
1428         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1429         applyIfPlatform = {"64-bit", "true"},
1430         applyIf = {"AlignVector", "true"})
1431     static Object[] test14eB(byte[] a) {
1432         // non-power-of-2 stride
1433         for (int i = 0; i < RANGE-32; i+=11) {
1434             a[i+0]++;
1435             a[i+1]++;
1436             a[i+2]++;
1437             a[i+3]++;
1438             a[i+4]++;
1439             a[i+5]++;
1440             a[i+6]++;
1441             a[i+7]++;
1442         }
1443         return new Object[]{ a };
1444     }
1445 
1446     @Test
1447     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1448                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1449                   IRNode.STORE_VECTOR,                                           "> 0"},
1450         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1451         applyIfPlatform = {"64-bit", "true"},
1452         applyIf = {"AlignVector", "false"})
1453     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1454                   IRNode.ADD_VB, "= 0",
1455                   IRNode.STORE_VECTOR, "= 0"},
1456         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1457         applyIfPlatform = {"64-bit", "true"},
1458         applyIf = {"AlignVector", "true"})
1459     static Object[] test14fB(byte[] a) {
1460         // non-power-of-2 stride
1461         for (int i = 0; i < RANGE-40; i+=12) {
1462             a[i+0]++;
1463             a[i+1]++;
1464             a[i+2]++;
1465             a[i+3]++;
1466             a[i+4]++;
1467             a[i+5]++;
1468             a[i+6]++;
1469             a[i+7]++;
1470         }
1471         return new Object[]{ a };
1472     }
1473 
1474     @Test
1475     // IR rules difficult because of modulo wrapping with offset after peeling.
1476     static Object[] test15aB(byte[] a) {
1477         // non-power-of-2 scale
1478         for (int i = 0; i < RANGE/64-20; i++) {
1479             a[53*i+0]++;
1480             a[53*i+1]++;
1481             a[53*i+2]++;
1482             a[53*i+3]++;
1483             a[53*i+4]++;
1484             a[53*i+5]++;
1485             a[53*i+6]++;
1486             a[53*i+7]++;
1487             a[53*i+8]++;
1488             a[53*i+9]++;
1489             a[53*i+10]++;
1490             a[53*i+11]++;
1491             a[53*i+12]++;
1492             a[53*i+13]++;
1493             a[53*i+14]++;
1494             a[53*i+15]++;
1495         }
1496         return new Object[]{ a };
1497     }
1498 
1499     @Test
1500     // IR rules difficult because of modulo wrapping with offset after peeling.
1501     static Object[] test15bB(byte[] a) {
1502         // non-power-of-2 scale
1503         for (int i = 0; i < RANGE/64-20; i++) {
1504             a[25*i+0]++;
1505             a[25*i+1]++;
1506             a[25*i+2]++;
1507             a[25*i+3]++;
1508             a[25*i+4]++;
1509             a[25*i+5]++;
1510             a[25*i+6]++;
1511             a[25*i+7]++;
1512             a[25*i+8]++;
1513             a[25*i+9]++;
1514             a[25*i+10]++;
1515             a[25*i+11]++;
1516             a[25*i+12]++;
1517             a[25*i+13]++;
1518             a[25*i+14]++;
1519             a[25*i+15]++;
1520         }
1521         return new Object[]{ a };
1522     }
1523 
1524     @Test
1525     // IR rules difficult because of modulo wrapping with offset after peeling.
1526     static Object[] test15cB(byte[] a) {
1527         // non-power-of-2 scale
1528         for (int i = 0; i < RANGE/64-20; i++) {
1529             a[19*i+0]++;
1530             a[19*i+1]++;
1531             a[19*i+2]++;
1532             a[19*i+3]++;
1533             a[19*i+4]++;
1534             a[19*i+5]++;
1535             a[19*i+6]++;
1536             a[19*i+7]++;
1537             a[19*i+8]++;
1538             a[19*i+9]++;
1539             a[19*i+10]++;
1540             a[19*i+11]++;
1541             a[19*i+12]++;
1542             a[19*i+13]++;
1543             a[19*i+14]++;
1544             a[19*i+15]++;
1545         }
1546         return new Object[]{ a };
1547     }
1548 
1549     @Test
1550     static Object[] test16a(byte[] a, short[] b) {
1551         // infinite loop issues
1552         for (int i = 0; i < RANGE/2-20; i++) {
1553             a[2*i+0]++;
1554             a[2*i+1]++;
1555             a[2*i+2]++;
1556             a[2*i+3]++;
1557             a[2*i+4]++;
1558             a[2*i+5]++;
1559             a[2*i+6]++;
1560             a[2*i+7]++;
1561             a[2*i+8]++;
1562             a[2*i+9]++;
1563             a[2*i+10]++;
1564             a[2*i+11]++;
1565             a[2*i+12]++;
1566             a[2*i+13]++;
1567             a[2*i+14]++;
1568 
1569             b[2*i+0]++;
1570             b[2*i+1]++;
1571             b[2*i+2]++;
1572             b[2*i+3]++;
1573         }
1574         return new Object[]{ a, b };
1575     }
1576 
1577     @Test
1578     static Object[] test16b(byte[] a) {
1579         // infinite loop issues
1580         for (int i = 0; i < RANGE/2-20; i++) {
1581             a[2*i+0]++;
1582             a[2*i+1]++;
1583             a[2*i+2]++;
1584             a[2*i+3]++;
1585             a[2*i+4]++;
1586             a[2*i+5]++;
1587             a[2*i+6]++;
1588             a[2*i+7]++;
1589             a[2*i+8]++;
1590             a[2*i+9]++;
1591             a[2*i+10]++;
1592             a[2*i+11]++;
1593             a[2*i+12]++;
1594             a[2*i+13]++;
1595             a[2*i+14]++;
1596         }
1597         return new Object[]{ a };
1598     }
1599 
1600     @Test
1601     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1602                   IRNode.ADD_VL, "> 0",
1603                   IRNode.STORE_VECTOR, "> 0"},
1604         applyIfPlatform = {"64-bit", "true"},
1605         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1606     static Object[] test17a(long[] a) {
1607         // Unsafe: vectorizes with profiling (not xcomp)
1608         for (int i = 0; i < RANGE; i++) {
1609             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1610             long v = UNSAFE.getLongUnaligned(a, adr);
1611             UNSAFE.putLongUnaligned(a, adr, v + 1);
1612         }
1613         return new Object[]{ a };
1614     }
1615 
1616     @Test
1617     // Difficult to write good IR rule. Modulo calculus overflow can create non-power-of-2 packs.
1618     static Object[] test17b(long[] a) {
1619         // Not alignable
1620         for (int i = 0; i < RANGE-1; i++) {
1621             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1622             long v = UNSAFE.getLongUnaligned(a, adr);
1623             UNSAFE.putLongUnaligned(a, adr, v + 1);
1624         }
1625         return new Object[]{ a };
1626     }
1627 
1628     @Test
1629     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1630                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1631                   IRNode.STORE_VECTOR, "> 0"},
1632         applyIf = {"MaxVectorSize", ">=32"},
1633         applyIfPlatform = {"64-bit", "true"},
1634         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
1635     static Object[] test17c(long[] a) {
1636         // Unsafe: aligned vectorizes
1637         for (int i = 0; i < RANGE-1; i+=4) {
1638             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1639             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1640             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1641             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1642             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1643         }
1644         return new Object[]{ a };
1645     }
1646 
1647     @Test
1648     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1649                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1650                   IRNode.STORE_VECTOR, "> 0"},
1651         applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
1652         applyIfPlatform = {"64-bit", "true"},
1653         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=64"})
1654     // Ensure vector width is large enough to fit 64 byte for longs:
1655     // The offsets are: 25, 33, 57, 65
1656     // In modulo 32:    25,  1, 25,  1  -> does not vectorize
1657     // In modulo 64:    25, 33, 57,  1  -> at least first pair vectorizes
1658     // This problem is because we compute modulo vector width in memory_alignment.
1659     @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0",
1660                   IRNode.ADD_VL, "= 0",
1661                   IRNode.STORE_VECTOR, "= 0"},
1662         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
1663         applyIfPlatform = {"64-bit", "true"},
1664         applyIf = {"AlignVector", "true"})
1665     static Object[] test17d(long[] a) {
1666         // Not alignable
1667         for (int i = 0; i < RANGE-1; i+=4) {
1668             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1669             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1670             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1671             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1672             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1673         }
1674         return new Object[]{ a };
1675     }
1676 
1677     @Test
1678     static Object[] test18a(byte[] a, int[] b) {
1679         // scale = 0  -->  no iv
1680         for (int i = 0; i < RANGE; i++) {
1681             a[0] = 1;
1682             b[i] = 2;
1683             a[1] = 1;
1684         }
1685         return new Object[]{ a, b };
1686     }
1687 
1688     @Test
1689     static Object[] test18b(byte[] a, int[] b) {
1690         // scale = 0  -->  no iv
1691         for (int i = 0; i < RANGE; i++) {
1692             a[1] = 1;
1693             b[i] = 2;
1694             a[2] = 1;
1695         }
1696         return new Object[]{ a, b };
1697     }
1698 
1699     @Test
1700     static Object[] test19(int[] a, int[] b) {
1701         for (int i = 5000; i > 0; i--) {
1702             a[RANGE_FINAL - i] = b[RANGE_FINAL - i];
1703         }
1704         return new Object[]{ a, b };
1705     }
1706 
1707     @Test
1708     static Object[] test20(byte[] a) {
1709         // Example where it is easy to pass alignment check,
1710         // but used to fail the alignment calculation
1711         for (int i = 1; i < RANGE/2-50; i++) {
1712             a[2*i+0+30]++;
1713             a[2*i+1+30]++;
1714             a[2*i+2+30]++;
1715             a[2*i+3+30]++;
1716         }
1717         return new Object[]{ a };
1718     }
1719 }