1 /*
   2  * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package compiler.loopopts.superword;
  25 
  26 import compiler.lib.ir_framework.*;
  27 import jdk.test.lib.Utils;
  28 import jdk.test.whitebox.WhiteBox;
  29 import jdk.internal.misc.Unsafe;
  30 import java.lang.reflect.Array;
  31 import java.util.Map;
  32 import java.util.HashMap;
  33 import java.util.Random;
  34 import java.nio.ByteOrder;
  35 
  36 /*
  37  * @test id=NoAlignVector
  38  * @bug 8310190
  39  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  40  * @modules java.base/jdk.internal.misc
  41  * @library /test/lib /
  42  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector
  43  */
  44 
  45 /*
  46  * @test id=AlignVector
  47  * @bug 8310190
  48  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  49  * @modules java.base/jdk.internal.misc
  50  * @library /test/lib /
  51  * @run driver compiler.loopopts.superword.TestAlignVector AlignVector
  52  */
  53 
  54 /*
  55  * @test id=VerifyAlignVector
  56  * @bug 8310190
  57  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  58  * @modules java.base/jdk.internal.misc
  59  * @library /test/lib /
  60  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector
  61  */
  62 
  63 /*
  64  * @test id=NoAlignVector-COH
  65  * @bug 8310190
  66  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  67  * @modules java.base/jdk.internal.misc
  68  * @library /test/lib /
  69  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector-COH
  70  */
  71 
  72 /*
  73  * @test id=VerifyAlignVector-COH
  74  * @bug 8310190
  75  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  76  * @modules java.base/jdk.internal.misc
  77  * @library /test/lib /
  78  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector-COH
  79  */
  80 
  81 public class TestAlignVector {
  82     static int RANGE = 1024*8;
  83     static int RANGE_FINAL = 1024*8;
  84     private static final Unsafe UNSAFE = Unsafe.getUnsafe();
  85     private static final Random RANDOM = Utils.getRandomInstance();
  86 
  87     // Inputs
  88     byte[] aB;
  89     byte[] bB;
  90     byte mB = (byte)31;
  91     short[] aS;
  92     short[] bS;
  93     short mS = (short)0xF0F0;
  94     int[] aI;
  95     int[] bI;
  96     int mI = 0xF0F0F0F0;
  97     long[] aL;
  98     long[] bL;
  99     long mL = 0xF0F0F0F0F0F0F0F0L;
 100 
 101     // List of tests
 102     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
 103 
 104     // List of gold, the results from the first run before compilation
 105     Map<String,Object[]> golds = new HashMap<String,Object[]>();
 106 
 107     interface TestFunction {
 108         Object[] run();
 109     }
 110 
 111     public static void main(String[] args) {
 112         TestFramework framework = new TestFramework(TestAlignVector.class);
 113         framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
 114                            "-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=250");
 115 
 116         switch (args[0]) {
 117             case "NoAlignVector"         -> { framework.addFlags("-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); }
 118             case "AlignVector"           -> { framework.addFlags("-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); }
 119             case "VerifyAlignVector"     -> { framework.addFlags("-XX:-UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 120             case "NoAlignVector-COH"     -> { framework.addFlags("-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); }
 121             case "VerifyAlignVector-COH" -> { framework.addFlags("-XX:+UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 122             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 123         }
 124         framework.start();
 125     }
 126 
 127     public TestAlignVector() {
 128         // Generate input once
 129         aB = generateB();
 130         bB = generateB();
 131         aS = generateS();
 132         bS = generateS();
 133         aI = generateI();
 134         bI = generateI();
 135         aL = generateL();
 136         bL = generateL();
 137 
 138         // Add all tests to list
 139         tests.put("test0",       () -> { return test0(aB.clone(), bB.clone(), mB); });
 140         tests.put("test1a",      () -> { return test1a(aB.clone(), bB.clone(), mB); });
 141         tests.put("test1b",      () -> { return test1b(aB.clone(), bB.clone(), mB); });
 142         tests.put("test2",       () -> { return test2(aB.clone(), bB.clone(), mB); });
 143         tests.put("test3",       () -> { return test3(aB.clone(), bB.clone(), mB); });
 144         tests.put("test4",       () -> { return test4(aB.clone(), bB.clone(), mB); });
 145         tests.put("test5",       () -> { return test5(aB.clone(), bB.clone(), mB, 0); });
 146         tests.put("test6",       () -> { return test6(aB.clone(), bB.clone(), mB); });
 147         tests.put("test7",       () -> { return test7(aS.clone(), bS.clone(), mS); });
 148         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 0); });
 149         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 1); });
 150         tests.put("test9",       () -> { return test9(aB.clone(), bB.clone(), mB); });
 151 
 152         tests.put("test10a",     () -> { return test10a(aB.clone(), bB.clone(), mB); });
 153         tests.put("test10b",     () -> { return test10b(aB.clone(), bB.clone(), mB); });
 154         tests.put("test10c",     () -> { return test10c(aS.clone(), bS.clone(), mS); });
 155         tests.put("test10d",     () -> { return test10d(aS.clone(), bS.clone(), mS); });
 156         tests.put("test10e",     () -> { return test10e(aS.clone(), bS.clone(), mS); });
 157 
 158         tests.put("test11aB",    () -> { return test11aB(aB.clone(), bB.clone(), mB); });
 159         tests.put("test11aS",    () -> { return test11aS(aS.clone(), bS.clone(), mS); });
 160         tests.put("test11aI",    () -> { return test11aI(aI.clone(), bI.clone(), mI); });
 161         tests.put("test11aL",    () -> { return test11aL(aL.clone(), bL.clone(), mL); });
 162 
 163         tests.put("test11bB",    () -> { return test11bB(aB.clone(), bB.clone(), mB); });
 164         tests.put("test11bS",    () -> { return test11bS(aS.clone(), bS.clone(), mS); });
 165         tests.put("test11bI",    () -> { return test11bI(aI.clone(), bI.clone(), mI); });
 166         tests.put("test11bL",    () -> { return test11bL(aL.clone(), bL.clone(), mL); });
 167 
 168         tests.put("test11cB",    () -> { return test11cB(aB.clone(), bB.clone(), mB); });
 169         tests.put("test11cS",    () -> { return test11cS(aS.clone(), bS.clone(), mS); });
 170         tests.put("test11cI",    () -> { return test11cI(aI.clone(), bI.clone(), mI); });
 171         tests.put("test11cL",    () -> { return test11cL(aL.clone(), bL.clone(), mL); });
 172 
 173         tests.put("test11dB",    () -> { return test11dB(aB.clone(), bB.clone(), mB, 0); });
 174         tests.put("test11dS",    () -> { return test11dS(aS.clone(), bS.clone(), mS, 0); });
 175         tests.put("test11dI",    () -> { return test11dI(aI.clone(), bI.clone(), mI, 0); });
 176         tests.put("test11dL",    () -> { return test11dL(aL.clone(), bL.clone(), mL, 0); });
 177 
 178         tests.put("test12",      () -> { return test12(aB.clone(), bB.clone(), mB); });
 179 
 180         tests.put("test13aIL",   () -> { return test13aIL(aI.clone(), aL.clone()); });
 181         tests.put("test13aIB",   () -> { return test13aIB(aI.clone(), aB.clone()); });
 182         tests.put("test13aIS",   () -> { return test13aIS(aI.clone(), aS.clone()); });
 183         tests.put("test13aBSIL", () -> { return test13aBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 184 
 185         tests.put("test13bIL",   () -> { return test13bIL(aI.clone(), aL.clone()); });
 186         tests.put("test13bIB",   () -> { return test13bIB(aI.clone(), aB.clone()); });
 187         tests.put("test13bIS",   () -> { return test13bIS(aI.clone(), aS.clone()); });
 188         tests.put("test13bBSIL", () -> { return test13bBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 189 
 190         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
 191         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
 192         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
 193         tests.put("test14dB",    () -> { return test14dB(aB.clone()); });
 194         tests.put("test14eB",    () -> { return test14eB(aB.clone()); });
 195         tests.put("test14fB",    () -> { return test14fB(aB.clone()); });
 196 
 197         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
 198         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
 199         tests.put("test15cB",    () -> { return test15cB(aB.clone()); });
 200 
 201         tests.put("test16a",     () -> { return test16a(aB.clone(), aS.clone()); });
 202         tests.put("test16b",     () -> { return test16b(aB.clone()); });
 203 
 204         tests.put("test17a",     () -> { return test17a(aL.clone()); });
 205         tests.put("test17b",     () -> { return test17b(aL.clone()); });
 206         tests.put("test17c",     () -> { return test17c(aL.clone()); });
 207         tests.put("test17d",     () -> { return test17d(aL.clone()); });
 208 
 209         tests.put("test18a",     () -> { return test18a(aB.clone(), aI.clone()); });
 210         tests.put("test18b",     () -> { return test18b(aB.clone(), aI.clone()); });
 211 
 212         tests.put("test19",      () -> { return test19(aI.clone(), bI.clone()); });
 213         tests.put("test20",      () -> { return test20(aB.clone()); });
 214 
 215         // Compute gold value for all test methods before compilation
 216         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 217             String name = entry.getKey();
 218             TestFunction test = entry.getValue();
 219             Object[] gold = test.run();
 220             golds.put(name, gold);
 221         }
 222     }
 223 
 224     @Warmup(100)
 225     @Run(test = {"test0",
 226                  "test1a",
 227                  "test1b",
 228                  "test2",
 229                  "test3",
 230                  "test4",
 231                  "test5",
 232                  "test6",
 233                  "test7",
 234                  "test8",
 235                  "test9",
 236                  "test10a",
 237                  "test10b",
 238                  "test10c",
 239                  "test10d",
 240                  "test10e",
 241                  "test11aB",
 242                  "test11aS",
 243                  "test11aI",
 244                  "test11aL",
 245                  "test11bB",
 246                  "test11bS",
 247                  "test11bI",
 248                  "test11bL",
 249                  "test11cB",
 250                  "test11cS",
 251                  "test11cI",
 252                  "test11cL",
 253                  "test11dB",
 254                  "test11dS",
 255                  "test11dI",
 256                  "test11dL",
 257                  "test12",
 258                  "test13aIL",
 259                  "test13aIB",
 260                  "test13aIS",
 261                  "test13aBSIL",
 262                  "test13bIL",
 263                  "test13bIB",
 264                  "test13bIS",
 265                  "test13bBSIL",
 266                  "test14aB",
 267                  "test14bB",
 268                  "test14cB",
 269                  "test14dB",
 270                  "test14eB",
 271                  "test14fB",
 272                  "test15aB",
 273                  "test15bB",
 274                  "test15cB",
 275                  "test16a",
 276                  "test16b",
 277                  "test17a",
 278                  "test17b",
 279                  "test17c",
 280                  "test17d",
 281                  "test18a",
 282                  "test18b",
 283                  "test19",
 284                  "test20"})
 285     public void runTests() {
 286         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 287             String name = entry.getKey();
 288             TestFunction test = entry.getValue();
 289             // Recall gold value from before compilation
 290             Object[] gold = golds.get(name);
 291             // Compute new result
 292             Object[] result = test.run();
 293             // Compare gold and new result
 294             verify(name, gold, result);
 295         }
 296     }
 297 
 298     static byte[] generateB() {
 299         byte[] a = new byte[RANGE];
 300         for (int i = 0; i < a.length; i++) {
 301             a[i] = (byte)RANDOM.nextInt();
 302         }
 303         return a;
 304     }
 305 
 306     static short[] generateS() {
 307         short[] a = new short[RANGE];
 308         for (int i = 0; i < a.length; i++) {
 309             a[i] = (short)RANDOM.nextInt();
 310         }
 311         return a;
 312     }
 313 
 314     static int[] generateI() {
 315         int[] a = new int[RANGE];
 316         for (int i = 0; i < a.length; i++) {
 317             a[i] = RANDOM.nextInt();
 318         }
 319         return a;
 320     }
 321 
 322     static long[] generateL() {
 323         long[] a = new long[RANGE];
 324         for (int i = 0; i < a.length; i++) {
 325             a[i] = RANDOM.nextLong();
 326         }
 327         return a;
 328     }
 329 
 330     static void verify(String name, Object[] gold, Object[] result) {
 331         if (gold.length != result.length) {
 332             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
 333                                        gold.length + ", result.length = " + result.length);
 334         }
 335         for (int i = 0; i < gold.length; i++) {
 336             Object g = gold[i];
 337             Object r = result[i];
 338             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
 339                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
 340                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 341                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 342             }
 343             if (g == r) {
 344                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
 345                                            " gold[" + i + "] == result[" + i + "]");
 346             }
 347             if (Array.getLength(g) != Array.getLength(r)) {
 348                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
 349                                            " gold[" + i + "].length = " + Array.getLength(g) +
 350                                            " result[" + i + "].length = " + Array.getLength(r));
 351             }
 352             Class c = g.getClass().getComponentType();
 353             if (c == byte.class) {
 354                 verifyB(name, i, (byte[])g, (byte[])r);
 355             } else if (c == short.class) {
 356                 verifyS(name, i, (short[])g, (short[])r);
 357             } else if (c == int.class) {
 358                 verifyI(name, i, (int[])g, (int[])r);
 359             } else if (c == long.class) {
 360                 verifyL(name, i, (long[])g, (long[])r);
 361             } else {
 362                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
 363                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 364                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 365             }
 366         }
 367     }
 368 
 369     static void verifyB(String name, int i, byte[] g, byte[] r) {
 370         for (int j = 0; j < g.length; j++) {
 371             if (g[j] != r[j]) {
 372                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 373                                            " gold[" + i + "][" + j + "] = " + g[j] +
 374                                            " result[" + i + "][" + j + "] = " + r[j]);
 375             }
 376         }
 377     }
 378 
 379     static void verifyS(String name, int i, short[] g, short[] r) {
 380         for (int j = 0; j < g.length; j++) {
 381             if (g[j] != r[j]) {
 382                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 383                                            " gold[" + i + "][" + j + "] = " + g[j] +
 384                                            " result[" + i + "][" + j + "] = " + r[j]);
 385             }
 386         }
 387     }
 388 
 389     static void verifyI(String name, int i, int[] g, int[] r) {
 390         for (int j = 0; j < g.length; j++) {
 391             if (g[j] != r[j]) {
 392                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 393                                            " gold[" + i + "][" + j + "] = " + g[j] +
 394                                            " result[" + i + "][" + j + "] = " + r[j]);
 395             }
 396         }
 397     }
 398 
 399     static void verifyL(String name, int i, long[] g, long[] r) {
 400         for (int j = 0; j < g.length; j++) {
 401             if (g[j] != r[j]) {
 402                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 403                                            " gold[" + i + "][" + j + "] = " + g[j] +
 404                                            " result[" + i + "][" + j + "] = " + r[j]);
 405             }
 406         }
 407     }
 408 
 409     @Test
 410     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 411                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 412                   IRNode.STORE_VECTOR, "> 0"},
 413         applyIf = {"MaxVectorSize", ">=8"},
 414         applyIfPlatform = {"64-bit", "true"},
 415         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 416     static Object[] test0(byte[] a, byte[] b, byte mask) {
 417         for (int i = 0; i < RANGE; i+=8) {
 418             // Safe to vectorize with AlignVector
 419             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 420             b[i+1] = (byte)(a[i+1] & mask);
 421             b[i+2] = (byte)(a[i+2] & mask);
 422             b[i+3] = (byte)(a[i+3] & mask);
 423         }
 424         return new Object[]{ a, b };
 425     }
 426 
 427     @Test
 428     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 429                   IRNode.AND_VB, "> 0",
 430                   IRNode.STORE_VECTOR, "> 0"},
 431         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
 432         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 433         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 434         applyIfPlatform = {"64-bit", "true"},
 435         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 436     static Object[] test1a(byte[] a, byte[] b, byte mask) {
 437         for (int i = 0; i < RANGE; i+=8) {
 438             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0 + iter*8
 439             b[i+1] = (byte)(a[i+1] & mask);
 440             b[i+2] = (byte)(a[i+2] & mask);
 441             b[i+3] = (byte)(a[i+3] & mask);
 442             b[i+4] = (byte)(a[i+4] & mask);
 443             b[i+5] = (byte)(a[i+5] & mask);
 444             b[i+6] = (byte)(a[i+6] & mask);
 445             b[i+7] = (byte)(a[i+7] & mask);
 446         }
 447         return new Object[]{ a, b };
 448     }
 449 
 450     @Test
 451     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 452                   IRNode.AND_VB, "> 0",
 453                   IRNode.STORE_VECTOR, "> 0"},
 454         applyIfOr = {"UseCompactObjectHeaders", "true", "AlignVector", "false"},
 455         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 456         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 457         applyIfPlatform = {"64-bit", "true"},
 458         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 459     static Object[] test1b(byte[] a, byte[] b, byte mask) {
 460         for (int i = 4; i < RANGE-8; i+=8) {
 461             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4 + iter*8
 462             b[i+1] = (byte)(a[i+1] & mask);
 463             b[i+2] = (byte)(a[i+2] & mask);
 464             b[i+3] = (byte)(a[i+3] & mask);
 465             b[i+4] = (byte)(a[i+4] & mask);
 466             b[i+5] = (byte)(a[i+5] & mask);
 467             b[i+6] = (byte)(a[i+6] & mask);
 468             b[i+7] = (byte)(a[i+7] & mask);
 469         }
 470         return new Object[]{ a, b };
 471     }
 472 
 473     @Test
 474     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 475                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 476                   IRNode.STORE_VECTOR, "> 0"},
 477         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 478         applyIfPlatform = {"64-bit", "true"},
 479         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 480     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 481                   IRNode.AND_VB, "= 0",
 482                   IRNode.STORE_VECTOR, "= 0"},
 483         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 484         applyIfPlatform = {"64-bit", "true"},
 485         applyIf = {"AlignVector", "true"})
 486     static Object[] test2(byte[] a, byte[] b, byte mask) {
 487         for (int i = 0; i < RANGE; i+=8) {
 488             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 489             b[i+3] = (byte)(a[i+3] & mask); // at alignment 3
 490             b[i+4] = (byte)(a[i+4] & mask);
 491             b[i+5] = (byte)(a[i+5] & mask);
 492             b[i+6] = (byte)(a[i+6] & mask);
 493         }
 494         return new Object[]{ a, b };
 495     }
 496 
 497     @Test
 498     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 499                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 500                   IRNode.STORE_VECTOR, "> 0"},
 501         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 502         applyIfPlatform = {"64-bit", "true"},
 503         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 504     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 505                   IRNode.AND_VB, "= 0",
 506                   IRNode.STORE_VECTOR, "= 0"},
 507         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 508         applyIfPlatform = {"64-bit", "true"},
 509         applyIf = {"AlignVector", "true"})
 510     static Object[] test3(byte[] a, byte[] b, byte mask) {
 511         for (int i = 0; i < RANGE; i+=8) {
 512             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 513 
 514             // Problematic for AlignVector
 515             b[i+0] = (byte)(a[i+0] & mask); // best_memref, align 0
 516 
 517             b[i+3] = (byte)(a[i+3] & mask); // pack at offset 3 bytes
 518             b[i+4] = (byte)(a[i+4] & mask);
 519             b[i+5] = (byte)(a[i+5] & mask);
 520             b[i+6] = (byte)(a[i+6] & mask);
 521         }
 522         return new Object[]{ a, b };
 523     }
 524 
 525     @Test
 526     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 527                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "> 0",
 528                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 529                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "> 0",
 530                   IRNode.STORE_VECTOR, "> 0"},
 531         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 532         applyIfPlatform = {"64-bit", "true"},
 533         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 534     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 535                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 536                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 537                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 538                   IRNode.STORE_VECTOR, "> 0"},
 539         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 540         applyIfPlatform = {"64-bit", "true"},
 541         applyIfAnd = {"AlignVector", "true", "MaxVectorSize", ">=16"})
 542     static Object[] test4(byte[] a, byte[] b, byte mask) {
 543         for (int i = 0; i < RANGE/16; i++) {
 544             // Problematic for AlignVector
 545             b[i*16 + 0 ] = (byte)(a[i*16 + 0 ] & mask); // 4 pack, 0 aligned
 546             b[i*16 + 1 ] = (byte)(a[i*16 + 1 ] & mask);
 547             b[i*16 + 2 ] = (byte)(a[i*16 + 2 ] & mask);
 548             b[i*16 + 3 ] = (byte)(a[i*16 + 3 ] & mask);
 549 
 550             b[i*16 + 5 ] = (byte)(a[i*16 + 5 ] & mask); // 8 pack, 5 aligned
 551             b[i*16 + 6 ] = (byte)(a[i*16 + 6 ] & mask);
 552             b[i*16 + 7 ] = (byte)(a[i*16 + 7 ] & mask);
 553             b[i*16 + 8 ] = (byte)(a[i*16 + 8 ] & mask);
 554             b[i*16 + 9 ] = (byte)(a[i*16 + 9 ] & mask);
 555             b[i*16 + 10] = (byte)(a[i*16 + 10] & mask);
 556             b[i*16 + 11] = (byte)(a[i*16 + 11] & mask);
 557             b[i*16 + 12] = (byte)(a[i*16 + 12] & mask);
 558         }
 559         return new Object[]{ a, b };
 560     }
 561 
 562     @Test
 563     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 564                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 565                   IRNode.STORE_VECTOR, "> 0"},
 566         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 567         applyIfPlatform = {"64-bit", "true"},
 568         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 569     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 570                   IRNode.AND_VB, "= 0",
 571                   IRNode.STORE_VECTOR, "= 0"},
 572         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 573         applyIfPlatform = {"64-bit", "true"},
 574         applyIf = {"AlignVector", "true"})
 575     static Object[] test5(byte[] a, byte[] b, byte mask, int inv) {
 576         for (int i = 0; i < RANGE; i+=8) {
 577             // Cannot align with AlignVector because of invariant
 578             b[i+inv+0] = (byte)(a[i+inv+0] & mask);
 579 
 580             b[i+inv+3] = (byte)(a[i+inv+3] & mask);
 581             b[i+inv+4] = (byte)(a[i+inv+4] & mask);
 582             b[i+inv+5] = (byte)(a[i+inv+5] & mask);
 583             b[i+inv+6] = (byte)(a[i+inv+6] & mask);
 584         }
 585         return new Object[]{ a, b };
 586     }
 587 
 588     @Test
 589     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 590                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 591                   IRNode.STORE_VECTOR, "> 0"},
 592         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 593         applyIfPlatform = {"64-bit", "true"},
 594         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 595     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 596                   IRNode.AND_VB, "= 0",
 597                   IRNode.STORE_VECTOR, "= 0"},
 598         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 599         applyIfPlatform = {"64-bit", "true"},
 600         applyIf = {"AlignVector", "true"})
 601     static Object[] test6(byte[] a, byte[] b, byte mask) {
 602         for (int i = 0; i < RANGE/8; i+=2) {
 603             // Cannot align with AlignVector because offset is odd
 604             b[i*4+0] = (byte)(a[i*4+0] & mask);
 605 
 606             b[i*4+3] = (byte)(a[i*4+3] & mask);
 607             b[i*4+4] = (byte)(a[i*4+4] & mask);
 608             b[i*4+5] = (byte)(a[i*4+5] & mask);
 609             b[i*4+6] = (byte)(a[i*4+6] & mask);
 610         }
 611         return new Object[]{ a, b };
 612     }
 613 
 614     @Test
 615     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 616                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 617                   IRNode.STORE_VECTOR, "> 0"},
 618         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"},
 619         applyIfPlatform = {"64-bit", "true"},
 620         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 621     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 622                   IRNode.AND_VS, "= 0",
 623                   IRNode.STORE_VECTOR, "= 0"},
 624         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 625         applyIfPlatform = {"64-bit", "true"},
 626         applyIf = {"AlignVector", "true"})
 627     static Object[] test7(short[] a, short[] b, short mask) {
 628         for (int i = 0; i < RANGE/8; i+=2) {
 629             // Cannot align with AlignVector because offset is odd
 630             b[i*4+0] = (short)(a[i*4+0] & mask);
 631 
 632             b[i*4+3] = (short)(a[i*4+3] & mask);
 633             b[i*4+4] = (short)(a[i*4+4] & mask);
 634             b[i*4+5] = (short)(a[i*4+5] & mask);
 635             b[i*4+6] = (short)(a[i*4+6] & mask);
 636         }
 637         return new Object[]{ a, b };
 638     }
 639 
 640     @Test
 641     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 642                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 643                   IRNode.STORE_VECTOR, "> 0"},
 644         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 645         applyIfPlatform = {"64-bit", "true"},
 646         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 647     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 648                   IRNode.AND_VB, "= 0",
 649                   IRNode.STORE_VECTOR, "= 0"},
 650         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 651         applyIfPlatform = {"64-bit", "true"},
 652         applyIf = {"AlignVector", "true"})
 653     static Object[] test8(byte[] a, byte[] b, byte mask, int init) {
 654         for (int i = init; i < RANGE; i+=8) {
 655             // Cannot align with AlignVector because of invariant (variable init becomes invar)
 656             b[i+0] = (byte)(a[i+0] & mask);
 657 
 658             b[i+3] = (byte)(a[i+3] & mask);
 659             b[i+4] = (byte)(a[i+4] & mask);
 660             b[i+5] = (byte)(a[i+5] & mask);
 661             b[i+6] = (byte)(a[i+6] & mask);
 662         }
 663         return new Object[]{ a, b };
 664     }
 665 
 666     @Test
 667     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 668                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 669                   IRNode.STORE_VECTOR, "> 0"},
 670         applyIf = {"MaxVectorSize", ">=8"},
 671         applyIfPlatform = {"64-bit", "true"},
 672         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 673     static Object[] test9(byte[] a, byte[] b, byte mask) {
 674         // known non-zero init value does not affect offset, but has implicit effect on iv
 675         for (int i = 13; i < RANGE-8; i+=8) {
 676             b[i+0] = (byte)(a[i+0] & mask);
 677 
 678             b[i+3] = (byte)(a[i+3] & mask);
 679             b[i+4] = (byte)(a[i+4] & mask);
 680             b[i+5] = (byte)(a[i+5] & mask);
 681             b[i+6] = (byte)(a[i+6] & mask);
 682         }
 683         return new Object[]{ a, b };
 684     }
 685 
 686     @Test
 687     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 688                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 689                   IRNode.STORE_VECTOR, "> 0"},
 690         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 691         applyIfPlatform = {"64-bit", "true"},
 692         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 693     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 694                   IRNode.AND_VB, "= 0",
 695                   IRNode.STORE_VECTOR, "= 0"},
 696         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 697         applyIfPlatform = {"64-bit", "true"},
 698         applyIf = {"AlignVector", "true"})
 699     static Object[] test10a(byte[] a, byte[] b, byte mask) {
 700         // This is not alignable with pre-loop, because of odd init.
 701         for (int i = 3; i < RANGE-8; i+=8) {
 702             b[i+0] = (byte)(a[i+0] & mask);
 703             b[i+1] = (byte)(a[i+1] & mask);
 704             b[i+2] = (byte)(a[i+2] & mask);
 705             b[i+3] = (byte)(a[i+3] & mask);
 706         }
 707         return new Object[]{ a, b };
 708     }
 709 
 710     @Test
 711     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 712                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 713                   IRNode.STORE_VECTOR, "> 0"},
 714         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 715         applyIfPlatform = {"64-bit", "true"},
 716         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 717     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 718                   IRNode.AND_VB, "= 0",
 719                   IRNode.STORE_VECTOR, "= 0"},
 720         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 721         applyIfPlatform = {"64-bit", "true"},
 722         applyIf = {"AlignVector", "true"})
 723     static Object[] test10b(byte[] a, byte[] b, byte mask) {
 724         // This is not alignable with pre-loop, because of odd init.
 725         // Seems not correctly handled.
 726         for (int i = 13; i < RANGE-8; i+=8) {
 727             b[i+0] = (byte)(a[i+0] & mask);
 728             b[i+1] = (byte)(a[i+1] & mask);
 729             b[i+2] = (byte)(a[i+2] & mask);
 730             b[i+3] = (byte)(a[i+3] & mask);
 731         }
 732         return new Object[]{ a, b };
 733     }
 734 
 735     @Test
 736     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 737                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 738                   IRNode.STORE_VECTOR, "> 0"},
 739         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 740         applyIfPlatform = {"64-bit", "true"},
 741         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 742     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 743                   IRNode.AND_VS, "= 0",
 744                   IRNode.STORE_VECTOR, "= 0"},
 745         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 746         applyIfPlatform = {"64-bit", "true"},
 747         applyIf = {"AlignVector", "true"})
 748     static Object[] test10c(short[] a, short[] b, short mask) {
 749         // This is not alignable with pre-loop, because of odd init.
 750         // Seems not correctly handled with MaxVectorSize >= 32.
 751         for (int i = 13; i < RANGE-8; i+=8) {
 752             b[i+0] = (short)(a[i+0] & mask);
 753             b[i+1] = (short)(a[i+1] & mask);
 754             b[i+2] = (short)(a[i+2] & mask);
 755             b[i+3] = (short)(a[i+3] & mask);
 756         }
 757         return new Object[]{ a, b };
 758     }
 759 
 760     @Test
 761     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 762                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 763                   IRNode.STORE_VECTOR, "> 0"},
 764         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "false"},
 765         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 766         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 767         applyIfPlatform = {"64-bit", "true"},
 768         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 769     static Object[] test10d(short[] a, short[] b, short mask) {
 770         for (int i = 13; i < RANGE-16; i+=8) {
 771             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 13) + iter*16
 772             b[i+0+3] = (short)(a[i+0+3] & mask);
 773             b[i+1+3] = (short)(a[i+1+3] & mask);
 774             b[i+2+3] = (short)(a[i+2+3] & mask);
 775             b[i+3+3] = (short)(a[i+3+3] & mask);
 776         }
 777         return new Object[]{ a, b };
 778     }
 779 
 780     @Test
 781     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 782                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 783                   IRNode.STORE_VECTOR, "> 0"},
 784         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "true"},
 785         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 786         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 787         applyIfPlatform = {"64-bit", "true"},
 788         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 789     static Object[] test10e(short[] a, short[] b, short mask) {
 790         for (int i = 11; i < RANGE-16; i+=8) {
 791             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 11) + iter*16
 792             b[i+0+3] = (short)(a[i+0+3] & mask);
 793             b[i+1+3] = (short)(a[i+1+3] & mask);
 794             b[i+2+3] = (short)(a[i+2+3] & mask);
 795             b[i+3+3] = (short)(a[i+3+3] & mask);
 796         }
 797         return new Object[]{ a, b };
 798     }
 799 
 800     @Test
 801     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 802                   IRNode.AND_VB, "> 0",
 803                   IRNode.STORE_VECTOR, "> 0"},
 804         applyIfPlatform = {"64-bit", "true"},
 805         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 806     static Object[] test11aB(byte[] a, byte[] b, byte mask) {
 807         for (int i = 0; i < RANGE; i++) {
 808             // always alignable
 809             b[i+0] = (byte)(a[i+0] & mask);
 810         }
 811         return new Object[]{ a, b };
 812     }
 813 
 814     @Test
 815     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 816                   IRNode.AND_VS, "> 0",
 817                   IRNode.STORE_VECTOR, "> 0"},
 818         applyIfPlatform = {"64-bit", "true"},
 819         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 820     static Object[] test11aS(short[] a, short[] b, short mask) {
 821         for (int i = 0; i < RANGE; i++) {
 822             // always alignable
 823             b[i+0] = (short)(a[i+0] & mask);
 824         }
 825         return new Object[]{ a, b };
 826     }
 827 
 828     @Test
 829     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 830                   IRNode.AND_VI, "> 0",
 831                   IRNode.STORE_VECTOR, "> 0"},
 832         applyIfPlatform = {"64-bit", "true"},
 833         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 834     static Object[] test11aI(int[] a, int[] b, int mask) {
 835         for (int i = 0; i < RANGE; i++) {
 836             // always alignable
 837             b[i+0] = (int)(a[i+0] & mask);
 838         }
 839         return new Object[]{ a, b };
 840     }
 841 
 842     @Test
 843     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 844                   IRNode.AND_VL, "> 0",
 845                   IRNode.STORE_VECTOR, "> 0"},
 846         applyIfPlatform = {"64-bit", "true"},
 847         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 848     static Object[] test11aL(long[] a, long[] b, long mask) {
 849         for (int i = 0; i < RANGE; i++) {
 850             // always alignable
 851             b[i+0] = (long)(a[i+0] & mask);
 852         }
 853         return new Object[]{ a, b };
 854     }
 855 
 856     @Test
 857     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 858                   IRNode.AND_VB, "> 0",
 859                   IRNode.STORE_VECTOR, "> 0"},
 860         applyIfPlatform = {"64-bit", "true"},
 861         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 862     static Object[] test11bB(byte[] a, byte[] b, byte mask) {
 863         for (int i = 1; i < RANGE; i++) {
 864             // always alignable
 865             b[i+0] = (byte)(a[i+0] & mask);
 866         }
 867         return new Object[]{ a, b };
 868     }
 869 
 870     @Test
 871     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 872                   IRNode.AND_VS, "> 0",
 873                   IRNode.STORE_VECTOR, "> 0"},
 874         applyIfPlatform = {"64-bit", "true"},
 875         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 876     static Object[] test11bS(short[] a, short[] b, short mask) {
 877         for (int i = 1; i < RANGE; i++) {
 878             // always alignable
 879             b[i+0] = (short)(a[i+0] & mask);
 880         }
 881         return new Object[]{ a, b };
 882     }
 883 
 884     @Test
 885     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 886                   IRNode.AND_VI, "> 0",
 887                   IRNode.STORE_VECTOR, "> 0"},
 888         applyIfPlatform = {"64-bit", "true"},
 889         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 890     static Object[] test11bI(int[] a, int[] b, int mask) {
 891         for (int i = 1; i < RANGE; i++) {
 892             // always alignable
 893             b[i+0] = (int)(a[i+0] & mask);
 894         }
 895         return new Object[]{ a, b };
 896     }
 897 
 898     @Test
 899     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 900                   IRNode.AND_VL, "> 0",
 901                   IRNode.STORE_VECTOR, "> 0"},
 902         applyIfPlatform = {"64-bit", "true"},
 903         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 904     static Object[] test11bL(long[] a, long[] b, long mask) {
 905         for (int i = 1; i < RANGE; i++) {
 906             // always alignable
 907             b[i+0] = (long)(a[i+0] & mask);
 908         }
 909         return new Object[]{ a, b };
 910     }
 911 
 912     @Test
 913     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 914                   IRNode.AND_VB, "> 0",
 915                   IRNode.STORE_VECTOR, "> 0"},
 916         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 917         applyIfPlatform = {"64-bit", "true"},
 918         applyIf = {"AlignVector", "false"})
 919     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 920                   IRNode.AND_VB, "= 0",
 921                   IRNode.STORE_VECTOR, "= 0"},
 922         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 923         applyIfPlatform = {"64-bit", "true"},
 924         applyIf = {"AlignVector", "true"})
 925     static Object[] test11cB(byte[] a, byte[] b, byte mask) {
 926         for (int i = 1; i < RANGE-1; i++) {
 927             // 1 byte offset -> not alignable with AlignVector
 928             b[i+0] = (byte)(a[i+1] & mask);
 929         }
 930         return new Object[]{ a, b };
 931     }
 932 
 933     @Test
 934     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 935                   IRNode.AND_VS, "> 0",
 936                   IRNode.STORE_VECTOR, "> 0"},
 937         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 938         applyIfPlatform = {"64-bit", "true"},
 939         applyIf = {"AlignVector", "false"})
 940     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 941                   IRNode.AND_VS, "= 0",
 942                   IRNode.STORE_VECTOR, "= 0"},
 943         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 944         applyIfPlatform = {"64-bit", "true"},
 945         applyIf = {"AlignVector", "true"})
 946     static Object[] test11cS(short[] a, short[] b, short mask) {
 947         for (int i = 1; i < RANGE-1; i++) {
 948             // 2 byte offset -> not alignable with AlignVector
 949             b[i+0] = (short)(a[i+1] & mask);
 950         }
 951         return new Object[]{ a, b };
 952     }
 953 
 954     @Test
 955     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 956                   IRNode.AND_VI, "> 0",
 957                   IRNode.STORE_VECTOR, "> 0"},
 958         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 959         applyIfPlatform = {"64-bit", "true"},
 960         applyIf = {"AlignVector", "false"})
 961     @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0",
 962                   IRNode.AND_VI, "= 0",
 963                   IRNode.STORE_VECTOR, "= 0"},
 964         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 965         applyIfPlatform = {"64-bit", "true"},
 966         applyIf = {"AlignVector", "true"})
 967     static Object[] test11cI(int[] a, int[] b, int mask) {
 968         for (int i = 1; i < RANGE-1; i++) {
 969             // 4 byte offset -> not alignable with AlignVector
 970             b[i+0] = (int)(a[i+1] & mask);
 971         }
 972         return new Object[]{ a, b };
 973     }
 974 
 975     @Test
 976     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 977                   IRNode.AND_VL, "> 0",
 978                   IRNode.STORE_VECTOR, "> 0"},
 979         applyIfPlatform = {"64-bit", "true"},
 980         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 981     static Object[] test11cL(long[] a, long[] b, long mask) {
 982         for (int i = 1; i < RANGE-1; i++) {
 983             // always alignable (8 byte offset)
 984             b[i+0] = (long)(a[i+1] & mask);
 985         }
 986         return new Object[]{ a, b };
 987     }
 988 
 989     @Test
 990     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 991                   IRNode.AND_VB, "> 0",
 992                   IRNode.STORE_VECTOR, "> 0"},
 993         applyIfPlatform = {"64-bit", "true"},
 994         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 995     static Object[] test11dB(byte[] a, byte[] b, byte mask, int invar) {
 996         for (int i = 0; i < RANGE; i++) {
 997             b[i+0+invar] = (byte)(a[i+0+invar] & mask);
 998         }
 999         return new Object[]{ a, b };
1000     }
1001 
1002     @Test
1003     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
1004                   IRNode.AND_VS, "> 0",
1005                   IRNode.STORE_VECTOR, "> 0"},
1006         applyIfPlatform = {"64-bit", "true"},
1007         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1008     static Object[] test11dS(short[] a, short[] b, short mask, int invar) {
1009         for (int i = 0; i < RANGE; i++) {
1010             b[i+0+invar] = (short)(a[i+0+invar] & mask);
1011         }
1012         return new Object[]{ a, b };
1013     }
1014 
1015     @Test
1016     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1017                   IRNode.AND_VI, "> 0",
1018                   IRNode.STORE_VECTOR, "> 0"},
1019         applyIfPlatform = {"64-bit", "true"},
1020         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1021     static Object[] test11dI(int[] a, int[] b, int mask, int invar) {
1022         for (int i = 0; i < RANGE; i++) {
1023             b[i+0+invar] = (int)(a[i+0+invar] & mask);
1024         }
1025         return new Object[]{ a, b };
1026     }
1027 
1028     @Test
1029     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1030                   IRNode.AND_VL, "> 0",
1031                   IRNode.STORE_VECTOR, "> 0"},
1032         applyIfPlatform = {"64-bit", "true"},
1033         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1034     static Object[] test11dL(long[] a, long[] b, long mask, int invar) {
1035         for (int i = 0; i < RANGE; i++) {
1036             b[i+0+invar] = (long)(a[i+0+invar] & mask);
1037         }
1038         return new Object[]{ a, b };
1039     }
1040 
1041     @Test
1042     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0",
1043                   IRNode.AND_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0",
1044                   IRNode.STORE_VECTOR,                                           "> 0"},
1045         applyIfPlatform = {"64-bit", "true"},
1046         applyIf = {"AlignVector", "false"},
1047         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1048     static Object[] test12(byte[] a, byte[] b, byte mask) {
1049         for (int i = 0; i < RANGE/16; i++) {
1050             // Non-power-of-2 stride. Vectorization of 4 bytes, then 2-bytes gap.
1051             b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask);
1052             b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask);
1053             b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask);
1054             b[i*6 + 3 ] = (byte)(a[i*6 + 3 ] & mask);
1055         }
1056         return new Object[]{ a, b };
1057     }
1058 
1059     @Test
1060     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1061                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1062                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1063                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1064                   IRNode.STORE_VECTOR, "> 0"},
1065         applyIfPlatform = {"64-bit", "true"},
1066         applyIfCPUFeature = {"avx2", "true"})
1067     // require avx to ensure vectors are larger than what unrolling produces
1068     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1069                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1070                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1071                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1072                   IRNode.STORE_VECTOR, "> 0"},
1073         applyIfPlatform = {"riscv64", "true"},
1074         applyIfCPUFeature = {"rvv", "true"},
1075         applyIf = {"MaxVectorSize", ">=32"})
1076     static Object[] test13aIL(int[] a, long[] b) {
1077         for (int i = 0; i < RANGE; i++) {
1078             a[i]++;
1079             b[i]++;
1080         }
1081         return new Object[]{ a, b };
1082     }
1083 
1084     @Test
1085     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1086                   IRNode.LOAD_VECTOR_I, "> 0",
1087                   IRNode.ADD_VB, "> 0",
1088                   IRNode.ADD_VI, "> 0",
1089                   IRNode.STORE_VECTOR, "> 0"},
1090         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1091         applyIfPlatform = {"64-bit", "true"},
1092         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1093     static Object[] test13aIB(int[] a, byte[] b) {
1094         for (int i = 0; i < RANGE; i++) {
1095             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1*iter
1096             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1097             a[i]++;
1098             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET  + 4*iter
1099             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1100             b[i]++;
1101             // For AlignVector, all adr must be 8-byte aligned. Let's see for which iteration this can hold:
1102             // If UseCompactObjectHeaders=false:
1103             //   a: 0, 8, 16, 24, 32, ...
1104             //   b: 0, 2,  4,  6,  8, ...
1105             //   -> Ok, aligns every 8th iteration.
1106             // If UseCompactObjectHeaders=true:
1107             //   a: 4, 12, 20, 28, 36, ...
1108             //   b: 1,  3,  5,  7,  9, ...
1109             //   -> we can never align both vectors!
1110         }
1111         return new Object[]{ a, b };
1112     }
1113 
1114     @Test
1115     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1116                   IRNode.LOAD_VECTOR_S, "> 0",
1117                   IRNode.ADD_VI, "> 0",
1118                   IRNode.ADD_VS, "> 0",
1119                   IRNode.STORE_VECTOR, "> 0"},
1120         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1121         applyIfPlatform = {"64-bit", "true"},
1122         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1123     static Object[] test13aIS(int[] a, short[] b) {
1124         for (int i = 0; i < RANGE; i++) {
1125             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4*iter
1126             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1127             a[i]++;
1128             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*iter
1129             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1130             b[i]++;
1131             // For AlignVector, all adr must be 8-byte aligned. Let's see for which iteration this can hold:
1132             // If UseCompactObjectHeaders=false:
1133             //   a: iter % 2 == 0
1134             //   b: iter % 4 == 0
1135             //   -> Ok, aligns every 4th iteration.
1136             // If UseCompactObjectHeaders=true:
1137             //   a: iter % 2 = 1
1138             //   b: iter % 4 = 2
1139             //   -> we can never align both vectors!
1140         }
1141         return new Object[]{ a, b };
1142     }
1143 
1144     @Test
1145     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1146                   IRNode.LOAD_VECTOR_S, "> 0",
1147                   IRNode.LOAD_VECTOR_I, "> 0",
1148                   IRNode.LOAD_VECTOR_L, "> 0",
1149                   IRNode.ADD_VB, "> 0",
1150                   IRNode.ADD_VS, "> 0",
1151                   IRNode.ADD_VI, "> 0",
1152                   IRNode.ADD_VL, "> 0",
1153                   IRNode.STORE_VECTOR, "> 0"},
1154         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1155         applyIfPlatform = {"64-bit", "true"},
1156         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1157     static Object[] test13aBSIL(byte[] a, short[] b, int[] c, long[] d) {
1158         for (int i = 0; i < RANGE; i++) {
1159             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1*iter
1160             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1161             a[i]++;
1162             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*iter
1163             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1164             b[i]++;
1165             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4*iter
1166             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1167             c[i]++;
1168             // adr = base + UNSAFE.ARRAY_LONG_BASE_OFFSET + 8*iter
1169             //              = 16 (always)
1170             d[i]++;
1171             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1172             //   a: iter % 8 = 4
1173             //   c: iter % 2 = 1
1174             //   -> can never align both vectors!
1175         }
1176         return new Object[]{ a, b, c, d };
1177     }
1178 
1179     @Test
1180     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1181                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1182                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1183                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1184                   IRNode.STORE_VECTOR, "> 0"},
1185         applyIfPlatform = {"64-bit", "true"},
1186         applyIfCPUFeature = {"avx2", "true"})
1187     // require avx to ensure vectors are larger than what unrolling produces
1188     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1189                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1190                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1191                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1192                   IRNode.STORE_VECTOR, "> 0"},
1193         applyIfPlatform = {"riscv64", "true"},
1194         applyIfCPUFeature = {"rvv", "true"},
1195         applyIf = {"MaxVectorSize", ">=32"})
1196     static Object[] test13bIL(int[] a, long[] b) {
1197         for (int i = 1; i < RANGE; i++) {
1198             a[i]++;
1199             b[i]++;
1200         }
1201         return new Object[]{ a, b };
1202     }
1203 
1204     @Test
1205     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1206                   IRNode.LOAD_VECTOR_I, "> 0",
1207                   IRNode.ADD_VB, "> 0",
1208                   IRNode.ADD_VI, "> 0",
1209                   IRNode.STORE_VECTOR, "> 0"},
1210         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1211         applyIfPlatform = {"64-bit", "true"},
1212         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1213     static Object[] test13bIB(int[] a, byte[] b) {
1214         for (int i = 1; i < RANGE; i++) {
1215             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1216             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1217             a[i]++;
1218             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1 + 1*iter
1219             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1220             b[i]++;
1221             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1222             //   a: iter % 2 = 0
1223             //   b: iter % 8 = 3
1224             //   -> can never align both vectors!
1225         }
1226         return new Object[]{ a, b };
1227     }
1228 
1229     @Test
1230     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1231                   IRNode.LOAD_VECTOR_S, "> 0",
1232                   IRNode.ADD_VI, "> 0",
1233                   IRNode.ADD_VS, "> 0",
1234                   IRNode.STORE_VECTOR, "> 0"},
1235         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1236         applyIfPlatform = {"64-bit", "true"},
1237         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1238     static Object[] test13bIS(int[] a, short[] b) {
1239         for (int i = 1; i < RANGE; i++) {
1240             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1241             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1242             a[i]++;
1243             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2 + 2*iter
1244             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1245             b[i]++;
1246             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1247             //   a: iter % 2 = 0
1248             //   b: iter % 4 = 1
1249             //   -> can never align both vectors!
1250         }
1251         return new Object[]{ a, b };
1252     }
1253 
1254     @Test
1255     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1256                   IRNode.LOAD_VECTOR_S, "> 0",
1257                   IRNode.LOAD_VECTOR_I, "> 0",
1258                   IRNode.LOAD_VECTOR_L, "> 0",
1259                   IRNode.ADD_VB, "> 0",
1260                   IRNode.ADD_VS, "> 0",
1261                   IRNode.ADD_VI, "> 0",
1262                   IRNode.ADD_VL, "> 0",
1263                   IRNode.STORE_VECTOR, "> 0"},
1264         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1265         applyIfPlatform = {"64-bit", "true"},
1266         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1267     static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
1268         for (int i = 1; i < RANGE; i++) {
1269             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1 + 1*iter
1270             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1271             a[i]++;
1272             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2 + 2*iter
1273             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1274             b[i]++;
1275             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1276             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1277             c[i]++;
1278             // adr = base + UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 + 8*iter
1279             //              = 16 (always)
1280             d[i]++;
1281             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1282             //   a: iter % 8 = 3
1283             //   c: iter % 2 = 0
1284             //   -> can never align both vectors!
1285         }
1286         return new Object[]{ a, b, c, d };
1287     }
1288 
1289     @Test
1290     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1291                   IRNode.ADD_VB, "= 0",
1292                   IRNode.STORE_VECTOR, "= 0"},
1293         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1294         applyIfPlatform = {"64-bit", "true"},
1295         applyIf = {"AlignVector", "false"})
1296     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1297                   IRNode.ADD_VB, "= 0",
1298                   IRNode.STORE_VECTOR, "= 0"},
1299         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1300         applyIfPlatform = {"64-bit", "true"},
1301         applyIf = {"AlignVector", "true"})
1302     static Object[] test14aB(byte[] a) {
1303         // non-power-of-2 stride
1304         for (int i = 0; i < RANGE-20; i+=9) {
1305             // Since the stride is shorter than the vector length, there will be always
1306             // partial overlap of loads with previous stores, this leads to failure in
1307             // store-to-load-forwarding -> vectorization not profitable.
1308             a[i+0]++;
1309             a[i+1]++;
1310             a[i+2]++;
1311             a[i+3]++;
1312             a[i+4]++;
1313             a[i+5]++;
1314             a[i+6]++;
1315             a[i+7]++;
1316             a[i+8]++;
1317             a[i+9]++;
1318             a[i+10]++;
1319             a[i+11]++;
1320             a[i+12]++;
1321             a[i+13]++;
1322             a[i+14]++;
1323             a[i+15]++;
1324         }
1325         return new Object[]{ a };
1326     }
1327 
1328     @Test
1329     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1330                   IRNode.ADD_VB, "= 0",
1331                   IRNode.STORE_VECTOR, "= 0"},
1332         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1333         applyIfPlatform = {"64-bit", "true"},
1334         applyIf = {"AlignVector", "false"})
1335     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1336                   IRNode.ADD_VB, "= 0",
1337                   IRNode.STORE_VECTOR, "= 0"},
1338         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1339         applyIfPlatform = {"64-bit", "true"},
1340         applyIf = {"AlignVector", "true"})
1341     static Object[] test14bB(byte[] a) {
1342         // non-power-of-2 stride
1343         for (int i = 0; i < RANGE-20; i+=3) {
1344             // Since the stride is shorter than the vector length, there will be always
1345             // partial overlap of loads with previous stores, this leads to failure in
1346             // store-to-load-forwarding -> vectorization not profitable.
1347             a[i+0]++;
1348             a[i+1]++;
1349             a[i+2]++;
1350             a[i+3]++;
1351             a[i+4]++;
1352             a[i+5]++;
1353             a[i+6]++;
1354             a[i+7]++;
1355             a[i+8]++;
1356             a[i+9]++;
1357             a[i+10]++;
1358             a[i+11]++;
1359             a[i+12]++;
1360             a[i+13]++;
1361             a[i+14]++;
1362             a[i+15]++;
1363         }
1364         return new Object[]{ a };
1365     }
1366 
1367     @Test
1368     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1369                   IRNode.ADD_VB, "= 0",
1370                   IRNode.STORE_VECTOR, "= 0"},
1371         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1372         applyIfPlatform = {"64-bit", "true"},
1373         applyIf = {"AlignVector", "false"})
1374     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1375                   IRNode.ADD_VB, "= 0",
1376                   IRNode.STORE_VECTOR, "= 0"},
1377         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1378         applyIfPlatform = {"64-bit", "true"},
1379         applyIf = {"AlignVector", "true"})
1380     static Object[] test14cB(byte[] a) {
1381         // non-power-of-2 stride
1382         for (int i = 0; i < RANGE-20; i+=5) {
1383             // Since the stride is shorter than the vector length, there will be always
1384             // partial overlap of loads with previous stores, this leads to failure in
1385             // store-to-load-forwarding -> vectorization not profitable.
1386             a[i+0]++;
1387             a[i+1]++;
1388             a[i+2]++;
1389             a[i+3]++;
1390             a[i+4]++;
1391             a[i+5]++;
1392             a[i+6]++;
1393             a[i+7]++;
1394             a[i+8]++;
1395             a[i+9]++;
1396             a[i+10]++;
1397             a[i+11]++;
1398             a[i+12]++;
1399             a[i+13]++;
1400             a[i+14]++;
1401             a[i+15]++;
1402         }
1403         return new Object[]{ a };
1404     }
1405 
1406     @Test
1407     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1408                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1409                   IRNode.STORE_VECTOR,                                           "> 0"},
1410         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1411         applyIfPlatform = {"64-bit", "true"},
1412         applyIf = {"AlignVector", "false"})
1413     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1414                   IRNode.ADD_VB, "= 0",
1415                   IRNode.STORE_VECTOR, "= 0"},
1416         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1417         applyIfPlatform = {"64-bit", "true"},
1418         applyIf = {"AlignVector", "true"})
1419     static Object[] test14dB(byte[] a) {
1420         // non-power-of-2 stride
1421         for (int i = 0; i < RANGE-20; i+=9) {
1422             a[i+0]++;
1423             a[i+1]++;
1424             a[i+2]++;
1425             a[i+3]++;
1426             a[i+4]++;
1427             a[i+5]++;
1428             a[i+6]++;
1429             a[i+7]++;
1430         }
1431         return new Object[]{ a };
1432     }
1433 
1434     @Test
1435     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1436                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1437                   IRNode.STORE_VECTOR,                                           "> 0"},
1438         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1439         applyIfPlatform = {"64-bit", "true"},
1440         applyIf = {"AlignVector", "false"})
1441     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1442                   IRNode.ADD_VB, "= 0",
1443                   IRNode.STORE_VECTOR, "= 0"},
1444         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1445         applyIfPlatform = {"64-bit", "true"},
1446         applyIf = {"AlignVector", "true"})
1447     static Object[] test14eB(byte[] a) {
1448         // non-power-of-2 stride
1449         for (int i = 0; i < RANGE-32; i+=11) {
1450             a[i+0]++;
1451             a[i+1]++;
1452             a[i+2]++;
1453             a[i+3]++;
1454             a[i+4]++;
1455             a[i+5]++;
1456             a[i+6]++;
1457             a[i+7]++;
1458         }
1459         return new Object[]{ a };
1460     }
1461 
1462     @Test
1463     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1464                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1465                   IRNode.STORE_VECTOR,                                           "> 0"},
1466         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1467         applyIfPlatform = {"64-bit", "true"},
1468         applyIf = {"AlignVector", "false"})
1469     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1470                   IRNode.ADD_VB, "= 0",
1471                   IRNode.STORE_VECTOR, "= 0"},
1472         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1473         applyIfPlatform = {"64-bit", "true"},
1474         applyIf = {"AlignVector", "true"})
1475     static Object[] test14fB(byte[] a) {
1476         // non-power-of-2 stride
1477         for (int i = 0; i < RANGE-40; i+=12) {
1478             a[i+0]++;
1479             a[i+1]++;
1480             a[i+2]++;
1481             a[i+3]++;
1482             a[i+4]++;
1483             a[i+5]++;
1484             a[i+6]++;
1485             a[i+7]++;
1486         }
1487         return new Object[]{ a };
1488     }
1489 
1490     @Test
1491     // IR rules difficult because of modulo wrapping with offset after peeling.
1492     static Object[] test15aB(byte[] a) {
1493         // non-power-of-2 scale
1494         for (int i = 0; i < RANGE/64-20; i++) {
1495             a[53*i+0]++;
1496             a[53*i+1]++;
1497             a[53*i+2]++;
1498             a[53*i+3]++;
1499             a[53*i+4]++;
1500             a[53*i+5]++;
1501             a[53*i+6]++;
1502             a[53*i+7]++;
1503             a[53*i+8]++;
1504             a[53*i+9]++;
1505             a[53*i+10]++;
1506             a[53*i+11]++;
1507             a[53*i+12]++;
1508             a[53*i+13]++;
1509             a[53*i+14]++;
1510             a[53*i+15]++;
1511         }
1512         return new Object[]{ a };
1513     }
1514 
1515     @Test
1516     // IR rules difficult because of modulo wrapping with offset after peeling.
1517     static Object[] test15bB(byte[] a) {
1518         // non-power-of-2 scale
1519         for (int i = 0; i < RANGE/64-20; i++) {
1520             a[25*i+0]++;
1521             a[25*i+1]++;
1522             a[25*i+2]++;
1523             a[25*i+3]++;
1524             a[25*i+4]++;
1525             a[25*i+5]++;
1526             a[25*i+6]++;
1527             a[25*i+7]++;
1528             a[25*i+8]++;
1529             a[25*i+9]++;
1530             a[25*i+10]++;
1531             a[25*i+11]++;
1532             a[25*i+12]++;
1533             a[25*i+13]++;
1534             a[25*i+14]++;
1535             a[25*i+15]++;
1536         }
1537         return new Object[]{ a };
1538     }
1539 
1540     @Test
1541     // IR rules difficult because of modulo wrapping with offset after peeling.
1542     static Object[] test15cB(byte[] a) {
1543         // non-power-of-2 scale
1544         for (int i = 0; i < RANGE/64-20; i++) {
1545             a[19*i+0]++;
1546             a[19*i+1]++;
1547             a[19*i+2]++;
1548             a[19*i+3]++;
1549             a[19*i+4]++;
1550             a[19*i+5]++;
1551             a[19*i+6]++;
1552             a[19*i+7]++;
1553             a[19*i+8]++;
1554             a[19*i+9]++;
1555             a[19*i+10]++;
1556             a[19*i+11]++;
1557             a[19*i+12]++;
1558             a[19*i+13]++;
1559             a[19*i+14]++;
1560             a[19*i+15]++;
1561         }
1562         return new Object[]{ a };
1563     }
1564 
1565     @Test
1566     static Object[] test16a(byte[] a, short[] b) {
1567         // infinite loop issues
1568         for (int i = 0; i < RANGE/2-20; i++) {
1569             a[2*i+0]++;
1570             a[2*i+1]++;
1571             a[2*i+2]++;
1572             a[2*i+3]++;
1573             a[2*i+4]++;
1574             a[2*i+5]++;
1575             a[2*i+6]++;
1576             a[2*i+7]++;
1577             a[2*i+8]++;
1578             a[2*i+9]++;
1579             a[2*i+10]++;
1580             a[2*i+11]++;
1581             a[2*i+12]++;
1582             a[2*i+13]++;
1583             a[2*i+14]++;
1584 
1585             b[2*i+0]++;
1586             b[2*i+1]++;
1587             b[2*i+2]++;
1588             b[2*i+3]++;
1589         }
1590         return new Object[]{ a, b };
1591     }
1592 
1593     @Test
1594     static Object[] test16b(byte[] a) {
1595         // infinite loop issues
1596         for (int i = 0; i < RANGE/2-20; i++) {
1597             a[2*i+0]++;
1598             a[2*i+1]++;
1599             a[2*i+2]++;
1600             a[2*i+3]++;
1601             a[2*i+4]++;
1602             a[2*i+5]++;
1603             a[2*i+6]++;
1604             a[2*i+7]++;
1605             a[2*i+8]++;
1606             a[2*i+9]++;
1607             a[2*i+10]++;
1608             a[2*i+11]++;
1609             a[2*i+12]++;
1610             a[2*i+13]++;
1611             a[2*i+14]++;
1612         }
1613         return new Object[]{ a };
1614     }
1615 
1616     @Test
1617     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1618                   IRNode.ADD_VL, "> 0",
1619                   IRNode.STORE_VECTOR, "> 0"},
1620         applyIfPlatform = {"64-bit", "true"},
1621         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1622     static Object[] test17a(long[] a) {
1623         // Unsafe: vectorizes with profiling (not xcomp)
1624         for (int i = 0; i < RANGE; i++) {
1625             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1626             long v = UNSAFE.getLongUnaligned(a, adr);
1627             UNSAFE.putLongUnaligned(a, adr, v + 1);
1628         }
1629         return new Object[]{ a };
1630     }
1631 
1632     @Test
1633     // Difficult to write good IR rule. Modulo calculus overflow can create non-power-of-2 packs.
1634     static Object[] test17b(long[] a) {
1635         // Not alignable
1636         for (int i = 0; i < RANGE-1; i++) {
1637             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1638             long v = UNSAFE.getLongUnaligned(a, adr);
1639             UNSAFE.putLongUnaligned(a, adr, v + 1);
1640         }
1641         return new Object[]{ a };
1642     }
1643 
1644     @Test
1645     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1646                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1647                   IRNode.STORE_VECTOR, "> 0"},
1648         applyIf = {"MaxVectorSize", ">=32"},
1649         applyIfPlatform = {"64-bit", "true"},
1650         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1651     static Object[] test17c(long[] a) {
1652         // Unsafe: aligned vectorizes
1653         for (int i = 0; i < RANGE-1; i+=4) {
1654             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1655             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1656             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1657             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1658             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1659         }
1660         return new Object[]{ a };
1661     }
1662 
1663     @Test
1664     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1665                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1666                   IRNode.STORE_VECTOR, "> 0"},
1667         applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true", "rvv", "true"},
1668         applyIfPlatform = {"64-bit", "true"},
1669         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=64"})
1670     // Ensure vector width is large enough to fit 64 byte for longs:
1671     // The offsets are: 25, 33, 57, 65
1672     // In modulo 32:    25,  1, 25,  1  -> does not vectorize
1673     // In modulo 64:    25, 33, 57,  1  -> at least first pair vectorizes
1674     // This problem is because we compute modulo vector width in memory_alignment.
1675     @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0",
1676                   IRNode.ADD_VL, "= 0",
1677                   IRNode.STORE_VECTOR, "= 0"},
1678         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1679         applyIfPlatform = {"64-bit", "true"},
1680         applyIf = {"AlignVector", "true"})
1681     static Object[] test17d(long[] a) {
1682         // Not alignable
1683         for (int i = 0; i < RANGE-1; i+=4) {
1684             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1685             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1686             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1687             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1688             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1689         }
1690         return new Object[]{ a };
1691     }
1692 
1693     @Test
1694     static Object[] test18a(byte[] a, int[] b) {
1695         // scale = 0  -->  no iv
1696         for (int i = 0; i < RANGE; i++) {
1697             a[0] = 1;
1698             b[i] = 2;
1699             a[1] = 1;
1700         }
1701         return new Object[]{ a, b };
1702     }
1703 
1704     @Test
1705     static Object[] test18b(byte[] a, int[] b) {
1706         // scale = 0  -->  no iv
1707         for (int i = 0; i < RANGE; i++) {
1708             a[1] = 1;
1709             b[i] = 2;
1710             a[2] = 1;
1711         }
1712         return new Object[]{ a, b };
1713     }
1714 
1715     @Test
1716     static Object[] test19(int[] a, int[] b) {
1717         for (int i = 5000; i > 0; i--) {
1718             a[RANGE_FINAL - i] = b[RANGE_FINAL - i];
1719         }
1720         return new Object[]{ a, b };
1721     }
1722 
1723     @Test
1724     static Object[] test20(byte[] a) {
1725         // Example where it is easy to pass alignment check,
1726         // but used to fail the alignment calculation
1727         for (int i = 1; i < RANGE/2-50; i++) {
1728             a[2*i+0+30]++;
1729             a[2*i+1+30]++;
1730             a[2*i+2+30]++;
1731             a[2*i+3+30]++;
1732         }
1733         return new Object[]{ a };
1734     }
1735 }