1 /*
   2  * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package compiler.loopopts.superword;
  25 
  26 import compiler.lib.ir_framework.*;
  27 import jdk.test.lib.Utils;
  28 import jdk.test.whitebox.WhiteBox;
  29 import jdk.internal.misc.Unsafe;
  30 import java.lang.reflect.Array;
  31 import java.util.Map;
  32 import java.util.HashMap;
  33 import java.util.Random;
  34 import java.nio.ByteOrder;
  35 
  36 /*
  37  * @test id=NoAlignVector
  38  * @bug 8310190
  39  * @key randomness
  40  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  41  * @modules java.base/jdk.internal.misc
  42  * @library /test/lib /
  43  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector
  44  */
  45 
  46 /*
  47  * @test id=AlignVector
  48  * @bug 8310190
  49  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  50  * @modules java.base/jdk.internal.misc
  51  * @library /test/lib /
  52  * @run driver compiler.loopopts.superword.TestAlignVector AlignVector
  53  */
  54 
  55 /*
  56  * @test id=VerifyAlignVector
  57  * @bug 8310190
  58  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  59  * @modules java.base/jdk.internal.misc
  60  * @library /test/lib /
  61  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector
  62  */
  63 
  64 /*
  65  * @test id=NoAlignVector-COH
  66  * @bug 8310190
  67  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  68  * @modules java.base/jdk.internal.misc
  69  * @library /test/lib /
  70  * @run driver compiler.loopopts.superword.TestAlignVector NoAlignVector-COH
  71  */
  72 
  73 /*
  74  * @test id=VerifyAlignVector-COH
  75  * @bug 8310190
  76  * @summary Test AlignVector with various loop init, stride, scale, invar, etc.
  77  * @modules java.base/jdk.internal.misc
  78  * @library /test/lib /
  79  * @run driver compiler.loopopts.superword.TestAlignVector VerifyAlignVector-COH
  80  */
  81 
  82 public class TestAlignVector {
  83     static int RANGE = 1024*8;
  84     static int RANGE_FINAL = 1024*8;
  85     private static final Unsafe UNSAFE = Unsafe.getUnsafe();
  86     private static final Random RANDOM = Utils.getRandomInstance();
  87 
  88     // Inputs
  89     byte[] aB;
  90     byte[] bB;
  91     byte mB = (byte)31;
  92     short[] aS;
  93     short[] bS;
  94     short mS = (short)0xF0F0;
  95     int[] aI;
  96     int[] bI;
  97     int mI = 0xF0F0F0F0;
  98     long[] aL;
  99     long[] bL;
 100     long mL = 0xF0F0F0F0F0F0F0F0L;
 101 
 102     // List of tests
 103     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
 104 
 105     // List of gold, the results from the first run before compilation
 106     Map<String,Object[]> golds = new HashMap<String,Object[]>();
 107 
 108     interface TestFunction {
 109         Object[] run();
 110     }
 111 
 112     public static void main(String[] args) {
 113         TestFramework framework = new TestFramework(TestAlignVector.class);
 114         framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
 115                            "-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=250");
 116 
 117         switch (args[0]) {
 118             case "NoAlignVector"         -> { framework.addFlags("-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); }
 119             case "AlignVector"           -> { framework.addFlags("-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); }
 120             case "VerifyAlignVector"     -> { framework.addFlags("-XX:-UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 121             case "NoAlignVector-COH"     -> { framework.addFlags("-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); }
 122             case "VerifyAlignVector-COH" -> { framework.addFlags("-XX:+UseCompactObjectHeaders", "-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
 123             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 124         }
 125         framework.start();
 126     }
 127 
 128     public TestAlignVector() {
 129         // Generate input once
 130         aB = generateB();
 131         bB = generateB();
 132         aS = generateS();
 133         bS = generateS();
 134         aI = generateI();
 135         bI = generateI();
 136         aL = generateL();
 137         bL = generateL();
 138 
 139         // Add all tests to list
 140         tests.put("test0",       () -> { return test0(aB.clone(), bB.clone(), mB); });
 141         tests.put("test1a",      () -> { return test1a(aB.clone(), bB.clone(), mB); });
 142         tests.put("test1b",      () -> { return test1b(aB.clone(), bB.clone(), mB); });
 143         tests.put("test2",       () -> { return test2(aB.clone(), bB.clone(), mB); });
 144         tests.put("test3",       () -> { return test3(aB.clone(), bB.clone(), mB); });
 145         tests.put("test4",       () -> { return test4(aB.clone(), bB.clone(), mB); });
 146         tests.put("test5",       () -> { return test5(aB.clone(), bB.clone(), mB, 0); });
 147         tests.put("test6",       () -> { return test6(aB.clone(), bB.clone(), mB); });
 148         tests.put("test7",       () -> { return test7(aS.clone(), bS.clone(), mS); });
 149         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 0); });
 150         tests.put("test8",       () -> { return test8(aB.clone(), bB.clone(), mB, 1); });
 151         tests.put("test9",       () -> { return test9(aB.clone(), bB.clone(), mB); });
 152 
 153         tests.put("test10a",     () -> { return test10a(aB.clone(), bB.clone(), mB); });
 154         tests.put("test10b",     () -> { return test10b(aB.clone(), bB.clone(), mB); });
 155         tests.put("test10c",     () -> { return test10c(aS.clone(), bS.clone(), mS); });
 156         tests.put("test10d",     () -> { return test10d(aS.clone(), bS.clone(), mS); });
 157         tests.put("test10e",     () -> { return test10e(aS.clone(), bS.clone(), mS); });
 158 
 159         tests.put("test11aB",    () -> { return test11aB(aB.clone(), bB.clone(), mB); });
 160         tests.put("test11aS",    () -> { return test11aS(aS.clone(), bS.clone(), mS); });
 161         tests.put("test11aI",    () -> { return test11aI(aI.clone(), bI.clone(), mI); });
 162         tests.put("test11aL",    () -> { return test11aL(aL.clone(), bL.clone(), mL); });
 163 
 164         tests.put("test11bB",    () -> { return test11bB(aB.clone(), bB.clone(), mB); });
 165         tests.put("test11bS",    () -> { return test11bS(aS.clone(), bS.clone(), mS); });
 166         tests.put("test11bI",    () -> { return test11bI(aI.clone(), bI.clone(), mI); });
 167         tests.put("test11bL",    () -> { return test11bL(aL.clone(), bL.clone(), mL); });
 168 
 169         tests.put("test11cB",    () -> { return test11cB(aB.clone(), bB.clone(), mB); });
 170         tests.put("test11cS",    () -> { return test11cS(aS.clone(), bS.clone(), mS); });
 171         tests.put("test11cI",    () -> { return test11cI(aI.clone(), bI.clone(), mI); });
 172         tests.put("test11cL",    () -> { return test11cL(aL.clone(), bL.clone(), mL); });
 173 
 174         tests.put("test11dB",    () -> { return test11dB(aB.clone(), bB.clone(), mB, 0); });
 175         tests.put("test11dS",    () -> { return test11dS(aS.clone(), bS.clone(), mS, 0); });
 176         tests.put("test11dI",    () -> { return test11dI(aI.clone(), bI.clone(), mI, 0); });
 177         tests.put("test11dL",    () -> { return test11dL(aL.clone(), bL.clone(), mL, 0); });
 178 
 179         tests.put("test12",      () -> { return test12(aB.clone(), bB.clone(), mB); });
 180 
 181         tests.put("test13aIL",   () -> { return test13aIL(aI.clone(), aL.clone()); });
 182         tests.put("test13aIB",   () -> { return test13aIB(aI.clone(), aB.clone()); });
 183         tests.put("test13aIS",   () -> { return test13aIS(aI.clone(), aS.clone()); });
 184         tests.put("test13aBSIL", () -> { return test13aBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 185 
 186         tests.put("test13bIL",   () -> { return test13bIL(aI.clone(), aL.clone()); });
 187         tests.put("test13bIB",   () -> { return test13bIB(aI.clone(), aB.clone()); });
 188         tests.put("test13bIS",   () -> { return test13bIS(aI.clone(), aS.clone()); });
 189         tests.put("test13bBSIL", () -> { return test13bBSIL(aB.clone(), aS.clone(), aI.clone(), aL.clone()); });
 190 
 191         tests.put("test14aB",    () -> { return test14aB(aB.clone()); });
 192         tests.put("test14bB",    () -> { return test14bB(aB.clone()); });
 193         tests.put("test14cB",    () -> { return test14cB(aB.clone()); });
 194         tests.put("test14dB",    () -> { return test14dB(aB.clone()); });
 195         tests.put("test14eB",    () -> { return test14eB(aB.clone()); });
 196         tests.put("test14fB",    () -> { return test14fB(aB.clone()); });
 197 
 198         tests.put("test15aB",    () -> { return test15aB(aB.clone()); });
 199         tests.put("test15bB",    () -> { return test15bB(aB.clone()); });
 200         tests.put("test15cB",    () -> { return test15cB(aB.clone()); });
 201 
 202         tests.put("test16a",     () -> { return test16a(aB.clone(), aS.clone()); });
 203         tests.put("test16b",     () -> { return test16b(aB.clone()); });
 204 
 205         tests.put("test17a",     () -> { return test17a(aL.clone()); });
 206         tests.put("test17b",     () -> { return test17b(aL.clone()); });
 207         tests.put("test17c",     () -> { return test17c(aL.clone()); });
 208         tests.put("test17d",     () -> { return test17d(aL.clone()); });
 209 
 210         tests.put("test18a",     () -> { return test18a(aB.clone(), aI.clone()); });
 211         tests.put("test18b",     () -> { return test18b(aB.clone(), aI.clone()); });
 212 
 213         tests.put("test19",      () -> { return test19(aI.clone(), bI.clone()); });
 214         tests.put("test20",      () -> { return test20(aB.clone()); });
 215 
 216         // Compute gold value for all test methods before compilation
 217         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 218             String name = entry.getKey();
 219             TestFunction test = entry.getValue();
 220             Object[] gold = test.run();
 221             golds.put(name, gold);
 222         }
 223     }
 224 
 225     @Warmup(100)
 226     @Run(test = {"test0",
 227                  "test1a",
 228                  "test1b",
 229                  "test2",
 230                  "test3",
 231                  "test4",
 232                  "test5",
 233                  "test6",
 234                  "test7",
 235                  "test8",
 236                  "test9",
 237                  "test10a",
 238                  "test10b",
 239                  "test10c",
 240                  "test10d",
 241                  "test10e",
 242                  "test11aB",
 243                  "test11aS",
 244                  "test11aI",
 245                  "test11aL",
 246                  "test11bB",
 247                  "test11bS",
 248                  "test11bI",
 249                  "test11bL",
 250                  "test11cB",
 251                  "test11cS",
 252                  "test11cI",
 253                  "test11cL",
 254                  "test11dB",
 255                  "test11dS",
 256                  "test11dI",
 257                  "test11dL",
 258                  "test12",
 259                  "test13aIL",
 260                  "test13aIB",
 261                  "test13aIS",
 262                  "test13aBSIL",
 263                  "test13bIL",
 264                  "test13bIB",
 265                  "test13bIS",
 266                  "test13bBSIL",
 267                  "test14aB",
 268                  "test14bB",
 269                  "test14cB",
 270                  "test14dB",
 271                  "test14eB",
 272                  "test14fB",
 273                  "test15aB",
 274                  "test15bB",
 275                  "test15cB",
 276                  "test16a",
 277                  "test16b",
 278                  "test17a",
 279                  "test17b",
 280                  "test17c",
 281                  "test17d",
 282                  "test18a",
 283                  "test18b",
 284                  "test19",
 285                  "test20"})
 286     public void runTests() {
 287         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
 288             String name = entry.getKey();
 289             TestFunction test = entry.getValue();
 290             // Recall gold value from before compilation
 291             Object[] gold = golds.get(name);
 292             // Compute new result
 293             Object[] result = test.run();
 294             // Compare gold and new result
 295             verify(name, gold, result);
 296         }
 297     }
 298 
 299     static byte[] generateB() {
 300         byte[] a = new byte[RANGE];
 301         for (int i = 0; i < a.length; i++) {
 302             a[i] = (byte)RANDOM.nextInt();
 303         }
 304         return a;
 305     }
 306 
 307     static short[] generateS() {
 308         short[] a = new short[RANGE];
 309         for (int i = 0; i < a.length; i++) {
 310             a[i] = (short)RANDOM.nextInt();
 311         }
 312         return a;
 313     }
 314 
 315     static int[] generateI() {
 316         int[] a = new int[RANGE];
 317         for (int i = 0; i < a.length; i++) {
 318             a[i] = RANDOM.nextInt();
 319         }
 320         return a;
 321     }
 322 
 323     static long[] generateL() {
 324         long[] a = new long[RANGE];
 325         for (int i = 0; i < a.length; i++) {
 326             a[i] = RANDOM.nextLong();
 327         }
 328         return a;
 329     }
 330 
 331     static void verify(String name, Object[] gold, Object[] result) {
 332         if (gold.length != result.length) {
 333             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
 334                                        gold.length + ", result.length = " + result.length);
 335         }
 336         for (int i = 0; i < gold.length; i++) {
 337             Object g = gold[i];
 338             Object r = result[i];
 339             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
 340                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
 341                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 342                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 343             }
 344             if (g == r) {
 345                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
 346                                            " gold[" + i + "] == result[" + i + "]");
 347             }
 348             if (Array.getLength(g) != Array.getLength(r)) {
 349                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
 350                                            " gold[" + i + "].length = " + Array.getLength(g) +
 351                                            " result[" + i + "].length = " + Array.getLength(r));
 352             }
 353             Class c = g.getClass().getComponentType();
 354             if (c == byte.class) {
 355                 verifyB(name, i, (byte[])g, (byte[])r);
 356             } else if (c == short.class) {
 357                 verifyS(name, i, (short[])g, (short[])r);
 358             } else if (c == int.class) {
 359                 verifyI(name, i, (int[])g, (int[])r);
 360             } else if (c == long.class) {
 361                 verifyL(name, i, (long[])g, (long[])r);
 362             } else {
 363                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
 364                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
 365                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
 366             }
 367         }
 368     }
 369 
 370     static void verifyB(String name, int i, byte[] g, byte[] r) {
 371         for (int j = 0; j < g.length; j++) {
 372             if (g[j] != r[j]) {
 373                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 374                                            " gold[" + i + "][" + j + "] = " + g[j] +
 375                                            " result[" + i + "][" + j + "] = " + r[j]);
 376             }
 377         }
 378     }
 379 
 380     static void verifyS(String name, int i, short[] g, short[] r) {
 381         for (int j = 0; j < g.length; j++) {
 382             if (g[j] != r[j]) {
 383                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 384                                            " gold[" + i + "][" + j + "] = " + g[j] +
 385                                            " result[" + i + "][" + j + "] = " + r[j]);
 386             }
 387         }
 388     }
 389 
 390     static void verifyI(String name, int i, int[] g, int[] r) {
 391         for (int j = 0; j < g.length; j++) {
 392             if (g[j] != r[j]) {
 393                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 394                                            " gold[" + i + "][" + j + "] = " + g[j] +
 395                                            " result[" + i + "][" + j + "] = " + r[j]);
 396             }
 397         }
 398     }
 399 
 400     static void verifyL(String name, int i, long[] g, long[] r) {
 401         for (int j = 0; j < g.length; j++) {
 402             if (g[j] != r[j]) {
 403                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
 404                                            " gold[" + i + "][" + j + "] = " + g[j] +
 405                                            " result[" + i + "][" + j + "] = " + r[j]);
 406             }
 407         }
 408     }
 409 
 410     @Test
 411     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 412                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 413                   IRNode.STORE_VECTOR, "> 0"},
 414         applyIf = {"MaxVectorSize", ">=8"},
 415         applyIfPlatform = {"64-bit", "true"},
 416         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 417     static Object[] test0(byte[] a, byte[] b, byte mask) {
 418         for (int i = 0; i < RANGE; i+=8) {
 419             // Safe to vectorize with AlignVector
 420             b[i+0] = (byte)(a[i+0] & mask); // offset 0, align 0
 421             b[i+1] = (byte)(a[i+1] & mask);
 422             b[i+2] = (byte)(a[i+2] & mask);
 423             b[i+3] = (byte)(a[i+3] & mask);
 424         }
 425         return new Object[]{ a, b };
 426     }
 427 
 428     @Test
 429     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 430                   IRNode.AND_VB, "> 0",
 431                   IRNode.STORE_VECTOR, "> 0"},
 432         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
 433         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 434         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 435         applyIfPlatform = {"64-bit", "true"},
 436         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 437     static Object[] test1a(byte[] a, byte[] b, byte mask) {
 438         for (int i = 0; i < RANGE; i+=8) {
 439             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0 + iter*8
 440             b[i+1] = (byte)(a[i+1] & mask);
 441             b[i+2] = (byte)(a[i+2] & mask);
 442             b[i+3] = (byte)(a[i+3] & mask);
 443             b[i+4] = (byte)(a[i+4] & mask);
 444             b[i+5] = (byte)(a[i+5] & mask);
 445             b[i+6] = (byte)(a[i+6] & mask);
 446             b[i+7] = (byte)(a[i+7] & mask);
 447         }
 448         return new Object[]{ a, b };
 449     }
 450 
 451     @Test
 452     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 453                   IRNode.AND_VB, "> 0",
 454                   IRNode.STORE_VECTOR, "> 0"},
 455         applyIfOr = {"UseCompactObjectHeaders", "true", "AlignVector", "false"},
 456         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 457         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 458         applyIfPlatform = {"64-bit", "true"},
 459         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 460     static Object[] test1b(byte[] a, byte[] b, byte mask) {
 461         for (int i = 4; i < RANGE-8; i+=8) {
 462             b[i+0] = (byte)(a[i+0] & mask); // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4 + iter*8
 463             b[i+1] = (byte)(a[i+1] & mask);
 464             b[i+2] = (byte)(a[i+2] & mask);
 465             b[i+3] = (byte)(a[i+3] & mask);
 466             b[i+4] = (byte)(a[i+4] & mask);
 467             b[i+5] = (byte)(a[i+5] & mask);
 468             b[i+6] = (byte)(a[i+6] & mask);
 469             b[i+7] = (byte)(a[i+7] & mask);
 470         }
 471         return new Object[]{ a, b };
 472     }
 473 
 474     @Test
 475     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 476                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 477                   IRNode.STORE_VECTOR, "> 0"},
 478         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 479         applyIfPlatform = {"64-bit", "true"},
 480         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 481     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 482                   IRNode.AND_VB, "= 0",
 483                   IRNode.STORE_VECTOR, "= 0"},
 484         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 485         applyIfPlatform = {"64-bit", "true"},
 486         applyIf = {"AlignVector", "true"})
 487     static Object[] test2(byte[] a, byte[] b, byte mask) {
 488         for (int i = 0; i < RANGE; i+=8) {
 489             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 490             b[i+3] = (byte)(a[i+3] & mask); // at alignment 3
 491             b[i+4] = (byte)(a[i+4] & mask);
 492             b[i+5] = (byte)(a[i+5] & mask);
 493             b[i+6] = (byte)(a[i+6] & mask);
 494         }
 495         return new Object[]{ a, b };
 496     }
 497 
 498     @Test
 499     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 500                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 501                   IRNode.STORE_VECTOR, "> 0"},
 502         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 503         applyIfPlatform = {"64-bit", "true"},
 504         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 505     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 506                   IRNode.AND_VB, "= 0",
 507                   IRNode.STORE_VECTOR, "= 0"},
 508         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 509         applyIfPlatform = {"64-bit", "true"},
 510         applyIf = {"AlignVector", "true"})
 511     static Object[] test3(byte[] a, byte[] b, byte mask) {
 512         for (int i = 0; i < RANGE; i+=8) {
 513             // Cannot align with AlignVector: 3 + x * 8 % 8 = 3
 514 
 515             // Problematic for AlignVector
 516             b[i+0] = (byte)(a[i+0] & mask); // best_memref, align 0
 517 
 518             b[i+3] = (byte)(a[i+3] & mask); // pack at offset 3 bytes
 519             b[i+4] = (byte)(a[i+4] & mask);
 520             b[i+5] = (byte)(a[i+5] & mask);
 521             b[i+6] = (byte)(a[i+6] & mask);
 522         }
 523         return new Object[]{ a, b };
 524     }
 525 
 526     @Test
 527     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 528                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "> 0",
 529                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 530                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "> 0",
 531                   IRNode.STORE_VECTOR, "> 0"},
 532         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 533         applyIfPlatform = {"64-bit", "true"},
 534         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 535     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 536                   IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 537                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 538                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_8, "= 0",// unaligned
 539                   IRNode.STORE_VECTOR, "> 0"},
 540         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 541         applyIfPlatform = {"64-bit", "true"},
 542         applyIfAnd = {"AlignVector", "true", "MaxVectorSize", ">=16"})
 543     static Object[] test4(byte[] a, byte[] b, byte mask) {
 544         for (int i = 0; i < RANGE/16; i++) {
 545             // Problematic for AlignVector
 546             b[i*16 + 0 ] = (byte)(a[i*16 + 0 ] & mask); // 4 pack, 0 aligned
 547             b[i*16 + 1 ] = (byte)(a[i*16 + 1 ] & mask);
 548             b[i*16 + 2 ] = (byte)(a[i*16 + 2 ] & mask);
 549             b[i*16 + 3 ] = (byte)(a[i*16 + 3 ] & mask);
 550 
 551             b[i*16 + 5 ] = (byte)(a[i*16 + 5 ] & mask); // 8 pack, 5 aligned
 552             b[i*16 + 6 ] = (byte)(a[i*16 + 6 ] & mask);
 553             b[i*16 + 7 ] = (byte)(a[i*16 + 7 ] & mask);
 554             b[i*16 + 8 ] = (byte)(a[i*16 + 8 ] & mask);
 555             b[i*16 + 9 ] = (byte)(a[i*16 + 9 ] & mask);
 556             b[i*16 + 10] = (byte)(a[i*16 + 10] & mask);
 557             b[i*16 + 11] = (byte)(a[i*16 + 11] & mask);
 558             b[i*16 + 12] = (byte)(a[i*16 + 12] & mask);
 559         }
 560         return new Object[]{ a, b };
 561     }
 562 
 563     @Test
 564     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 565                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 566                   IRNode.STORE_VECTOR, "> 0"},
 567         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 568         applyIfPlatform = {"64-bit", "true"},
 569         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 570     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 571                   IRNode.AND_VB, "= 0",
 572                   IRNode.STORE_VECTOR, "= 0"},
 573         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 574         applyIfPlatform = {"64-bit", "true"},
 575         applyIf = {"AlignVector", "true"})
 576     static Object[] test5(byte[] a, byte[] b, byte mask, int inv) {
 577         for (int i = 0; i < RANGE; i+=8) {
 578             // Cannot align with AlignVector because of invariant
 579             b[i+inv+0] = (byte)(a[i+inv+0] & mask);
 580 
 581             b[i+inv+3] = (byte)(a[i+inv+3] & mask);
 582             b[i+inv+4] = (byte)(a[i+inv+4] & mask);
 583             b[i+inv+5] = (byte)(a[i+inv+5] & mask);
 584             b[i+inv+6] = (byte)(a[i+inv+6] & mask);
 585         }
 586         return new Object[]{ a, b };
 587     }
 588 
 589     @Test
 590     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 591                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 592                   IRNode.STORE_VECTOR, "> 0"},
 593         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 594         applyIfPlatform = {"64-bit", "true"},
 595         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 596     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 597                   IRNode.AND_VB, "= 0",
 598                   IRNode.STORE_VECTOR, "= 0"},
 599         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 600         applyIfPlatform = {"64-bit", "true"},
 601         applyIf = {"AlignVector", "true"})
 602     static Object[] test6(byte[] a, byte[] b, byte mask) {
 603         for (int i = 0; i < RANGE/8; i+=2) {
 604             // Cannot align with AlignVector because offset is odd
 605             b[i*4+0] = (byte)(a[i*4+0] & mask);
 606 
 607             b[i*4+3] = (byte)(a[i*4+3] & mask);
 608             b[i*4+4] = (byte)(a[i*4+4] & mask);
 609             b[i*4+5] = (byte)(a[i*4+5] & mask);
 610             b[i*4+6] = (byte)(a[i*4+6] & mask);
 611         }
 612         return new Object[]{ a, b };
 613     }
 614 
 615     @Test
 616     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 617                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 618                   IRNode.STORE_VECTOR, "> 0"},
 619         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"},
 620         applyIfPlatform = {"64-bit", "true"},
 621         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 622     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 623                   IRNode.AND_VS, "= 0",
 624                   IRNode.STORE_VECTOR, "= 0"},
 625         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 626         applyIfPlatform = {"64-bit", "true"},
 627         applyIf = {"AlignVector", "true"})
 628     static Object[] test7(short[] a, short[] b, short mask) {
 629         for (int i = 0; i < RANGE/8; i+=2) {
 630             // Cannot align with AlignVector because offset is odd
 631             b[i*4+0] = (short)(a[i*4+0] & mask);
 632 
 633             b[i*4+3] = (short)(a[i*4+3] & mask);
 634             b[i*4+4] = (short)(a[i*4+4] & mask);
 635             b[i*4+5] = (short)(a[i*4+5] & mask);
 636             b[i*4+6] = (short)(a[i*4+6] & mask);
 637         }
 638         return new Object[]{ a, b };
 639     }
 640 
 641     @Test
 642     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 643                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 644                   IRNode.STORE_VECTOR, "> 0"},
 645         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"},
 646         applyIfPlatform = {"64-bit", "true"},
 647         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 648     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 649                   IRNode.AND_VB, "= 0",
 650                   IRNode.STORE_VECTOR, "= 0"},
 651         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 652         applyIfPlatform = {"64-bit", "true"},
 653         applyIf = {"AlignVector", "true"})
 654     static Object[] test8(byte[] a, byte[] b, byte mask, int init) {
 655         for (int i = init; i < RANGE; i+=8) {
 656             // Cannot align with AlignVector because of invariant (variable init becomes invar)
 657             b[i+0] = (byte)(a[i+0] & mask);
 658 
 659             b[i+3] = (byte)(a[i+3] & mask);
 660             b[i+4] = (byte)(a[i+4] & mask);
 661             b[i+5] = (byte)(a[i+5] & mask);
 662             b[i+6] = (byte)(a[i+6] & mask);
 663         }
 664         return new Object[]{ a, b };
 665     }
 666 
 667     @Test
 668     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 669                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 670                   IRNode.STORE_VECTOR, "> 0"},
 671         applyIf = {"MaxVectorSize", ">=8"},
 672         applyIfPlatform = {"64-bit", "true"},
 673         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 674     static Object[] test9(byte[] a, byte[] b, byte mask) {
 675         // known non-zero init value does not affect offset, but has implicit effect on iv
 676         for (int i = 13; i < RANGE-8; i+=8) {
 677             b[i+0] = (byte)(a[i+0] & mask);
 678 
 679             b[i+3] = (byte)(a[i+3] & mask);
 680             b[i+4] = (byte)(a[i+4] & mask);
 681             b[i+5] = (byte)(a[i+5] & mask);
 682             b[i+6] = (byte)(a[i+6] & mask);
 683         }
 684         return new Object[]{ a, b };
 685     }
 686 
 687     @Test
 688     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 689                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 690                   IRNode.STORE_VECTOR, "> 0"},
 691         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 692         applyIfPlatform = {"64-bit", "true"},
 693         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 694     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 695                   IRNode.AND_VB, "= 0",
 696                   IRNode.STORE_VECTOR, "= 0"},
 697         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 698         applyIfPlatform = {"64-bit", "true"},
 699         applyIf = {"AlignVector", "true"})
 700     static Object[] test10a(byte[] a, byte[] b, byte mask) {
 701         // This is not alignable with pre-loop, because of odd init.
 702         for (int i = 3; i < RANGE-8; i+=8) {
 703             b[i+0] = (byte)(a[i+0] & mask);
 704             b[i+1] = (byte)(a[i+1] & mask);
 705             b[i+2] = (byte)(a[i+2] & mask);
 706             b[i+3] = (byte)(a[i+3] & mask);
 707         }
 708         return new Object[]{ a, b };
 709     }
 710 
 711     @Test
 712     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE_4, "> 0",
 713                   IRNode.AND_VB,        IRNode.VECTOR_SIZE_4, "> 0",
 714                   IRNode.STORE_VECTOR, "> 0"},
 715         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 716         applyIfPlatform = {"64-bit", "true"},
 717         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=8"})
 718     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 719                   IRNode.AND_VB, "= 0",
 720                   IRNode.STORE_VECTOR, "= 0"},
 721         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 722         applyIfPlatform = {"64-bit", "true"},
 723         applyIf = {"AlignVector", "true"})
 724     static Object[] test10b(byte[] a, byte[] b, byte mask) {
 725         // This is not alignable with pre-loop, because of odd init.
 726         // Seems not correctly handled.
 727         for (int i = 13; i < RANGE-8; i+=8) {
 728             b[i+0] = (byte)(a[i+0] & mask);
 729             b[i+1] = (byte)(a[i+1] & mask);
 730             b[i+2] = (byte)(a[i+2] & mask);
 731             b[i+3] = (byte)(a[i+3] & mask);
 732         }
 733         return new Object[]{ a, b };
 734     }
 735 
 736     @Test
 737     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 738                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 739                   IRNode.STORE_VECTOR, "> 0"},
 740         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 741         applyIfPlatform = {"64-bit", "true"},
 742         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=16"})
 743     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 744                   IRNode.AND_VS, "= 0",
 745                   IRNode.STORE_VECTOR, "= 0"},
 746         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 747         applyIfPlatform = {"64-bit", "true"},
 748         applyIf = {"AlignVector", "true"})
 749     static Object[] test10c(short[] a, short[] b, short mask) {
 750         // This is not alignable with pre-loop, because of odd init.
 751         // Seems not correctly handled with MaxVectorSize >= 32.
 752         for (int i = 13; i < RANGE-8; i+=8) {
 753             b[i+0] = (short)(a[i+0] & mask);
 754             b[i+1] = (short)(a[i+1] & mask);
 755             b[i+2] = (short)(a[i+2] & mask);
 756             b[i+3] = (short)(a[i+3] & mask);
 757         }
 758         return new Object[]{ a, b };
 759     }
 760 
 761     @Test
 762     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 763                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 764                   IRNode.STORE_VECTOR, "> 0"},
 765         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "false"},
 766         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 767         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 768         applyIfPlatform = {"64-bit", "true"},
 769         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 770     static Object[] test10d(short[] a, short[] b, short mask) {
 771         for (int i = 13; i < RANGE-16; i+=8) {
 772             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 13) + iter*16
 773             b[i+0+3] = (short)(a[i+0+3] & mask);
 774             b[i+1+3] = (short)(a[i+1+3] & mask);
 775             b[i+2+3] = (short)(a[i+2+3] & mask);
 776             b[i+3+3] = (short)(a[i+3+3] & mask);
 777         }
 778         return new Object[]{ a, b };
 779     }
 780 
 781     @Test
 782     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
 783                   IRNode.AND_VS,        IRNode.VECTOR_SIZE_4, "> 0",
 784                   IRNode.STORE_VECTOR, "> 0"},
 785         applyIfAnd = {"MaxVectorSize", ">=16", "UseCompactObjectHeaders", "true"},
 786         // UNSAFE.ARRAY_BYTE_BASE_OFFSET = 16, but with compact object headers UNSAFE.ARRAY_BYTE_BASE_OFFSET=12.
 787         // If AlignVector=true, we need the offset to be 8-byte aligned, else the vectors are filtered out.
 788         applyIfPlatform = {"64-bit", "true"},
 789         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
 790     static Object[] test10e(short[] a, short[] b, short mask) {
 791         for (int i = 11; i < RANGE-16; i+=8) {
 792             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*(3 + 11) + iter*16
 793             b[i+0+3] = (short)(a[i+0+3] & mask);
 794             b[i+1+3] = (short)(a[i+1+3] & mask);
 795             b[i+2+3] = (short)(a[i+2+3] & mask);
 796             b[i+3+3] = (short)(a[i+3+3] & mask);
 797         }
 798         return new Object[]{ a, b };
 799     }
 800 
 801     @Test
 802     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 803                   IRNode.AND_VB, "> 0",
 804                   IRNode.STORE_VECTOR, "> 0"},
 805         applyIfPlatform = {"64-bit", "true"},
 806         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 807     static Object[] test11aB(byte[] a, byte[] b, byte mask) {
 808         for (int i = 0; i < RANGE; i++) {
 809             // always alignable
 810             b[i+0] = (byte)(a[i+0] & mask);
 811         }
 812         return new Object[]{ a, b };
 813     }
 814 
 815     @Test
 816     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 817                   IRNode.AND_VS, "> 0",
 818                   IRNode.STORE_VECTOR, "> 0"},
 819         applyIfPlatform = {"64-bit", "true"},
 820         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 821     static Object[] test11aS(short[] a, short[] b, short mask) {
 822         for (int i = 0; i < RANGE; i++) {
 823             // always alignable
 824             b[i+0] = (short)(a[i+0] & mask);
 825         }
 826         return new Object[]{ a, b };
 827     }
 828 
 829     @Test
 830     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 831                   IRNode.AND_VI, "> 0",
 832                   IRNode.STORE_VECTOR, "> 0"},
 833         applyIfPlatform = {"64-bit", "true"},
 834         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 835     static Object[] test11aI(int[] a, int[] b, int mask) {
 836         for (int i = 0; i < RANGE; i++) {
 837             // always alignable
 838             b[i+0] = (int)(a[i+0] & mask);
 839         }
 840         return new Object[]{ a, b };
 841     }
 842 
 843     @Test
 844     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 845                   IRNode.AND_VL, "> 0",
 846                   IRNode.STORE_VECTOR, "> 0"},
 847         applyIfPlatform = {"64-bit", "true"},
 848         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 849     static Object[] test11aL(long[] a, long[] b, long mask) {
 850         for (int i = 0; i < RANGE; i++) {
 851             // always alignable
 852             b[i+0] = (long)(a[i+0] & mask);
 853         }
 854         return new Object[]{ a, b };
 855     }
 856 
 857     @Test
 858     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 859                   IRNode.AND_VB, "> 0",
 860                   IRNode.STORE_VECTOR, "> 0"},
 861         applyIfPlatform = {"64-bit", "true"},
 862         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 863     static Object[] test11bB(byte[] a, byte[] b, byte mask) {
 864         for (int i = 1; i < RANGE; i++) {
 865             // always alignable
 866             b[i+0] = (byte)(a[i+0] & mask);
 867         }
 868         return new Object[]{ a, b };
 869     }
 870 
 871     @Test
 872     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 873                   IRNode.AND_VS, "> 0",
 874                   IRNode.STORE_VECTOR, "> 0"},
 875         applyIfPlatform = {"64-bit", "true"},
 876         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 877     static Object[] test11bS(short[] a, short[] b, short mask) {
 878         for (int i = 1; i < RANGE; i++) {
 879             // always alignable
 880             b[i+0] = (short)(a[i+0] & mask);
 881         }
 882         return new Object[]{ a, b };
 883     }
 884 
 885     @Test
 886     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 887                   IRNode.AND_VI, "> 0",
 888                   IRNode.STORE_VECTOR, "> 0"},
 889         applyIfPlatform = {"64-bit", "true"},
 890         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 891     static Object[] test11bI(int[] a, int[] b, int mask) {
 892         for (int i = 1; i < RANGE; i++) {
 893             // always alignable
 894             b[i+0] = (int)(a[i+0] & mask);
 895         }
 896         return new Object[]{ a, b };
 897     }
 898 
 899     @Test
 900     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 901                   IRNode.AND_VL, "> 0",
 902                   IRNode.STORE_VECTOR, "> 0"},
 903         applyIfPlatform = {"64-bit", "true"},
 904         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 905     static Object[] test11bL(long[] a, long[] b, long mask) {
 906         for (int i = 1; i < RANGE; i++) {
 907             // always alignable
 908             b[i+0] = (long)(a[i+0] & mask);
 909         }
 910         return new Object[]{ a, b };
 911     }
 912 
 913     @Test
 914     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 915                   IRNode.AND_VB, "> 0",
 916                   IRNode.STORE_VECTOR, "> 0"},
 917         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 918         applyIfPlatform = {"64-bit", "true"},
 919         applyIf = {"AlignVector", "false"})
 920     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
 921                   IRNode.AND_VB, "= 0",
 922                   IRNode.STORE_VECTOR, "= 0"},
 923         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 924         applyIfPlatform = {"64-bit", "true"},
 925         applyIf = {"AlignVector", "true"})
 926     static Object[] test11cB(byte[] a, byte[] b, byte mask) {
 927         for (int i = 1; i < RANGE-1; i++) {
 928             // 1 byte offset -> not alignable with AlignVector
 929             b[i+0] = (byte)(a[i+1] & mask);
 930         }
 931         return new Object[]{ a, b };
 932     }
 933 
 934     @Test
 935     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
 936                   IRNode.AND_VS, "> 0",
 937                   IRNode.STORE_VECTOR, "> 0"},
 938         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 939         applyIfPlatform = {"64-bit", "true"},
 940         applyIf = {"AlignVector", "false"})
 941     @IR(counts = {IRNode.LOAD_VECTOR_S, "= 0",
 942                   IRNode.AND_VS, "= 0",
 943                   IRNode.STORE_VECTOR, "= 0"},
 944         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 945         applyIfPlatform = {"64-bit", "true"},
 946         applyIf = {"AlignVector", "true"})
 947     static Object[] test11cS(short[] a, short[] b, short mask) {
 948         for (int i = 1; i < RANGE-1; i++) {
 949             // 2 byte offset -> not alignable with AlignVector
 950             b[i+0] = (short)(a[i+1] & mask);
 951         }
 952         return new Object[]{ a, b };
 953     }
 954 
 955     @Test
 956     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
 957                   IRNode.AND_VI, "> 0",
 958                   IRNode.STORE_VECTOR, "> 0"},
 959         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 960         applyIfPlatform = {"64-bit", "true"},
 961         applyIf = {"AlignVector", "false"})
 962     @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0",
 963                   IRNode.AND_VI, "= 0",
 964                   IRNode.STORE_VECTOR, "= 0"},
 965         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
 966         applyIfPlatform = {"64-bit", "true"},
 967         applyIf = {"AlignVector", "true"})
 968     static Object[] test11cI(int[] a, int[] b, int mask) {
 969         for (int i = 1; i < RANGE-1; i++) {
 970             // 4 byte offset -> not alignable with AlignVector
 971             b[i+0] = (int)(a[i+1] & mask);
 972         }
 973         return new Object[]{ a, b };
 974     }
 975 
 976     @Test
 977     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
 978                   IRNode.AND_VL, "> 0",
 979                   IRNode.STORE_VECTOR, "> 0"},
 980         applyIfPlatform = {"64-bit", "true"},
 981         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 982     static Object[] test11cL(long[] a, long[] b, long mask) {
 983         for (int i = 1; i < RANGE-1; i++) {
 984             // always alignable (8 byte offset)
 985             b[i+0] = (long)(a[i+1] & mask);
 986         }
 987         return new Object[]{ a, b };
 988     }
 989 
 990     @Test
 991     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
 992                   IRNode.AND_VB, "> 0",
 993                   IRNode.STORE_VECTOR, "> 0"},
 994         applyIfPlatform = {"64-bit", "true"},
 995         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
 996     static Object[] test11dB(byte[] a, byte[] b, byte mask, int invar) {
 997         for (int i = 0; i < RANGE; i++) {
 998             b[i+0+invar] = (byte)(a[i+0+invar] & mask);
 999         }
1000         return new Object[]{ a, b };
1001     }
1002 
1003     @Test
1004     @IR(counts = {IRNode.LOAD_VECTOR_S, "> 0",
1005                   IRNode.AND_VS, "> 0",
1006                   IRNode.STORE_VECTOR, "> 0"},
1007         applyIfPlatform = {"64-bit", "true"},
1008         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1009     static Object[] test11dS(short[] a, short[] b, short mask, int invar) {
1010         for (int i = 0; i < RANGE; i++) {
1011             b[i+0+invar] = (short)(a[i+0+invar] & mask);
1012         }
1013         return new Object[]{ a, b };
1014     }
1015 
1016     @Test
1017     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1018                   IRNode.AND_VI, "> 0",
1019                   IRNode.STORE_VECTOR, "> 0"},
1020         applyIfPlatform = {"64-bit", "true"},
1021         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1022     static Object[] test11dI(int[] a, int[] b, int mask, int invar) {
1023         for (int i = 0; i < RANGE; i++) {
1024             b[i+0+invar] = (int)(a[i+0+invar] & mask);
1025         }
1026         return new Object[]{ a, b };
1027     }
1028 
1029     @Test
1030     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1031                   IRNode.AND_VL, "> 0",
1032                   IRNode.STORE_VECTOR, "> 0"},
1033         applyIfPlatform = {"64-bit", "true"},
1034         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1035     static Object[] test11dL(long[] a, long[] b, long mask, int invar) {
1036         for (int i = 0; i < RANGE; i++) {
1037             b[i+0+invar] = (long)(a[i+0+invar] & mask);
1038         }
1039         return new Object[]{ a, b };
1040     }
1041 
1042     @Test
1043     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0",
1044                   IRNode.AND_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0",
1045                   IRNode.STORE_VECTOR,                                           "> 0"},
1046         applyIfPlatform = {"64-bit", "true"},
1047         applyIf = {"AlignVector", "false"},
1048         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1049     static Object[] test12(byte[] a, byte[] b, byte mask) {
1050         for (int i = 0; i < RANGE/16; i++) {
1051             // Non-power-of-2 stride. Vectorization of 4 bytes, then 2-bytes gap.
1052             b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask);
1053             b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask);
1054             b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask);
1055             b[i*6 + 3 ] = (byte)(a[i*6 + 3 ] & mask);
1056         }
1057         return new Object[]{ a, b };
1058     }
1059 
1060     @Test
1061     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1062                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1063                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1064                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1065                   IRNode.STORE_VECTOR, "> 0"},
1066         applyIfPlatform = {"64-bit", "true"},
1067         applyIfCPUFeature = {"avx2", "true"})
1068     // require avx to ensure vectors are larger than what unrolling produces
1069     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1070                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1071                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1072                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1073                   IRNode.STORE_VECTOR, "> 0"},
1074         applyIfPlatform = {"riscv64", "true"},
1075         applyIfCPUFeature = {"rvv", "true"},
1076         applyIf = {"MaxVectorSize", ">=32"})
1077     static Object[] test13aIL(int[] a, long[] b) {
1078         for (int i = 0; i < RANGE; i++) {
1079             a[i]++;
1080             b[i]++;
1081         }
1082         return new Object[]{ a, b };
1083     }
1084 
1085     @Test
1086     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1087                   IRNode.LOAD_VECTOR_I, "> 0",
1088                   IRNode.ADD_VB, "> 0",
1089                   IRNode.ADD_VI, "> 0",
1090                   IRNode.STORE_VECTOR, "> 0"},
1091         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1092         applyIfPlatform = {"64-bit", "true"},
1093         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1094     static Object[] test13aIB(int[] a, byte[] b) {
1095         for (int i = 0; i < RANGE; i++) {
1096             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1*iter
1097             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1098             a[i]++;
1099             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET  + 4*iter
1100             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1101             b[i]++;
1102             // For AlignVector, all adr must be 8-byte aligned. Let's see for which iteration this can hold:
1103             // If UseCompactObjectHeaders=false:
1104             //   a: 0, 8, 16, 24, 32, ...
1105             //   b: 0, 2,  4,  6,  8, ...
1106             //   -> Ok, aligns every 8th iteration.
1107             // If UseCompactObjectHeaders=true:
1108             //   a: 4, 12, 20, 28, 36, ...
1109             //   b: 1,  3,  5,  7,  9, ...
1110             //   -> we can never align both vectors!
1111         }
1112         return new Object[]{ a, b };
1113     }
1114 
1115     @Test
1116     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1117                   IRNode.LOAD_VECTOR_S, "> 0",
1118                   IRNode.ADD_VI, "> 0",
1119                   IRNode.ADD_VS, "> 0",
1120                   IRNode.STORE_VECTOR, "> 0"},
1121         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1122         applyIfPlatform = {"64-bit", "true"},
1123         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1124     static Object[] test13aIS(int[] a, short[] b) {
1125         for (int i = 0; i < RANGE; i++) {
1126             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4*iter
1127             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1128             a[i]++;
1129             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*iter
1130             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1131             b[i]++;
1132             // For AlignVector, all adr must be 8-byte aligned. Let's see for which iteration this can hold:
1133             // If UseCompactObjectHeaders=false:
1134             //   a: iter % 2 == 0
1135             //   b: iter % 4 == 0
1136             //   -> Ok, aligns every 4th iteration.
1137             // If UseCompactObjectHeaders=true:
1138             //   a: iter % 2 = 1
1139             //   b: iter % 4 = 2
1140             //   -> we can never align both vectors!
1141         }
1142         return new Object[]{ a, b };
1143     }
1144 
1145     @Test
1146     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1147                   IRNode.LOAD_VECTOR_S, "> 0",
1148                   IRNode.LOAD_VECTOR_I, "> 0",
1149                   IRNode.LOAD_VECTOR_L, "> 0",
1150                   IRNode.ADD_VB, "> 0",
1151                   IRNode.ADD_VS, "> 0",
1152                   IRNode.ADD_VI, "> 0",
1153                   IRNode.ADD_VL, "> 0",
1154                   IRNode.STORE_VECTOR, "> 0"},
1155         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1156         applyIfPlatform = {"64-bit", "true"},
1157         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1158     static Object[] test13aBSIL(byte[] a, short[] b, int[] c, long[] d) {
1159         for (int i = 0; i < RANGE; i++) {
1160             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1*iter
1161             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1162             a[i]++;
1163             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2*iter
1164             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1165             b[i]++;
1166             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4*iter
1167             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1168             c[i]++;
1169             // adr = base + UNSAFE.ARRAY_LONG_BASE_OFFSET + 8*iter
1170             //              = 16 (always)
1171             d[i]++;
1172             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1173             //   a: iter % 8 = 4
1174             //   c: iter % 2 = 1
1175             //   -> can never align both vectors!
1176         }
1177         return new Object[]{ a, b, c, d };
1178     }
1179 
1180     @Test
1181     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1182                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1183                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1184                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1185                   IRNode.STORE_VECTOR, "> 0"},
1186         applyIfPlatform = {"64-bit", "true"},
1187         applyIfCPUFeature = {"avx2", "true"})
1188     // require avx to ensure vectors are larger than what unrolling produces
1189     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1190                   IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1191                   IRNode.ADD_VI, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1192                   IRNode.ADD_VL, IRNode.VECTOR_SIZE + "min(max_int, max_long)", "> 0",
1193                   IRNode.STORE_VECTOR, "> 0"},
1194         applyIfPlatform = {"riscv64", "true"},
1195         applyIfCPUFeature = {"rvv", "true"},
1196         applyIf = {"MaxVectorSize", ">=32"})
1197     static Object[] test13bIL(int[] a, long[] b) {
1198         for (int i = 1; i < RANGE; i++) {
1199             a[i]++;
1200             b[i]++;
1201         }
1202         return new Object[]{ a, b };
1203     }
1204 
1205     @Test
1206     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1207                   IRNode.LOAD_VECTOR_I, "> 0",
1208                   IRNode.ADD_VB, "> 0",
1209                   IRNode.ADD_VI, "> 0",
1210                   IRNode.STORE_VECTOR, "> 0"},
1211         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1212         applyIfPlatform = {"64-bit", "true"},
1213         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1214     static Object[] test13bIB(int[] a, byte[] b) {
1215         for (int i = 1; i < RANGE; i++) {
1216             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1217             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1218             a[i]++;
1219             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1 + 1*iter
1220             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1221             b[i]++;
1222             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1223             //   a: iter % 2 = 0
1224             //   b: iter % 8 = 3
1225             //   -> can never align both vectors!
1226         }
1227         return new Object[]{ a, b };
1228     }
1229 
1230     @Test
1231     @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
1232                   IRNode.LOAD_VECTOR_S, "> 0",
1233                   IRNode.ADD_VI, "> 0",
1234                   IRNode.ADD_VS, "> 0",
1235                   IRNode.STORE_VECTOR, "> 0"},
1236         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1237         applyIfPlatform = {"64-bit", "true"},
1238         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1239     static Object[] test13bIS(int[] a, short[] b) {
1240         for (int i = 1; i < RANGE; i++) {
1241             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1242             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1243             a[i]++;
1244             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2 + 2*iter
1245             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1246             b[i]++;
1247             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1248             //   a: iter % 2 = 0
1249             //   b: iter % 4 = 1
1250             //   -> can never align both vectors!
1251         }
1252         return new Object[]{ a, b };
1253     }
1254 
1255     @Test
1256     @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
1257                   IRNode.LOAD_VECTOR_S, "> 0",
1258                   IRNode.LOAD_VECTOR_I, "> 0",
1259                   IRNode.LOAD_VECTOR_L, "> 0",
1260                   IRNode.ADD_VB, "> 0",
1261                   IRNode.ADD_VS, "> 0",
1262                   IRNode.ADD_VI, "> 0",
1263                   IRNode.ADD_VL, "> 0",
1264                   IRNode.STORE_VECTOR, "> 0"},
1265         applyIfOr = {"UseCompactObjectHeaders", "false", "AlignVector", "false"},
1266         applyIfPlatform = {"64-bit", "true"},
1267         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
1268     static Object[] test13bBSIL(byte[] a, short[] b, int[] c, long[] d) {
1269         for (int i = 1; i < RANGE; i++) {
1270             // adr = base + UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1 + 1*iter
1271             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1272             a[i]++;
1273             // adr = base + UNSAFE.ARRAY_SHORT_BASE_OFFSET + 2 + 2*iter
1274             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1275             b[i]++;
1276             // adr = base + UNSAFE.ARRAY_INT_BASE_OFFSET + 4 + 4*iter
1277             //              = 16 (or 12 if UseCompactObjectHeaders=true)
1278             c[i]++;
1279             // adr = base + UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 + 8*iter
1280             //              = 16 (always)
1281             d[i]++;
1282             // If AlignVector and UseCompactObjectHeaders, and we want all adr 8-byte aligned:
1283             //   a: iter % 8 = 3
1284             //   c: iter % 2 = 0
1285             //   -> can never align both vectors!
1286         }
1287         return new Object[]{ a, b, c, d };
1288     }
1289 
1290     @Test
1291     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1292                   IRNode.ADD_VB, "= 0",
1293                   IRNode.STORE_VECTOR, "= 0"},
1294         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1295         applyIfPlatform = {"64-bit", "true"},
1296         applyIf = {"AlignVector", "false"})
1297     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1298                   IRNode.ADD_VB, "= 0",
1299                   IRNode.STORE_VECTOR, "= 0"},
1300         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1301         applyIfPlatform = {"64-bit", "true"},
1302         applyIf = {"AlignVector", "true"})
1303     static Object[] test14aB(byte[] a) {
1304         // non-power-of-2 stride
1305         for (int i = 0; i < RANGE-20; i+=9) {
1306             // Since the stride is shorter than the vector length, there will be always
1307             // partial overlap of loads with previous stores, this leads to failure in
1308             // store-to-load-forwarding -> vectorization not profitable.
1309             a[i+0]++;
1310             a[i+1]++;
1311             a[i+2]++;
1312             a[i+3]++;
1313             a[i+4]++;
1314             a[i+5]++;
1315             a[i+6]++;
1316             a[i+7]++;
1317             a[i+8]++;
1318             a[i+9]++;
1319             a[i+10]++;
1320             a[i+11]++;
1321             a[i+12]++;
1322             a[i+13]++;
1323             a[i+14]++;
1324             a[i+15]++;
1325         }
1326         return new Object[]{ a };
1327     }
1328 
1329     @Test
1330     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1331                   IRNode.ADD_VB, "= 0",
1332                   IRNode.STORE_VECTOR, "= 0"},
1333         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1334         applyIfPlatform = {"64-bit", "true"},
1335         applyIf = {"AlignVector", "false"})
1336     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1337                   IRNode.ADD_VB, "= 0",
1338                   IRNode.STORE_VECTOR, "= 0"},
1339         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1340         applyIfPlatform = {"64-bit", "true"},
1341         applyIf = {"AlignVector", "true"})
1342     static Object[] test14bB(byte[] a) {
1343         // non-power-of-2 stride
1344         for (int i = 0; i < RANGE-20; i+=3) {
1345             // Since the stride is shorter than the vector length, there will be always
1346             // partial overlap of loads with previous stores, this leads to failure in
1347             // store-to-load-forwarding -> vectorization not profitable.
1348             a[i+0]++;
1349             a[i+1]++;
1350             a[i+2]++;
1351             a[i+3]++;
1352             a[i+4]++;
1353             a[i+5]++;
1354             a[i+6]++;
1355             a[i+7]++;
1356             a[i+8]++;
1357             a[i+9]++;
1358             a[i+10]++;
1359             a[i+11]++;
1360             a[i+12]++;
1361             a[i+13]++;
1362             a[i+14]++;
1363             a[i+15]++;
1364         }
1365         return new Object[]{ a };
1366     }
1367 
1368     @Test
1369     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1370                   IRNode.ADD_VB, "= 0",
1371                   IRNode.STORE_VECTOR, "= 0"},
1372         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1373         applyIfPlatform = {"64-bit", "true"},
1374         applyIf = {"AlignVector", "false"})
1375     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1376                   IRNode.ADD_VB, "= 0",
1377                   IRNode.STORE_VECTOR, "= 0"},
1378         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1379         applyIfPlatform = {"64-bit", "true"},
1380         applyIf = {"AlignVector", "true"})
1381     static Object[] test14cB(byte[] a) {
1382         // non-power-of-2 stride
1383         for (int i = 0; i < RANGE-20; i+=5) {
1384             // Since the stride is shorter than the vector length, there will be always
1385             // partial overlap of loads with previous stores, this leads to failure in
1386             // store-to-load-forwarding -> vectorization not profitable.
1387             a[i+0]++;
1388             a[i+1]++;
1389             a[i+2]++;
1390             a[i+3]++;
1391             a[i+4]++;
1392             a[i+5]++;
1393             a[i+6]++;
1394             a[i+7]++;
1395             a[i+8]++;
1396             a[i+9]++;
1397             a[i+10]++;
1398             a[i+11]++;
1399             a[i+12]++;
1400             a[i+13]++;
1401             a[i+14]++;
1402             a[i+15]++;
1403         }
1404         return new Object[]{ a };
1405     }
1406 
1407     @Test
1408     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1409                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1410                   IRNode.STORE_VECTOR,                                           "> 0"},
1411         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1412         applyIfPlatform = {"64-bit", "true"},
1413         applyIf = {"AlignVector", "false"})
1414     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1415                   IRNode.ADD_VB, "= 0",
1416                   IRNode.STORE_VECTOR, "= 0"},
1417         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1418         applyIfPlatform = {"64-bit", "true"},
1419         applyIf = {"AlignVector", "true"})
1420     static Object[] test14dB(byte[] a) {
1421         // non-power-of-2 stride
1422         for (int i = 0; i < RANGE-20; i+=9) {
1423             a[i+0]++;
1424             a[i+1]++;
1425             a[i+2]++;
1426             a[i+3]++;
1427             a[i+4]++;
1428             a[i+5]++;
1429             a[i+6]++;
1430             a[i+7]++;
1431         }
1432         return new Object[]{ a };
1433     }
1434 
1435     @Test
1436     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1437                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1438                   IRNode.STORE_VECTOR,                                           "> 0"},
1439         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1440         applyIfPlatform = {"64-bit", "true"},
1441         applyIf = {"AlignVector", "false"})
1442     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1443                   IRNode.ADD_VB, "= 0",
1444                   IRNode.STORE_VECTOR, "= 0"},
1445         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1446         applyIfPlatform = {"64-bit", "true"},
1447         applyIf = {"AlignVector", "true"})
1448     static Object[] test14eB(byte[] a) {
1449         // non-power-of-2 stride
1450         for (int i = 0; i < RANGE-32; i+=11) {
1451             a[i+0]++;
1452             a[i+1]++;
1453             a[i+2]++;
1454             a[i+3]++;
1455             a[i+4]++;
1456             a[i+5]++;
1457             a[i+6]++;
1458             a[i+7]++;
1459         }
1460         return new Object[]{ a };
1461     }
1462 
1463     @Test
1464     @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1465                   IRNode.ADD_VB,        IRNode.VECTOR_SIZE + "min(max_byte, 8)", "> 0",
1466                   IRNode.STORE_VECTOR,                                           "> 0"},
1467         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1468         applyIfPlatform = {"64-bit", "true"},
1469         applyIf = {"AlignVector", "false"})
1470     @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0",
1471                   IRNode.ADD_VB, "= 0",
1472                   IRNode.STORE_VECTOR, "= 0"},
1473         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1474         applyIfPlatform = {"64-bit", "true"},
1475         applyIf = {"AlignVector", "true"})
1476     static Object[] test14fB(byte[] a) {
1477         // non-power-of-2 stride
1478         for (int i = 0; i < RANGE-40; i+=12) {
1479             a[i+0]++;
1480             a[i+1]++;
1481             a[i+2]++;
1482             a[i+3]++;
1483             a[i+4]++;
1484             a[i+5]++;
1485             a[i+6]++;
1486             a[i+7]++;
1487         }
1488         return new Object[]{ a };
1489     }
1490 
1491     @Test
1492     // IR rules difficult because of modulo wrapping with offset after peeling.
1493     static Object[] test15aB(byte[] a) {
1494         // non-power-of-2 scale
1495         for (int i = 0; i < RANGE/64-20; i++) {
1496             a[53*i+0]++;
1497             a[53*i+1]++;
1498             a[53*i+2]++;
1499             a[53*i+3]++;
1500             a[53*i+4]++;
1501             a[53*i+5]++;
1502             a[53*i+6]++;
1503             a[53*i+7]++;
1504             a[53*i+8]++;
1505             a[53*i+9]++;
1506             a[53*i+10]++;
1507             a[53*i+11]++;
1508             a[53*i+12]++;
1509             a[53*i+13]++;
1510             a[53*i+14]++;
1511             a[53*i+15]++;
1512         }
1513         return new Object[]{ a };
1514     }
1515 
1516     @Test
1517     // IR rules difficult because of modulo wrapping with offset after peeling.
1518     static Object[] test15bB(byte[] a) {
1519         // non-power-of-2 scale
1520         for (int i = 0; i < RANGE/64-20; i++) {
1521             a[25*i+0]++;
1522             a[25*i+1]++;
1523             a[25*i+2]++;
1524             a[25*i+3]++;
1525             a[25*i+4]++;
1526             a[25*i+5]++;
1527             a[25*i+6]++;
1528             a[25*i+7]++;
1529             a[25*i+8]++;
1530             a[25*i+9]++;
1531             a[25*i+10]++;
1532             a[25*i+11]++;
1533             a[25*i+12]++;
1534             a[25*i+13]++;
1535             a[25*i+14]++;
1536             a[25*i+15]++;
1537         }
1538         return new Object[]{ a };
1539     }
1540 
1541     @Test
1542     // IR rules difficult because of modulo wrapping with offset after peeling.
1543     static Object[] test15cB(byte[] a) {
1544         // non-power-of-2 scale
1545         for (int i = 0; i < RANGE/64-20; i++) {
1546             a[19*i+0]++;
1547             a[19*i+1]++;
1548             a[19*i+2]++;
1549             a[19*i+3]++;
1550             a[19*i+4]++;
1551             a[19*i+5]++;
1552             a[19*i+6]++;
1553             a[19*i+7]++;
1554             a[19*i+8]++;
1555             a[19*i+9]++;
1556             a[19*i+10]++;
1557             a[19*i+11]++;
1558             a[19*i+12]++;
1559             a[19*i+13]++;
1560             a[19*i+14]++;
1561             a[19*i+15]++;
1562         }
1563         return new Object[]{ a };
1564     }
1565 
1566     @Test
1567     static Object[] test16a(byte[] a, short[] b) {
1568         // infinite loop issues
1569         for (int i = 0; i < RANGE/2-20; i++) {
1570             a[2*i+0]++;
1571             a[2*i+1]++;
1572             a[2*i+2]++;
1573             a[2*i+3]++;
1574             a[2*i+4]++;
1575             a[2*i+5]++;
1576             a[2*i+6]++;
1577             a[2*i+7]++;
1578             a[2*i+8]++;
1579             a[2*i+9]++;
1580             a[2*i+10]++;
1581             a[2*i+11]++;
1582             a[2*i+12]++;
1583             a[2*i+13]++;
1584             a[2*i+14]++;
1585 
1586             b[2*i+0]++;
1587             b[2*i+1]++;
1588             b[2*i+2]++;
1589             b[2*i+3]++;
1590         }
1591         return new Object[]{ a, b };
1592     }
1593 
1594     @Test
1595     static Object[] test16b(byte[] a) {
1596         // infinite loop issues
1597         for (int i = 0; i < RANGE/2-20; i++) {
1598             a[2*i+0]++;
1599             a[2*i+1]++;
1600             a[2*i+2]++;
1601             a[2*i+3]++;
1602             a[2*i+4]++;
1603             a[2*i+5]++;
1604             a[2*i+6]++;
1605             a[2*i+7]++;
1606             a[2*i+8]++;
1607             a[2*i+9]++;
1608             a[2*i+10]++;
1609             a[2*i+11]++;
1610             a[2*i+12]++;
1611             a[2*i+13]++;
1612             a[2*i+14]++;
1613         }
1614         return new Object[]{ a };
1615     }
1616 
1617     @Test
1618     @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0",
1619                   IRNode.ADD_VL, "> 0",
1620                   IRNode.STORE_VECTOR, "> 0"},
1621         applyIfPlatform = {"64-bit", "true"},
1622         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1623     static Object[] test17a(long[] a) {
1624         // Unsafe: vectorizes with profiling (not xcomp)
1625         for (int i = 0; i < RANGE; i++) {
1626             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1627             long v = UNSAFE.getLongUnaligned(a, adr);
1628             UNSAFE.putLongUnaligned(a, adr, v + 1);
1629         }
1630         return new Object[]{ a };
1631     }
1632 
1633     @Test
1634     // Difficult to write good IR rule. Modulo calculus overflow can create non-power-of-2 packs.
1635     static Object[] test17b(long[] a) {
1636         // Not alignable
1637         for (int i = 0; i < RANGE-1; i++) {
1638             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1639             long v = UNSAFE.getLongUnaligned(a, adr);
1640             UNSAFE.putLongUnaligned(a, adr, v + 1);
1641         }
1642         return new Object[]{ a };
1643     }
1644 
1645     @Test
1646     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1647                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1648                   IRNode.STORE_VECTOR, "> 0"},
1649         applyIf = {"MaxVectorSize", ">=32"},
1650         applyIfPlatform = {"64-bit", "true"},
1651         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
1652     static Object[] test17c(long[] a) {
1653         // Unsafe: aligned vectorizes
1654         for (int i = 0; i < RANGE-1; i+=4) {
1655             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
1656             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1657             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1658             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1659             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1660         }
1661         return new Object[]{ a };
1662     }
1663 
1664     @Test
1665     @IR(counts = {IRNode.LOAD_VECTOR_L, IRNode.VECTOR_SIZE_2, "> 0",
1666                   IRNode.ADD_VL,        IRNode.VECTOR_SIZE_2, "> 0",
1667                   IRNode.STORE_VECTOR, "> 0"},
1668         applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true", "rvv", "true"},
1669         applyIfPlatform = {"64-bit", "true"},
1670         applyIfAnd = {"AlignVector", "false", "MaxVectorSize", ">=64"})
1671     // Ensure vector width is large enough to fit 64 byte for longs:
1672     // The offsets are: 25, 33, 57, 65
1673     // In modulo 32:    25,  1, 25,  1  -> does not vectorize
1674     // In modulo 64:    25, 33, 57,  1  -> at least first pair vectorizes
1675     // This problem is because we compute modulo vector width in memory_alignment.
1676     @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0",
1677                   IRNode.ADD_VL, "= 0",
1678                   IRNode.STORE_VECTOR, "= 0"},
1679         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"},
1680         applyIfPlatform = {"64-bit", "true"},
1681         applyIf = {"AlignVector", "true"})
1682     static Object[] test17d(long[] a) {
1683         // Not alignable
1684         for (int i = 0; i < RANGE-1; i+=4) {
1685             long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
1686             long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
1687             long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
1688             UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
1689             UNSAFE.putLongUnaligned(a, adr + 8, v1 + 1);
1690         }
1691         return new Object[]{ a };
1692     }
1693 
1694     @Test
1695     static Object[] test18a(byte[] a, int[] b) {
1696         // scale = 0  -->  no iv
1697         for (int i = 0; i < RANGE; i++) {
1698             a[0] = 1;
1699             b[i] = 2;
1700             a[1] = 1;
1701         }
1702         return new Object[]{ a, b };
1703     }
1704 
1705     @Test
1706     static Object[] test18b(byte[] a, int[] b) {
1707         // scale = 0  -->  no iv
1708         for (int i = 0; i < RANGE; i++) {
1709             a[1] = 1;
1710             b[i] = 2;
1711             a[2] = 1;
1712         }
1713         return new Object[]{ a, b };
1714     }
1715 
1716     @Test
1717     static Object[] test19(int[] a, int[] b) {
1718         for (int i = 5000; i > 0; i--) {
1719             a[RANGE_FINAL - i] = b[RANGE_FINAL - i];
1720         }
1721         return new Object[]{ a, b };
1722     }
1723 
1724     @Test
1725     static Object[] test20(byte[] a) {
1726         // Example where it is easy to pass alignment check,
1727         // but used to fail the alignment calculation
1728         for (int i = 1; i < RANGE/2-50; i++) {
1729             a[2*i+0+30]++;
1730             a[2*i+1+30]++;
1731             a[2*i+2+30]++;
1732             a[2*i+3+30]++;
1733         }
1734         return new Object[]{ a };
1735     }
1736 }