1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  */
 23 
 24 package compiler.loopopts.superword;
 25 
 26 import compiler.lib.ir_framework.*;
 27 import jdk.test.lib.Utils;
 28 import jdk.test.whitebox.WhiteBox;
 29 import java.lang.reflect.Array;
 30 import java.util.Map;
 31 import java.util.HashMap;
 32 import java.util.Random;
 33 import java.nio.ByteOrder;
 34 
 35 /*
 36  * @test
 37  * @bug 8326139
 38  * @summary Test splitting packs in SuperWord
 39  * @library /test/lib /
 40  * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_nAV
 41  * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_yAV
 42  * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_nAV
 43  * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_yAV
 44  */
 45 
 46 public class TestSplitPacks {
 47     static int RANGE = 1024*8;
 48     static int RANGE_FINAL = 1024*8;
 49     private static final Random RANDOM = Utils.getRandomInstance();
 50 
 51     // Inputs
 52     byte[] aB;
 53     byte[] bB;
 54     byte mB = (byte)31;
 55     short[] aS;
 56     short[] bS;
 57     short mS = (short)0xF0F0;
 58     int[] aI;
 59     int[] bI;
 60     int mI = 0xF0F0F0F0;
 61     long[] aL;
 62     long[] bL;
 63     long mL = 0xF0F0F0F0F0F0F0F0L;
 64 
 65     // List of tests
 66     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
 67 
 68     // List of gold, the results from the first run before compilation
 69     Map<String,Object[]> golds = new HashMap<String,Object[]>();
 70 
 71     interface TestFunction {
 72         Object[] run();
 73     }
 74 
 75     public static void main(String[] args) {
 76         TestFramework framework = new TestFramework(TestSplitPacks.class);
 77         framework.addFlags("-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=1000");
 78         switch (args[0]) {
 79             case "nCOH_nAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); }
 80             case "nCOH_yAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); }
 81             case "yCOH_nAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); }
 82             case "yCOH_yAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:+AlignVector"); }
 83             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 84         };
 85         framework.start();
 86     }
 87 
 88     public TestSplitPacks() {
 89         // Generate input once
 90         aB = generateB();
 91         bB = generateB();
 92         aS = generateS();
 93         bS = generateS();
 94         aI = generateI();
 95         bI = generateI();
 96         aL = generateL();
 97         bL = generateL();
 98 
 99         // Add all tests to list
100         tests.put("test0",       () -> { return test0(aI.clone(), bI.clone(), mI); });
101         tests.put("test1a",      () -> { return test1a(aI.clone(), bI.clone(), mI); });
102         tests.put("test1b",      () -> { return test1b(aI.clone(), bI.clone(), mI); });
103         tests.put("test1c",      () -> { return test1c(aI.clone(), bI.clone(), mI); });
104         tests.put("test1d",      () -> { return test1d(aI.clone(), bI.clone(), mI); });
105         tests.put("test2a",      () -> { return test2a(aI.clone(), bI.clone(), mI); });
106         tests.put("test2b",      () -> { return test2b(aI.clone(), bI.clone(), mI); });
107         tests.put("test2c",      () -> { return test2c(aI.clone(), bI.clone(), mI); });
108         tests.put("test2d",      () -> { return test2d(aI.clone(), bI.clone(), mI); });
109         tests.put("test3a",      () -> { return test3a(aS.clone(), bS.clone(), mS); });
110         tests.put("test4a",      () -> { return test4a(aS.clone(), bS.clone()); });
111         tests.put("test4b",      () -> { return test4b(aS.clone(), bS.clone()); });
112         tests.put("test4c",      () -> { return test4c(aS.clone(), bS.clone()); });
113         tests.put("test4d",      () -> { return test4d(aS.clone(), bS.clone()); });
114         tests.put("test4e",      () -> { return test4e(aS.clone(), bS.clone()); });
115         tests.put("test4f",      () -> { return test4f(aS.clone(), bS.clone()); });
116         tests.put("test4g",      () -> { return test4g(aS.clone(), bS.clone()); });
117         tests.put("test5a",      () -> { return test5a(aS.clone(), bS.clone(), mS); });
118         tests.put("test6a",      () -> { return test6a(aI.clone(), bI.clone()); });
119         tests.put("test7a",      () -> { return test7a(aI.clone(), bI.clone()); });
120 
121         // Compute gold value for all test methods before compilation
122         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
123             String name = entry.getKey();
124             TestFunction test = entry.getValue();
125             Object[] gold = test.run();
126             golds.put(name, gold);
127         }
128     }
129 
130     @Warmup(100)
131     @Run(test = {"test0",
132                  "test1a",
133                  "test1b",
134                  "test1c",
135                  "test1d",
136                  "test2a",
137                  "test2b",
138                  "test2c",
139                  "test2d",
140                  "test3a",
141                  "test4a",
142                  "test4b",
143                  "test4c",
144                  "test4d",
145                  "test4e",
146                  "test4f",
147                  "test4g",
148                  "test5a",
149                  "test6a",
150                  "test7a"})
151     public void runTests() {
152         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
153             String name = entry.getKey();
154             TestFunction test = entry.getValue();
155             // Recall gold value from before compilation
156             Object[] gold = golds.get(name);
157             // Compute new result
158             Object[] result = test.run();
159             // Compare gold and new result
160             verify(name, gold, result);
161         }
162     }
163 
164     static byte[] generateB() {
165         byte[] a = new byte[RANGE];
166         for (int i = 0; i < a.length; i++) {
167             a[i] = (byte)RANDOM.nextInt();
168         }
169         return a;
170     }
171 
172     static short[] generateS() {
173         short[] a = new short[RANGE];
174         for (int i = 0; i < a.length; i++) {
175             a[i] = (short)RANDOM.nextInt();
176         }
177         return a;
178     }
179 
180     static int[] generateI() {
181         int[] a = new int[RANGE];
182         for (int i = 0; i < a.length; i++) {
183             a[i] = RANDOM.nextInt();
184         }
185         return a;
186     }
187 
188     static long[] generateL() {
189         long[] a = new long[RANGE];
190         for (int i = 0; i < a.length; i++) {
191             a[i] = RANDOM.nextLong();
192         }
193         return a;
194     }
195 
196     static void verify(String name, Object[] gold, Object[] result) {
197         if (gold.length != result.length) {
198             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
199                                        gold.length + ", result.length = " + result.length);
200         }
201         for (int i = 0; i < gold.length; i++) {
202             Object g = gold[i];
203             Object r = result[i];
204             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
205                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
206                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
207                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
208             }
209             if (g == r) {
210                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
211                                            " gold[" + i + "] == result[" + i + "]");
212             }
213             if (Array.getLength(g) != Array.getLength(r)) {
214                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
215                                            " gold[" + i + "].length = " + Array.getLength(g) +
216                                            " result[" + i + "].length = " + Array.getLength(r));
217             }
218             Class c = g.getClass().getComponentType();
219             if (c == byte.class) {
220                 verifyB(name, i, (byte[])g, (byte[])r);
221             } else if (c == short.class) {
222                 verifyS(name, i, (short[])g, (short[])r);
223             } else if (c == int.class) {
224                 verifyI(name, i, (int[])g, (int[])r);
225             } else if (c == long.class) {
226                 verifyL(name, i, (long[])g, (long[])r);
227             } else {
228                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
229                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
230                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
231             }
232         }
233     }
234 
235     static void verifyB(String name, int i, byte[] g, byte[] r) {
236         for (int j = 0; j < g.length; j++) {
237             if (g[j] != r[j]) {
238                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
239                                            " gold[" + i + "][" + j + "] = " + g[j] +
240                                            " result[" + i + "][" + j + "] = " + r[j]);
241             }
242         }
243     }
244 
245     static void verifyS(String name, int i, short[] g, short[] r) {
246         for (int j = 0; j < g.length; j++) {
247             if (g[j] != r[j]) {
248                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
249                                            " gold[" + i + "][" + j + "] = " + g[j] +
250                                            " result[" + i + "][" + j + "] = " + r[j]);
251             }
252         }
253     }
254 
255     static void verifyI(String name, int i, int[] g, int[] r) {
256         for (int j = 0; j < g.length; j++) {
257             if (g[j] != r[j]) {
258                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
259                                            " gold[" + i + "][" + j + "] = " + g[j] +
260                                            " result[" + i + "][" + j + "] = " + r[j]);
261             }
262         }
263     }
264 
265     static void verifyL(String name, int i, long[] g, long[] r) {
266         for (int j = 0; j < g.length; j++) {
267             if (g[j] != r[j]) {
268                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
269                                            " gold[" + i + "][" + j + "] = " + g[j] +
270                                            " result[" + i + "][" + j + "] = " + r[j]);
271             }
272         }
273     }
274 
275     @Test
276     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
277                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
278                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
279                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
280                   IRNode.STORE_VECTOR, "> 0"},
281         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
282         applyIfPlatform = {"64-bit", "true"},
283         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
284     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
285                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
286                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
287                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
288                   IRNode.STORE_VECTOR, "> 0"},
289         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
290         applyIfPlatform = {"64-bit", "true"},
291         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
292     // Load and store are already split
293     //
294     //  0 1 - - 4 5 6 7
295     //  | |     | | | |
296     //  0 1 - - 4 5 6 7
297     static Object[] test0(int[] a, int[] b, int mask) {
298         for (int i = 0; i < RANGE; i+=8) {
299             int b0 = a[i+0] & mask;
300             int b1 = a[i+1] & mask;
301 
302             int b4 = a[i+4] & mask;
303             int b5 = a[i+5] & mask;
304             int b6 = a[i+6] & mask;
305             int b7 = a[i+7] & mask;
306 
307             b[i+0] = b0;
308             b[i+1] = b1;
309 
310             b[i+4] = b4;
311             b[i+5] = b5;
312             b[i+6] = b6;
313             b[i+7] = b7;
314             // With AlignVector, we need 8-byte alignment of vector loads/stores.
315             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
316             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
317             // -> vectorize                                  -> no vectorization
318         }
319         return new Object[]{ a, b };
320     }
321 
322     @Test
323     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
324                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
325                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
326                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
327                   IRNode.STORE_VECTOR, "> 0"},
328         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
329         applyIfPlatform = {"64-bit", "true"},
330         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
331     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
332                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
333                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
334                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
335                   IRNode.STORE_VECTOR, "> 0"},
336         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
337         applyIfPlatform = {"64-bit", "true"},
338         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
339     // Adjacent Load and Store, but split by Add/Mul
340     static Object[] test1a(int[] a, int[] b, int mask) {
341         for (int i = 0; i < RANGE; i+=8) {
342             b[i+0] = a[i+0] + mask; // Add
343             b[i+1] = a[i+1] + mask;
344             b[i+2] = a[i+2] + mask;
345             b[i+3] = a[i+3] + mask;
346 
347             b[i+4] = a[i+4] * mask; // Mul
348             b[i+5] = a[i+5] * mask;
349             // With AlignVector, we need 8-byte alignment of vector loads/stores.
350             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
351             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
352             // -> vectorize                                  -> no vectorization
353         }
354         return new Object[]{ a, b };
355     }
356 
357     @Test
358     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
359                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
360                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
361                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
362                   IRNode.STORE_VECTOR, "> 0"},
363         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
364         applyIfPlatform = {"64-bit", "true"},
365         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
366     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
367                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
368                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
369                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
370                   IRNode.STORE_VECTOR, "> 0"},
371         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
372         applyIfPlatform = {"64-bit", "true"},
373         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
374     // Adjacent Load and Store, but split by Add/Mul
375     static Object[] test1b(int[] a, int[] b, int mask) {
376         for (int i = 0; i < RANGE; i+=8) {
377             b[i+0] = a[i+0] * mask; // Mul
378             b[i+1] = a[i+1] * mask;
379             b[i+2] = a[i+2] * mask;
380             b[i+3] = a[i+3] * mask;
381 
382             b[i+4] = a[i+4] + mask; // Add
383             b[i+5] = a[i+5] + mask;
384             // With AlignVector, we need 8-byte alignment of vector loads/stores.
385             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
386             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
387             // -> vectorize                                  -> no vectorization
388         }
389         return new Object[]{ a, b };
390     }
391 
392     @Test
393     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
394                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
395                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
396                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
397                   IRNode.STORE_VECTOR, "> 0"},
398         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
399         applyIfPlatform = {"64-bit", "true"},
400         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
401     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
402                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
403                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
404                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
405                   IRNode.STORE_VECTOR, "> 0"},
406         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
407         applyIfPlatform = {"64-bit", "true"},
408         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
409     // Adjacent Load and Store, but split by Add/Mul
410     static Object[] test1c(int[] a, int[] b, int mask) {
411         for (int i = 0; i < RANGE; i+=8) {
412             b[i+0] = a[i+0] + mask; // Add
413             b[i+1] = a[i+1] + mask;
414 
415             b[i+2] = a[i+2] * mask; // Mul
416             b[i+3] = a[i+3] * mask;
417             b[i+4] = a[i+4] * mask;
418             b[i+5] = a[i+5] * mask;
419             // With AlignVector, we need 8-byte alignment of vector loads/stores.
420             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
421             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
422             // -> vectorize                                  -> no vectorization
423         }
424         return new Object[]{ a, b };
425     }
426 
427     @Test
428     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
429                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
430                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
431                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
432                   IRNode.STORE_VECTOR, "> 0"},
433         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
434         applyIfPlatform = {"64-bit", "true"},
435         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
436     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
437                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
438                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
439                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
440                   IRNode.STORE_VECTOR, "> 0"},
441         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
442         applyIfPlatform = {"64-bit", "true"},
443         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
444     // Adjacent Load and Store, but split by Add/Mul
445     static Object[] test1d(int[] a, int[] b, int mask) {
446         for (int i = 0; i < RANGE; i+=8) {
447             b[i+0] = a[i+0] * mask; // Mul
448             b[i+1] = a[i+1] * mask;
449 
450             b[i+2] = a[i+2] + mask; // Add
451             b[i+3] = a[i+3] + mask;
452             b[i+4] = a[i+4] + mask;
453             b[i+5] = a[i+5] + mask;
454             // With AlignVector, we need 8-byte alignment of vector loads/stores.
455             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
456             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
457             // -> vectorize                                  -> no vectorization
458         }
459         return new Object[]{ a, b };
460     }
461 
462     @Test
463     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
464                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
465                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
466                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
467                   IRNode.STORE_VECTOR, "> 0"},
468         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
469         applyIfPlatform = {"64-bit", "true"},
470         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
471     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
472                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
473                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
474                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
475                   IRNode.STORE_VECTOR, "> 0"},
476         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
477         applyIfPlatform = {"64-bit", "true"},
478         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
479     // Split the load
480     //
481     //  0 1 2 3 4 5 - -
482     //  | |  \ \ \ \
483     //  | |   \ \ \ \
484     //  | |    \ \ \ \
485     //  0 1 - - 4 5 6 7
486     //
487     static Object[] test2a(int[] a, int[] b, int mask) {
488         for (int i = 0; i < RANGE; i+=8) {
489             int b0 = a[i+0] & mask;
490             int b1 = a[i+1] & mask;
491             int b2 = a[i+2] & mask;
492             int b3 = a[i+3] & mask;
493             int b4 = a[i+4] & mask;
494             int b5 = a[i+5] & mask;
495 
496             b[i+0] = b0;
497             b[i+1] = b1;
498 
499             b[i+4] = b2;
500             b[i+5] = b3;
501             b[i+6] = b4;
502             b[i+7] = b5;
503             // With AlignVector, we need 8-byte alignment of vector loads/stores.
504             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
505             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
506             // -> vectorize                                  -> no vectorization
507         }
508         return new Object[]{ a, b };
509     }
510 
511     @Test
512     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
513                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
514                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
515                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
516                   IRNode.STORE_VECTOR, "> 0"},
517         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
518         applyIfPlatform = {"64-bit", "true"},
519         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
520     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
521                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
522                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
523                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
524                   IRNode.STORE_VECTOR, "> 0"},
525         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
526         applyIfPlatform = {"64-bit", "true"},
527         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
528     // Split the load
529     //
530     //  0 1 2 3 4 5 - -
531     //  | | | |  \ \
532     //  | | | |   \ \
533     //  | | | |    \ \
534     //  0 1 2 3 -- 6 7
535     //
536     static Object[] test2b(int[] a, int[] b, int mask) {
537         for (int i = 0; i < RANGE; i+=8) {
538             int b0 = a[i+0] & mask;
539             int b1 = a[i+1] & mask;
540             int b2 = a[i+2] & mask;
541             int b3 = a[i+3] & mask;
542             int b4 = a[i+4] & mask;
543             int b5 = a[i+5] & mask;
544 
545             b[i+0] = b0;
546             b[i+1] = b1;
547             b[i+2] = b2;
548             b[i+3] = b3;
549 
550             b[i+6] = b4;
551             b[i+7] = b5;
552             // With AlignVector, we need 8-byte alignment of vector loads/stores.
553             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
554             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
555             // -> vectorize                                  -> no vectorization
556         }
557         return new Object[]{ a, b };
558     }
559 
560     @Test
561     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
562                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
563                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
564                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
565                   IRNode.STORE_VECTOR, "> 0"},
566         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
567         applyIfPlatform = {"64-bit", "true"},
568         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
569     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
570                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
571                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
572                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
573                   IRNode.STORE_VECTOR, "> 0"},
574         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
575         applyIfPlatform = {"64-bit", "true"},
576         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
577     // Split the load
578     //
579     //  0 1 - - 4 5 6 7
580     //  | |    / / / /
581     //  | |   / / / /
582     //  | |  / / / /
583     //  0 1 2 3 4 5 - -
584     //
585     static Object[] test2c(int[] a, int[] b, int mask) {
586         for (int i = 0; i < RANGE; i+=8) {
587             int b0 = a[i+0] & mask;
588             int b1 = a[i+1] & mask;
589 
590             int b4 = a[i+4] & mask;
591             int b5 = a[i+5] & mask;
592             int b6 = a[i+6] & mask;
593             int b7 = a[i+7] & mask;
594 
595             b[i+0] = b0;
596             b[i+1] = b1;
597             b[i+2] = b4;
598             b[i+3] = b5;
599             b[i+4] = b6;
600             b[i+5] = b7;
601             // With AlignVector, we need 8-byte alignment of vector loads/stores.
602             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
603             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
604             // -> vectorize                                  -> no vectorization
605         }
606         return new Object[]{ a, b };
607     }
608 
609     @Test
610     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
611                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
612                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
613                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
614                   IRNode.STORE_VECTOR, "> 0"},
615         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
616         applyIfPlatform = {"64-bit", "true"},
617         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
618     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
619                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
620                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
621                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
622                   IRNode.STORE_VECTOR, "> 0"},
623         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
624         applyIfPlatform = {"64-bit", "true"},
625         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
626     // Split the load
627     //
628     //  0 1 2 3 - - 6 7
629     //  | | | |    / /
630     //  | | | |   / /
631     //  | | | |  / /
632     //  0 1 2 3 4 5 - -
633     //
634     static Object[] test2d(int[] a, int[] b, int mask) {
635         for (int i = 0; i < RANGE; i+=8) {
636             int b0 = a[i+0] & mask;
637             int b1 = a[i+1] & mask;
638             int b2 = a[i+2] & mask;
639             int b3 = a[i+3] & mask;
640 
641             int b6 = a[i+6] & mask;
642             int b7 = a[i+7] & mask;
643 
644             b[i+0] = b0;
645             b[i+1] = b1;
646             b[i+2] = b2;
647             b[i+3] = b3;
648             b[i+4] = b6;
649             b[i+5] = b7;
650             // With AlignVector, we need 8-byte alignment of vector loads/stores.
651             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
652             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
653             // -> vectorize                                  -> no vectorization
654         }
655         return new Object[]{ a, b };
656     }
657 
658     @Test
659     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
660                   IRNode.STORE_VECTOR, "> 0"},
661         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
662         applyIfPlatform = {"64-bit", "true"},
663         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
664     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
665                   IRNode.STORE_VECTOR, "> 0"},
666         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
667         applyIfPlatform = {"64-bit", "true"},
668         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
669     // 0 1 2 3 4 5 6 7 -
670     // | | | | | | | |
671     // | + + + | | | |
672     // |       | | | |
673     // |     v | | | | v
674     // |     | | | | | |
675     // 1 - - 3 4 5 6 7 8
676     static Object[] test3a(short[] a, short[] b, short val) {
677         int sum = 0;
678         for (int i = 0; i < RANGE; i+=16) {
679             short a0 = a[i+0]; // required for alignment / offsets, technical limitation.
680 
681             short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off
682             short a2 = a[i+2];
683             short a3 = a[i+3];
684 
685             short a4 = a[i+4]; // 4-pack
686             short a5 = a[i+5];
687             short a6 = a[i+6];
688             short a7 = a[i+7];
689 
690 
691             b[i+0] = a0; // required for alignment / offsets, technical limitation.
692 
693             sum += a1 + a2 + a3; // not packed
694 
695             b[i+3] = val; // adjacent to 4-pack but needs to be split off
696 
697             b[i+4] = a4; // 4-pack
698             b[i+5] = a5;
699             b[i+6] = a6;
700             b[i+7] = a7;
701 
702             b[i+8] = val; // adjacent to 4-pack but needs to be split off
703 
704             // With AlignVector, we need 8-byte alignment of vector loads/stores.
705             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
706             // adr = base + 16 + 8 + 32*i  ->  always        adr = base + 12 + 8 + 32*i  ->  never
707             // -> vectorize                                  -> no vectorization
708         }
709         return new Object[]{ a, b, new int[]{ sum } };
710     }
711 
712     @Test
713     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
714                   IRNode.STORE_VECTOR, "> 0"},
715         applyIfPlatform = {"64-bit", "true"},
716         applyIfCPUFeatureOr = {"sse4.1", "true"})
717     // Cyclic dependency with distance 2 -> split into 2-packs
718     static Object[] test4a(short[] a, short[] b) {
719         for (int i = 0; i < RANGE-64; i++) {
720           b[i+2] = a[i+0];
721         }
722         return new Object[]{ a, b };
723     }
724 
725     @Test
726     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
727                   IRNode.STORE_VECTOR, "> 0"},
728         applyIf = {"AlignVector", "false"},
729         applyIfPlatform = {"64-bit", "true"},
730         applyIfCPUFeatureOr = {"sse4.1", "true"})
731     // Cyclic dependency with distance 3 -> split into 2-packs
732     static Object[] test4b(short[] a, short[] b) {
733         for (int i = 0; i < RANGE-64; i++) {
734           b[i+3] = a[i+0];
735         }
736         return new Object[]{ a, b };
737     }
738 
739     @Test
740     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
741                   IRNode.STORE_VECTOR, "> 0"},
742         applyIf = {"MaxVectorSize", ">=8"},
743         applyIfPlatform = {"64-bit", "true"},
744         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
745     // Cyclic dependency with distance 4 -> split into 4-packs
746     static Object[] test4c(short[] a, short[] b) {
747         for (int i = 0; i < RANGE-64; i++) {
748           b[i+4] = a[i+0];
749         }
750         return new Object[]{ a, b };
751     }
752 
753     @Test
754     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
755                   IRNode.STORE_VECTOR, "> 0"},
756         applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
757         applyIfPlatform = {"64-bit", "true"},
758         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
759     // Cyclic dependency with distance 5 -> split into 4-packs
760     static Object[] test4d(short[] a, short[] b) {
761         for (int i = 0; i < RANGE-64; i++) {
762           b[i+5] = a[i+0];
763         }
764         return new Object[]{ a, b };
765     }
766 
767     @Test
768     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
769                   IRNode.STORE_VECTOR, "> 0"},
770         applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
771         applyIfPlatform = {"64-bit", "true"},
772         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
773     // Cyclic dependency with distance 6 -> split into 4-packs
774     static Object[] test4e(short[] a, short[] b) {
775         for (int i = 0; i < RANGE-64; i++) {
776           b[i+6] = a[i+0];
777         }
778         return new Object[]{ a, b };
779     }
780 
781     @Test
782     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
783                   IRNode.STORE_VECTOR, "> 0"},
784         applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
785         applyIfPlatform = {"64-bit", "true"},
786         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
787     // Cyclic dependency with distance 7 -> split into 4-packs
788     static Object[] test4f(short[] a, short[] b) {
789         for (int i = 0; i < RANGE-64; i++) {
790           b[i+7] = a[i+0];
791         }
792         return new Object[]{ a, b };
793     }
794 
795     @Test
796     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
797                   IRNode.STORE_VECTOR, "> 0"},
798         applyIf = {"MaxVectorSize", ">=32"},
799         applyIfPlatform = {"64-bit", "true"},
800         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
801     // Cyclic dependency with distance 8 -> split into 8-packs
802     static Object[] test4g(short[] a, short[] b) {
803         for (int i = 0; i < RANGE-64; i++) {
804           b[i+8] = a[i+0];
805         }
806         return new Object[]{ a, b };
807     }
808 
809     @Test
810     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
811                   IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
812                   IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
813                   IRNode.ADD_VS,        IRNode.VECTOR_SIZE_2, "> 0",
814                   IRNode.ADD_VS,        IRNode.VECTOR_SIZE_8, "> 0",
815                   IRNode.ADD_VS,        IRNode.VECTOR_SIZE_4, "> 0",
816                   IRNode.STORE_VECTOR, "> 0"},
817         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
818         applyIfPlatform = {"64-bit", "true"},
819         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
820     // Split pack into power-of-2 sizes
821     static Object[] test5a(short[] a, short[] b, short val) {
822         for (int i = 0; i < RANGE; i+=16) {
823             b[i+ 0] = (short)(a[i+ 0] + val); // 8 pack
824             b[i+ 1] = (short)(a[i+ 1] + val);
825             b[i+ 2] = (short)(a[i+ 2] + val);
826             b[i+ 3] = (short)(a[i+ 3] + val);
827             b[i+ 4] = (short)(a[i+ 4] + val);
828             b[i+ 5] = (short)(a[i+ 5] + val);
829             b[i+ 6] = (short)(a[i+ 6] + val);
830             b[i+ 7] = (short)(a[i+ 7] + val);
831 
832             b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack
833             b[i+ 9] = (short)(a[i+ 9] + val);
834             b[i+10] = (short)(a[i+10] + val);
835             b[i+11] = (short)(a[i+11] + val);
836 
837             b[i+12] = (short)(a[i+12] + val); // 2-pack
838             b[i+13] = (short)(a[i+13] + val);
839 
840             b[i+14] = (short)(a[i+14] + val);
841         }
842         return new Object[]{ a, b };
843     }
844 
845     @Test
846     @IR(counts = {IRNode.LOAD_VECTOR_I,   IRNode.VECTOR_SIZE_4, "> 0",
847                   IRNode.MUL_VI,          IRNode.VECTOR_SIZE_4, "> 0",
848                   IRNode.AND_VI,          IRNode.VECTOR_SIZE_4, "> 0",
849                   IRNode.ADD_VI,          IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
850                   IRNode.ADD_REDUCTION_V,                       "> 0"},
851         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
852         applyIfPlatform = {"64-bit", "true"},
853         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
854     @IR(counts = {IRNode.LOAD_VECTOR_I,   IRNode.VECTOR_SIZE_4, "> 0",
855                   IRNode.MUL_VI,          IRNode.VECTOR_SIZE_4, "> 0",
856                   IRNode.AND_VI,          IRNode.VECTOR_SIZE_4, "> 0",
857                   IRNode.ADD_VI,          IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
858                   IRNode.ADD_REDUCTION_V,                       "> 0"},
859         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
860         applyIfPlatform = {"64-bit", "true"},
861         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
862     // Split packs including reductions
863     static Object[] test6a(int[] a, int[] b) {
864         int s = 0;
865         for (int i = 0; i < RANGE; i+=8) {
866             s += a[i+0] * b[i+0];
867             s += a[i+1] * b[i+1];
868             s += a[i+2] * b[i+2];
869             s += a[i+3] * b[i+3];
870 
871             s += a[i+4] & b[i+4];
872             s += a[i+5] & b[i+5];
873             s += a[i+6] & b[i+6];
874             s += a[i+7] & b[i+7];
875             // With AlignVector, we need 8-byte alignment of vector loads/stores.
876             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
877             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
878             // -> vectorize                                  -> no vectorization
879         }
880         return new Object[]{ a, b, new int[]{ s } };
881     }
882 
883     @Test
884     @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
885                   IRNode.MUL_VI,         "> 0",
886                   IRNode.POPULATE_INDEX, "> 0"},
887         applyIfPlatform = {"64-bit", "true"},
888         applyIfCPUFeatureOr = {"avx2", "true", "sve", "true"})
889     // Index Populate:
890     // There can be an issue when all the (iv + 1), (iv + 2), ...
891     // get packed, but not (iv). Then we have a pack that is one element
892     // too short, and we start splitting everything in a bad way.
893     static Object[] test7a(int[] a, int[] b) {
894         for (int i = 0; i < RANGE; i++) {
895             a[i] = b[i] * i;
896         }
897         return new Object[]{ a, b };
898     }
899 }