1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  */
 23 
 24 package compiler.loopopts.superword;
 25 
 26 import compiler.lib.ir_framework.*;
 27 import jdk.test.lib.Utils;
 28 import jdk.test.whitebox.WhiteBox;
 29 import java.lang.reflect.Array;
 30 import java.util.Map;
 31 import java.util.HashMap;
 32 import java.util.Random;
 33 import java.nio.ByteOrder;
 34 
 35 /*
 36  * @test
 37  * @bug 8326139
 38  * @summary Test splitting packs in SuperWord
 39  * @library /test/lib /
 40  * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_nAV
 41  * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_yAV
 42  * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_nAV
 43  * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_yAV
 44  */
 45 
 46 public class TestSplitPacks {
 47     static int RANGE = 1024*8;
 48     static int RANGE_FINAL = 1024*8;
 49     private static final Random RANDOM = Utils.getRandomInstance();
 50 
 51     // Inputs
 52     byte[] aB;
 53     byte[] bB;
 54     byte mB = (byte)31;
 55     short[] aS;
 56     short[] bS;
 57     short mS = (short)0xF0F0;
 58     int[] aI;
 59     int[] bI;
 60     int mI = 0xF0F0F0F0;
 61     long[] aL;
 62     long[] bL;
 63     long mL = 0xF0F0F0F0F0F0F0F0L;
 64 
 65     // List of tests
 66     Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
 67 
 68     // List of gold, the results from the first run before compilation
 69     Map<String,Object[]> golds = new HashMap<String,Object[]>();
 70 
 71     interface TestFunction {
 72         Object[] run();
 73     }
 74 
 75     public static void main(String[] args) {
 76         TestFramework framework = new TestFramework(TestSplitPacks.class);
 77         framework.addFlags("-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=1000");
 78         switch (args[0]) {
 79             case "nCOH_nAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); }
 80             case "nCOH_yAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); }
 81             case "yCOH_nAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); }
 82             case "yCOH_yAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:+AlignVector"); }
 83             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
 84         };
 85         framework.start();
 86     }
 87 
 88     public TestSplitPacks() {
 89         // Generate input once
 90         aB = generateB();
 91         bB = generateB();
 92         aS = generateS();
 93         bS = generateS();
 94         aI = generateI();
 95         bI = generateI();
 96         aL = generateL();
 97         bL = generateL();
 98 
 99         // Add all tests to list
100         tests.put("test0",       () -> { return test0(aI.clone(), bI.clone(), mI); });
101         tests.put("test1a",      () -> { return test1a(aI.clone(), bI.clone(), mI); });
102         tests.put("test1b",      () -> { return test1b(aI.clone(), bI.clone(), mI); });
103         tests.put("test1c",      () -> { return test1c(aI.clone(), bI.clone(), mI); });
104         tests.put("test1d",      () -> { return test1d(aI.clone(), bI.clone(), mI); });
105         tests.put("test2a",      () -> { return test2a(aI.clone(), bI.clone(), mI); });
106         tests.put("test2b",      () -> { return test2b(aI.clone(), bI.clone(), mI); });
107         tests.put("test2c",      () -> { return test2c(aI.clone(), bI.clone(), mI); });
108         tests.put("test2d",      () -> { return test2d(aI.clone(), bI.clone(), mI); });
109         tests.put("test3a",      () -> { return test3a(aS.clone(), bS.clone(), mS); });
110         tests.put("test4a",      () -> { return test4a(aS.clone(), bS.clone()); });
111         tests.put("test4b",      () -> { return test4b(aS.clone(), bS.clone()); });
112         tests.put("test4c",      () -> { return test4c(aS.clone(), bS.clone()); });
113         tests.put("test4d",      () -> { return test4d(aS.clone(), bS.clone()); });
114         tests.put("test4e",      () -> { return test4e(aS.clone(), bS.clone()); });
115         tests.put("test4f",      () -> { return test4f(aS.clone(), bS.clone()); });
116         tests.put("test4g",      () -> { return test4g(aS.clone(), bS.clone()); });
117         tests.put("test5a",      () -> { return test5a(aS.clone(), bS.clone(), mS); });
118         tests.put("test6a",      () -> { return test6a(aI.clone(), bI.clone()); });
119         tests.put("test7a",      () -> { return test7a(aI.clone(), bI.clone()); });
120 
121         // Compute gold value for all test methods before compilation
122         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
123             String name = entry.getKey();
124             TestFunction test = entry.getValue();
125             Object[] gold = test.run();
126             golds.put(name, gold);
127         }
128     }
129 
130     @Warmup(100)
131     @Run(test = {"test0",
132                  "test1a",
133                  "test1b",
134                  "test1c",
135                  "test1d",
136                  "test2a",
137                  "test2b",
138                  "test2c",
139                  "test2d",
140                  "test3a",
141                  "test4a",
142                  "test4b",
143                  "test4c",
144                  "test4d",
145                  "test4e",
146                  "test4f",
147                  "test4g",
148                  "test5a",
149                  "test6a",
150                  "test7a"})
151     public void runTests() {
152         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
153             String name = entry.getKey();
154             TestFunction test = entry.getValue();
155             // Recall gold value from before compilation
156             Object[] gold = golds.get(name);
157             // Compute new result
158             Object[] result = test.run();
159             // Compare gold and new result
160             verify(name, gold, result);
161         }
162     }
163 
164     static byte[] generateB() {
165         byte[] a = new byte[RANGE];
166         for (int i = 0; i < a.length; i++) {
167             a[i] = (byte)RANDOM.nextInt();
168         }
169         return a;
170     }
171 
172     static short[] generateS() {
173         short[] a = new short[RANGE];
174         for (int i = 0; i < a.length; i++) {
175             a[i] = (short)RANDOM.nextInt();
176         }
177         return a;
178     }
179 
180     static int[] generateI() {
181         int[] a = new int[RANGE];
182         for (int i = 0; i < a.length; i++) {
183             a[i] = RANDOM.nextInt();
184         }
185         return a;
186     }
187 
188     static long[] generateL() {
189         long[] a = new long[RANGE];
190         for (int i = 0; i < a.length; i++) {
191             a[i] = RANDOM.nextLong();
192         }
193         return a;
194     }
195 
196     static void verify(String name, Object[] gold, Object[] result) {
197         if (gold.length != result.length) {
198             throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
199                                        gold.length + ", result.length = " + result.length);
200         }
201         for (int i = 0; i < gold.length; i++) {
202             Object g = gold[i];
203             Object r = result[i];
204             if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
205                 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
206                                            " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
207                                            " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
208             }
209             if (g == r) {
210                 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
211                                            " gold[" + i + "] == result[" + i + "]");
212             }
213             if (Array.getLength(g) != Array.getLength(r)) {
214                     throw new RuntimeException("verify " + name + ": arrays must have same length:" +
215                                            " gold[" + i + "].length = " + Array.getLength(g) +
216                                            " result[" + i + "].length = " + Array.getLength(r));
217             }
218             Class c = g.getClass().getComponentType();
219             if (c == byte.class) {
220                 verifyB(name, i, (byte[])g, (byte[])r);
221             } else if (c == short.class) {
222                 verifyS(name, i, (short[])g, (short[])r);
223             } else if (c == int.class) {
224                 verifyI(name, i, (int[])g, (int[])r);
225             } else if (c == long.class) {
226                 verifyL(name, i, (long[])g, (long[])r);
227             } else {
228                 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
229                                        " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
230                                        " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
231             }
232         }
233     }
234 
235     static void verifyB(String name, int i, byte[] g, byte[] r) {
236         for (int j = 0; j < g.length; j++) {
237             if (g[j] != r[j]) {
238                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
239                                            " gold[" + i + "][" + j + "] = " + g[j] +
240                                            " result[" + i + "][" + j + "] = " + r[j]);
241             }
242         }
243     }
244 
245     static void verifyS(String name, int i, short[] g, short[] r) {
246         for (int j = 0; j < g.length; j++) {
247             if (g[j] != r[j]) {
248                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
249                                            " gold[" + i + "][" + j + "] = " + g[j] +
250                                            " result[" + i + "][" + j + "] = " + r[j]);
251             }
252         }
253     }
254 
255     static void verifyI(String name, int i, int[] g, int[] r) {
256         for (int j = 0; j < g.length; j++) {
257             if (g[j] != r[j]) {
258                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
259                                            " gold[" + i + "][" + j + "] = " + g[j] +
260                                            " result[" + i + "][" + j + "] = " + r[j]);
261             }
262         }
263     }
264 
265     static void verifyL(String name, int i, long[] g, long[] r) {
266         for (int j = 0; j < g.length; j++) {
267             if (g[j] != r[j]) {
268                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
269                                            " gold[" + i + "][" + j + "] = " + g[j] +
270                                            " result[" + i + "][" + j + "] = " + r[j]);
271             }
272         }
273     }
274 
275     @Test
276     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
277                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
278                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
279                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
280                   IRNode.STORE_VECTOR, "> 0"},
281         applyIf = {"MaxVectorSize", ">=32"},
282         applyIfPlatform = {"64-bit", "true"},
283         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
284     // Load and store are already split
285     //
286     //  0 1 - - 4 5 6 7
287     //  | |     | | | |
288     //  0 1 - - 4 5 6 7
289     static Object[] test0(int[] a, int[] b, int mask) {
290         for (int i = 0; i < RANGE; i+=8) {
291             int b0 = a[i+0] & mask;
292             int b1 = a[i+1] & mask;
293 
294             int b4 = a[i+4] & mask;
295             int b5 = a[i+5] & mask;
296             int b6 = a[i+6] & mask;
297             int b7 = a[i+7] & mask;
298 
299             b[i+0] = b0;
300             b[i+1] = b1;
301 
302             b[i+4] = b4;
303             b[i+5] = b5;
304             b[i+6] = b6;
305             b[i+7] = b7;
306         }
307         return new Object[]{ a, b };
308     }
309 
310     @Test
311     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
312                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
313                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
314                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
315                   IRNode.STORE_VECTOR, "> 0"},
316         applyIf = {"MaxVectorSize", ">=32"},
317         applyIfPlatform = {"64-bit", "true"},
318         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
319     // Adjacent Load and Store, but split by Add/Mul
320     static Object[] test1a(int[] a, int[] b, int mask) {
321         for (int i = 0; i < RANGE; i+=8) {
322             b[i+0] = a[i+0] + mask; // Add
323             b[i+1] = a[i+1] + mask;
324             b[i+2] = a[i+2] + mask;
325             b[i+3] = a[i+3] + mask;
326 
327             b[i+4] = a[i+4] * mask; // Mul
328             b[i+5] = a[i+5] * mask;
329         }
330         return new Object[]{ a, b };
331     }
332 
333     @Test
334     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
335                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
336                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
337                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
338                   IRNode.STORE_VECTOR, "> 0"},
339         applyIf = {"MaxVectorSize", ">=32"},
340         applyIfPlatform = {"64-bit", "true"},
341         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
342     // Adjacent Load and Store, but split by Add/Mul
343     static Object[] test1b(int[] a, int[] b, int mask) {
344         for (int i = 0; i < RANGE; i+=8) {
345             b[i+0] = a[i+0] * mask; // Mul
346             b[i+1] = a[i+1] * mask;
347             b[i+2] = a[i+2] * mask;
348             b[i+3] = a[i+3] * mask;
349 
350             b[i+4] = a[i+4] + mask; // Add
351             b[i+5] = a[i+5] + mask;
352         }
353         return new Object[]{ a, b };
354     }
355 
356     @Test
357     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
358                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
359                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
360                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
361                   IRNode.STORE_VECTOR, "> 0"},
362         applyIf = {"MaxVectorSize", ">=32"},
363         applyIfPlatform = {"64-bit", "true"},
364         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
365     // Adjacent Load and Store, but split by Add/Mul
366     static Object[] test1c(int[] a, int[] b, int mask) {
367         for (int i = 0; i < RANGE; i+=8) {
368             b[i+0] = a[i+0] + mask; // Add
369             b[i+1] = a[i+1] + mask;
370 
371             b[i+2] = a[i+2] * mask; // Mul
372             b[i+3] = a[i+3] * mask;
373             b[i+4] = a[i+4] * mask;
374             b[i+5] = a[i+5] * mask;
375         }
376         return new Object[]{ a, b };
377     }
378 
379     @Test
380     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
381                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
382                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
383                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
384                   IRNode.STORE_VECTOR, "> 0"},
385         applyIf = {"MaxVectorSize", ">=32"},
386         applyIfPlatform = {"64-bit", "true"},
387         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
388     // Adjacent Load and Store, but split by Add/Mul
389     static Object[] test1d(int[] a, int[] b, int mask) {
390         for (int i = 0; i < RANGE; i+=8) {
391             b[i+0] = a[i+0] * mask; // Mul
392             b[i+1] = a[i+1] * mask;
393 
394             b[i+2] = a[i+2] + mask; // Add
395             b[i+3] = a[i+3] + mask;
396             b[i+4] = a[i+4] + mask;
397             b[i+5] = a[i+5] + mask;
398         }
399         return new Object[]{ a, b };
400     }
401 
402     @Test
403     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
404                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
405                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
406                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
407                   IRNode.STORE_VECTOR, "> 0"},
408         applyIf = {"MaxVectorSize", ">=32"},
409         applyIfPlatform = {"64-bit", "true"},
410         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
411     // Split the load
412     //
413     //  0 1 2 3 4 5 - -
414     //  | |  \ \ \ \
415     //  | |   \ \ \ \
416     //  | |    \ \ \ \
417     //  0 1 - - 4 5 6 7
418     //
419     static Object[] test2a(int[] a, int[] b, int mask) {
420         for (int i = 0; i < RANGE; i+=8) {
421             int b0 = a[i+0] & mask;
422             int b1 = a[i+1] & mask;
423             int b2 = a[i+2] & mask;
424             int b3 = a[i+3] & mask;
425             int b4 = a[i+4] & mask;
426             int b5 = a[i+5] & mask;
427 
428             b[i+0] = b0;
429             b[i+1] = b1;
430 
431             b[i+4] = b2;
432             b[i+5] = b3;
433             b[i+6] = b4;
434             b[i+7] = b5;
435         }
436         return new Object[]{ a, b };
437     }
438 
439     @Test
440     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
441                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
442                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
443                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
444                   IRNode.STORE_VECTOR, "> 0"},
445         applyIf = {"MaxVectorSize", ">=32"},
446         applyIfPlatform = {"64-bit", "true"},
447         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
448     // Split the load
449     //
450     //  0 1 2 3 4 5 - -
451     //  | | | |  \ \
452     //  | | | |   \ \
453     //  | | | |    \ \
454     //  0 1 2 3 -- 6 7
455     //
456     static Object[] test2b(int[] a, int[] b, int mask) {
457         for (int i = 0; i < RANGE; i+=8) {
458             int b0 = a[i+0] & mask;
459             int b1 = a[i+1] & mask;
460             int b2 = a[i+2] & mask;
461             int b3 = a[i+3] & mask;
462             int b4 = a[i+4] & mask;
463             int b5 = a[i+5] & mask;
464 
465             b[i+0] = b0;
466             b[i+1] = b1;
467             b[i+2] = b2;
468             b[i+3] = b3;
469 
470             b[i+6] = b4;
471             b[i+7] = b5;
472         }
473         return new Object[]{ a, b };
474     }
475 
476     @Test
477     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
478                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
479                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
480                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
481                   IRNode.STORE_VECTOR, "> 0"},
482         applyIf = {"MaxVectorSize", ">=32"},
483         applyIfPlatform = {"64-bit", "true"},
484         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
485     // Split the load
486     //
487     //  0 1 - - 4 5 6 7
488     //  | |    / / / /
489     //  | |   / / / /
490     //  | |  / / / /
491     //  0 1 2 3 4 5 - -
492     //
493     static Object[] test2c(int[] a, int[] b, int mask) {
494         for (int i = 0; i < RANGE; i+=8) {
495             int b0 = a[i+0] & mask;
496             int b1 = a[i+1] & mask;
497 
498             int b4 = a[i+4] & mask;
499             int b5 = a[i+5] & mask;
500             int b6 = a[i+6] & mask;
501             int b7 = a[i+7] & mask;
502 
503             b[i+0] = b0;
504             b[i+1] = b1;
505             b[i+2] = b4;
506             b[i+3] = b5;
507             b[i+4] = b6;
508             b[i+5] = b7;
509         }
510         return new Object[]{ a, b };
511     }
512 
513     @Test
514     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
515                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
516                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
517                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
518                   IRNode.STORE_VECTOR, "> 0"},
519         applyIf = {"MaxVectorSize", ">=32"},
520         applyIfPlatform = {"64-bit", "true"},
521         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
522     // Split the load
523     //
524     //  0 1 2 3 - - 6 7
525     //  | | | |    / /
526     //  | | | |   / /
527     //  | | | |  / /
528     //  0 1 2 3 4 5 - -
529     //
530     static Object[] test2d(int[] a, int[] b, int mask) {
531         for (int i = 0; i < RANGE; i+=8) {
532             int b0 = a[i+0] & mask;
533             int b1 = a[i+1] & mask;
534             int b2 = a[i+2] & mask;
535             int b3 = a[i+3] & mask;
536 
537             int b6 = a[i+6] & mask;
538             int b7 = a[i+7] & mask;
539 
540             b[i+0] = b0;
541             b[i+1] = b1;
542             b[i+2] = b2;
543             b[i+3] = b3;
544             b[i+4] = b6;
545             b[i+5] = b7;
546         }
547         return new Object[]{ a, b };
548     }
549 
550     @Test
551     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
552                   IRNode.STORE_VECTOR, "> 0"},
553         applyIf = {"MaxVectorSize", ">=32"},
554         applyIfPlatform = {"64-bit", "true"},
555         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
556     // 0 1 2 3 4 5 6 7 -
557     // | | | | | | | |
558     // | + + + | | | |
559     // |       | | | |
560     // |     v | | | | v
561     // |     | | | | | |
562     // 1 - - 3 4 5 6 7 8
563     static Object[] test3a(short[] a, short[] b, short val) {
564         int sum = 0;
565         for (int i = 0; i < RANGE; i+=16) {
566             short a0 = a[i+0]; // required for alignment / offsets, technical limitation.
567 
568             short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off
569             short a2 = a[i+2];
570             short a3 = a[i+3];
571 
572             short a4 = a[i+4]; // 4-pack
573             short a5 = a[i+5];
574             short a6 = a[i+6];
575             short a7 = a[i+7];
576 
577 
578             b[i+0] = a0; // required for alignment / offsets, technical limitation.
579 
580             sum += a1 + a2 + a3; // not packed
581 
582             b[i+3] = val; // adjacent to 4-pack but needs to be split off
583 
584             b[i+4] = a4; // 4-pack
585             b[i+5] = a5;
586             b[i+6] = a6;
587             b[i+7] = a7;
588 
589             b[i+8] = val; // adjacent to 4-pack but needs to be split off
590         }
591         return new Object[]{ a, b, new int[]{ sum } };
592     }
593 
594     @Test
595     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
596                   IRNode.STORE_VECTOR, "> 0"},
597         applyIfPlatform = {"64-bit", "true"},
598         applyIfCPUFeatureOr = {"sse4.1", "true"})
599     // Cyclic dependency with distance 2 -> split into 2-packs
600     static Object[] test4a(short[] a, short[] b) {
601         for (int i = 0; i < RANGE-64; i++) {
602           b[i+2] = a[i+0];
603         }
604         return new Object[]{ a, b };
605     }
606 
607     @Test
608     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
609                   IRNode.STORE_VECTOR, "> 0"},
610         applyIf = {"AlignVector", "false"},
611         applyIfPlatform = {"64-bit", "true"},
612         applyIfCPUFeatureOr = {"sse4.1", "true"})
613     // Cyclic dependency with distance 3 -> split into 2-packs
614     static Object[] test4b(short[] a, short[] b) {
615         for (int i = 0; i < RANGE-64; i++) {
616           b[i+3] = a[i+0];
617         }
618         return new Object[]{ a, b };
619     }
620 
621     @Test
622     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
623                   IRNode.STORE_VECTOR, "> 0"},
624         applyIf = {"MaxVectorSize", ">=8"},
625         applyIfPlatform = {"64-bit", "true"},
626         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
627     // Cyclic dependency with distance 4 -> split into 4-packs
628     static Object[] test4c(short[] a, short[] b) {
629         for (int i = 0; i < RANGE-64; i++) {
630           b[i+4] = a[i+0];
631         }
632         return new Object[]{ a, b };
633     }
634 
635     @Test
636     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
637                   IRNode.STORE_VECTOR, "> 0"},
638         applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
639         applyIfPlatform = {"64-bit", "true"},
640         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
641     // Cyclic dependency with distance 5 -> split into 4-packs
642     static Object[] test4d(short[] a, short[] b) {
643         for (int i = 0; i < RANGE-64; i++) {
644           b[i+5] = a[i+0];
645         }
646         return new Object[]{ a, b };
647     }
648 
649     @Test
650     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
651                   IRNode.STORE_VECTOR, "> 0"},
652         applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
653         applyIfPlatform = {"64-bit", "true"},
654         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
655     // Cyclic dependency with distance 6 -> split into 4-packs
656     static Object[] test4e(short[] a, short[] b) {
657         for (int i = 0; i < RANGE-64; i++) {
658           b[i+6] = a[i+0];
659         }
660         return new Object[]{ a, b };
661     }
662 
663     @Test
664     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
665                   IRNode.STORE_VECTOR, "> 0"},
666         applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
667         applyIfPlatform = {"64-bit", "true"},
668         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
669     // Cyclic dependency with distance 7 -> split into 4-packs
670     static Object[] test4f(short[] a, short[] b) {
671         for (int i = 0; i < RANGE-64; i++) {
672           b[i+7] = a[i+0];
673         }
674         return new Object[]{ a, b };
675     }
676 
677     @Test
678     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
679                   IRNode.STORE_VECTOR, "> 0"},
680         applyIf = {"MaxVectorSize", ">=32"},
681         applyIfPlatform = {"64-bit", "true"},
682         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
683     // Cyclic dependency with distance 8 -> split into 8-packs
684     static Object[] test4g(short[] a, short[] b) {
685         for (int i = 0; i < RANGE-64; i++) {
686           b[i+8] = a[i+0];
687         }
688         return new Object[]{ a, b };
689     }
690 
691     @Test
692     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
693                   IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
694                   IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
695                   IRNode.ADD_VS,        IRNode.VECTOR_SIZE_2, "> 0",
696                   IRNode.ADD_VS,        IRNode.VECTOR_SIZE_8, "> 0",
697                   IRNode.ADD_VS,        IRNode.VECTOR_SIZE_4, "> 0",
698                   IRNode.STORE_VECTOR, "> 0"},
699         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
700         applyIfPlatform = {"64-bit", "true"},
701         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
702     // Split pack into power-of-2 sizes
703     static Object[] test5a(short[] a, short[] b, short val) {
704         for (int i = 0; i < RANGE; i+=16) {
705             b[i+ 0] = (short)(a[i+ 0] + val); // 8 pack
706             b[i+ 1] = (short)(a[i+ 1] + val);
707             b[i+ 2] = (short)(a[i+ 2] + val);
708             b[i+ 3] = (short)(a[i+ 3] + val);
709             b[i+ 4] = (short)(a[i+ 4] + val);
710             b[i+ 5] = (short)(a[i+ 5] + val);
711             b[i+ 6] = (short)(a[i+ 6] + val);
712             b[i+ 7] = (short)(a[i+ 7] + val);
713 
714             b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack
715             b[i+ 9] = (short)(a[i+ 9] + val);
716             b[i+10] = (short)(a[i+10] + val);
717             b[i+11] = (short)(a[i+11] + val);
718 
719             b[i+12] = (short)(a[i+12] + val); // 2-pack
720             b[i+13] = (short)(a[i+13] + val);
721 
722             b[i+14] = (short)(a[i+14] + val);
723         }
724         return new Object[]{ a, b };
725     }
726 
727     @Test
728     @IR(counts = {IRNode.LOAD_VECTOR_I,   IRNode.VECTOR_SIZE_4, "> 0",
729                   IRNode.MUL_VI,          IRNode.VECTOR_SIZE_4, "> 0",
730                   IRNode.AND_VI,          IRNode.VECTOR_SIZE_4, "> 0",
731                   IRNode.ADD_VI,          IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
732                   IRNode.ADD_REDUCTION_V,                       "> 0"},
733         applyIf = {"MaxVectorSize", ">=32"},
734         applyIfPlatform = {"64-bit", "true"},
735         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
736     // Split packs including reductions
737     static Object[] test6a(int[] a, int[] b) {
738         int s = 0;
739         for (int i = 0; i < RANGE; i+=8) {
740             s += a[i+0] * b[i+0];
741             s += a[i+1] * b[i+1];
742             s += a[i+2] * b[i+2];
743             s += a[i+3] * b[i+3];
744 
745             s += a[i+4] & b[i+4];
746             s += a[i+5] & b[i+5];
747             s += a[i+6] & b[i+6];
748             s += a[i+7] & b[i+7];
749         }
750         return new Object[]{ a, b, new int[]{ s } };
751     }
752 
753     @Test
754     @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
755                   IRNode.MUL_VI,         "> 0",
756                   IRNode.POPULATE_INDEX, "> 0"},
757         applyIfPlatform = {"64-bit", "true"},
758         applyIfCPUFeatureOr = {"avx2", "true", "sve", "true"})
759     // Index Populate:
760     // There can be an issue when all the (iv + 1), (iv + 2), ...
761     // get packed, but not (iv). Then we have a pack that is one element
762     // too short, and we start splitting everything in a bad way.
763     static Object[] test7a(int[] a, int[] b) {
764         for (int i = 0; i < RANGE; i++) {
765             a[i] = b[i] * i;
766         }
767         return new Object[]{ a, b };
768     }
769 }