1 /*
2 * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 */
23
24 package compiler.loopopts.superword;
25
26 import compiler.lib.ir_framework.*;
27 import jdk.test.lib.Utils;
28 import jdk.test.whitebox.WhiteBox;
29 import java.lang.reflect.Array;
30 import java.util.Map;
31 import java.util.HashMap;
32 import java.util.Random;
33 import java.nio.ByteOrder;
34
35 /*
36 * @test
37 * @bug 8326139 8348659
38 * @summary Test splitting packs in SuperWord
39 * @library /test/lib /
40 * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_nAV
41 * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_yAV
42 * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_nAV
43 * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_yAV
44 */
45
46 public class TestSplitPacks {
47 static int RANGE = 1024*8;
48 static int RANGE_FINAL = 1024*8;
49 private static final Random RANDOM = Utils.getRandomInstance();
50
51 // Inputs
52 byte[] aB;
53 byte[] bB;
54 byte mB = (byte)31;
55 short[] aS;
56 short[] bS;
57 short mS = (short)0xF0F0;
58 int[] aI;
59 int[] bI;
60 int mI = 0xF0F0F0F0;
61 long[] aL;
62 long[] bL;
63 long mL = 0xF0F0F0F0F0F0F0F0L;
64
65 // List of tests
66 Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
67
68 // List of gold, the results from the first run before compilation
69 Map<String,Object[]> golds = new HashMap<String,Object[]>();
70
71 interface TestFunction {
72 Object[] run();
73 }
74
75 public static void main(String[] args) {
76 TestFramework framework = new TestFramework(TestSplitPacks.class);
77 framework.addFlags("-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=1000");
78 switch (args[0]) {
79 case "nCOH_nAV" -> { framework.addFlags("-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); }
80 case "nCOH_yAV" -> { framework.addFlags("-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); }
81 case "yCOH_nAV" -> { framework.addFlags("-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); }
82 case "yCOH_yAV" -> { framework.addFlags("-XX:+UseCompactObjectHeaders", "-XX:+AlignVector"); }
83 default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
84 };
85 framework.start();
86 }
87
88 public TestSplitPacks() {
89 // Generate input once
90 aB = generateB();
91 bB = generateB();
92 aS = generateS();
93 bS = generateS();
94 aI = generateI();
95 bI = generateI();
96 aL = generateL();
97 bL = generateL();
98
99 // Add all tests to list
100 tests.put("test0", () -> { return test0(aI.clone(), bI.clone(), mI); });
101 tests.put("test1a", () -> { return test1a(aI.clone(), bI.clone(), mI); });
102 tests.put("test1b", () -> { return test1b(aI.clone(), bI.clone(), mI); });
103 tests.put("test1c", () -> { return test1c(aI.clone(), bI.clone(), mI); });
104 tests.put("test1d", () -> { return test1d(aI.clone(), bI.clone(), mI); });
105 tests.put("test2a", () -> { return test2a(aI.clone(), bI.clone(), mI); });
106 tests.put("test2b", () -> { return test2b(aI.clone(), bI.clone(), mI); });
107 tests.put("test2c", () -> { return test2c(aI.clone(), bI.clone(), mI); });
108 tests.put("test2d", () -> { return test2d(aI.clone(), bI.clone(), mI); });
109 tests.put("test3a", () -> { return test3a(aS.clone(), bS.clone(), mS); });
110 tests.put("test4a", () -> { return test4a(aS.clone(), bS.clone()); });
111 tests.put("test4b", () -> { return test4b(aS.clone(), bS.clone()); });
112 tests.put("test4c", () -> { return test4c(aS.clone(), bS.clone()); });
113 tests.put("test4d", () -> { return test4d(aS.clone(), bS.clone()); });
114 tests.put("test4e", () -> { return test4e(aS.clone(), bS.clone()); });
115 tests.put("test4f", () -> { return test4f(aS.clone(), bS.clone()); });
116 tests.put("test4g", () -> { return test4g(aS.clone(), bS.clone()); });
117 tests.put("test5a", () -> { return test5a(aS.clone(), bS.clone(), mS); });
118 tests.put("test6a", () -> { return test6a(aI.clone(), bI.clone()); });
119 tests.put("test7a", () -> { return test7a(aI.clone(), bI.clone()); });
120
121 // Compute gold value for all test methods before compilation
122 for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
123 String name = entry.getKey();
124 TestFunction test = entry.getValue();
125 Object[] gold = test.run();
126 golds.put(name, gold);
127 }
128 }
129
130 @Warmup(100)
131 @Run(test = {"test0",
132 "test1a",
133 "test1b",
134 "test1c",
135 "test1d",
136 "test2a",
137 "test2b",
138 "test2c",
139 "test2d",
140 "test3a",
141 "test4a",
142 "test4b",
143 "test4c",
144 "test4d",
145 "test4e",
146 "test4f",
147 "test4g",
148 "test5a",
149 "test6a",
150 "test7a"})
151 public void runTests() {
152 for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
153 String name = entry.getKey();
154 TestFunction test = entry.getValue();
155 // Recall gold value from before compilation
156 Object[] gold = golds.get(name);
157 // Compute new result
158 Object[] result = test.run();
159 // Compare gold and new result
160 verify(name, gold, result);
161 }
162 }
163
164 static byte[] generateB() {
165 byte[] a = new byte[RANGE];
166 for (int i = 0; i < a.length; i++) {
167 a[i] = (byte)RANDOM.nextInt();
168 }
169 return a;
170 }
171
172 static short[] generateS() {
173 short[] a = new short[RANGE];
174 for (int i = 0; i < a.length; i++) {
175 a[i] = (short)RANDOM.nextInt();
176 }
177 return a;
178 }
179
180 static int[] generateI() {
181 int[] a = new int[RANGE];
182 for (int i = 0; i < a.length; i++) {
183 a[i] = RANDOM.nextInt();
184 }
185 return a;
186 }
187
188 static long[] generateL() {
189 long[] a = new long[RANGE];
190 for (int i = 0; i < a.length; i++) {
191 a[i] = RANDOM.nextLong();
192 }
193 return a;
194 }
195
196 static void verify(String name, Object[] gold, Object[] result) {
197 if (gold.length != result.length) {
198 throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
199 gold.length + ", result.length = " + result.length);
200 }
201 for (int i = 0; i < gold.length; i++) {
202 Object g = gold[i];
203 Object r = result[i];
204 if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
205 throw new RuntimeException("verify " + name + ": must both be array of same type:" +
206 " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
207 " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
208 }
209 if (g == r) {
210 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
211 " gold[" + i + "] == result[" + i + "]");
212 }
213 if (Array.getLength(g) != Array.getLength(r)) {
214 throw new RuntimeException("verify " + name + ": arrays must have same length:" +
215 " gold[" + i + "].length = " + Array.getLength(g) +
216 " result[" + i + "].length = " + Array.getLength(r));
217 }
218 Class c = g.getClass().getComponentType();
219 if (c == byte.class) {
220 verifyB(name, i, (byte[])g, (byte[])r);
221 } else if (c == short.class) {
222 verifyS(name, i, (short[])g, (short[])r);
223 } else if (c == int.class) {
224 verifyI(name, i, (int[])g, (int[])r);
225 } else if (c == long.class) {
226 verifyL(name, i, (long[])g, (long[])r);
227 } else {
228 throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
229 " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
230 " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
231 }
232 }
233 }
234
235 static void verifyB(String name, int i, byte[] g, byte[] r) {
236 for (int j = 0; j < g.length; j++) {
237 if (g[j] != r[j]) {
238 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
239 " gold[" + i + "][" + j + "] = " + g[j] +
240 " result[" + i + "][" + j + "] = " + r[j]);
241 }
242 }
243 }
244
245 static void verifyS(String name, int i, short[] g, short[] r) {
246 for (int j = 0; j < g.length; j++) {
247 if (g[j] != r[j]) {
248 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
249 " gold[" + i + "][" + j + "] = " + g[j] +
250 " result[" + i + "][" + j + "] = " + r[j]);
251 }
252 }
253 }
254
255 static void verifyI(String name, int i, int[] g, int[] r) {
256 for (int j = 0; j < g.length; j++) {
257 if (g[j] != r[j]) {
258 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
259 " gold[" + i + "][" + j + "] = " + g[j] +
260 " result[" + i + "][" + j + "] = " + r[j]);
261 }
262 }
263 }
264
265 static void verifyL(String name, int i, long[] g, long[] r) {
266 for (int j = 0; j < g.length; j++) {
267 if (g[j] != r[j]) {
268 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
269 " gold[" + i + "][" + j + "] = " + g[j] +
270 " result[" + i + "][" + j + "] = " + r[j]);
271 }
272 }
273 }
274
275 @Test
276 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
277 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
278 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
279 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
280 IRNode.STORE_VECTOR, "> 0"},
281 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
282 applyIfPlatform = {"64-bit", "true"},
283 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
284 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
285 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
286 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
287 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
288 IRNode.STORE_VECTOR, "> 0"},
289 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
290 applyIfPlatform = {"64-bit", "true"},
291 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
292 // Load and store are already split
293 //
294 // 0 1 - - 4 5 6 7
295 // | | | | | |
296 // 0 1 - - 4 5 6 7
297 static Object[] test0(int[] a, int[] b, int mask) {
298 for (int i = 0; i < RANGE; i+=8) {
299 int b0 = a[i+0] & mask;
300 int b1 = a[i+1] & mask;
301
302 int b4 = a[i+4] & mask;
303 int b5 = a[i+5] & mask;
304 int b6 = a[i+6] & mask;
305 int b7 = a[i+7] & mask;
306
307 b[i+0] = b0;
308 b[i+1] = b1;
309
310 b[i+4] = b4;
311 b[i+5] = b5;
312 b[i+6] = b6;
313 b[i+7] = b7;
314 // With AlignVector, we need 8-byte alignment of vector loads/stores.
315 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
316 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
317 // -> vectorize -> no vectorization
318 }
319 return new Object[]{ a, b };
320 }
321
322 @Test
323 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
324 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
325 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
326 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
327 IRNode.STORE_VECTOR, "> 0"},
328 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
329 applyIfPlatform = {"64-bit", "true"},
330 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
331 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
332 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
333 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
334 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
335 IRNode.STORE_VECTOR, "> 0"},
336 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
337 applyIfPlatform = {"64-bit", "true"},
338 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
339 // Adjacent Load and Store, but split by Add/Mul
340 static Object[] test1a(int[] a, int[] b, int mask) {
341 for (int i = 0; i < RANGE; i+=8) {
342 b[i+0] = a[i+0] + mask; // Add
343 b[i+1] = a[i+1] + mask;
344 b[i+2] = a[i+2] + mask;
345 b[i+3] = a[i+3] + mask;
346
347 b[i+4] = a[i+4] * mask; // Mul
348 b[i+5] = a[i+5] * mask;
349 // With AlignVector, we need 8-byte alignment of vector loads/stores.
350 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
351 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
352 // -> vectorize -> no vectorization
353 }
354 return new Object[]{ a, b };
355 }
356
357 @Test
358 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
359 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
360 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
361 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
362 IRNode.STORE_VECTOR, "> 0"},
363 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
364 applyIfPlatform = {"64-bit", "true"},
365 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
366 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
367 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
368 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
369 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
370 IRNode.STORE_VECTOR, "> 0"},
371 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
372 applyIfPlatform = {"64-bit", "true"},
373 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
374 // Adjacent Load and Store, but split by Add/Mul
375 static Object[] test1b(int[] a, int[] b, int mask) {
376 for (int i = 0; i < RANGE; i+=8) {
377 b[i+0] = a[i+0] * mask; // Mul
378 b[i+1] = a[i+1] * mask;
379 b[i+2] = a[i+2] * mask;
380 b[i+3] = a[i+3] * mask;
381
382 b[i+4] = a[i+4] + mask; // Add
383 b[i+5] = a[i+5] + mask;
384 // With AlignVector, we need 8-byte alignment of vector loads/stores.
385 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
386 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
387 // -> vectorize -> no vectorization
388 }
389 return new Object[]{ a, b };
390 }
391
392 @Test
393 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
394 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
395 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
396 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
397 IRNode.STORE_VECTOR, "> 0"},
398 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
399 applyIfPlatform = {"64-bit", "true"},
400 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
401 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
402 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
403 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
404 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
405 IRNode.STORE_VECTOR, "> 0"},
406 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
407 applyIfPlatform = {"64-bit", "true"},
408 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
409 // Adjacent Load and Store, but split by Add/Mul
410 static Object[] test1c(int[] a, int[] b, int mask) {
411 for (int i = 0; i < RANGE; i+=8) {
412 b[i+0] = a[i+0] + mask; // Add
413 b[i+1] = a[i+1] + mask;
414
415 b[i+2] = a[i+2] * mask; // Mul
416 b[i+3] = a[i+3] * mask;
417 b[i+4] = a[i+4] * mask;
418 b[i+5] = a[i+5] * mask;
419 // With AlignVector, we need 8-byte alignment of vector loads/stores.
420 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
421 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
422 // -> vectorize -> no vectorization
423 }
424 return new Object[]{ a, b };
425 }
426
427 @Test
428 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
429 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
430 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
431 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
432 IRNode.STORE_VECTOR, "> 0"},
433 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
434 applyIfPlatform = {"64-bit", "true"},
435 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
436 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
437 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
438 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
439 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
440 IRNode.STORE_VECTOR, "> 0"},
441 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
442 applyIfPlatform = {"64-bit", "true"},
443 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
444 // Adjacent Load and Store, but split by Add/Mul
445 static Object[] test1d(int[] a, int[] b, int mask) {
446 for (int i = 0; i < RANGE; i+=8) {
447 b[i+0] = a[i+0] * mask; // Mul
448 b[i+1] = a[i+1] * mask;
449
450 b[i+2] = a[i+2] + mask; // Add
451 b[i+3] = a[i+3] + mask;
452 b[i+4] = a[i+4] + mask;
453 b[i+5] = a[i+5] + mask;
454 // With AlignVector, we need 8-byte alignment of vector loads/stores.
455 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
456 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
457 // -> vectorize -> no vectorization
458 }
459 return new Object[]{ a, b };
460 }
461
462 @Test
463 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
464 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
465 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
466 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
467 IRNode.STORE_VECTOR, "> 0"},
468 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
469 applyIfPlatform = {"64-bit", "true"},
470 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
471 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
472 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
473 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
474 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
475 IRNode.STORE_VECTOR, "> 0"},
476 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
477 applyIfPlatform = {"64-bit", "true"},
478 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
479 // Split the load
480 //
481 // 0 1 2 3 4 5 - -
482 // | | \ \ \ \
483 // | | \ \ \ \
484 // | | \ \ \ \
485 // 0 1 - - 4 5 6 7
486 //
487 static Object[] test2a(int[] a, int[] b, int mask) {
488 for (int i = 0; i < RANGE; i+=8) {
489 int b0 = a[i+0] & mask;
490 int b1 = a[i+1] & mask;
491 int b2 = a[i+2] & mask;
492 int b3 = a[i+3] & mask;
493 int b4 = a[i+4] & mask;
494 int b5 = a[i+5] & mask;
495
496 b[i+0] = b0;
497 b[i+1] = b1;
498
499 b[i+4] = b2;
500 b[i+5] = b3;
501 b[i+6] = b4;
502 b[i+7] = b5;
503 // With AlignVector, we need 8-byte alignment of vector loads/stores.
504 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
505 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
506 // -> vectorize -> no vectorization
507 }
508 return new Object[]{ a, b };
509 }
510
511 @Test
512 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
513 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
514 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
515 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
516 IRNode.STORE_VECTOR, "> 0"},
517 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
518 applyIfPlatform = {"64-bit", "true"},
519 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
520 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
521 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
522 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
523 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
524 IRNode.STORE_VECTOR, "> 0"},
525 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
526 applyIfPlatform = {"64-bit", "true"},
527 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
528 // Split the load
529 //
530 // 0 1 2 3 4 5 - -
531 // | | | | \ \
532 // | | | | \ \
533 // | | | | \ \
534 // 0 1 2 3 -- 6 7
535 //
536 static Object[] test2b(int[] a, int[] b, int mask) {
537 for (int i = 0; i < RANGE; i+=8) {
538 int b0 = a[i+0] & mask;
539 int b1 = a[i+1] & mask;
540 int b2 = a[i+2] & mask;
541 int b3 = a[i+3] & mask;
542 int b4 = a[i+4] & mask;
543 int b5 = a[i+5] & mask;
544
545 b[i+0] = b0;
546 b[i+1] = b1;
547 b[i+2] = b2;
548 b[i+3] = b3;
549
550 b[i+6] = b4;
551 b[i+7] = b5;
552 // With AlignVector, we need 8-byte alignment of vector loads/stores.
553 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
554 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
555 // -> vectorize -> no vectorization
556 }
557 return new Object[]{ a, b };
558 }
559
560 @Test
561 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
562 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
563 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
564 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
565 IRNode.STORE_VECTOR, "> 0"},
566 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
567 applyIfPlatform = {"64-bit", "true"},
568 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
569 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
570 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
571 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
572 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
573 IRNode.STORE_VECTOR, "> 0"},
574 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
575 applyIfPlatform = {"64-bit", "true"},
576 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
577 // Split the load
578 //
579 // 0 1 - - 4 5 6 7
580 // | | / / / /
581 // | | / / / /
582 // | | / / / /
583 // 0 1 2 3 4 5 - -
584 //
585 static Object[] test2c(int[] a, int[] b, int mask) {
586 for (int i = 0; i < RANGE; i+=8) {
587 int b0 = a[i+0] & mask;
588 int b1 = a[i+1] & mask;
589
590 int b4 = a[i+4] & mask;
591 int b5 = a[i+5] & mask;
592 int b6 = a[i+6] & mask;
593 int b7 = a[i+7] & mask;
594
595 b[i+0] = b0;
596 b[i+1] = b1;
597 b[i+2] = b4;
598 b[i+3] = b5;
599 b[i+4] = b6;
600 b[i+5] = b7;
601 // With AlignVector, we need 8-byte alignment of vector loads/stores.
602 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
603 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
604 // -> vectorize -> no vectorization
605 }
606 return new Object[]{ a, b };
607 }
608
609 @Test
610 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
611 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
612 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
613 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
614 IRNode.STORE_VECTOR, "> 0"},
615 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
616 applyIfPlatform = {"64-bit", "true"},
617 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
618 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
619 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
620 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
621 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
622 IRNode.STORE_VECTOR, "> 0"},
623 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
624 applyIfPlatform = {"64-bit", "true"},
625 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
626 // Split the load
627 //
628 // 0 1 2 3 - - 6 7
629 // | | | | / /
630 // | | | | / /
631 // | | | | / /
632 // 0 1 2 3 4 5 - -
633 //
634 static Object[] test2d(int[] a, int[] b, int mask) {
635 for (int i = 0; i < RANGE; i+=8) {
636 int b0 = a[i+0] & mask;
637 int b1 = a[i+1] & mask;
638 int b2 = a[i+2] & mask;
639 int b3 = a[i+3] & mask;
640
641 int b6 = a[i+6] & mask;
642 int b7 = a[i+7] & mask;
643
644 b[i+0] = b0;
645 b[i+1] = b1;
646 b[i+2] = b2;
647 b[i+3] = b3;
648 b[i+4] = b6;
649 b[i+5] = b7;
650 // With AlignVector, we need 8-byte alignment of vector loads/stores.
651 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
652 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
653 // -> vectorize -> no vectorization
654 }
655 return new Object[]{ a, b };
656 }
657
658 @Test
659 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
660 IRNode.STORE_VECTOR, "> 0"},
661 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
662 applyIfPlatform = {"64-bit", "true"},
663 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
664 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
665 IRNode.STORE_VECTOR, "> 0"},
666 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
667 applyIfPlatform = {"64-bit", "true"},
668 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
669 // 0 1 2 3 4 5 6 7 -
670 // | | | | | | | |
671 // | + + + | | | |
672 // | | | | |
673 // | v | | | | v
674 // | | | | | | |
675 // 1 - - 3 4 5 6 7 8
676 static Object[] test3a(short[] a, short[] b, short val) {
677 int sum = 0;
678 for (int i = 0; i < RANGE; i+=16) {
679 short a0 = a[i+0]; // required for alignment / offsets, technical limitation.
680
681 short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off
682 short a2 = a[i+2];
683 short a3 = a[i+3];
684
685 short a4 = a[i+4]; // 4-pack
686 short a5 = a[i+5];
687 short a6 = a[i+6];
688 short a7 = a[i+7];
689
690
691 b[i+0] = a0; // required for alignment / offsets, technical limitation.
692
693 sum += a1 + a2 + a3; // not packed
694
695 b[i+3] = val; // adjacent to 4-pack but needs to be split off
696
697 b[i+4] = a4; // 4-pack
698 b[i+5] = a5;
699 b[i+6] = a6;
700 b[i+7] = a7;
701
702 b[i+8] = val; // adjacent to 4-pack but needs to be split off
703
704 // With AlignVector, we need 8-byte alignment of vector loads/stores.
705 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
706 // adr = base + 16 + 8 + 32*i -> always adr = base + 12 + 8 + 32*i -> never
707 // -> vectorize -> no vectorization
708 }
709 return new Object[]{ a, b, new int[]{ sum } };
710 }
711
712 @Test
713 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
714 IRNode.STORE_VECTOR, "> 0"},
715 applyIfPlatform = {"64-bit", "true"},
716 applyIfCPUFeatureOr = {"sse4.1", "true"})
717 // Cyclic dependency with distance 2 -> split into 2-packs
718 static Object[] test4a(short[] a, short[] b) {
719 for (int i = 0; i < RANGE-64; i++) {
720 b[i+2] = a[i+0];
721 }
722 return new Object[]{ a, b };
723 }
724
725 @Test
726 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
727 IRNode.STORE_VECTOR, "> 0"},
728 applyIf = {"AlignVector", "false"},
729 applyIfPlatform = {"64-bit", "true"},
730 applyIfCPUFeatureOr = {"sse4.1", "true"})
731 // Cyclic dependency with distance 3 -> split into 2-packs
732 static Object[] test4b(short[] a, short[] b) {
733 for (int i = 0; i < RANGE-64; i++) {
734 b[i+3] = a[i+0];
735 }
736 return new Object[]{ a, b };
737 }
738
739 @Test
740 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
741 IRNode.STORE_VECTOR, "> 0"},
742 applyIf = {"MaxVectorSize", ">=8"},
743 applyIfPlatform = {"64-bit", "true"},
744 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
745 // Cyclic dependency with distance 4 -> split into 4-packs
746 static Object[] test4c(short[] a, short[] b) {
747 for (int i = 0; i < RANGE-64; i++) {
748 b[i+4] = a[i+0];
749 }
750 return new Object[]{ a, b };
751 }
752
753 @Test
754 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
755 IRNode.STORE_VECTOR, "> 0"},
756 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
757 applyIfPlatform = {"64-bit", "true"},
758 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
759 // Cyclic dependency with distance 5 -> split into 4-packs
760 static Object[] test4d(short[] a, short[] b) {
761 for (int i = 0; i < RANGE-64; i++) {
762 b[i+5] = a[i+0];
763 }
764 return new Object[]{ a, b };
765 }
766
767 @Test
768 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
769 IRNode.STORE_VECTOR, "> 0"},
770 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
771 applyIfPlatform = {"64-bit", "true"},
772 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
773 // Cyclic dependency with distance 6 -> split into 4-packs
774 static Object[] test4e(short[] a, short[] b) {
775 for (int i = 0; i < RANGE-64; i++) {
776 b[i+6] = a[i+0];
777 }
778 return new Object[]{ a, b };
779 }
780
781 @Test
782 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
783 IRNode.STORE_VECTOR, "> 0"},
784 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"},
785 applyIfPlatform = {"64-bit", "true"},
786 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
787 // Cyclic dependency with distance 7 -> split into 4-packs
788 static Object[] test4f(short[] a, short[] b) {
789 for (int i = 0; i < RANGE-64; i++) {
790 b[i+7] = a[i+0];
791 }
792 return new Object[]{ a, b };
793 }
794
795 @Test
796 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
797 IRNode.STORE_VECTOR, "> 0"},
798 applyIf = {"MaxVectorSize", ">=32"},
799 applyIfPlatform = {"64-bit", "true"},
800 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
801 // Cyclic dependency with distance 8 -> split into 8-packs
802 static Object[] test4g(short[] a, short[] b) {
803 for (int i = 0; i < RANGE-64; i++) {
804 b[i+8] = a[i+0];
805 }
806 return new Object[]{ a, b };
807 }
808
809 @Test
810 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
811 IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
812 IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
813 IRNode.ADD_VS, IRNode.VECTOR_SIZE_2, "> 0",
814 IRNode.ADD_VS, IRNode.VECTOR_SIZE_8, "> 0",
815 IRNode.ADD_VS, IRNode.VECTOR_SIZE_4, "> 0",
816 IRNode.STORE_VECTOR, "> 0"},
817 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
818 applyIfPlatform = {"64-bit", "true"},
819 applyIfCPUFeature = {"sse4.1", "true"})
820 // aarch64 limits minimum vector size to 8B, thus a vector size of
821 // length 2 for type "short" will not be generated
822 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
823 IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0",
824 IRNode.ADD_VS, IRNode.VECTOR_SIZE_8, "> 0",
825 IRNode.ADD_VS, IRNode.VECTOR_SIZE_4, "> 0",
826 IRNode.STORE_VECTOR, "> 0"},
827 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
828 applyIfPlatform = {"64-bit", "true"},
829 applyIfCPUFeature = {"sve", "true"})
830 // Split pack into power-of-2 sizes
831 static Object[] test5a(short[] a, short[] b, short val) {
832 for (int i = 0; i < RANGE; i+=16) {
833 b[i+ 0] = (short)(a[i+ 0] + val); // 8 pack
834 b[i+ 1] = (short)(a[i+ 1] + val);
835 b[i+ 2] = (short)(a[i+ 2] + val);
836 b[i+ 3] = (short)(a[i+ 3] + val);
837 b[i+ 4] = (short)(a[i+ 4] + val);
838 b[i+ 5] = (short)(a[i+ 5] + val);
839 b[i+ 6] = (short)(a[i+ 6] + val);
840 b[i+ 7] = (short)(a[i+ 7] + val);
841
842 b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack
843 b[i+ 9] = (short)(a[i+ 9] + val);
844 b[i+10] = (short)(a[i+10] + val);
845 b[i+11] = (short)(a[i+11] + val);
846
847 b[i+12] = (short)(a[i+12] + val); // 2-pack
848 b[i+13] = (short)(a[i+13] + val);
849
850 b[i+14] = (short)(a[i+14] + val);
851 }
852 return new Object[]{ a, b };
853 }
854
855 @Test
856 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
857 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
858 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
859 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
860 IRNode.ADD_REDUCTION_V, "> 0"},
861 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
862 applyIfPlatform = {"64-bit", "true"},
863 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
864 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
865 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
866 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
867 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
868 IRNode.ADD_REDUCTION_V, "> 0"},
869 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
870 applyIfPlatform = {"64-bit", "true"},
871 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
872 // Split packs including reductions
873 static Object[] test6a(int[] a, int[] b) {
874 int s = 0;
875 for (int i = 0; i < RANGE; i+=8) {
876 s += a[i+0] * b[i+0];
877 s += a[i+1] * b[i+1];
878 s += a[i+2] * b[i+2];
879 s += a[i+3] * b[i+3];
880
881 s += a[i+4] & b[i+4];
882 s += a[i+5] & b[i+5];
883 s += a[i+6] & b[i+6];
884 s += a[i+7] & b[i+7];
885 // With AlignVector, we need 8-byte alignment of vector loads/stores.
886 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
887 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
888 // -> vectorize -> no vectorization
889 }
890 return new Object[]{ a, b, new int[]{ s } };
891 }
892
893 @Test
894 @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
895 IRNode.MUL_VI, "> 0",
896 IRNode.POPULATE_INDEX, "> 0"},
897 applyIfPlatform = {"64-bit", "true"},
898 applyIfCPUFeatureOr = {"avx2", "true", "sve", "true", "rvv", "true"})
899 // Index Populate:
900 // There can be an issue when all the (iv + 1), (iv + 2), ...
901 // get packed, but not (iv). Then we have a pack that is one element
902 // too short, and we start splitting everything in a bad way.
903 static Object[] test7a(int[] a, int[] b) {
904 for (int i = 0; i < RANGE; i++) {
905 a[i] = b[i] * i;
906 }
907 return new Object[]{ a, b };
908 }
909 }