1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 package compiler.loopopts.superword; 25 26 import compiler.lib.ir_framework.*; 27 import jdk.test.lib.Utils; 28 import jdk.test.whitebox.WhiteBox; 29 import java.lang.reflect.Array; 30 import java.util.Map; 31 import java.util.HashMap; 32 import java.util.Random; 33 import java.nio.ByteOrder; 34 35 /* 36 * @test 37 * @bug 8326139 38 * @summary Test splitting packs in SuperWord 39 * @library /test/lib / 40 * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_nAV 41 * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_yAV 42 * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_nAV 43 * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_yAV 44 */ 45 46 public class TestSplitPacks { 47 static int RANGE = 1024*8; 48 static int RANGE_FINAL = 1024*8; 49 private static final Random RANDOM = Utils.getRandomInstance(); 50 51 // Inputs 52 byte[] aB; 53 byte[] bB; 54 byte mB = (byte)31; 55 short[] aS; 56 short[] bS; 57 short mS = (short)0xF0F0; 58 int[] aI; 59 int[] bI; 60 int mI = 0xF0F0F0F0; 61 long[] aL; 62 long[] bL; 63 long mL = 0xF0F0F0F0F0F0F0F0L; 64 65 // List of tests 66 Map<String,TestFunction> tests = new HashMap<String,TestFunction>(); 67 68 // List of gold, the results from the first run before compilation 69 Map<String,Object[]> golds = new HashMap<String,Object[]>(); 70 71 interface TestFunction { 72 Object[] run(); 73 } 74 75 public static void main(String[] args) { 76 TestFramework framework = new TestFramework(TestSplitPacks.class); 77 framework.addFlags("-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=1000"); 78 switch (args[0]) { 79 case "nCOH_nAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); } 80 case "nCOH_yAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); } 81 case "yCOH_nAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); } 82 case "yCOH_yAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:+AlignVector"); } 83 default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); } 84 }; 85 framework.start(); 86 } 87 88 public TestSplitPacks() { 89 // Generate input once 90 aB = generateB(); 91 bB = generateB(); 92 aS = generateS(); 93 bS = generateS(); 94 aI = generateI(); 95 bI = generateI(); 96 aL = generateL(); 97 bL = generateL(); 98 99 // Add all tests to list 100 tests.put("test0", () -> { return test0(aI.clone(), bI.clone(), mI); }); 101 tests.put("test1a", () -> { return test1a(aI.clone(), bI.clone(), mI); }); 102 tests.put("test1b", () -> { return test1b(aI.clone(), bI.clone(), mI); }); 103 tests.put("test1c", () -> { return test1c(aI.clone(), bI.clone(), mI); }); 104 tests.put("test1d", () -> { return test1d(aI.clone(), bI.clone(), mI); }); 105 tests.put("test2a", () -> { return test2a(aI.clone(), bI.clone(), mI); }); 106 tests.put("test2b", () -> { return test2b(aI.clone(), bI.clone(), mI); }); 107 tests.put("test2c", () -> { return test2c(aI.clone(), bI.clone(), mI); }); 108 tests.put("test2d", () -> { return test2d(aI.clone(), bI.clone(), mI); }); 109 tests.put("test3a", () -> { return test3a(aS.clone(), bS.clone(), mS); }); 110 tests.put("test4a", () -> { return test4a(aS.clone(), bS.clone()); }); 111 tests.put("test4b", () -> { return test4b(aS.clone(), bS.clone()); }); 112 tests.put("test4c", () -> { return test4c(aS.clone(), bS.clone()); }); 113 tests.put("test4d", () -> { return test4d(aS.clone(), bS.clone()); }); 114 tests.put("test4e", () -> { return test4e(aS.clone(), bS.clone()); }); 115 tests.put("test4f", () -> { return test4f(aS.clone(), bS.clone()); }); 116 tests.put("test4g", () -> { return test4g(aS.clone(), bS.clone()); }); 117 tests.put("test5a", () -> { return test5a(aS.clone(), bS.clone(), mS); }); 118 tests.put("test6a", () -> { return test6a(aI.clone(), bI.clone()); }); 119 tests.put("test7a", () -> { return test7a(aI.clone(), bI.clone()); }); 120 121 // Compute gold value for all test methods before compilation 122 for (Map.Entry<String,TestFunction> entry : tests.entrySet()) { 123 String name = entry.getKey(); 124 TestFunction test = entry.getValue(); 125 Object[] gold = test.run(); 126 golds.put(name, gold); 127 } 128 } 129 130 @Warmup(100) 131 @Run(test = {"test0", 132 "test1a", 133 "test1b", 134 "test1c", 135 "test1d", 136 "test2a", 137 "test2b", 138 "test2c", 139 "test2d", 140 "test3a", 141 "test4a", 142 "test4b", 143 "test4c", 144 "test4d", 145 "test4e", 146 "test4f", 147 "test4g", 148 "test5a", 149 "test6a", 150 "test7a"}) 151 public void runTests() { 152 for (Map.Entry<String,TestFunction> entry : tests.entrySet()) { 153 String name = entry.getKey(); 154 TestFunction test = entry.getValue(); 155 // Recall gold value from before compilation 156 Object[] gold = golds.get(name); 157 // Compute new result 158 Object[] result = test.run(); 159 // Compare gold and new result 160 verify(name, gold, result); 161 } 162 } 163 164 static byte[] generateB() { 165 byte[] a = new byte[RANGE]; 166 for (int i = 0; i < a.length; i++) { 167 a[i] = (byte)RANDOM.nextInt(); 168 } 169 return a; 170 } 171 172 static short[] generateS() { 173 short[] a = new short[RANGE]; 174 for (int i = 0; i < a.length; i++) { 175 a[i] = (short)RANDOM.nextInt(); 176 } 177 return a; 178 } 179 180 static int[] generateI() { 181 int[] a = new int[RANGE]; 182 for (int i = 0; i < a.length; i++) { 183 a[i] = RANDOM.nextInt(); 184 } 185 return a; 186 } 187 188 static long[] generateL() { 189 long[] a = new long[RANGE]; 190 for (int i = 0; i < a.length; i++) { 191 a[i] = RANDOM.nextLong(); 192 } 193 return a; 194 } 195 196 static void verify(String name, Object[] gold, Object[] result) { 197 if (gold.length != result.length) { 198 throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " + 199 gold.length + ", result.length = " + result.length); 200 } 201 for (int i = 0; i < gold.length; i++) { 202 Object g = gold[i]; 203 Object r = result[i]; 204 if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) { 205 throw new RuntimeException("verify " + name + ": must both be array of same type:" + 206 " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + 207 " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); 208 } 209 if (g == r) { 210 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" + 211 " gold[" + i + "] == result[" + i + "]"); 212 } 213 if (Array.getLength(g) != Array.getLength(r)) { 214 throw new RuntimeException("verify " + name + ": arrays must have same length:" + 215 " gold[" + i + "].length = " + Array.getLength(g) + 216 " result[" + i + "].length = " + Array.getLength(r)); 217 } 218 Class c = g.getClass().getComponentType(); 219 if (c == byte.class) { 220 verifyB(name, i, (byte[])g, (byte[])r); 221 } else if (c == short.class) { 222 verifyS(name, i, (short[])g, (short[])r); 223 } else if (c == int.class) { 224 verifyI(name, i, (int[])g, (int[])r); 225 } else if (c == long.class) { 226 verifyL(name, i, (long[])g, (long[])r); 227 } else { 228 throw new RuntimeException("verify " + name + ": array type not supported for verify:" + 229 " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + 230 " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); 231 } 232 } 233 } 234 235 static void verifyB(String name, int i, byte[] g, byte[] r) { 236 for (int j = 0; j < g.length; j++) { 237 if (g[j] != r[j]) { 238 throw new RuntimeException("verify " + name + ": arrays must have same content:" + 239 " gold[" + i + "][" + j + "] = " + g[j] + 240 " result[" + i + "][" + j + "] = " + r[j]); 241 } 242 } 243 } 244 245 static void verifyS(String name, int i, short[] g, short[] r) { 246 for (int j = 0; j < g.length; j++) { 247 if (g[j] != r[j]) { 248 throw new RuntimeException("verify " + name + ": arrays must have same content:" + 249 " gold[" + i + "][" + j + "] = " + g[j] + 250 " result[" + i + "][" + j + "] = " + r[j]); 251 } 252 } 253 } 254 255 static void verifyI(String name, int i, int[] g, int[] r) { 256 for (int j = 0; j < g.length; j++) { 257 if (g[j] != r[j]) { 258 throw new RuntimeException("verify " + name + ": arrays must have same content:" + 259 " gold[" + i + "][" + j + "] = " + g[j] + 260 " result[" + i + "][" + j + "] = " + r[j]); 261 } 262 } 263 } 264 265 static void verifyL(String name, int i, long[] g, long[] r) { 266 for (int j = 0; j < g.length; j++) { 267 if (g[j] != r[j]) { 268 throw new RuntimeException("verify " + name + ": arrays must have same content:" + 269 " gold[" + i + "][" + j + "] = " + g[j] + 270 " result[" + i + "][" + j + "] = " + r[j]); 271 } 272 } 273 } 274 275 @Test 276 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 277 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 278 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 279 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 280 IRNode.STORE_VECTOR, "> 0"}, 281 applyIf = {"MaxVectorSize", ">=32"}, 282 applyIfPlatform = {"64-bit", "true"}, 283 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 284 // Load and store are already split 285 // 286 // 0 1 - - 4 5 6 7 287 // | | | | | | 288 // 0 1 - - 4 5 6 7 289 static Object[] test0(int[] a, int[] b, int mask) { 290 for (int i = 0; i < RANGE; i+=8) { 291 int b0 = a[i+0] & mask; 292 int b1 = a[i+1] & mask; 293 294 int b4 = a[i+4] & mask; 295 int b5 = a[i+5] & mask; 296 int b6 = a[i+6] & mask; 297 int b7 = a[i+7] & mask; 298 299 b[i+0] = b0; 300 b[i+1] = b1; 301 302 b[i+4] = b4; 303 b[i+5] = b5; 304 b[i+6] = b6; 305 b[i+7] = b7; 306 } 307 return new Object[]{ a, b }; 308 } 309 310 @Test 311 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 312 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 313 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", 314 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0", 315 IRNode.STORE_VECTOR, "> 0"}, 316 applyIf = {"MaxVectorSize", ">=32"}, 317 applyIfPlatform = {"64-bit", "true"}, 318 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 319 // Adjacent Load and Store, but split by Add/Mul 320 static Object[] test1a(int[] a, int[] b, int mask) { 321 for (int i = 0; i < RANGE; i+=8) { 322 b[i+0] = a[i+0] + mask; // Add 323 b[i+1] = a[i+1] + mask; 324 b[i+2] = a[i+2] + mask; 325 b[i+3] = a[i+3] + mask; 326 327 b[i+4] = a[i+4] * mask; // Mul 328 b[i+5] = a[i+5] * mask; 329 } 330 return new Object[]{ a, b }; 331 } 332 333 @Test 334 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 335 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 336 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0", 337 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 338 IRNode.STORE_VECTOR, "> 0"}, 339 applyIf = {"MaxVectorSize", ">=32"}, 340 applyIfPlatform = {"64-bit", "true"}, 341 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 342 // Adjacent Load and Store, but split by Add/Mul 343 static Object[] test1b(int[] a, int[] b, int mask) { 344 for (int i = 0; i < RANGE; i+=8) { 345 b[i+0] = a[i+0] * mask; // Mul 346 b[i+1] = a[i+1] * mask; 347 b[i+2] = a[i+2] * mask; 348 b[i+3] = a[i+3] * mask; 349 350 b[i+4] = a[i+4] + mask; // Add 351 b[i+5] = a[i+5] + mask; 352 } 353 return new Object[]{ a, b }; 354 } 355 356 @Test 357 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 358 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 359 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0", 360 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 361 IRNode.STORE_VECTOR, "> 0"}, 362 applyIf = {"MaxVectorSize", ">=32"}, 363 applyIfPlatform = {"64-bit", "true"}, 364 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) 365 // Adjacent Load and Store, but split by Add/Mul 366 static Object[] test1c(int[] a, int[] b, int mask) { 367 for (int i = 0; i < RANGE; i+=8) { 368 b[i+0] = a[i+0] + mask; // Add 369 b[i+1] = a[i+1] + mask; 370 371 b[i+2] = a[i+2] * mask; // Mul 372 b[i+3] = a[i+3] * mask; 373 b[i+4] = a[i+4] * mask; 374 b[i+5] = a[i+5] * mask; 375 } 376 return new Object[]{ a, b }; 377 } 378 379 @Test 380 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 381 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 382 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", 383 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0", 384 IRNode.STORE_VECTOR, "> 0"}, 385 applyIf = {"MaxVectorSize", ">=32"}, 386 applyIfPlatform = {"64-bit", "true"}, 387 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) 388 // Adjacent Load and Store, but split by Add/Mul 389 static Object[] test1d(int[] a, int[] b, int mask) { 390 for (int i = 0; i < RANGE; i+=8) { 391 b[i+0] = a[i+0] * mask; // Mul 392 b[i+1] = a[i+1] * mask; 393 394 b[i+2] = a[i+2] + mask; // Add 395 b[i+3] = a[i+3] + mask; 396 b[i+4] = a[i+4] + mask; 397 b[i+5] = a[i+5] + mask; 398 } 399 return new Object[]{ a, b }; 400 } 401 402 @Test 403 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 404 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 405 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 406 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 407 IRNode.STORE_VECTOR, "> 0"}, 408 applyIf = {"MaxVectorSize", ">=32"}, 409 applyIfPlatform = {"64-bit", "true"}, 410 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 411 // Split the load 412 // 413 // 0 1 2 3 4 5 - - 414 // | | \ \ \ \ 415 // | | \ \ \ \ 416 // | | \ \ \ \ 417 // 0 1 - - 4 5 6 7 418 // 419 static Object[] test2a(int[] a, int[] b, int mask) { 420 for (int i = 0; i < RANGE; i+=8) { 421 int b0 = a[i+0] & mask; 422 int b1 = a[i+1] & mask; 423 int b2 = a[i+2] & mask; 424 int b3 = a[i+3] & mask; 425 int b4 = a[i+4] & mask; 426 int b5 = a[i+5] & mask; 427 428 b[i+0] = b0; 429 b[i+1] = b1; 430 431 b[i+4] = b2; 432 b[i+5] = b3; 433 b[i+6] = b4; 434 b[i+7] = b5; 435 } 436 return new Object[]{ a, b }; 437 } 438 439 @Test 440 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 441 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 442 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 443 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 444 IRNode.STORE_VECTOR, "> 0"}, 445 applyIf = {"MaxVectorSize", ">=32"}, 446 applyIfPlatform = {"64-bit", "true"}, 447 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 448 // Split the load 449 // 450 // 0 1 2 3 4 5 - - 451 // | | | | \ \ 452 // | | | | \ \ 453 // | | | | \ \ 454 // 0 1 2 3 -- 6 7 455 // 456 static Object[] test2b(int[] a, int[] b, int mask) { 457 for (int i = 0; i < RANGE; i+=8) { 458 int b0 = a[i+0] & mask; 459 int b1 = a[i+1] & mask; 460 int b2 = a[i+2] & mask; 461 int b3 = a[i+3] & mask; 462 int b4 = a[i+4] & mask; 463 int b5 = a[i+5] & mask; 464 465 b[i+0] = b0; 466 b[i+1] = b1; 467 b[i+2] = b2; 468 b[i+3] = b3; 469 470 b[i+6] = b4; 471 b[i+7] = b5; 472 } 473 return new Object[]{ a, b }; 474 } 475 476 @Test 477 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 478 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 479 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 480 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 481 IRNode.STORE_VECTOR, "> 0"}, 482 applyIf = {"MaxVectorSize", ">=32"}, 483 applyIfPlatform = {"64-bit", "true"}, 484 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 485 // Split the load 486 // 487 // 0 1 - - 4 5 6 7 488 // | | / / / / 489 // | | / / / / 490 // | | / / / / 491 // 0 1 2 3 4 5 - - 492 // 493 static Object[] test2c(int[] a, int[] b, int mask) { 494 for (int i = 0; i < RANGE; i+=8) { 495 int b0 = a[i+0] & mask; 496 int b1 = a[i+1] & mask; 497 498 int b4 = a[i+4] & mask; 499 int b5 = a[i+5] & mask; 500 int b6 = a[i+6] & mask; 501 int b7 = a[i+7] & mask; 502 503 b[i+0] = b0; 504 b[i+1] = b1; 505 b[i+2] = b4; 506 b[i+3] = b5; 507 b[i+4] = b6; 508 b[i+5] = b7; 509 } 510 return new Object[]{ a, b }; 511 } 512 513 @Test 514 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 515 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 516 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 517 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 518 IRNode.STORE_VECTOR, "> 0"}, 519 applyIf = {"MaxVectorSize", ">=32"}, 520 applyIfPlatform = {"64-bit", "true"}, 521 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 522 // Split the load 523 // 524 // 0 1 2 3 - - 6 7 525 // | | | | / / 526 // | | | | / / 527 // | | | | / / 528 // 0 1 2 3 4 5 - - 529 // 530 static Object[] test2d(int[] a, int[] b, int mask) { 531 for (int i = 0; i < RANGE; i+=8) { 532 int b0 = a[i+0] & mask; 533 int b1 = a[i+1] & mask; 534 int b2 = a[i+2] & mask; 535 int b3 = a[i+3] & mask; 536 537 int b6 = a[i+6] & mask; 538 int b7 = a[i+7] & mask; 539 540 b[i+0] = b0; 541 b[i+1] = b1; 542 b[i+2] = b2; 543 b[i+3] = b3; 544 b[i+4] = b6; 545 b[i+5] = b7; 546 } 547 return new Object[]{ a, b }; 548 } 549 550 @Test 551 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 552 IRNode.STORE_VECTOR, "> 0"}, 553 applyIf = {"MaxVectorSize", ">=32"}, 554 applyIfPlatform = {"64-bit", "true"}, 555 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 556 // 0 1 2 3 4 5 6 7 - 557 // | | | | | | | | 558 // | + + + | | | | 559 // | | | | | 560 // | v | | | | v 561 // | | | | | | | 562 // 1 - - 3 4 5 6 7 8 563 static Object[] test3a(short[] a, short[] b, short val) { 564 int sum = 0; 565 for (int i = 0; i < RANGE; i+=16) { 566 short a0 = a[i+0]; // required for alignment / offsets, technical limitation. 567 568 short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off 569 short a2 = a[i+2]; 570 short a3 = a[i+3]; 571 572 short a4 = a[i+4]; // 4-pack 573 short a5 = a[i+5]; 574 short a6 = a[i+6]; 575 short a7 = a[i+7]; 576 577 578 b[i+0] = a0; // required for alignment / offsets, technical limitation. 579 580 sum += a1 + a2 + a3; // not packed 581 582 b[i+3] = val; // adjacent to 4-pack but needs to be split off 583 584 b[i+4] = a4; // 4-pack 585 b[i+5] = a5; 586 b[i+6] = a6; 587 b[i+7] = a7; 588 589 b[i+8] = val; // adjacent to 4-pack but needs to be split off 590 } 591 return new Object[]{ a, b, new int[]{ sum } }; 592 } 593 594 @Test 595 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0", 596 IRNode.STORE_VECTOR, "> 0"}, 597 applyIfPlatform = {"64-bit", "true"}, 598 applyIfCPUFeatureOr = {"sse4.1", "true"}) 599 // Cyclic dependency with distance 2 -> split into 2-packs 600 static Object[] test4a(short[] a, short[] b) { 601 for (int i = 0; i < RANGE-64; i++) { 602 b[i+2] = a[i+0]; 603 } 604 return new Object[]{ a, b }; 605 } 606 607 @Test 608 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0", 609 IRNode.STORE_VECTOR, "> 0"}, 610 applyIf = {"AlignVector", "false"}, 611 applyIfPlatform = {"64-bit", "true"}, 612 applyIfCPUFeatureOr = {"sse4.1", "true"}) 613 // Cyclic dependency with distance 3 -> split into 2-packs 614 static Object[] test4b(short[] a, short[] b) { 615 for (int i = 0; i < RANGE-64; i++) { 616 b[i+3] = a[i+0]; 617 } 618 return new Object[]{ a, b }; 619 } 620 621 @Test 622 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 623 IRNode.STORE_VECTOR, "> 0"}, 624 applyIf = {"MaxVectorSize", ">=8"}, 625 applyIfPlatform = {"64-bit", "true"}, 626 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 627 // Cyclic dependency with distance 4 -> split into 4-packs 628 static Object[] test4c(short[] a, short[] b) { 629 for (int i = 0; i < RANGE-64; i++) { 630 b[i+4] = a[i+0]; 631 } 632 return new Object[]{ a, b }; 633 } 634 635 @Test 636 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 637 IRNode.STORE_VECTOR, "> 0"}, 638 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"}, 639 applyIfPlatform = {"64-bit", "true"}, 640 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 641 // Cyclic dependency with distance 5 -> split into 4-packs 642 static Object[] test4d(short[] a, short[] b) { 643 for (int i = 0; i < RANGE-64; i++) { 644 b[i+5] = a[i+0]; 645 } 646 return new Object[]{ a, b }; 647 } 648 649 @Test 650 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 651 IRNode.STORE_VECTOR, "> 0"}, 652 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"}, 653 applyIfPlatform = {"64-bit", "true"}, 654 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 655 // Cyclic dependency with distance 6 -> split into 4-packs 656 static Object[] test4e(short[] a, short[] b) { 657 for (int i = 0; i < RANGE-64; i++) { 658 b[i+6] = a[i+0]; 659 } 660 return new Object[]{ a, b }; 661 } 662 663 @Test 664 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 665 IRNode.STORE_VECTOR, "> 0"}, 666 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"}, 667 applyIfPlatform = {"64-bit", "true"}, 668 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 669 // Cyclic dependency with distance 7 -> split into 4-packs 670 static Object[] test4f(short[] a, short[] b) { 671 for (int i = 0; i < RANGE-64; i++) { 672 b[i+7] = a[i+0]; 673 } 674 return new Object[]{ a, b }; 675 } 676 677 @Test 678 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0", 679 IRNode.STORE_VECTOR, "> 0"}, 680 applyIf = {"MaxVectorSize", ">=32"}, 681 applyIfPlatform = {"64-bit", "true"}, 682 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 683 // Cyclic dependency with distance 8 -> split into 8-packs 684 static Object[] test4g(short[] a, short[] b) { 685 for (int i = 0; i < RANGE-64; i++) { 686 b[i+8] = a[i+0]; 687 } 688 return new Object[]{ a, b }; 689 } 690 691 @Test 692 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0", 693 IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 694 IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0", 695 IRNode.ADD_VS, IRNode.VECTOR_SIZE_2, "> 0", 696 IRNode.ADD_VS, IRNode.VECTOR_SIZE_8, "> 0", 697 IRNode.ADD_VS, IRNode.VECTOR_SIZE_4, "> 0", 698 IRNode.STORE_VECTOR, "> 0"}, 699 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 700 applyIfPlatform = {"64-bit", "true"}, 701 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 702 // Split pack into power-of-2 sizes 703 static Object[] test5a(short[] a, short[] b, short val) { 704 for (int i = 0; i < RANGE; i+=16) { 705 b[i+ 0] = (short)(a[i+ 0] + val); // 8 pack 706 b[i+ 1] = (short)(a[i+ 1] + val); 707 b[i+ 2] = (short)(a[i+ 2] + val); 708 b[i+ 3] = (short)(a[i+ 3] + val); 709 b[i+ 4] = (short)(a[i+ 4] + val); 710 b[i+ 5] = (short)(a[i+ 5] + val); 711 b[i+ 6] = (short)(a[i+ 6] + val); 712 b[i+ 7] = (short)(a[i+ 7] + val); 713 714 b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack 715 b[i+ 9] = (short)(a[i+ 9] + val); 716 b[i+10] = (short)(a[i+10] + val); 717 b[i+11] = (short)(a[i+11] + val); 718 719 b[i+12] = (short)(a[i+12] + val); // 2-pack 720 b[i+13] = (short)(a[i+13] + val); 721 722 b[i+14] = (short)(a[i+14] + val); 723 } 724 return new Object[]{ a, b }; 725 } 726 727 @Test 728 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 729 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 730 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 731 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop 732 IRNode.ADD_REDUCTION_V, "> 0"}, 733 applyIf = {"MaxVectorSize", ">=32"}, 734 applyIfPlatform = {"64-bit", "true"}, 735 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 736 // Split packs including reductions 737 static Object[] test6a(int[] a, int[] b) { 738 int s = 0; 739 for (int i = 0; i < RANGE; i+=8) { 740 s += a[i+0] * b[i+0]; 741 s += a[i+1] * b[i+1]; 742 s += a[i+2] * b[i+2]; 743 s += a[i+3] * b[i+3]; 744 745 s += a[i+4] & b[i+4]; 746 s += a[i+5] & b[i+5]; 747 s += a[i+6] & b[i+6]; 748 s += a[i+7] & b[i+7]; 749 } 750 return new Object[]{ a, b, new int[]{ s } }; 751 } 752 753 @Test 754 @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", 755 IRNode.MUL_VI, "> 0", 756 IRNode.POPULATE_INDEX, "> 0"}, 757 applyIfPlatform = {"64-bit", "true"}, 758 applyIfCPUFeatureOr = {"avx2", "true", "sve", "true"}) 759 // Index Populate: 760 // There can be an issue when all the (iv + 1), (iv + 2), ... 761 // get packed, but not (iv). Then we have a pack that is one element 762 // too short, and we start splitting everything in a bad way. 763 static Object[] test7a(int[] a, int[] b) { 764 for (int i = 0; i < RANGE; i++) { 765 a[i] = b[i] * i; 766 } 767 return new Object[]{ a, b }; 768 } 769 }