1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 package compiler.loopopts.superword; 25 26 import compiler.lib.ir_framework.*; 27 import jdk.test.lib.Utils; 28 import jdk.test.whitebox.WhiteBox; 29 import java.lang.reflect.Array; 30 import java.util.Map; 31 import java.util.HashMap; 32 import java.util.Random; 33 import java.nio.ByteOrder; 34 35 /* 36 * @test 37 * @bug 8326139 38 * @summary Test splitting packs in SuperWord 39 * @library /test/lib / 40 * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_nAV 41 * @run driver compiler.loopopts.superword.TestSplitPacks nCOH_yAV 42 * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_nAV 43 * @run driver compiler.loopopts.superword.TestSplitPacks yCOH_yAV 44 */ 45 46 public class TestSplitPacks { 47 static int RANGE = 1024*8; 48 static int RANGE_FINAL = 1024*8; 49 private static final Random RANDOM = Utils.getRandomInstance(); 50 51 // Inputs 52 byte[] aB; 53 byte[] bB; 54 byte mB = (byte)31; 55 short[] aS; 56 short[] bS; 57 short mS = (short)0xF0F0; 58 int[] aI; 59 int[] bI; 60 int mI = 0xF0F0F0F0; 61 long[] aL; 62 long[] bL; 63 long mL = 0xF0F0F0F0F0F0F0F0L; 64 65 // List of tests 66 Map<String,TestFunction> tests = new HashMap<String,TestFunction>(); 67 68 // List of gold, the results from the first run before compilation 69 Map<String,Object[]> golds = new HashMap<String,Object[]>(); 70 71 interface TestFunction { 72 Object[] run(); 73 } 74 75 public static void main(String[] args) { 76 TestFramework framework = new TestFramework(TestSplitPacks.class); 77 framework.addFlags("-XX:+IgnoreUnrecognizedVMOptions", "-XX:LoopUnrollLimit=1000"); 78 switch (args[0]) { 79 case "nCOH_nAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:-AlignVector"); } 80 case "nCOH_yAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:-UseCompactObjectHeaders", "-XX:+AlignVector"); } 81 case "yCOH_nAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:-AlignVector"); } 82 case "yCOH_yAV" -> { framework.addFlags("-XX:+UnlockExperimentalVMOptions", "-XX:+UseCompactObjectHeaders", "-XX:+AlignVector"); } 83 default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); } 84 }; 85 framework.start(); 86 } 87 88 public TestSplitPacks() { 89 // Generate input once 90 aB = generateB(); 91 bB = generateB(); 92 aS = generateS(); 93 bS = generateS(); 94 aI = generateI(); 95 bI = generateI(); 96 aL = generateL(); 97 bL = generateL(); 98 99 // Add all tests to list 100 tests.put("test0", () -> { return test0(aI.clone(), bI.clone(), mI); }); 101 tests.put("test1a", () -> { return test1a(aI.clone(), bI.clone(), mI); }); 102 tests.put("test1b", () -> { return test1b(aI.clone(), bI.clone(), mI); }); 103 tests.put("test1c", () -> { return test1c(aI.clone(), bI.clone(), mI); }); 104 tests.put("test1d", () -> { return test1d(aI.clone(), bI.clone(), mI); }); 105 tests.put("test2a", () -> { return test2a(aI.clone(), bI.clone(), mI); }); 106 tests.put("test2b", () -> { return test2b(aI.clone(), bI.clone(), mI); }); 107 tests.put("test2c", () -> { return test2c(aI.clone(), bI.clone(), mI); }); 108 tests.put("test2d", () -> { return test2d(aI.clone(), bI.clone(), mI); }); 109 tests.put("test3a", () -> { return test3a(aS.clone(), bS.clone(), mS); }); 110 tests.put("test4a", () -> { return test4a(aS.clone(), bS.clone()); }); 111 tests.put("test4b", () -> { return test4b(aS.clone(), bS.clone()); }); 112 tests.put("test4c", () -> { return test4c(aS.clone(), bS.clone()); }); 113 tests.put("test4d", () -> { return test4d(aS.clone(), bS.clone()); }); 114 tests.put("test4e", () -> { return test4e(aS.clone(), bS.clone()); }); 115 tests.put("test4f", () -> { return test4f(aS.clone(), bS.clone()); }); 116 tests.put("test4g", () -> { return test4g(aS.clone(), bS.clone()); }); 117 tests.put("test5a", () -> { return test5a(aS.clone(), bS.clone(), mS); }); 118 tests.put("test6a", () -> { return test6a(aI.clone(), bI.clone()); }); 119 tests.put("test7a", () -> { return test7a(aI.clone(), bI.clone()); }); 120 121 // Compute gold value for all test methods before compilation 122 for (Map.Entry<String,TestFunction> entry : tests.entrySet()) { 123 String name = entry.getKey(); 124 TestFunction test = entry.getValue(); 125 Object[] gold = test.run(); 126 golds.put(name, gold); 127 } 128 } 129 130 @Warmup(100) 131 @Run(test = {"test0", 132 "test1a", 133 "test1b", 134 "test1c", 135 "test1d", 136 "test2a", 137 "test2b", 138 "test2c", 139 "test2d", 140 "test3a", 141 "test4a", 142 "test4b", 143 "test4c", 144 "test4d", 145 "test4e", 146 "test4f", 147 "test4g", 148 "test5a", 149 "test6a", 150 "test7a"}) 151 public void runTests() { 152 for (Map.Entry<String,TestFunction> entry : tests.entrySet()) { 153 String name = entry.getKey(); 154 TestFunction test = entry.getValue(); 155 // Recall gold value from before compilation 156 Object[] gold = golds.get(name); 157 // Compute new result 158 Object[] result = test.run(); 159 // Compare gold and new result 160 verify(name, gold, result); 161 } 162 } 163 164 static byte[] generateB() { 165 byte[] a = new byte[RANGE]; 166 for (int i = 0; i < a.length; i++) { 167 a[i] = (byte)RANDOM.nextInt(); 168 } 169 return a; 170 } 171 172 static short[] generateS() { 173 short[] a = new short[RANGE]; 174 for (int i = 0; i < a.length; i++) { 175 a[i] = (short)RANDOM.nextInt(); 176 } 177 return a; 178 } 179 180 static int[] generateI() { 181 int[] a = new int[RANGE]; 182 for (int i = 0; i < a.length; i++) { 183 a[i] = RANDOM.nextInt(); 184 } 185 return a; 186 } 187 188 static long[] generateL() { 189 long[] a = new long[RANGE]; 190 for (int i = 0; i < a.length; i++) { 191 a[i] = RANDOM.nextLong(); 192 } 193 return a; 194 } 195 196 static void verify(String name, Object[] gold, Object[] result) { 197 if (gold.length != result.length) { 198 throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " + 199 gold.length + ", result.length = " + result.length); 200 } 201 for (int i = 0; i < gold.length; i++) { 202 Object g = gold[i]; 203 Object r = result[i]; 204 if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) { 205 throw new RuntimeException("verify " + name + ": must both be array of same type:" + 206 " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + 207 " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); 208 } 209 if (g == r) { 210 throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" + 211 " gold[" + i + "] == result[" + i + "]"); 212 } 213 if (Array.getLength(g) != Array.getLength(r)) { 214 throw new RuntimeException("verify " + name + ": arrays must have same length:" + 215 " gold[" + i + "].length = " + Array.getLength(g) + 216 " result[" + i + "].length = " + Array.getLength(r)); 217 } 218 Class c = g.getClass().getComponentType(); 219 if (c == byte.class) { 220 verifyB(name, i, (byte[])g, (byte[])r); 221 } else if (c == short.class) { 222 verifyS(name, i, (short[])g, (short[])r); 223 } else if (c == int.class) { 224 verifyI(name, i, (int[])g, (int[])r); 225 } else if (c == long.class) { 226 verifyL(name, i, (long[])g, (long[])r); 227 } else { 228 throw new RuntimeException("verify " + name + ": array type not supported for verify:" + 229 " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + 230 " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); 231 } 232 } 233 } 234 235 static void verifyB(String name, int i, byte[] g, byte[] r) { 236 for (int j = 0; j < g.length; j++) { 237 if (g[j] != r[j]) { 238 throw new RuntimeException("verify " + name + ": arrays must have same content:" + 239 " gold[" + i + "][" + j + "] = " + g[j] + 240 " result[" + i + "][" + j + "] = " + r[j]); 241 } 242 } 243 } 244 245 static void verifyS(String name, int i, short[] g, short[] r) { 246 for (int j = 0; j < g.length; j++) { 247 if (g[j] != r[j]) { 248 throw new RuntimeException("verify " + name + ": arrays must have same content:" + 249 " gold[" + i + "][" + j + "] = " + g[j] + 250 " result[" + i + "][" + j + "] = " + r[j]); 251 } 252 } 253 } 254 255 static void verifyI(String name, int i, int[] g, int[] r) { 256 for (int j = 0; j < g.length; j++) { 257 if (g[j] != r[j]) { 258 throw new RuntimeException("verify " + name + ": arrays must have same content:" + 259 " gold[" + i + "][" + j + "] = " + g[j] + 260 " result[" + i + "][" + j + "] = " + r[j]); 261 } 262 } 263 } 264 265 static void verifyL(String name, int i, long[] g, long[] r) { 266 for (int j = 0; j < g.length; j++) { 267 if (g[j] != r[j]) { 268 throw new RuntimeException("verify " + name + ": arrays must have same content:" + 269 " gold[" + i + "][" + j + "] = " + g[j] + 270 " result[" + i + "][" + j + "] = " + r[j]); 271 } 272 } 273 } 274 275 @Test 276 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 277 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 278 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 279 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 280 IRNode.STORE_VECTOR, "> 0"}, 281 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 282 applyIfPlatform = {"64-bit", "true"}, 283 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 284 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 285 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 286 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 287 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 288 IRNode.STORE_VECTOR, "> 0"}, 289 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 290 applyIfPlatform = {"64-bit", "true"}, 291 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 292 // Load and store are already split 293 // 294 // 0 1 - - 4 5 6 7 295 // | | | | | | 296 // 0 1 - - 4 5 6 7 297 static Object[] test0(int[] a, int[] b, int mask) { 298 for (int i = 0; i < RANGE; i+=8) { 299 int b0 = a[i+0] & mask; 300 int b1 = a[i+1] & mask; 301 302 int b4 = a[i+4] & mask; 303 int b5 = a[i+5] & mask; 304 int b6 = a[i+6] & mask; 305 int b7 = a[i+7] & mask; 306 307 b[i+0] = b0; 308 b[i+1] = b1; 309 310 b[i+4] = b4; 311 b[i+5] = b5; 312 b[i+6] = b6; 313 b[i+7] = b7; 314 // With AlignVector, we need 8-byte alignment of vector loads/stores. 315 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 316 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 317 // -> vectorize -> no vectorization 318 } 319 return new Object[]{ a, b }; 320 } 321 322 @Test 323 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 324 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 325 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", 326 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0", 327 IRNode.STORE_VECTOR, "> 0"}, 328 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 329 applyIfPlatform = {"64-bit", "true"}, 330 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 331 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 332 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 333 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", 334 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0", 335 IRNode.STORE_VECTOR, "> 0"}, 336 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 337 applyIfPlatform = {"64-bit", "true"}, 338 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 339 // Adjacent Load and Store, but split by Add/Mul 340 static Object[] test1a(int[] a, int[] b, int mask) { 341 for (int i = 0; i < RANGE; i+=8) { 342 b[i+0] = a[i+0] + mask; // Add 343 b[i+1] = a[i+1] + mask; 344 b[i+2] = a[i+2] + mask; 345 b[i+3] = a[i+3] + mask; 346 347 b[i+4] = a[i+4] * mask; // Mul 348 b[i+5] = a[i+5] * mask; 349 // With AlignVector, we need 8-byte alignment of vector loads/stores. 350 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 351 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 352 // -> vectorize -> no vectorization 353 } 354 return new Object[]{ a, b }; 355 } 356 357 @Test 358 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 359 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 360 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0", 361 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 362 IRNode.STORE_VECTOR, "> 0"}, 363 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 364 applyIfPlatform = {"64-bit", "true"}, 365 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 366 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 367 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 368 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0", 369 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 370 IRNode.STORE_VECTOR, "> 0"}, 371 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 372 applyIfPlatform = {"64-bit", "true"}, 373 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 374 // Adjacent Load and Store, but split by Add/Mul 375 static Object[] test1b(int[] a, int[] b, int mask) { 376 for (int i = 0; i < RANGE; i+=8) { 377 b[i+0] = a[i+0] * mask; // Mul 378 b[i+1] = a[i+1] * mask; 379 b[i+2] = a[i+2] * mask; 380 b[i+3] = a[i+3] * mask; 381 382 b[i+4] = a[i+4] + mask; // Add 383 b[i+5] = a[i+5] + mask; 384 // With AlignVector, we need 8-byte alignment of vector loads/stores. 385 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 386 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 387 // -> vectorize -> no vectorization 388 } 389 return new Object[]{ a, b }; 390 } 391 392 @Test 393 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 394 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 395 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0", 396 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 397 IRNode.STORE_VECTOR, "> 0"}, 398 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 399 applyIfPlatform = {"64-bit", "true"}, 400 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) 401 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 402 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 403 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0", 404 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 405 IRNode.STORE_VECTOR, "> 0"}, 406 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 407 applyIfPlatform = {"64-bit", "true"}, 408 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) 409 // Adjacent Load and Store, but split by Add/Mul 410 static Object[] test1c(int[] a, int[] b, int mask) { 411 for (int i = 0; i < RANGE; i+=8) { 412 b[i+0] = a[i+0] + mask; // Add 413 b[i+1] = a[i+1] + mask; 414 415 b[i+2] = a[i+2] * mask; // Mul 416 b[i+3] = a[i+3] * mask; 417 b[i+4] = a[i+4] * mask; 418 b[i+5] = a[i+5] * mask; 419 // With AlignVector, we need 8-byte alignment of vector loads/stores. 420 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 421 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 422 // -> vectorize -> no vectorization 423 } 424 return new Object[]{ a, b }; 425 } 426 427 @Test 428 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 429 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 430 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", 431 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0", 432 IRNode.STORE_VECTOR, "> 0"}, 433 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 434 applyIfPlatform = {"64-bit", "true"}, 435 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) 436 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 437 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 438 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", 439 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0", 440 IRNode.STORE_VECTOR, "> 0"}, 441 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 442 applyIfPlatform = {"64-bit", "true"}, 443 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) 444 // Adjacent Load and Store, but split by Add/Mul 445 static Object[] test1d(int[] a, int[] b, int mask) { 446 for (int i = 0; i < RANGE; i+=8) { 447 b[i+0] = a[i+0] * mask; // Mul 448 b[i+1] = a[i+1] * mask; 449 450 b[i+2] = a[i+2] + mask; // Add 451 b[i+3] = a[i+3] + mask; 452 b[i+4] = a[i+4] + mask; 453 b[i+5] = a[i+5] + mask; 454 // With AlignVector, we need 8-byte alignment of vector loads/stores. 455 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 456 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 457 // -> vectorize -> no vectorization 458 } 459 return new Object[]{ a, b }; 460 } 461 462 @Test 463 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 464 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 465 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 466 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 467 IRNode.STORE_VECTOR, "> 0"}, 468 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 469 applyIfPlatform = {"64-bit", "true"}, 470 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 471 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 472 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 473 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 474 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 475 IRNode.STORE_VECTOR, "> 0"}, 476 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 477 applyIfPlatform = {"64-bit", "true"}, 478 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 479 // Split the load 480 // 481 // 0 1 2 3 4 5 - - 482 // | | \ \ \ \ 483 // | | \ \ \ \ 484 // | | \ \ \ \ 485 // 0 1 - - 4 5 6 7 486 // 487 static Object[] test2a(int[] a, int[] b, int mask) { 488 for (int i = 0; i < RANGE; i+=8) { 489 int b0 = a[i+0] & mask; 490 int b1 = a[i+1] & mask; 491 int b2 = a[i+2] & mask; 492 int b3 = a[i+3] & mask; 493 int b4 = a[i+4] & mask; 494 int b5 = a[i+5] & mask; 495 496 b[i+0] = b0; 497 b[i+1] = b1; 498 499 b[i+4] = b2; 500 b[i+5] = b3; 501 b[i+6] = b4; 502 b[i+7] = b5; 503 // With AlignVector, we need 8-byte alignment of vector loads/stores. 504 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 505 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 506 // -> vectorize -> no vectorization 507 } 508 return new Object[]{ a, b }; 509 } 510 511 @Test 512 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 513 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 514 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 515 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 516 IRNode.STORE_VECTOR, "> 0"}, 517 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 518 applyIfPlatform = {"64-bit", "true"}, 519 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 520 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 521 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 522 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 523 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 524 IRNode.STORE_VECTOR, "> 0"}, 525 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 526 applyIfPlatform = {"64-bit", "true"}, 527 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 528 // Split the load 529 // 530 // 0 1 2 3 4 5 - - 531 // | | | | \ \ 532 // | | | | \ \ 533 // | | | | \ \ 534 // 0 1 2 3 -- 6 7 535 // 536 static Object[] test2b(int[] a, int[] b, int mask) { 537 for (int i = 0; i < RANGE; i+=8) { 538 int b0 = a[i+0] & mask; 539 int b1 = a[i+1] & mask; 540 int b2 = a[i+2] & mask; 541 int b3 = a[i+3] & mask; 542 int b4 = a[i+4] & mask; 543 int b5 = a[i+5] & mask; 544 545 b[i+0] = b0; 546 b[i+1] = b1; 547 b[i+2] = b2; 548 b[i+3] = b3; 549 550 b[i+6] = b4; 551 b[i+7] = b5; 552 // With AlignVector, we need 8-byte alignment of vector loads/stores. 553 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 554 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 555 // -> vectorize -> no vectorization 556 } 557 return new Object[]{ a, b }; 558 } 559 560 @Test 561 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 562 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 563 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 564 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 565 IRNode.STORE_VECTOR, "> 0"}, 566 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 567 applyIfPlatform = {"64-bit", "true"}, 568 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 569 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 570 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 571 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 572 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 573 IRNode.STORE_VECTOR, "> 0"}, 574 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 575 applyIfPlatform = {"64-bit", "true"}, 576 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 577 // Split the load 578 // 579 // 0 1 - - 4 5 6 7 580 // | | / / / / 581 // | | / / / / 582 // | | / / / / 583 // 0 1 2 3 4 5 - - 584 // 585 static Object[] test2c(int[] a, int[] b, int mask) { 586 for (int i = 0; i < RANGE; i+=8) { 587 int b0 = a[i+0] & mask; 588 int b1 = a[i+1] & mask; 589 590 int b4 = a[i+4] & mask; 591 int b5 = a[i+5] & mask; 592 int b6 = a[i+6] & mask; 593 int b7 = a[i+7] & mask; 594 595 b[i+0] = b0; 596 b[i+1] = b1; 597 b[i+2] = b4; 598 b[i+3] = b5; 599 b[i+4] = b6; 600 b[i+5] = b7; 601 // With AlignVector, we need 8-byte alignment of vector loads/stores. 602 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 603 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 604 // -> vectorize -> no vectorization 605 } 606 return new Object[]{ a, b }; 607 } 608 609 @Test 610 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 611 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 612 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 613 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 614 IRNode.STORE_VECTOR, "> 0"}, 615 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 616 applyIfPlatform = {"64-bit", "true"}, 617 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 618 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0", 619 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 620 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0", 621 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 622 IRNode.STORE_VECTOR, "> 0"}, 623 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 624 applyIfPlatform = {"64-bit", "true"}, 625 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 626 // Split the load 627 // 628 // 0 1 2 3 - - 6 7 629 // | | | | / / 630 // | | | | / / 631 // | | | | / / 632 // 0 1 2 3 4 5 - - 633 // 634 static Object[] test2d(int[] a, int[] b, int mask) { 635 for (int i = 0; i < RANGE; i+=8) { 636 int b0 = a[i+0] & mask; 637 int b1 = a[i+1] & mask; 638 int b2 = a[i+2] & mask; 639 int b3 = a[i+3] & mask; 640 641 int b6 = a[i+6] & mask; 642 int b7 = a[i+7] & mask; 643 644 b[i+0] = b0; 645 b[i+1] = b1; 646 b[i+2] = b2; 647 b[i+3] = b3; 648 b[i+4] = b6; 649 b[i+5] = b7; 650 // With AlignVector, we need 8-byte alignment of vector loads/stores. 651 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 652 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 653 // -> vectorize -> no vectorization 654 } 655 return new Object[]{ a, b }; 656 } 657 658 @Test 659 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 660 IRNode.STORE_VECTOR, "> 0"}, 661 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 662 applyIfPlatform = {"64-bit", "true"}, 663 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 664 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 665 IRNode.STORE_VECTOR, "> 0"}, 666 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 667 applyIfPlatform = {"64-bit", "true"}, 668 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 669 // 0 1 2 3 4 5 6 7 - 670 // | | | | | | | | 671 // | + + + | | | | 672 // | | | | | 673 // | v | | | | v 674 // | | | | | | | 675 // 1 - - 3 4 5 6 7 8 676 static Object[] test3a(short[] a, short[] b, short val) { 677 int sum = 0; 678 for (int i = 0; i < RANGE; i+=16) { 679 short a0 = a[i+0]; // required for alignment / offsets, technical limitation. 680 681 short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off 682 short a2 = a[i+2]; 683 short a3 = a[i+3]; 684 685 short a4 = a[i+4]; // 4-pack 686 short a5 = a[i+5]; 687 short a6 = a[i+6]; 688 short a7 = a[i+7]; 689 690 691 b[i+0] = a0; // required for alignment / offsets, technical limitation. 692 693 sum += a1 + a2 + a3; // not packed 694 695 b[i+3] = val; // adjacent to 4-pack but needs to be split off 696 697 b[i+4] = a4; // 4-pack 698 b[i+5] = a5; 699 b[i+6] = a6; 700 b[i+7] = a7; 701 702 b[i+8] = val; // adjacent to 4-pack but needs to be split off 703 704 // With AlignVector, we need 8-byte alignment of vector loads/stores. 705 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 706 // adr = base + 16 + 8 + 32*i -> always adr = base + 12 + 8 + 32*i -> never 707 // -> vectorize -> no vectorization 708 } 709 return new Object[]{ a, b, new int[]{ sum } }; 710 } 711 712 @Test 713 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0", 714 IRNode.STORE_VECTOR, "> 0"}, 715 applyIfPlatform = {"64-bit", "true"}, 716 applyIfCPUFeatureOr = {"sse4.1", "true"}) 717 // Cyclic dependency with distance 2 -> split into 2-packs 718 static Object[] test4a(short[] a, short[] b) { 719 for (int i = 0; i < RANGE-64; i++) { 720 b[i+2] = a[i+0]; 721 } 722 return new Object[]{ a, b }; 723 } 724 725 @Test 726 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0", 727 IRNode.STORE_VECTOR, "> 0"}, 728 applyIf = {"AlignVector", "false"}, 729 applyIfPlatform = {"64-bit", "true"}, 730 applyIfCPUFeatureOr = {"sse4.1", "true"}) 731 // Cyclic dependency with distance 3 -> split into 2-packs 732 static Object[] test4b(short[] a, short[] b) { 733 for (int i = 0; i < RANGE-64; i++) { 734 b[i+3] = a[i+0]; 735 } 736 return new Object[]{ a, b }; 737 } 738 739 @Test 740 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 741 IRNode.STORE_VECTOR, "> 0"}, 742 applyIf = {"MaxVectorSize", ">=8"}, 743 applyIfPlatform = {"64-bit", "true"}, 744 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 745 // Cyclic dependency with distance 4 -> split into 4-packs 746 static Object[] test4c(short[] a, short[] b) { 747 for (int i = 0; i < RANGE-64; i++) { 748 b[i+4] = a[i+0]; 749 } 750 return new Object[]{ a, b }; 751 } 752 753 @Test 754 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 755 IRNode.STORE_VECTOR, "> 0"}, 756 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"}, 757 applyIfPlatform = {"64-bit", "true"}, 758 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 759 // Cyclic dependency with distance 5 -> split into 4-packs 760 static Object[] test4d(short[] a, short[] b) { 761 for (int i = 0; i < RANGE-64; i++) { 762 b[i+5] = a[i+0]; 763 } 764 return new Object[]{ a, b }; 765 } 766 767 @Test 768 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 769 IRNode.STORE_VECTOR, "> 0"}, 770 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"}, 771 applyIfPlatform = {"64-bit", "true"}, 772 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 773 // Cyclic dependency with distance 6 -> split into 4-packs 774 static Object[] test4e(short[] a, short[] b) { 775 for (int i = 0; i < RANGE-64; i++) { 776 b[i+6] = a[i+0]; 777 } 778 return new Object[]{ a, b }; 779 } 780 781 @Test 782 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 783 IRNode.STORE_VECTOR, "> 0"}, 784 applyIfAnd = {"MaxVectorSize", ">=8", "AlignVector", "false"}, 785 applyIfPlatform = {"64-bit", "true"}, 786 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 787 // Cyclic dependency with distance 7 -> split into 4-packs 788 static Object[] test4f(short[] a, short[] b) { 789 for (int i = 0; i < RANGE-64; i++) { 790 b[i+7] = a[i+0]; 791 } 792 return new Object[]{ a, b }; 793 } 794 795 @Test 796 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0", 797 IRNode.STORE_VECTOR, "> 0"}, 798 applyIf = {"MaxVectorSize", ">=32"}, 799 applyIfPlatform = {"64-bit", "true"}, 800 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 801 // Cyclic dependency with distance 8 -> split into 8-packs 802 static Object[] test4g(short[] a, short[] b) { 803 for (int i = 0; i < RANGE-64; i++) { 804 b[i+8] = a[i+0]; 805 } 806 return new Object[]{ a, b }; 807 } 808 809 @Test 810 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0", 811 IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0", 812 IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_8, "> 0", 813 IRNode.ADD_VS, IRNode.VECTOR_SIZE_2, "> 0", 814 IRNode.ADD_VS, IRNode.VECTOR_SIZE_8, "> 0", 815 IRNode.ADD_VS, IRNode.VECTOR_SIZE_4, "> 0", 816 IRNode.STORE_VECTOR, "> 0"}, 817 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 818 applyIfPlatform = {"64-bit", "true"}, 819 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 820 // Split pack into power-of-2 sizes 821 static Object[] test5a(short[] a, short[] b, short val) { 822 for (int i = 0; i < RANGE; i+=16) { 823 b[i+ 0] = (short)(a[i+ 0] + val); // 8 pack 824 b[i+ 1] = (short)(a[i+ 1] + val); 825 b[i+ 2] = (short)(a[i+ 2] + val); 826 b[i+ 3] = (short)(a[i+ 3] + val); 827 b[i+ 4] = (short)(a[i+ 4] + val); 828 b[i+ 5] = (short)(a[i+ 5] + val); 829 b[i+ 6] = (short)(a[i+ 6] + val); 830 b[i+ 7] = (short)(a[i+ 7] + val); 831 832 b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack 833 b[i+ 9] = (short)(a[i+ 9] + val); 834 b[i+10] = (short)(a[i+10] + val); 835 b[i+11] = (short)(a[i+11] + val); 836 837 b[i+12] = (short)(a[i+12] + val); // 2-pack 838 b[i+13] = (short)(a[i+13] + val); 839 840 b[i+14] = (short)(a[i+14] + val); 841 } 842 return new Object[]{ a, b }; 843 } 844 845 @Test 846 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 847 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 848 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 849 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop 850 IRNode.ADD_REDUCTION_V, "> 0"}, 851 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"}, 852 applyIfPlatform = {"64-bit", "true"}, 853 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 854 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0", 855 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0", 856 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0", 857 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop 858 IRNode.ADD_REDUCTION_V, "> 0"}, 859 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"}, 860 applyIfPlatform = {"64-bit", "true"}, 861 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) 862 // Split packs including reductions 863 static Object[] test6a(int[] a, int[] b) { 864 int s = 0; 865 for (int i = 0; i < RANGE; i+=8) { 866 s += a[i+0] * b[i+0]; 867 s += a[i+1] * b[i+1]; 868 s += a[i+2] * b[i+2]; 869 s += a[i+3] * b[i+3]; 870 871 s += a[i+4] & b[i+4]; 872 s += a[i+5] & b[i+5]; 873 s += a[i+6] & b[i+6]; 874 s += a[i+7] & b[i+7]; 875 // With AlignVector, we need 8-byte alignment of vector loads/stores. 876 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true 877 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never 878 // -> vectorize -> no vectorization 879 } 880 return new Object[]{ a, b, new int[]{ s } }; 881 } 882 883 @Test 884 @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", 885 IRNode.MUL_VI, "> 0", 886 IRNode.POPULATE_INDEX, "> 0"}, 887 applyIfPlatform = {"64-bit", "true"}, 888 applyIfCPUFeatureOr = {"avx2", "true", "sve", "true"}) 889 // Index Populate: 890 // There can be an issue when all the (iv + 1), (iv + 2), ... 891 // get packed, but not (iv). Then we have a pack that is one element 892 // too short, and we start splitting everything in a bad way. 893 static Object[] test7a(int[] a, int[] b) { 894 for (int i = 0; i < RANGE; i++) { 895 a[i] = b[i] * i; 896 } 897 return new Object[]{ a, b }; 898 } 899 }