261 }
262 }
263 }
264
265 static void verifyL(String name, int i, long[] g, long[] r) {
266 for (int j = 0; j < g.length; j++) {
267 if (g[j] != r[j]) {
268 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
269 " gold[" + i + "][" + j + "] = " + g[j] +
270 " result[" + i + "][" + j + "] = " + r[j]);
271 }
272 }
273 }
274
275 @Test
276 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
277 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
278 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
279 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
280 IRNode.STORE_VECTOR, "> 0"},
281 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
282 applyIfPlatform = {"64-bit", "true"},
283 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
284 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
285 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
286 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
287 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
288 IRNode.STORE_VECTOR, "> 0"},
289 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
290 applyIfPlatform = {"64-bit", "true"},
291 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
292 // Load and store are already split
293 //
294 // 0 1 - - 4 5 6 7
295 // | | | | | |
296 // 0 1 - - 4 5 6 7
297 static Object[] test0(int[] a, int[] b, int mask) {
298 for (int i = 0; i < RANGE; i+=8) {
299 int b0 = a[i+0] & mask;
300 int b1 = a[i+1] & mask;
301
302 int b4 = a[i+4] & mask;
303 int b5 = a[i+5] & mask;
304 int b6 = a[i+6] & mask;
305 int b7 = a[i+7] & mask;
306
307 b[i+0] = b0;
308 b[i+1] = b1;
309
310 b[i+4] = b4;
311 b[i+5] = b5;
312 b[i+6] = b6;
313 b[i+7] = b7;
314 // With AlignVector, we need 8-byte alignment of vector loads/stores.
315 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
316 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
317 // -> vectorize -> no vectorization
318 }
319 return new Object[]{ a, b };
320 }
321
322 @Test
323 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
324 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
325 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
326 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
327 IRNode.STORE_VECTOR, "> 0"},
328 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
329 applyIfPlatform = {"64-bit", "true"},
330 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
331 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
332 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
333 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
334 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
335 IRNode.STORE_VECTOR, "> 0"},
336 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
337 applyIfPlatform = {"64-bit", "true"},
338 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
339 // Adjacent Load and Store, but split by Add/Mul
340 static Object[] test1a(int[] a, int[] b, int mask) {
341 for (int i = 0; i < RANGE; i+=8) {
342 b[i+0] = a[i+0] + mask; // Add
343 b[i+1] = a[i+1] + mask;
344 b[i+2] = a[i+2] + mask;
345 b[i+3] = a[i+3] + mask;
346
347 b[i+4] = a[i+4] * mask; // Mul
348 b[i+5] = a[i+5] * mask;
349 // With AlignVector, we need 8-byte alignment of vector loads/stores.
350 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
351 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
352 // -> vectorize -> no vectorization
353 }
354 return new Object[]{ a, b };
355 }
356
357 @Test
358 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
359 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
360 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
361 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
362 IRNode.STORE_VECTOR, "> 0"},
363 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
364 applyIfPlatform = {"64-bit", "true"},
365 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
366 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
367 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
368 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
369 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
370 IRNode.STORE_VECTOR, "> 0"},
371 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
372 applyIfPlatform = {"64-bit", "true"},
373 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
374 // Adjacent Load and Store, but split by Add/Mul
375 static Object[] test1b(int[] a, int[] b, int mask) {
376 for (int i = 0; i < RANGE; i+=8) {
377 b[i+0] = a[i+0] * mask; // Mul
378 b[i+1] = a[i+1] * mask;
379 b[i+2] = a[i+2] * mask;
380 b[i+3] = a[i+3] * mask;
381
382 b[i+4] = a[i+4] + mask; // Add
383 b[i+5] = a[i+5] + mask;
384 // With AlignVector, we need 8-byte alignment of vector loads/stores.
385 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
386 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
387 // -> vectorize -> no vectorization
388 }
389 return new Object[]{ a, b };
390 }
391
392 @Test
393 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
394 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
395 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
396 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
397 IRNode.STORE_VECTOR, "> 0"},
398 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
399 applyIfPlatform = {"64-bit", "true"},
400 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
401 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
402 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
403 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
404 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
405 IRNode.STORE_VECTOR, "> 0"},
406 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
407 applyIfPlatform = {"64-bit", "true"},
408 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
409 // Adjacent Load and Store, but split by Add/Mul
410 static Object[] test1c(int[] a, int[] b, int mask) {
411 for (int i = 0; i < RANGE; i+=8) {
412 b[i+0] = a[i+0] + mask; // Add
413 b[i+1] = a[i+1] + mask;
414
415 b[i+2] = a[i+2] * mask; // Mul
416 b[i+3] = a[i+3] * mask;
417 b[i+4] = a[i+4] * mask;
418 b[i+5] = a[i+5] * mask;
419 // With AlignVector, we need 8-byte alignment of vector loads/stores.
420 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
421 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
422 // -> vectorize -> no vectorization
423 }
424 return new Object[]{ a, b };
425 }
426
427 @Test
428 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
429 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
430 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
431 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
432 IRNode.STORE_VECTOR, "> 0"},
433 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
434 applyIfPlatform = {"64-bit", "true"},
435 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
436 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
437 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
438 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
439 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
440 IRNode.STORE_VECTOR, "> 0"},
441 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
442 applyIfPlatform = {"64-bit", "true"},
443 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
444 // Adjacent Load and Store, but split by Add/Mul
445 static Object[] test1d(int[] a, int[] b, int mask) {
446 for (int i = 0; i < RANGE; i+=8) {
447 b[i+0] = a[i+0] * mask; // Mul
448 b[i+1] = a[i+1] * mask;
449
450 b[i+2] = a[i+2] + mask; // Add
451 b[i+3] = a[i+3] + mask;
452 b[i+4] = a[i+4] + mask;
453 b[i+5] = a[i+5] + mask;
454 // With AlignVector, we need 8-byte alignment of vector loads/stores.
455 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
456 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
457 // -> vectorize -> no vectorization
458 }
459 return new Object[]{ a, b };
460 }
461
462 @Test
463 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
464 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
465 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
466 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
467 IRNode.STORE_VECTOR, "> 0"},
468 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
469 applyIfPlatform = {"64-bit", "true"},
470 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
471 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
472 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
473 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
474 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
475 IRNode.STORE_VECTOR, "> 0"},
476 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
477 applyIfPlatform = {"64-bit", "true"},
478 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
479 // Split the load
480 //
481 // 0 1 2 3 4 5 - -
482 // | | \ \ \ \
483 // | | \ \ \ \
484 // | | \ \ \ \
485 // 0 1 - - 4 5 6 7
486 //
487 static Object[] test2a(int[] a, int[] b, int mask) {
488 for (int i = 0; i < RANGE; i+=8) {
489 int b0 = a[i+0] & mask;
490 int b1 = a[i+1] & mask;
491 int b2 = a[i+2] & mask;
492 int b3 = a[i+3] & mask;
493 int b4 = a[i+4] & mask;
494 int b5 = a[i+5] & mask;
495
496 b[i+0] = b0;
497 b[i+1] = b1;
498
499 b[i+4] = b2;
500 b[i+5] = b3;
501 b[i+6] = b4;
502 b[i+7] = b5;
503 // With AlignVector, we need 8-byte alignment of vector loads/stores.
504 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
505 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
506 // -> vectorize -> no vectorization
507 }
508 return new Object[]{ a, b };
509 }
510
511 @Test
512 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
513 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
514 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
515 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
516 IRNode.STORE_VECTOR, "> 0"},
517 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
518 applyIfPlatform = {"64-bit", "true"},
519 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
520 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
521 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
522 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
523 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
524 IRNode.STORE_VECTOR, "> 0"},
525 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
526 applyIfPlatform = {"64-bit", "true"},
527 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
528 // Split the load
529 //
530 // 0 1 2 3 4 5 - -
531 // | | | | \ \
532 // | | | | \ \
533 // | | | | \ \
534 // 0 1 2 3 -- 6 7
535 //
536 static Object[] test2b(int[] a, int[] b, int mask) {
537 for (int i = 0; i < RANGE; i+=8) {
538 int b0 = a[i+0] & mask;
539 int b1 = a[i+1] & mask;
540 int b2 = a[i+2] & mask;
541 int b3 = a[i+3] & mask;
542 int b4 = a[i+4] & mask;
543 int b5 = a[i+5] & mask;
544
545 b[i+0] = b0;
546 b[i+1] = b1;
547 b[i+2] = b2;
548 b[i+3] = b3;
549
550 b[i+6] = b4;
551 b[i+7] = b5;
552 // With AlignVector, we need 8-byte alignment of vector loads/stores.
553 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
554 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
555 // -> vectorize -> no vectorization
556 }
557 return new Object[]{ a, b };
558 }
559
560 @Test
561 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
562 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
563 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
564 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
565 IRNode.STORE_VECTOR, "> 0"},
566 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
567 applyIfPlatform = {"64-bit", "true"},
568 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
569 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
570 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
571 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
572 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
573 IRNode.STORE_VECTOR, "> 0"},
574 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
575 applyIfPlatform = {"64-bit", "true"},
576 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
577 // Split the load
578 //
579 // 0 1 - - 4 5 6 7
580 // | | / / / /
581 // | | / / / /
582 // | | / / / /
583 // 0 1 2 3 4 5 - -
584 //
585 static Object[] test2c(int[] a, int[] b, int mask) {
586 for (int i = 0; i < RANGE; i+=8) {
587 int b0 = a[i+0] & mask;
588 int b1 = a[i+1] & mask;
589
590 int b4 = a[i+4] & mask;
591 int b5 = a[i+5] & mask;
592 int b6 = a[i+6] & mask;
593 int b7 = a[i+7] & mask;
594
595 b[i+0] = b0;
596 b[i+1] = b1;
597 b[i+2] = b4;
598 b[i+3] = b5;
599 b[i+4] = b6;
600 b[i+5] = b7;
601 // With AlignVector, we need 8-byte alignment of vector loads/stores.
602 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
603 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
604 // -> vectorize -> no vectorization
605 }
606 return new Object[]{ a, b };
607 }
608
609 @Test
610 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
611 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
612 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
613 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
614 IRNode.STORE_VECTOR, "> 0"},
615 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
616 applyIfPlatform = {"64-bit", "true"},
617 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
618 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
619 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
620 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
621 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
622 IRNode.STORE_VECTOR, "> 0"},
623 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
624 applyIfPlatform = {"64-bit", "true"},
625 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
626 // Split the load
627 //
628 // 0 1 2 3 - - 6 7
629 // | | | | / /
630 // | | | | / /
631 // | | | | / /
632 // 0 1 2 3 4 5 - -
633 //
634 static Object[] test2d(int[] a, int[] b, int mask) {
635 for (int i = 0; i < RANGE; i+=8) {
636 int b0 = a[i+0] & mask;
637 int b1 = a[i+1] & mask;
638 int b2 = a[i+2] & mask;
639 int b3 = a[i+3] & mask;
640
641 int b6 = a[i+6] & mask;
642 int b7 = a[i+7] & mask;
643
644 b[i+0] = b0;
645 b[i+1] = b1;
646 b[i+2] = b2;
647 b[i+3] = b3;
648 b[i+4] = b6;
649 b[i+5] = b7;
650 // With AlignVector, we need 8-byte alignment of vector loads/stores.
651 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
652 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
653 // -> vectorize -> no vectorization
654 }
655 return new Object[]{ a, b };
656 }
657
658 @Test
659 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
660 IRNode.STORE_VECTOR, "> 0"},
661 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
662 applyIfPlatform = {"64-bit", "true"},
663 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
664 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
665 IRNode.STORE_VECTOR, "> 0"},
666 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
667 applyIfPlatform = {"64-bit", "true"},
668 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
669 // 0 1 2 3 4 5 6 7 -
670 // | | | | | | | |
671 // | + + + | | | |
672 // | | | | |
673 // | v | | | | v
674 // | | | | | | |
675 // 1 - - 3 4 5 6 7 8
676 static Object[] test3a(short[] a, short[] b, short val) {
677 int sum = 0;
678 for (int i = 0; i < RANGE; i+=16) {
679 short a0 = a[i+0]; // required for alignment / offsets, technical limitation.
680
681 short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off
682 short a2 = a[i+2];
683 short a3 = a[i+3];
684
685 short a4 = a[i+4]; // 4-pack
686 short a5 = a[i+5];
687 short a6 = a[i+6];
688 short a7 = a[i+7];
689
690
691 b[i+0] = a0; // required for alignment / offsets, technical limitation.
692
693 sum += a1 + a2 + a3; // not packed
694
695 b[i+3] = val; // adjacent to 4-pack but needs to be split off
696
697 b[i+4] = a4; // 4-pack
698 b[i+5] = a5;
699 b[i+6] = a6;
700 b[i+7] = a7;
701
702 b[i+8] = val; // adjacent to 4-pack but needs to be split off
703
704 // With AlignVector, we need 8-byte alignment of vector loads/stores.
705 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
706 // adr = base + 16 + 8 + 32*i -> always adr = base + 12 + 8 + 32*i -> never
707 // -> vectorize -> no vectorization
708 }
709 return new Object[]{ a, b, new int[]{ sum } };
710 }
711
712 @Test
713 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
714 IRNode.STORE_VECTOR, "> 0"},
715 applyIfPlatform = {"64-bit", "true"},
716 applyIfCPUFeatureOr = {"sse4.1", "true"})
717 // Cyclic dependency with distance 2 -> split into 2-packs
718 static Object[] test4a(short[] a, short[] b) {
719 for (int i = 0; i < RANGE-64; i++) {
720 b[i+2] = a[i+0];
721 }
722 return new Object[]{ a, b };
723 }
724
725 @Test
726 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
727 IRNode.STORE_VECTOR, "> 0"},
831
832 b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack
833 b[i+ 9] = (short)(a[i+ 9] + val);
834 b[i+10] = (short)(a[i+10] + val);
835 b[i+11] = (short)(a[i+11] + val);
836
837 b[i+12] = (short)(a[i+12] + val); // 2-pack
838 b[i+13] = (short)(a[i+13] + val);
839
840 b[i+14] = (short)(a[i+14] + val);
841 }
842 return new Object[]{ a, b };
843 }
844
845 @Test
846 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
847 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
848 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
849 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
850 IRNode.ADD_REDUCTION_V, "> 0"},
851 applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
852 applyIfPlatform = {"64-bit", "true"},
853 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
854 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
855 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
856 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
857 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
858 IRNode.ADD_REDUCTION_V, "> 0"},
859 applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
860 applyIfPlatform = {"64-bit", "true"},
861 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
862 // Split packs including reductions
863 static Object[] test6a(int[] a, int[] b) {
864 int s = 0;
865 for (int i = 0; i < RANGE; i+=8) {
866 s += a[i+0] * b[i+0];
867 s += a[i+1] * b[i+1];
868 s += a[i+2] * b[i+2];
869 s += a[i+3] * b[i+3];
870
871 s += a[i+4] & b[i+4];
872 s += a[i+5] & b[i+5];
873 s += a[i+6] & b[i+6];
874 s += a[i+7] & b[i+7];
875 // With AlignVector, we need 8-byte alignment of vector loads/stores.
876 // UseCompactObjectHeaders=false UseCompactObjectHeaders=true
877 // adr = base + 16 + 32*i -> always adr = base + 12 + 32*i -> never
878 // -> vectorize -> no vectorization
879 }
880 return new Object[]{ a, b, new int[]{ s } };
881 }
882
883 @Test
884 @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
885 IRNode.MUL_VI, "> 0",
886 IRNode.POPULATE_INDEX, "> 0"},
887 applyIfPlatform = {"64-bit", "true"},
888 applyIfCPUFeatureOr = {"avx2", "true", "sve", "true"})
889 // Index Populate:
890 // There can be an issue when all the (iv + 1), (iv + 2), ...
891 // get packed, but not (iv). Then we have a pack that is one element
892 // too short, and we start splitting everything in a bad way.
893 static Object[] test7a(int[] a, int[] b) {
894 for (int i = 0; i < RANGE; i++) {
895 a[i] = b[i] * i;
896 }
897 return new Object[]{ a, b };
898 }
|
261 }
262 }
263 }
264
265 static void verifyL(String name, int i, long[] g, long[] r) {
266 for (int j = 0; j < g.length; j++) {
267 if (g[j] != r[j]) {
268 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
269 " gold[" + i + "][" + j + "] = " + g[j] +
270 " result[" + i + "][" + j + "] = " + r[j]);
271 }
272 }
273 }
274
275 @Test
276 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
277 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
278 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
279 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
280 IRNode.STORE_VECTOR, "> 0"},
281 applyIf = {"MaxVectorSize", ">=32"},
282 applyIfPlatform = {"64-bit", "true"},
283 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
284 // Load and store are already split
285 //
286 // 0 1 - - 4 5 6 7
287 // | | | | | |
288 // 0 1 - - 4 5 6 7
289 static Object[] test0(int[] a, int[] b, int mask) {
290 for (int i = 0; i < RANGE; i+=8) {
291 int b0 = a[i+0] & mask;
292 int b1 = a[i+1] & mask;
293
294 int b4 = a[i+4] & mask;
295 int b5 = a[i+5] & mask;
296 int b6 = a[i+6] & mask;
297 int b7 = a[i+7] & mask;
298
299 b[i+0] = b0;
300 b[i+1] = b1;
301
302 b[i+4] = b4;
303 b[i+5] = b5;
304 b[i+6] = b6;
305 b[i+7] = b7;
306 }
307 return new Object[]{ a, b };
308 }
309
310 @Test
311 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
312 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
313 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
314 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
315 IRNode.STORE_VECTOR, "> 0"},
316 applyIf = {"MaxVectorSize", ">=32"},
317 applyIfPlatform = {"64-bit", "true"},
318 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
319 // Adjacent Load and Store, but split by Add/Mul
320 static Object[] test1a(int[] a, int[] b, int mask) {
321 for (int i = 0; i < RANGE; i+=8) {
322 b[i+0] = a[i+0] + mask; // Add
323 b[i+1] = a[i+1] + mask;
324 b[i+2] = a[i+2] + mask;
325 b[i+3] = a[i+3] + mask;
326
327 b[i+4] = a[i+4] * mask; // Mul
328 b[i+5] = a[i+5] * mask;
329 }
330 return new Object[]{ a, b };
331 }
332
333 @Test
334 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
335 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
336 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
337 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
338 IRNode.STORE_VECTOR, "> 0"},
339 applyIf = {"MaxVectorSize", ">=32"},
340 applyIfPlatform = {"64-bit", "true"},
341 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
342 // Adjacent Load and Store, but split by Add/Mul
343 static Object[] test1b(int[] a, int[] b, int mask) {
344 for (int i = 0; i < RANGE; i+=8) {
345 b[i+0] = a[i+0] * mask; // Mul
346 b[i+1] = a[i+1] * mask;
347 b[i+2] = a[i+2] * mask;
348 b[i+3] = a[i+3] * mask;
349
350 b[i+4] = a[i+4] + mask; // Add
351 b[i+5] = a[i+5] + mask;
352 }
353 return new Object[]{ a, b };
354 }
355
356 @Test
357 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
358 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
359 IRNode.ADD_VI, IRNode.VECTOR_SIZE_2, "> 0",
360 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
361 IRNode.STORE_VECTOR, "> 0"},
362 applyIf = {"MaxVectorSize", ">=32"},
363 applyIfPlatform = {"64-bit", "true"},
364 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
365 // Adjacent Load and Store, but split by Add/Mul
366 static Object[] test1c(int[] a, int[] b, int mask) {
367 for (int i = 0; i < RANGE; i+=8) {
368 b[i+0] = a[i+0] + mask; // Add
369 b[i+1] = a[i+1] + mask;
370
371 b[i+2] = a[i+2] * mask; // Mul
372 b[i+3] = a[i+3] * mask;
373 b[i+4] = a[i+4] * mask;
374 b[i+5] = a[i+5] * mask;
375 }
376 return new Object[]{ a, b };
377 }
378
379 @Test
380 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
381 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
382 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0",
383 IRNode.MUL_VI, IRNode.VECTOR_SIZE_2, "> 0",
384 IRNode.STORE_VECTOR, "> 0"},
385 applyIf = {"MaxVectorSize", ">=32"},
386 applyIfPlatform = {"64-bit", "true"},
387 applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
388 // Adjacent Load and Store, but split by Add/Mul
389 static Object[] test1d(int[] a, int[] b, int mask) {
390 for (int i = 0; i < RANGE; i+=8) {
391 b[i+0] = a[i+0] * mask; // Mul
392 b[i+1] = a[i+1] * mask;
393
394 b[i+2] = a[i+2] + mask; // Add
395 b[i+3] = a[i+3] + mask;
396 b[i+4] = a[i+4] + mask;
397 b[i+5] = a[i+5] + mask;
398 }
399 return new Object[]{ a, b };
400 }
401
402 @Test
403 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
404 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
405 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
406 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
407 IRNode.STORE_VECTOR, "> 0"},
408 applyIf = {"MaxVectorSize", ">=32"},
409 applyIfPlatform = {"64-bit", "true"},
410 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
411 // Split the load
412 //
413 // 0 1 2 3 4 5 - -
414 // | | \ \ \ \
415 // | | \ \ \ \
416 // | | \ \ \ \
417 // 0 1 - - 4 5 6 7
418 //
419 static Object[] test2a(int[] a, int[] b, int mask) {
420 for (int i = 0; i < RANGE; i+=8) {
421 int b0 = a[i+0] & mask;
422 int b1 = a[i+1] & mask;
423 int b2 = a[i+2] & mask;
424 int b3 = a[i+3] & mask;
425 int b4 = a[i+4] & mask;
426 int b5 = a[i+5] & mask;
427
428 b[i+0] = b0;
429 b[i+1] = b1;
430
431 b[i+4] = b2;
432 b[i+5] = b3;
433 b[i+6] = b4;
434 b[i+7] = b5;
435 }
436 return new Object[]{ a, b };
437 }
438
439 @Test
440 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
441 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
442 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
443 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
444 IRNode.STORE_VECTOR, "> 0"},
445 applyIf = {"MaxVectorSize", ">=32"},
446 applyIfPlatform = {"64-bit", "true"},
447 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
448 // Split the load
449 //
450 // 0 1 2 3 4 5 - -
451 // | | | | \ \
452 // | | | | \ \
453 // | | | | \ \
454 // 0 1 2 3 -- 6 7
455 //
456 static Object[] test2b(int[] a, int[] b, int mask) {
457 for (int i = 0; i < RANGE; i+=8) {
458 int b0 = a[i+0] & mask;
459 int b1 = a[i+1] & mask;
460 int b2 = a[i+2] & mask;
461 int b3 = a[i+3] & mask;
462 int b4 = a[i+4] & mask;
463 int b5 = a[i+5] & mask;
464
465 b[i+0] = b0;
466 b[i+1] = b1;
467 b[i+2] = b2;
468 b[i+3] = b3;
469
470 b[i+6] = b4;
471 b[i+7] = b5;
472 }
473 return new Object[]{ a, b };
474 }
475
476 @Test
477 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
478 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
479 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
480 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
481 IRNode.STORE_VECTOR, "> 0"},
482 applyIf = {"MaxVectorSize", ">=32"},
483 applyIfPlatform = {"64-bit", "true"},
484 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
485 // Split the load
486 //
487 // 0 1 - - 4 5 6 7
488 // | | / / / /
489 // | | / / / /
490 // | | / / / /
491 // 0 1 2 3 4 5 - -
492 //
493 static Object[] test2c(int[] a, int[] b, int mask) {
494 for (int i = 0; i < RANGE; i+=8) {
495 int b0 = a[i+0] & mask;
496 int b1 = a[i+1] & mask;
497
498 int b4 = a[i+4] & mask;
499 int b5 = a[i+5] & mask;
500 int b6 = a[i+6] & mask;
501 int b7 = a[i+7] & mask;
502
503 b[i+0] = b0;
504 b[i+1] = b1;
505 b[i+2] = b4;
506 b[i+3] = b5;
507 b[i+4] = b6;
508 b[i+5] = b7;
509 }
510 return new Object[]{ a, b };
511 }
512
513 @Test
514 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
515 IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
516 IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
517 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
518 IRNode.STORE_VECTOR, "> 0"},
519 applyIf = {"MaxVectorSize", ">=32"},
520 applyIfPlatform = {"64-bit", "true"},
521 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
522 // Split the load
523 //
524 // 0 1 2 3 - - 6 7
525 // | | | | / /
526 // | | | | / /
527 // | | | | / /
528 // 0 1 2 3 4 5 - -
529 //
530 static Object[] test2d(int[] a, int[] b, int mask) {
531 for (int i = 0; i < RANGE; i+=8) {
532 int b0 = a[i+0] & mask;
533 int b1 = a[i+1] & mask;
534 int b2 = a[i+2] & mask;
535 int b3 = a[i+3] & mask;
536
537 int b6 = a[i+6] & mask;
538 int b7 = a[i+7] & mask;
539
540 b[i+0] = b0;
541 b[i+1] = b1;
542 b[i+2] = b2;
543 b[i+3] = b3;
544 b[i+4] = b6;
545 b[i+5] = b7;
546 }
547 return new Object[]{ a, b };
548 }
549
550 @Test
551 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
552 IRNode.STORE_VECTOR, "> 0"},
553 applyIf = {"MaxVectorSize", ">=32"},
554 applyIfPlatform = {"64-bit", "true"},
555 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
556 // 0 1 2 3 4 5 6 7 -
557 // | | | | | | | |
558 // | + + + | | | |
559 // | | | | |
560 // | v | | | | v
561 // | | | | | | |
562 // 1 - - 3 4 5 6 7 8
563 static Object[] test3a(short[] a, short[] b, short val) {
564 int sum = 0;
565 for (int i = 0; i < RANGE; i+=16) {
566 short a0 = a[i+0]; // required for alignment / offsets, technical limitation.
567
568 short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off
569 short a2 = a[i+2];
570 short a3 = a[i+3];
571
572 short a4 = a[i+4]; // 4-pack
573 short a5 = a[i+5];
574 short a6 = a[i+6];
575 short a7 = a[i+7];
576
577
578 b[i+0] = a0; // required for alignment / offsets, technical limitation.
579
580 sum += a1 + a2 + a3; // not packed
581
582 b[i+3] = val; // adjacent to 4-pack but needs to be split off
583
584 b[i+4] = a4; // 4-pack
585 b[i+5] = a5;
586 b[i+6] = a6;
587 b[i+7] = a7;
588
589 b[i+8] = val; // adjacent to 4-pack but needs to be split off
590 }
591 return new Object[]{ a, b, new int[]{ sum } };
592 }
593
594 @Test
595 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
596 IRNode.STORE_VECTOR, "> 0"},
597 applyIfPlatform = {"64-bit", "true"},
598 applyIfCPUFeatureOr = {"sse4.1", "true"})
599 // Cyclic dependency with distance 2 -> split into 2-packs
600 static Object[] test4a(short[] a, short[] b) {
601 for (int i = 0; i < RANGE-64; i++) {
602 b[i+2] = a[i+0];
603 }
604 return new Object[]{ a, b };
605 }
606
607 @Test
608 @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
609 IRNode.STORE_VECTOR, "> 0"},
713
714 b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack
715 b[i+ 9] = (short)(a[i+ 9] + val);
716 b[i+10] = (short)(a[i+10] + val);
717 b[i+11] = (short)(a[i+11] + val);
718
719 b[i+12] = (short)(a[i+12] + val); // 2-pack
720 b[i+13] = (short)(a[i+13] + val);
721
722 b[i+14] = (short)(a[i+14] + val);
723 }
724 return new Object[]{ a, b };
725 }
726
727 @Test
728 @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
729 IRNode.MUL_VI, IRNode.VECTOR_SIZE_4, "> 0",
730 IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
731 IRNode.ADD_VI, IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
732 IRNode.ADD_REDUCTION_V, "> 0"},
733 applyIf = {"MaxVectorSize", ">=32"},
734 applyIfPlatform = {"64-bit", "true"},
735 applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
736 // Split packs including reductions
737 static Object[] test6a(int[] a, int[] b) {
738 int s = 0;
739 for (int i = 0; i < RANGE; i+=8) {
740 s += a[i+0] * b[i+0];
741 s += a[i+1] * b[i+1];
742 s += a[i+2] * b[i+2];
743 s += a[i+3] * b[i+3];
744
745 s += a[i+4] & b[i+4];
746 s += a[i+5] & b[i+5];
747 s += a[i+6] & b[i+6];
748 s += a[i+7] & b[i+7];
749 }
750 return new Object[]{ a, b, new int[]{ s } };
751 }
752
753 @Test
754 @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
755 IRNode.MUL_VI, "> 0",
756 IRNode.POPULATE_INDEX, "> 0"},
757 applyIfPlatform = {"64-bit", "true"},
758 applyIfCPUFeatureOr = {"avx2", "true", "sve", "true"})
759 // Index Populate:
760 // There can be an issue when all the (iv + 1), (iv + 2), ...
761 // get packed, but not (iv). Then we have a pack that is one element
762 // too short, and we start splitting everything in a bad way.
763 static Object[] test7a(int[] a, int[] b) {
764 for (int i = 0; i < RANGE; i++) {
765 a[i] = b[i] * i;
766 }
767 return new Object[]{ a, b };
768 }
|