Sdiff test/hotspot/jtreg/compiler/loopopts/superword/TestSplitPacks.java

test/hotspot/jtreg/compiler/loopopts/superword/TestSplitPacks.java

261             }
262         }
263     }
264 
265     static void verifyL(String name, int i, long[] g, long[] r) {
266         for (int j = 0; j < g.length; j++) {
267             if (g[j] != r[j]) {
268                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
269                                            " gold[" + i + "][" + j + "] = " + g[j] +
270                                            " result[" + i + "][" + j + "] = " + r[j]);
271             }
272         }
273     }
274 
275     @Test
276     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
277                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
278                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
279                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
280                   IRNode.STORE_VECTOR, "> 0"},
281         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
282         applyIfPlatform = {"64-bit", "true"},
283         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
284     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
285                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
286                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
287                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
288                   IRNode.STORE_VECTOR, "> 0"},
289         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
290         applyIfPlatform = {"64-bit", "true"},
291         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
292     // Load and store are already split
293     //
294     //  0 1 - - 4 5 6 7
295     //  | |     | | | |
296     //  0 1 - - 4 5 6 7
297     static Object[] test0(int[] a, int[] b, int mask) {
298         for (int i = 0; i < RANGE; i+=8) {
299             int b0 = a[i+0] & mask;
300             int b1 = a[i+1] & mask;
301 
302             int b4 = a[i+4] & mask;
303             int b5 = a[i+5] & mask;
304             int b6 = a[i+6] & mask;
305             int b7 = a[i+7] & mask;
306 
307             b[i+0] = b0;
308             b[i+1] = b1;
309 
310             b[i+4] = b4;
311             b[i+5] = b5;
312             b[i+6] = b6;
313             b[i+7] = b7;
314             // With AlignVector, we need 8-byte alignment of vector loads/stores.
315             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
316             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
317             // -> vectorize                                  -> no vectorization
318         }
319         return new Object[]{ a, b };
320     }
321 
322     @Test
323     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
324                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
325                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
326                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
327                   IRNode.STORE_VECTOR, "> 0"},
328         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
329         applyIfPlatform = {"64-bit", "true"},
330         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
331     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
332                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
333                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
334                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
335                   IRNode.STORE_VECTOR, "> 0"},
336         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
337         applyIfPlatform = {"64-bit", "true"},
338         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
339     // Adjacent Load and Store, but split by Add/Mul
340     static Object[] test1a(int[] a, int[] b, int mask) {
341         for (int i = 0; i < RANGE; i+=8) {
342             b[i+0] = a[i+0] + mask; // Add
343             b[i+1] = a[i+1] + mask;
344             b[i+2] = a[i+2] + mask;
345             b[i+3] = a[i+3] + mask;
346 
347             b[i+4] = a[i+4] * mask; // Mul
348             b[i+5] = a[i+5] * mask;
349             // With AlignVector, we need 8-byte alignment of vector loads/stores.
350             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
351             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
352             // -> vectorize                                  -> no vectorization
353         }
354         return new Object[]{ a, b };
355     }
356 
357     @Test
358     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
359                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
360                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
361                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
362                   IRNode.STORE_VECTOR, "> 0"},
363         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
364         applyIfPlatform = {"64-bit", "true"},
365         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
366     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
367                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
368                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
369                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
370                   IRNode.STORE_VECTOR, "> 0"},
371         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
372         applyIfPlatform = {"64-bit", "true"},
373         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
374     // Adjacent Load and Store, but split by Add/Mul
375     static Object[] test1b(int[] a, int[] b, int mask) {
376         for (int i = 0; i < RANGE; i+=8) {
377             b[i+0] = a[i+0] * mask; // Mul
378             b[i+1] = a[i+1] * mask;
379             b[i+2] = a[i+2] * mask;
380             b[i+3] = a[i+3] * mask;
381 
382             b[i+4] = a[i+4] + mask; // Add
383             b[i+5] = a[i+5] + mask;
384             // With AlignVector, we need 8-byte alignment of vector loads/stores.
385             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
386             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
387             // -> vectorize                                  -> no vectorization
388         }
389         return new Object[]{ a, b };
390     }
391 
392     @Test
393     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
394                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
395                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
396                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
397                   IRNode.STORE_VECTOR, "> 0"},
398         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
399         applyIfPlatform = {"64-bit", "true"},
400         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
401     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
402                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
403                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
404                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
405                   IRNode.STORE_VECTOR, "> 0"},
406         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
407         applyIfPlatform = {"64-bit", "true"},
408         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
409     // Adjacent Load and Store, but split by Add/Mul
410     static Object[] test1c(int[] a, int[] b, int mask) {
411         for (int i = 0; i < RANGE; i+=8) {
412             b[i+0] = a[i+0] + mask; // Add
413             b[i+1] = a[i+1] + mask;
414 
415             b[i+2] = a[i+2] * mask; // Mul
416             b[i+3] = a[i+3] * mask;
417             b[i+4] = a[i+4] * mask;
418             b[i+5] = a[i+5] * mask;
419             // With AlignVector, we need 8-byte alignment of vector loads/stores.
420             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
421             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
422             // -> vectorize                                  -> no vectorization
423         }
424         return new Object[]{ a, b };
425     }
426 
427     @Test
428     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
429                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
430                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
431                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
432                   IRNode.STORE_VECTOR, "> 0"},
433         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
434         applyIfPlatform = {"64-bit", "true"},
435         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
436     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
437                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
438                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
439                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
440                   IRNode.STORE_VECTOR, "> 0"},
441         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
442         applyIfPlatform = {"64-bit", "true"},
443         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
444     // Adjacent Load and Store, but split by Add/Mul
445     static Object[] test1d(int[] a, int[] b, int mask) {
446         for (int i = 0; i < RANGE; i+=8) {
447             b[i+0] = a[i+0] * mask; // Mul
448             b[i+1] = a[i+1] * mask;
449 
450             b[i+2] = a[i+2] + mask; // Add
451             b[i+3] = a[i+3] + mask;
452             b[i+4] = a[i+4] + mask;
453             b[i+5] = a[i+5] + mask;
454             // With AlignVector, we need 8-byte alignment of vector loads/stores.
455             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
456             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
457             // -> vectorize                                  -> no vectorization
458         }
459         return new Object[]{ a, b };
460     }
461 
462     @Test
463     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
464                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
465                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
466                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
467                   IRNode.STORE_VECTOR, "> 0"},
468         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
469         applyIfPlatform = {"64-bit", "true"},
470         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
471     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
472                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
473                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
474                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
475                   IRNode.STORE_VECTOR, "> 0"},
476         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
477         applyIfPlatform = {"64-bit", "true"},
478         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
479     // Split the load
480     //
481     //  0 1 2 3 4 5 - -
482     //  | |  \ \ \ \
483     //  | |   \ \ \ \
484     //  | |    \ \ \ \
485     //  0 1 - - 4 5 6 7
486     //
487     static Object[] test2a(int[] a, int[] b, int mask) {
488         for (int i = 0; i < RANGE; i+=8) {
489             int b0 = a[i+0] & mask;
490             int b1 = a[i+1] & mask;
491             int b2 = a[i+2] & mask;
492             int b3 = a[i+3] & mask;
493             int b4 = a[i+4] & mask;
494             int b5 = a[i+5] & mask;
495 
496             b[i+0] = b0;
497             b[i+1] = b1;
498 
499             b[i+4] = b2;
500             b[i+5] = b3;
501             b[i+6] = b4;
502             b[i+7] = b5;
503             // With AlignVector, we need 8-byte alignment of vector loads/stores.
504             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
505             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
506             // -> vectorize                                  -> no vectorization
507         }
508         return new Object[]{ a, b };
509     }
510 
511     @Test
512     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
513                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
514                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
515                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
516                   IRNode.STORE_VECTOR, "> 0"},
517         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
518         applyIfPlatform = {"64-bit", "true"},
519         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
520     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
521                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
522                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
523                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
524                   IRNode.STORE_VECTOR, "> 0"},
525         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
526         applyIfPlatform = {"64-bit", "true"},
527         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
528     // Split the load
529     //
530     //  0 1 2 3 4 5 - -
531     //  | | | |  \ \
532     //  | | | |   \ \
533     //  | | | |    \ \
534     //  0 1 2 3 -- 6 7
535     //
536     static Object[] test2b(int[] a, int[] b, int mask) {
537         for (int i = 0; i < RANGE; i+=8) {
538             int b0 = a[i+0] & mask;
539             int b1 = a[i+1] & mask;
540             int b2 = a[i+2] & mask;
541             int b3 = a[i+3] & mask;
542             int b4 = a[i+4] & mask;
543             int b5 = a[i+5] & mask;
544 
545             b[i+0] = b0;
546             b[i+1] = b1;
547             b[i+2] = b2;
548             b[i+3] = b3;
549 
550             b[i+6] = b4;
551             b[i+7] = b5;
552             // With AlignVector, we need 8-byte alignment of vector loads/stores.
553             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
554             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
555             // -> vectorize                                  -> no vectorization
556         }
557         return new Object[]{ a, b };
558     }
559 
560     @Test
561     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
562                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
563                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
564                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
565                   IRNode.STORE_VECTOR, "> 0"},
566         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
567         applyIfPlatform = {"64-bit", "true"},
568         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
569     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
570                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
571                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
572                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
573                   IRNode.STORE_VECTOR, "> 0"},
574         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
575         applyIfPlatform = {"64-bit", "true"},
576         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
577     // Split the load
578     //
579     //  0 1 - - 4 5 6 7
580     //  | |    / / / /
581     //  | |   / / / /
582     //  | |  / / / /
583     //  0 1 2 3 4 5 - -
584     //
585     static Object[] test2c(int[] a, int[] b, int mask) {
586         for (int i = 0; i < RANGE; i+=8) {
587             int b0 = a[i+0] & mask;
588             int b1 = a[i+1] & mask;
589 
590             int b4 = a[i+4] & mask;
591             int b5 = a[i+5] & mask;
592             int b6 = a[i+6] & mask;
593             int b7 = a[i+7] & mask;
594 
595             b[i+0] = b0;
596             b[i+1] = b1;
597             b[i+2] = b4;
598             b[i+3] = b5;
599             b[i+4] = b6;
600             b[i+5] = b7;
601             // With AlignVector, we need 8-byte alignment of vector loads/stores.
602             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
603             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
604             // -> vectorize                                  -> no vectorization
605         }
606         return new Object[]{ a, b };
607     }
608 
609     @Test
610     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
611                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
612                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
613                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
614                   IRNode.STORE_VECTOR, "> 0"},
615         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
616         applyIfPlatform = {"64-bit", "true"},
617         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
618     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
619                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
620                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
621                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
622                   IRNode.STORE_VECTOR, "> 0"},
623         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
624         applyIfPlatform = {"64-bit", "true"},
625         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
626     // Split the load
627     //
628     //  0 1 2 3 - - 6 7
629     //  | | | |    / /
630     //  | | | |   / /
631     //  | | | |  / /
632     //  0 1 2 3 4 5 - -
633     //
634     static Object[] test2d(int[] a, int[] b, int mask) {
635         for (int i = 0; i < RANGE; i+=8) {
636             int b0 = a[i+0] & mask;
637             int b1 = a[i+1] & mask;
638             int b2 = a[i+2] & mask;
639             int b3 = a[i+3] & mask;
640 
641             int b6 = a[i+6] & mask;
642             int b7 = a[i+7] & mask;
643 
644             b[i+0] = b0;
645             b[i+1] = b1;
646             b[i+2] = b2;
647             b[i+3] = b3;
648             b[i+4] = b6;
649             b[i+5] = b7;
650             // With AlignVector, we need 8-byte alignment of vector loads/stores.
651             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
652             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
653             // -> vectorize                                  -> no vectorization
654         }
655         return new Object[]{ a, b };
656     }
657 
658     @Test
659     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
660                   IRNode.STORE_VECTOR, "> 0"},
661         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
662         applyIfPlatform = {"64-bit", "true"},
663         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
664     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
665                   IRNode.STORE_VECTOR, "> 0"},
666         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
667         applyIfPlatform = {"64-bit", "true"},
668         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
669     // 0 1 2 3 4 5 6 7 -
670     // | | | | | | | |
671     // | + + + | | | |
672     // |       | | | |
673     // |     v | | | | v
674     // |     | | | | | |
675     // 1 - - 3 4 5 6 7 8
676     static Object[] test3a(short[] a, short[] b, short val) {
677         int sum = 0;
678         for (int i = 0; i < RANGE; i+=16) {
679             short a0 = a[i+0]; // required for alignment / offsets, technical limitation.
680 
681             short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off
682             short a2 = a[i+2];
683             short a3 = a[i+3];
684 
685             short a4 = a[i+4]; // 4-pack
686             short a5 = a[i+5];
687             short a6 = a[i+6];
688             short a7 = a[i+7];
689 
690 
691             b[i+0] = a0; // required for alignment / offsets, technical limitation.
692 
693             sum += a1 + a2 + a3; // not packed
694 
695             b[i+3] = val; // adjacent to 4-pack but needs to be split off
696 
697             b[i+4] = a4; // 4-pack
698             b[i+5] = a5;
699             b[i+6] = a6;
700             b[i+7] = a7;
701 
702             b[i+8] = val; // adjacent to 4-pack but needs to be split off
703 
704             // With AlignVector, we need 8-byte alignment of vector loads/stores.
705             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
706             // adr = base + 16 + 8 + 32*i  ->  always        adr = base + 12 + 8 + 32*i  ->  never
707             // -> vectorize                                  -> no vectorization
708         }
709         return new Object[]{ a, b, new int[]{ sum } };
710     }
711 
712     @Test
713     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
714                   IRNode.STORE_VECTOR, "> 0"},
715         applyIfPlatform = {"64-bit", "true"},
716         applyIfCPUFeatureOr = {"sse4.1", "true"})
717     // Cyclic dependency with distance 2 -> split into 2-packs
718     static Object[] test4a(short[] a, short[] b) {
719         for (int i = 0; i < RANGE-64; i++) {
720           b[i+2] = a[i+0];
721         }
722         return new Object[]{ a, b };
723     }
724 
725     @Test
726     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
727                   IRNode.STORE_VECTOR, "> 0"},

841 
842             b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack
843             b[i+ 9] = (short)(a[i+ 9] + val);
844             b[i+10] = (short)(a[i+10] + val);
845             b[i+11] = (short)(a[i+11] + val);
846 
847             b[i+12] = (short)(a[i+12] + val); // 2-pack
848             b[i+13] = (short)(a[i+13] + val);
849 
850             b[i+14] = (short)(a[i+14] + val);
851         }
852         return new Object[]{ a, b };
853     }
854 
855     @Test
856     @IR(counts = {IRNode.LOAD_VECTOR_I,   IRNode.VECTOR_SIZE_4, "> 0",
857                   IRNode.MUL_VI,          IRNode.VECTOR_SIZE_4, "> 0",
858                   IRNode.AND_VI,          IRNode.VECTOR_SIZE_4, "> 0",
859                   IRNode.ADD_VI,          IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
860                   IRNode.ADD_REDUCTION_V,                       "> 0"},
861         applyIfAnd = {"MaxVectorSize", ">=32", "AlignVector", "false"},
862         applyIfPlatform = {"64-bit", "true"},
863         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
864     @IR(counts = {IRNode.LOAD_VECTOR_I,   IRNode.VECTOR_SIZE_4, "> 0",
865                   IRNode.MUL_VI,          IRNode.VECTOR_SIZE_4, "> 0",
866                   IRNode.AND_VI,          IRNode.VECTOR_SIZE_4, "> 0",
867                   IRNode.ADD_VI,          IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
868                   IRNode.ADD_REDUCTION_V,                       "> 0"},
869         applyIfAnd = {"MaxVectorSize", ">=32", "UseCompactObjectHeaders", "false"},
870         applyIfPlatform = {"64-bit", "true"},
871         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
872     // Split packs including reductions
873     static Object[] test6a(int[] a, int[] b) {
874         int s = 0;
875         for (int i = 0; i < RANGE; i+=8) {
876             s += a[i+0] * b[i+0];
877             s += a[i+1] * b[i+1];
878             s += a[i+2] * b[i+2];
879             s += a[i+3] * b[i+3];
880 
881             s += a[i+4] & b[i+4];
882             s += a[i+5] & b[i+5];
883             s += a[i+6] & b[i+6];
884             s += a[i+7] & b[i+7];
885             // With AlignVector, we need 8-byte alignment of vector loads/stores.
886             // UseCompactObjectHeaders=false                 UseCompactObjectHeaders=true
887             // adr = base + 16 + 32*i  ->  always            adr = base + 12 + 32*i  ->  never
888             // -> vectorize                                  -> no vectorization
889         }
890         return new Object[]{ a, b, new int[]{ s } };
891     }
892 
893     @Test
894     @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
895                   IRNode.MUL_VI,         "> 0",
896                   IRNode.POPULATE_INDEX, "> 0"},
897         applyIfPlatform = {"64-bit", "true"},
898         applyIfCPUFeatureOr = {"avx2", "true", "sve", "true", "rvv", "true"})
899     // Index Populate:
900     // There can be an issue when all the (iv + 1), (iv + 2), ...
901     // get packed, but not (iv). Then we have a pack that is one element
902     // too short, and we start splitting everything in a bad way.
903     static Object[] test7a(int[] a, int[] b) {
904         for (int i = 0; i < RANGE; i++) {
905             a[i] = b[i] * i;
906         }
907         return new Object[]{ a, b };
908     }

261             }
262         }
263     }
264 
265     static void verifyL(String name, int i, long[] g, long[] r) {
266         for (int j = 0; j < g.length; j++) {
267             if (g[j] != r[j]) {
268                 throw new RuntimeException("verify " + name + ": arrays must have same content:" +
269                                            " gold[" + i + "][" + j + "] = " + g[j] +
270                                            " result[" + i + "][" + j + "] = " + r[j]);
271             }
272         }
273     }
274 
275     @Test
276     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
277                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
278                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
279                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
280                   IRNode.STORE_VECTOR, "> 0"},
281         applyIf = {"MaxVectorSize", ">=32"},








282         applyIfPlatform = {"64-bit", "true"},
283         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
284     // Load and store are already split
285     //
286     //  0 1 - - 4 5 6 7
287     //  | |     | | | |
288     //  0 1 - - 4 5 6 7
289     static Object[] test0(int[] a, int[] b, int mask) {
290         for (int i = 0; i < RANGE; i+=8) {
291             int b0 = a[i+0] & mask;
292             int b1 = a[i+1] & mask;
293 
294             int b4 = a[i+4] & mask;
295             int b5 = a[i+5] & mask;
296             int b6 = a[i+6] & mask;
297             int b7 = a[i+7] & mask;
298 
299             b[i+0] = b0;
300             b[i+1] = b1;
301 
302             b[i+4] = b4;
303             b[i+5] = b5;
304             b[i+6] = b6;
305             b[i+7] = b7;




306         }
307         return new Object[]{ a, b };
308     }
309 
310     @Test
311     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
312                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
313                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
314                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
315                   IRNode.STORE_VECTOR, "> 0"},
316         applyIf = {"MaxVectorSize", ">=32"},








317         applyIfPlatform = {"64-bit", "true"},
318         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
319     // Adjacent Load and Store, but split by Add/Mul
320     static Object[] test1a(int[] a, int[] b, int mask) {
321         for (int i = 0; i < RANGE; i+=8) {
322             b[i+0] = a[i+0] + mask; // Add
323             b[i+1] = a[i+1] + mask;
324             b[i+2] = a[i+2] + mask;
325             b[i+3] = a[i+3] + mask;
326 
327             b[i+4] = a[i+4] * mask; // Mul
328             b[i+5] = a[i+5] * mask;




329         }
330         return new Object[]{ a, b };
331     }
332 
333     @Test
334     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
335                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
336                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
337                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
338                   IRNode.STORE_VECTOR, "> 0"},
339         applyIf = {"MaxVectorSize", ">=32"},








340         applyIfPlatform = {"64-bit", "true"},
341         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
342     // Adjacent Load and Store, but split by Add/Mul
343     static Object[] test1b(int[] a, int[] b, int mask) {
344         for (int i = 0; i < RANGE; i+=8) {
345             b[i+0] = a[i+0] * mask; // Mul
346             b[i+1] = a[i+1] * mask;
347             b[i+2] = a[i+2] * mask;
348             b[i+3] = a[i+3] * mask;
349 
350             b[i+4] = a[i+4] + mask; // Add
351             b[i+5] = a[i+5] + mask;




352         }
353         return new Object[]{ a, b };
354     }
355 
356     @Test
357     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
358                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
359                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_2, "> 0",
360                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_4, "> 0",
361                   IRNode.STORE_VECTOR, "> 0"},
362         applyIf = {"MaxVectorSize", ">=32"},








363         applyIfPlatform = {"64-bit", "true"},
364         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
365     // Adjacent Load and Store, but split by Add/Mul
366     static Object[] test1c(int[] a, int[] b, int mask) {
367         for (int i = 0; i < RANGE; i+=8) {
368             b[i+0] = a[i+0] + mask; // Add
369             b[i+1] = a[i+1] + mask;
370 
371             b[i+2] = a[i+2] * mask; // Mul
372             b[i+3] = a[i+3] * mask;
373             b[i+4] = a[i+4] * mask;
374             b[i+5] = a[i+5] * mask;




375         }
376         return new Object[]{ a, b };
377     }
378 
379     @Test
380     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
381                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
382                   IRNode.ADD_VI,        IRNode.VECTOR_SIZE_4, "> 0",
383                   IRNode.MUL_VI,        IRNode.VECTOR_SIZE_2, "> 0",
384                   IRNode.STORE_VECTOR, "> 0"},
385         applyIf = {"MaxVectorSize", ">=32"},








386         applyIfPlatform = {"64-bit", "true"},
387         applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true", "rvv", "true"})
388     // Adjacent Load and Store, but split by Add/Mul
389     static Object[] test1d(int[] a, int[] b, int mask) {
390         for (int i = 0; i < RANGE; i+=8) {
391             b[i+0] = a[i+0] * mask; // Mul
392             b[i+1] = a[i+1] * mask;
393 
394             b[i+2] = a[i+2] + mask; // Add
395             b[i+3] = a[i+3] + mask;
396             b[i+4] = a[i+4] + mask;
397             b[i+5] = a[i+5] + mask;




398         }
399         return new Object[]{ a, b };
400     }
401 
402     @Test
403     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
404                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
405                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
406                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
407                   IRNode.STORE_VECTOR, "> 0"},
408         applyIf = {"MaxVectorSize", ">=32"},








409         applyIfPlatform = {"64-bit", "true"},
410         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
411     // Split the load
412     //
413     //  0 1 2 3 4 5 - -
414     //  | |  \ \ \ \
415     //  | |   \ \ \ \
416     //  | |    \ \ \ \
417     //  0 1 - - 4 5 6 7
418     //
419     static Object[] test2a(int[] a, int[] b, int mask) {
420         for (int i = 0; i < RANGE; i+=8) {
421             int b0 = a[i+0] & mask;
422             int b1 = a[i+1] & mask;
423             int b2 = a[i+2] & mask;
424             int b3 = a[i+3] & mask;
425             int b4 = a[i+4] & mask;
426             int b5 = a[i+5] & mask;
427 
428             b[i+0] = b0;
429             b[i+1] = b1;
430 
431             b[i+4] = b2;
432             b[i+5] = b3;
433             b[i+6] = b4;
434             b[i+7] = b5;




435         }
436         return new Object[]{ a, b };
437     }
438 
439     @Test
440     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
441                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
442                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
443                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
444                   IRNode.STORE_VECTOR, "> 0"},
445         applyIf = {"MaxVectorSize", ">=32"},








446         applyIfPlatform = {"64-bit", "true"},
447         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
448     // Split the load
449     //
450     //  0 1 2 3 4 5 - -
451     //  | | | |  \ \
452     //  | | | |   \ \
453     //  | | | |    \ \
454     //  0 1 2 3 -- 6 7
455     //
456     static Object[] test2b(int[] a, int[] b, int mask) {
457         for (int i = 0; i < RANGE; i+=8) {
458             int b0 = a[i+0] & mask;
459             int b1 = a[i+1] & mask;
460             int b2 = a[i+2] & mask;
461             int b3 = a[i+3] & mask;
462             int b4 = a[i+4] & mask;
463             int b5 = a[i+5] & mask;
464 
465             b[i+0] = b0;
466             b[i+1] = b1;
467             b[i+2] = b2;
468             b[i+3] = b3;
469 
470             b[i+6] = b4;
471             b[i+7] = b5;




472         }
473         return new Object[]{ a, b };
474     }
475 
476     @Test
477     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
478                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
479                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
480                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
481                   IRNode.STORE_VECTOR, "> 0"},
482         applyIf = {"MaxVectorSize", ">=32"},








483         applyIfPlatform = {"64-bit", "true"},
484         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
485     // Split the load
486     //
487     //  0 1 - - 4 5 6 7
488     //  | |    / / / /
489     //  | |   / / / /
490     //  | |  / / / /
491     //  0 1 2 3 4 5 - -
492     //
493     static Object[] test2c(int[] a, int[] b, int mask) {
494         for (int i = 0; i < RANGE; i+=8) {
495             int b0 = a[i+0] & mask;
496             int b1 = a[i+1] & mask;
497 
498             int b4 = a[i+4] & mask;
499             int b5 = a[i+5] & mask;
500             int b6 = a[i+6] & mask;
501             int b7 = a[i+7] & mask;
502 
503             b[i+0] = b0;
504             b[i+1] = b1;
505             b[i+2] = b4;
506             b[i+3] = b5;
507             b[i+4] = b6;
508             b[i+5] = b7;




509         }
510         return new Object[]{ a, b };
511     }
512 
513     @Test
514     @IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
515                   IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
516                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_2, "> 0",
517                   IRNode.AND_VI,        IRNode.VECTOR_SIZE_4, "> 0",
518                   IRNode.STORE_VECTOR, "> 0"},
519         applyIf = {"MaxVectorSize", ">=32"},








520         applyIfPlatform = {"64-bit", "true"},
521         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
522     // Split the load
523     //
524     //  0 1 2 3 - - 6 7
525     //  | | | |    / /
526     //  | | | |   / /
527     //  | | | |  / /
528     //  0 1 2 3 4 5 - -
529     //
530     static Object[] test2d(int[] a, int[] b, int mask) {
531         for (int i = 0; i < RANGE; i+=8) {
532             int b0 = a[i+0] & mask;
533             int b1 = a[i+1] & mask;
534             int b2 = a[i+2] & mask;
535             int b3 = a[i+3] & mask;
536 
537             int b6 = a[i+6] & mask;
538             int b7 = a[i+7] & mask;
539 
540             b[i+0] = b0;
541             b[i+1] = b1;
542             b[i+2] = b2;
543             b[i+3] = b3;
544             b[i+4] = b6;
545             b[i+5] = b7;




546         }
547         return new Object[]{ a, b };
548     }
549 
550     @Test
551     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_4, "> 0",
552                   IRNode.STORE_VECTOR, "> 0"},
553         applyIf = {"MaxVectorSize", ">=32"},





554         applyIfPlatform = {"64-bit", "true"},
555         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
556     // 0 1 2 3 4 5 6 7 -
557     // | | | | | | | |
558     // | + + + | | | |
559     // |       | | | |
560     // |     v | | | | v
561     // |     | | | | | |
562     // 1 - - 3 4 5 6 7 8
563     static Object[] test3a(short[] a, short[] b, short val) {
564         int sum = 0;
565         for (int i = 0; i < RANGE; i+=16) {
566             short a0 = a[i+0]; // required for alignment / offsets, technical limitation.
567 
568             short a1 = a[i+1]; // adjacent to 4-pack, but need to be split off
569             short a2 = a[i+2];
570             short a3 = a[i+3];
571 
572             short a4 = a[i+4]; // 4-pack
573             short a5 = a[i+5];
574             short a6 = a[i+6];
575             short a7 = a[i+7];
576 
577 
578             b[i+0] = a0; // required for alignment / offsets, technical limitation.
579 
580             sum += a1 + a2 + a3; // not packed
581 
582             b[i+3] = val; // adjacent to 4-pack but needs to be split off
583 
584             b[i+4] = a4; // 4-pack
585             b[i+5] = a5;
586             b[i+6] = a6;
587             b[i+7] = a7;
588 
589             b[i+8] = val; // adjacent to 4-pack but needs to be split off





590         }
591         return new Object[]{ a, b, new int[]{ sum } };
592     }
593 
594     @Test
595     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
596                   IRNode.STORE_VECTOR, "> 0"},
597         applyIfPlatform = {"64-bit", "true"},
598         applyIfCPUFeatureOr = {"sse4.1", "true"})
599     // Cyclic dependency with distance 2 -> split into 2-packs
600     static Object[] test4a(short[] a, short[] b) {
601         for (int i = 0; i < RANGE-64; i++) {
602           b[i+2] = a[i+0];
603         }
604         return new Object[]{ a, b };
605     }
606 
607     @Test
608     @IR(counts = {IRNode.LOAD_VECTOR_S, IRNode.VECTOR_SIZE_2, "> 0",
609                   IRNode.STORE_VECTOR, "> 0"},

723 
724             b[i+ 8] = (short)(a[i+ 8] + val); // 4-pack
725             b[i+ 9] = (short)(a[i+ 9] + val);
726             b[i+10] = (short)(a[i+10] + val);
727             b[i+11] = (short)(a[i+11] + val);
728 
729             b[i+12] = (short)(a[i+12] + val); // 2-pack
730             b[i+13] = (short)(a[i+13] + val);
731 
732             b[i+14] = (short)(a[i+14] + val);
733         }
734         return new Object[]{ a, b };
735     }
736 
737     @Test
738     @IR(counts = {IRNode.LOAD_VECTOR_I,   IRNode.VECTOR_SIZE_4, "> 0",
739                   IRNode.MUL_VI,          IRNode.VECTOR_SIZE_4, "> 0",
740                   IRNode.AND_VI,          IRNode.VECTOR_SIZE_4, "> 0",
741                   IRNode.ADD_VI,          IRNode.VECTOR_SIZE_4, "> 0", // reduction moved out of loop
742                   IRNode.ADD_REDUCTION_V,                       "> 0"},
743         applyIf = {"MaxVectorSize", ">=32"},








744         applyIfPlatform = {"64-bit", "true"},
745         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true", "rvv", "true"})
746     // Split packs including reductions
747     static Object[] test6a(int[] a, int[] b) {
748         int s = 0;
749         for (int i = 0; i < RANGE; i+=8) {
750             s += a[i+0] * b[i+0];
751             s += a[i+1] * b[i+1];
752             s += a[i+2] * b[i+2];
753             s += a[i+3] * b[i+3];
754 
755             s += a[i+4] & b[i+4];
756             s += a[i+5] & b[i+5];
757             s += a[i+6] & b[i+6];
758             s += a[i+7] & b[i+7];




759         }
760         return new Object[]{ a, b, new int[]{ s } };
761     }
762 
763     @Test
764     @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
765                   IRNode.MUL_VI,         "> 0",
766                   IRNode.POPULATE_INDEX, "> 0"},
767         applyIfPlatform = {"64-bit", "true"},
768         applyIfCPUFeatureOr = {"avx2", "true", "sve", "true", "rvv", "true"})
769     // Index Populate:
770     // There can be an issue when all the (iv + 1), (iv + 2), ...
771     // get packed, but not (iv). Then we have a pack that is one element
772     // too short, and we start splitting everything in a bad way.
773     static Object[] test7a(int[] a, int[] b) {
774         for (int i = 0; i < RANGE; i++) {
775             a[i] = b[i] * i;
776         }
777         return new Object[]{ a, b };
778     }

< prev index next >