< prev index next >

src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp

Print this page

1053     }
1054   } else if (src_size == S) {
1055     sve_sunpklo(dst, D, src);
1056   }
1057 }
1058 
1059 // Vector narrow from src to dst with specified element sizes.
1060 // High part of dst vector will be filled with zero.
1061 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1062                                           FloatRegister src, SIMD_RegVariant src_size,
1063                                           FloatRegister tmp) {
1064   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1065   assert_different_registers(src, tmp);
1066   sve_dup(tmp, src_size, 0);
1067   if (src_size == D) {
1068     switch (dst_size) {
1069     case S:
1070       sve_uzp1(dst, S, src, tmp);
1071       break;
1072     case H:

1073       sve_uzp1(dst, S, src, tmp);
1074       sve_uzp1(dst, H, dst, tmp);
1075       break;
1076     case B:

1077       sve_uzp1(dst, S, src, tmp);
1078       sve_uzp1(dst, H, dst, tmp);
1079       sve_uzp1(dst, B, dst, tmp);
1080       break;
1081     default:
1082       ShouldNotReachHere();
1083     }
1084   } else if (src_size == S) {
1085     if (dst_size == H) {
1086       sve_uzp1(dst, H, src, tmp);
1087     } else { // B

1088       sve_uzp1(dst, H, src, tmp);
1089       sve_uzp1(dst, B, dst, tmp);
1090     }
1091   } else if (src_size == H) {
1092     sve_uzp1(dst, B, src, tmp);
1093   }
1094 }
1095 
1096 // Extend src predicate to dst predicate with the same lane count but larger
1097 // element size, e.g. 64Byte -> 512Long
1098 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1099                                              uint dst_element_length_in_bytes,
1100                                              uint src_element_length_in_bytes) {
1101   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1102     sve_punpklo(dst, src);
1103   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1104     sve_punpklo(dst, src);
1105     sve_punpklo(dst, dst);
1106   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1107     sve_punpklo(dst, src);

1251       sve_ptrue(dst, size, /* VL16 */ 0b01001);
1252       break;
1253     case 32:
1254       sve_ptrue(dst, size, /* VL32 */ 0b01010);
1255       break;
1256     case 64:
1257       sve_ptrue(dst, size, /* VL64 */ 0b01011);
1258       break;
1259     case 128:
1260       sve_ptrue(dst, size, /* VL128 */ 0b01100);
1261       break;
1262     case 256:
1263       sve_ptrue(dst, size, /* VL256 */ 0b01101);
1264       break;
1265     default:
1266       assert(false, "unsupported");
1267       ShouldNotReachHere();
1268   }
1269 }
1270 




















































































































































1271 // Extract a scalar element from an sve vector at position 'idx'.
1272 // The input elements in src are expected to be of integral type.
1273 void C2_MacroAssembler::sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
1274                                              bool is_signed, FloatRegister vtmp) {
1275   assert(UseSVE > 0 && size != Q, "unsupported");
1276   assert(!(is_signed && size == D), "signed extract (D) not supported.");
1277   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
1278     is_signed ? smov(dst, src, size, idx) : umov(dst, src, size, idx);
1279   } else {
1280     sve_orr(vtmp, src, src);
1281     sve_ext(vtmp, vtmp, idx << size);
1282     is_signed ? smov(dst, vtmp, size, 0) : umov(dst, vtmp, size, 0);
1283   }
1284 }
1285 
1286 // java.lang.Math::round intrinsics
1287 
1288 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1289                                        FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
1290   assert_different_registers(tmp1, tmp2, tmp3, src, dst);

1053     }
1054   } else if (src_size == S) {
1055     sve_sunpklo(dst, D, src);
1056   }
1057 }
1058 
1059 // Vector narrow from src to dst with specified element sizes.
1060 // High part of dst vector will be filled with zero.
1061 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1062                                           FloatRegister src, SIMD_RegVariant src_size,
1063                                           FloatRegister tmp) {
1064   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1065   assert_different_registers(src, tmp);
1066   sve_dup(tmp, src_size, 0);
1067   if (src_size == D) {
1068     switch (dst_size) {
1069     case S:
1070       sve_uzp1(dst, S, src, tmp);
1071       break;
1072     case H:
1073       assert_different_registers(dst, tmp);
1074       sve_uzp1(dst, S, src, tmp);
1075       sve_uzp1(dst, H, dst, tmp);
1076       break;
1077     case B:
1078       assert_different_registers(dst, tmp);
1079       sve_uzp1(dst, S, src, tmp);
1080       sve_uzp1(dst, H, dst, tmp);
1081       sve_uzp1(dst, B, dst, tmp);
1082       break;
1083     default:
1084       ShouldNotReachHere();
1085     }
1086   } else if (src_size == S) {
1087     if (dst_size == H) {
1088       sve_uzp1(dst, H, src, tmp);
1089     } else { // B
1090       assert_different_registers(dst, tmp);
1091       sve_uzp1(dst, H, src, tmp);
1092       sve_uzp1(dst, B, dst, tmp);
1093     }
1094   } else if (src_size == H) {
1095     sve_uzp1(dst, B, src, tmp);
1096   }
1097 }
1098 
1099 // Extend src predicate to dst predicate with the same lane count but larger
1100 // element size, e.g. 64Byte -> 512Long
1101 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1102                                              uint dst_element_length_in_bytes,
1103                                              uint src_element_length_in_bytes) {
1104   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1105     sve_punpklo(dst, src);
1106   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1107     sve_punpklo(dst, src);
1108     sve_punpklo(dst, dst);
1109   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1110     sve_punpklo(dst, src);

1254       sve_ptrue(dst, size, /* VL16 */ 0b01001);
1255       break;
1256     case 32:
1257       sve_ptrue(dst, size, /* VL32 */ 0b01010);
1258       break;
1259     case 64:
1260       sve_ptrue(dst, size, /* VL64 */ 0b01011);
1261       break;
1262     case 128:
1263       sve_ptrue(dst, size, /* VL128 */ 0b01100);
1264       break;
1265     case 256:
1266       sve_ptrue(dst, size, /* VL256 */ 0b01101);
1267       break;
1268     default:
1269       assert(false, "unsupported");
1270       ShouldNotReachHere();
1271   }
1272 }
1273 
1274 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1275 // Any remaining elements of dst will be filled with zero.
1276 // Clobbers: rscratch1
1277 // Preserves: src, mask
1278 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1279                                            FloatRegister vtmp1, FloatRegister vtmp2,
1280                                            PRegister pgtmp) {
1281   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1282   assert_different_registers(dst, src, vtmp1, vtmp2);
1283   assert_different_registers(mask, pgtmp);
1284 
1285   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1286   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1287   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1288   sve_dup(vtmp2, H, 0);
1289 
1290   // Extend lowest half to type INT.
1291   // dst = 00004444 00003333 00002222 00001111
1292   sve_uunpklo(dst, S, src);
1293   // pgtmp = 00000001 00000000 00000001 00000001
1294   sve_punpklo(pgtmp, mask);
1295   // Pack the active elements in size of type INT to the right,
1296   // and fill the remainings with zero.
1297   // dst = 00000000 00004444 00002222 00001111
1298   sve_compact(dst, S, dst, pgtmp);
1299   // Narrow the result back to type SHORT.
1300   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1301   sve_uzp1(dst, H, dst, vtmp2);
1302   // Count the active elements of lowest half.
1303   // rscratch1 = 3
1304   sve_cntp(rscratch1, S, ptrue, pgtmp);
1305 
1306   // Repeat to the highest half.
1307   // pgtmp = 00000001 00000000 00000000 00000001
1308   sve_punpkhi(pgtmp, mask);
1309   // vtmp1 = 00008888 00007777 00006666 00005555
1310   sve_uunpkhi(vtmp1, S, src);
1311   // vtmp1 = 00000000 00000000 00008888 00005555
1312   sve_compact(vtmp1, S, vtmp1, pgtmp);
1313   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
1314   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
1315 
1316   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
1317   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
1318   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1319   // TRUE_CNT is the number of active elements in the compressed low.
1320   neg(rscratch1, rscratch1);
1321   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1322   sve_index(vtmp2, H, rscratch1, 1);
1323   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
1324   sve_tbl(vtmp1, H, vtmp1, vtmp2);
1325 
1326   // Combine the compressed high(after shifted) with the compressed low.
1327   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
1328   sve_orr(dst, dst, vtmp1);
1329 }
1330 
1331 // Clobbers: rscratch1, rscratch2
1332 // Preserves: src, mask
1333 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
1334                                           FloatRegister vtmp1, FloatRegister vtmp2,
1335                                           FloatRegister vtmp3, FloatRegister vtmp4,
1336                                           PRegister ptmp, PRegister pgtmp) {
1337   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1338   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
1339   assert_different_registers(mask, ptmp, pgtmp);
1340   // Example input:   src   = 88 77 66 45 44 33 22 11
1341   //                  mask  = 01 00 00 01 01 00 01 01
1342   // Expected result: dst   = 00 00 00 88 55 44 22 11
1343 
1344   sve_dup(vtmp4, B, 0);
1345   // Extend lowest half to type SHORT.
1346   // vtmp1 = 0044 0033 0022 0011
1347   sve_uunpklo(vtmp1, H, src);
1348   // ptmp = 0001 0000 0001 0001
1349   sve_punpklo(ptmp, mask);
1350   // Count the active elements of lowest half.
1351   // rscratch2 = 3
1352   sve_cntp(rscratch2, H, ptrue, ptmp);
1353   // Pack the active elements in size of type SHORT to the right,
1354   // and fill the remainings with zero.
1355   // dst = 0000 0044 0022 0011
1356   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
1357   // Narrow the result back to type BYTE.
1358   // dst = 00 00 00 00 00 44 22 11
1359   sve_uzp1(dst, B, dst, vtmp4);
1360 
1361   // Repeat to the highest half.
1362   // ptmp = 0001 0000 0000 0001
1363   sve_punpkhi(ptmp, mask);
1364   // vtmp1 = 0088 0077 0066 0055
1365   sve_uunpkhi(vtmp2, H, src);
1366   // vtmp1 = 0000 0000 0088 0055
1367   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
1368 
1369   sve_dup(vtmp4, B, 0);
1370   // vtmp1 = 00 00 00 00 00 00 88 55
1371   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
1372 
1373   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
1374   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
1375   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1376   // TRUE_CNT is the number of active elements in the compressed low.
1377   neg(rscratch2, rscratch2);
1378   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1379   sve_index(vtmp2, B, rscratch2, 1);
1380   // vtmp1 = 00 00 00 88 55 00 00 00
1381   sve_tbl(vtmp1, B, vtmp1, vtmp2);
1382   // Combine the compressed high(after shifted) with the compressed low.
1383   // dst = 00 00 00 88 55 44 22 11
1384   sve_orr(dst, dst, vtmp1);
1385 }
1386 
1387 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1388   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1389   SIMD_Arrangement size = isQ ? T16B : T8B;
1390   if (bt == T_BYTE) {
1391     rbit(dst, size, src);
1392   } else {
1393     neon_reverse_bytes(dst, src, bt, isQ);
1394     rbit(dst, size, dst);
1395   }
1396 }
1397 
1398 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1399   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1400   SIMD_Arrangement size = isQ ? T16B : T8B;
1401   switch (bt) {
1402     case T_BYTE:
1403       if (dst != src) {
1404         orr(dst, size, src, src);
1405       }
1406       break;
1407     case T_SHORT:
1408       rev16(dst, size, src);
1409       break;
1410     case T_INT:
1411       rev32(dst, size, src);
1412       break;
1413     case T_LONG:
1414       rev64(dst, size, src);
1415       break;
1416     default:
1417       assert(false, "unsupported");
1418       ShouldNotReachHere();
1419   }
1420 }
1421 
1422 // Extract a scalar element from an sve vector at position 'idx'.
1423 // The input elements in src are expected to be of integral type.
1424 void C2_MacroAssembler::sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
1425                                              bool is_signed, FloatRegister vtmp) {
1426   assert(UseSVE > 0 && size != Q, "unsupported");
1427   assert(!(is_signed && size == D), "signed extract (D) not supported.");
1428   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
1429     is_signed ? smov(dst, src, size, idx) : umov(dst, src, size, idx);
1430   } else {
1431     sve_orr(vtmp, src, src);
1432     sve_ext(vtmp, vtmp, idx << size);
1433     is_signed ? smov(dst, vtmp, size, 0) : umov(dst, vtmp, size, 0);
1434   }
1435 }
1436 
1437 // java.lang.Math::round intrinsics
1438 
1439 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1440                                        FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
1441   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
< prev index next >