< prev index next >

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Print this page

4401       vmovmskps(tmp, mask, vec_enc);
4402       need_clip = masklen < 4;
4403       break;
4404     case T_LONG:
4405     case T_DOUBLE:
4406       vmovmskpd(tmp, mask, vec_enc);
4407       need_clip = masklen < 2;
4408       break;
4409     default: assert(false, "Unhandled type, %s", type2name(bt));
4410   }
4411 
4412   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4413   // operations needs to be clipped.
4414   if (need_clip && opc != Op_VectorMaskFirstTrue) {
4415     // need_clip implies masklen < 32
4416     andq(tmp, (1 << masklen) - 1);
4417   }
4418 
4419   vector_mask_operation_helper(opc, dst, tmp, masklen);
4420 }

































































4421 #endif
4422 
4423 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
4424   if (VM_Version::supports_avx512bw()) {
4425     if (mask_len > 32) {
4426       kmovql(dst, src);
4427     } else {
4428       kmovdl(dst, src);
4429       if (mask_len != 32) {
4430         kshiftrdl(dst, dst, 32 - mask_len);
4431       }
4432     }
4433   } else {
4434     assert(mask_len <= 16, "");
4435     kmovwl(dst, src);
4436     if (mask_len != 16) {
4437       kshiftrwl(dst, dst, 16 - mask_len);
4438     }
4439   }
4440 }
4441 


























4442 
4443 //
4444 // Following is lookup table based popcount computation algorithm:-
4445 //       Index   Bit set count
4446 //     [ 0000 ->   0,
4447 //       0001 ->   1,
4448 //       0010 ->   1,
4449 //       0011 ->   2,
4450 //       0100 ->   1,
4451 //       0101 ->   2,
4452 //       0110 ->   2,
4453 //       0111 ->   3,
4454 //       1000 ->   1,
4455 //       1001 ->   2,
4456 //       1010 ->   3,
4457 //       1011 ->   3,
4458 //       1100 ->   2,
4459 //       1101 ->   3,
4460 //       1111 ->   4 ]
4461 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
4462 //     shuffle indices for lookup table access.
4463 //  b. Right shift each byte of vector lane by 4 positions.
4464 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
4465 //     shuffle indices for lookup table access.
4466 //  d. Add the bitset count of upper and lower 4 bits of each byte.
4467 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
4468 //     count of all the bytes of a quadword.
4469 //  f. Perform step e. for upper 128bit vector lane.
4470 //  g. Pack the bitset count of quadwords back to double word.
4471 //  h. Unpacking and packing operations are not needed for 64bit vector lane.














4472 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4473                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
4474                                             int vec_enc) {
4475   if (VM_Version::supports_avx512_vpopcntdq()) {
4476     vpopcntd(dst, src, vec_enc);
4477   } else {
4478     assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
4479     movl(rtmp, 0x0F0F0F0F);
4480     movdl(xtmp1, rtmp);
4481     vpbroadcastd(xtmp1, xtmp1, vec_enc);
4482     if (Assembler::AVX_512bit == vec_enc) {
4483       evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), false, vec_enc, rtmp);
4484     } else {
4485       vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp);
4486     }
4487     vpand(xtmp3, src, xtmp1, vec_enc);
4488     vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
4489     vpsrlw(dst, src, 4, vec_enc);
4490     vpand(dst, dst, xtmp1, vec_enc);
4491     vpshufb(dst, xtmp2, dst, vec_enc);
4492     vpaddb(xtmp3, dst, xtmp3, vec_enc);
4493     vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
4494     vpunpckhdq(dst, xtmp3, xtmp1, vec_enc);
4495     vpsadbw(dst, dst, xtmp1, vec_enc);
4496     vpunpckldq(xtmp2, xtmp3, xtmp1, vec_enc);
4497     vpsadbw(xtmp2, xtmp2, xtmp1, vec_enc);
4498     vpackuswb(dst, xtmp2, dst, vec_enc);
4499   }
4500 }
4501 
4502 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4503                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
4504                                              int vec_enc) {
4505   if (VM_Version::supports_avx512_vpopcntdq()) {
4506     vpopcntq(dst, src, vec_enc);
4507   } else if (vec_enc == Assembler::AVX_512bit) {
4508     assert(VM_Version::supports_avx512bw(), "");
4509     movl(rtmp, 0x0F0F0F0F);
4510     movdl(xtmp1, rtmp);
4511     vpbroadcastd(xtmp1, xtmp1, vec_enc);
4512     evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), true, vec_enc, rtmp);
4513     vpandq(xtmp3, src, xtmp1, vec_enc);
4514     vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
4515     vpsrlw(dst, src, 4, vec_enc);
4516     vpandq(dst, dst, xtmp1, vec_enc);
4517     vpshufb(dst, xtmp2, dst, vec_enc);
4518     vpaddb(xtmp3, dst, xtmp3, vec_enc);
4519     vpxorq(xtmp1, xtmp1, xtmp1, vec_enc);
4520     vpsadbw(dst, xtmp3, xtmp1, vec_enc);
4521   } else {
4522     // We do not see any performance benefit of running
4523     // above instruction sequence on 256 bit vector which
4524     // can operate over maximum 4 long elements.
4525     ShouldNotReachHere();





























4526   }
4527   evpmovqd(dst, dst, vec_enc);
4528 }
4529 
4530 #ifndef _LP64
4531 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
4532   assert(VM_Version::supports_avx512bw(), "");
4533   kmovdl(tmp, src);
4534   kunpckdql(dst, tmp, tmp);
4535 }
4536 #endif
4537 






































































































































































































































































































































































































4538 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
4539   Label done;
4540   Label neg_divisor_fastpath;
4541   cmpl(divisor, 0);
4542   jccb(Assembler::less, neg_divisor_fastpath);
4543   xorl(rdx, rdx);
4544   divl(divisor);
4545   jmpb(done);
4546   bind(neg_divisor_fastpath);
4547   // Fastpath for divisor < 0:
4548   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
4549   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
4550   movl(rdx, rax);
4551   subl(rdx, divisor);
4552   if (VM_Version::supports_bmi1()) {
4553     andnl(rax, rdx, rax);
4554   } else {
4555     notl(rdx);
4556     andl(rax, rdx);
4557   }

4680   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
4681   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
4682   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
4683   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
4684   movq(rdx, rax);
4685   subq(rax, divisor);
4686   if (VM_Version::supports_bmi1()) {
4687     andnq(rax, rax, rdx);
4688   } else {
4689     notq(rax);
4690     andq(rax, rdx);
4691   }
4692   movq(tmp, rax);
4693   shrq(rax, 63); // quotient
4694   sarq(tmp, 63);
4695   andq(tmp, divisor);
4696   subq(rdx, tmp); // remainder
4697   bind(done);
4698 }
4699 #endif
4700 

4401       vmovmskps(tmp, mask, vec_enc);
4402       need_clip = masklen < 4;
4403       break;
4404     case T_LONG:
4405     case T_DOUBLE:
4406       vmovmskpd(tmp, mask, vec_enc);
4407       need_clip = masklen < 2;
4408       break;
4409     default: assert(false, "Unhandled type, %s", type2name(bt));
4410   }
4411 
4412   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4413   // operations needs to be clipped.
4414   if (need_clip && opc != Op_VectorMaskFirstTrue) {
4415     // need_clip implies masklen < 32
4416     andq(tmp, (1 << masklen) - 1);
4417   }
4418 
4419   vector_mask_operation_helper(opc, dst, tmp, masklen);
4420 }
4421 
4422 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
4423                                              Register rtmp2, int mask_len) {
4424   kmov(rtmp1, src);
4425   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
4426   mov64(rtmp2, -1L);
4427   pext(rtmp2, rtmp2, rtmp1);
4428   kmov(dst, rtmp2);
4429 }
4430 
4431 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
4432                                                bool merge, BasicType bt, int vec_enc) {
4433   if (opcode == Op_CompressV) {
4434     switch(bt) {
4435     case T_BYTE:
4436       evpcompressb(dst, mask, src, merge, vec_enc);
4437       break;
4438     case T_CHAR:
4439     case T_SHORT:
4440       evpcompressw(dst, mask, src, merge, vec_enc);
4441       break;
4442     case T_INT:
4443       evpcompressd(dst, mask, src, merge, vec_enc);
4444       break;
4445     case T_FLOAT:
4446       evcompressps(dst, mask, src, merge, vec_enc);
4447       break;
4448     case T_LONG:
4449       evpcompressq(dst, mask, src, merge, vec_enc);
4450       break;
4451     case T_DOUBLE:
4452       evcompresspd(dst, mask, src, merge, vec_enc);
4453       break;
4454     default:
4455       fatal("Unsupported type");
4456       break;
4457     }
4458   } else {
4459     assert(opcode == Op_ExpandV, "");
4460     switch(bt) {
4461     case T_BYTE:
4462       evpexpandb(dst, mask, src, merge, vec_enc);
4463       break;
4464     case T_CHAR:
4465     case T_SHORT:
4466       evpexpandw(dst, mask, src, merge, vec_enc);
4467       break;
4468     case T_INT:
4469       evpexpandd(dst, mask, src, merge, vec_enc);
4470       break;
4471     case T_FLOAT:
4472       evexpandps(dst, mask, src, merge, vec_enc);
4473       break;
4474     case T_LONG:
4475       evpexpandq(dst, mask, src, merge, vec_enc);
4476       break;
4477     case T_DOUBLE:
4478       evexpandpd(dst, mask, src, merge, vec_enc);
4479       break;
4480     default:
4481       fatal("Unsupported type");
4482       break;
4483     }
4484   }
4485 }
4486 #endif
4487 
4488 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
4489   if (VM_Version::supports_avx512bw()) {
4490     if (mask_len > 32) {
4491       kmovql(dst, src);
4492     } else {
4493       kmovdl(dst, src);
4494       if (mask_len != 32) {
4495         kshiftrdl(dst, dst, 32 - mask_len);
4496       }
4497     }
4498   } else {
4499     assert(mask_len <= 16, "");
4500     kmovwl(dst, src);
4501     if (mask_len != 16) {
4502       kshiftrwl(dst, dst, 16 - mask_len);
4503     }
4504   }
4505 }
4506 
4507 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
4508   int lane_size = type2aelembytes(bt);
4509   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
4510   if ((is_LP64 || lane_size < 8) &&
4511       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
4512        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
4513     movptr(rtmp, imm32);
4514     switch(lane_size) {
4515       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
4516       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
4517       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
4518       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
4519       default : ShouldNotReachHere(); break;
4520     }
4521   } else {
4522     movptr(rtmp, imm32);
4523     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
4524     switch(lane_size) {
4525       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
4526       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
4527       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
4528       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
4529       default : ShouldNotReachHere(); break;
4530     }
4531   }
4532 }
4533 
4534 //
4535 // Following is lookup table based popcount computation algorithm:-
4536 //       Index   Bit set count
4537 //     [ 0000 ->   0,
4538 //       0001 ->   1,
4539 //       0010 ->   1,
4540 //       0011 ->   2,
4541 //       0100 ->   1,
4542 //       0101 ->   2,
4543 //       0110 ->   2,
4544 //       0111 ->   3,
4545 //       1000 ->   1,
4546 //       1001 ->   2,
4547 //       1010 ->   3,
4548 //       1011 ->   3,
4549 //       1100 ->   2,
4550 //       1101 ->   3,
4551 //       1111 ->   4 ]
4552 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
4553 //     shuffle indices for lookup table access.
4554 //  b. Right shift each byte of vector lane by 4 positions.
4555 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
4556 //     shuffle indices for lookup table access.
4557 //  d. Add the bitset count of upper and lower 4 bits of each byte.
4558 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
4559 //     count of all the bytes of a quadword.
4560 //  f. Perform step e. for upper 128bit vector lane.
4561 //  g. Pack the bitset count of quadwords back to double word.
4562 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
4563 
4564 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4565                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
4566   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
4567   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
4568   vpsrlw(dst, src, 4, vec_enc);
4569   vpand(dst, dst, xtmp1, vec_enc);
4570   vpand(xtmp1, src, xtmp1, vec_enc);
4571   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp, vec_enc);
4572   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
4573   vpshufb(dst, xtmp2, dst, vec_enc);
4574   vpaddb(dst, dst, xtmp1, vec_enc);
4575 }
4576 
4577 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4578                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
4579   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4580   // Following code is as per steps e,f,g and h of above algorithm.
4581   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4582   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
4583   vpsadbw(dst, dst, xtmp2, vec_enc);
4584   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
4585   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
4586   vpackuswb(dst, xtmp1, dst, vec_enc);
4587 }
4588 
4589 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4590                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
4591   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4592   // Add the popcount of upper and lower bytes of word.
4593   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
4594   vpsrlw(dst, xtmp1, 8, vec_enc);
4595   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
4596   vpaddw(dst, dst, xtmp1, vec_enc);








4597 }
4598 
4599 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4600                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
4601   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4602   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4603   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
4604 }
4605 
4606 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4607                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
4608   switch(bt) {
4609     case T_LONG:
4610       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4611       break;
4612     case T_INT:
4613       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4614       break;
4615     case T_CHAR:
4616     case T_SHORT:
4617       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4618       break;
4619     case T_BYTE:
4620     case T_BOOLEAN:
4621       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4622       break;
4623     default:
4624       ShouldNotReachHere();
4625   }
4626 }
4627 
4628 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
4629                                                       KRegister mask, bool merge, int vec_enc) {
4630   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
4631   switch(bt) {
4632     case T_LONG:
4633       assert(VM_Version::supports_avx512_vpopcntdq(), "");
4634       evpopcntq(dst, mask, src, merge, vec_enc);
4635       break;
4636     case T_INT:
4637       assert(VM_Version::supports_avx512_vpopcntdq(), "");
4638       evpopcntd(dst, mask, src, merge, vec_enc);
4639       break;
4640     case T_CHAR:
4641     case T_SHORT:
4642       assert(VM_Version::supports_avx512_bitalg(), "");
4643       evpopcntw(dst, mask, src, merge, vec_enc);
4644       break;
4645     case T_BYTE:
4646     case T_BOOLEAN:
4647       assert(VM_Version::supports_avx512_bitalg(), "");
4648       evpopcntb(dst, mask, src, merge, vec_enc);
4649       break;
4650     default:
4651       ShouldNotReachHere();
4652   }

4653 }
4654 
4655 #ifndef _LP64
4656 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
4657   assert(VM_Version::supports_avx512bw(), "");
4658   kmovdl(tmp, src);
4659   kunpckdql(dst, tmp, tmp);
4660 }
4661 #endif
4662 
4663 // Bit reversal algorithm first reverses the bits of each byte followed by
4664 // a byte level reversal for multi-byte primitive types (short/int/long).
4665 // Algorithm performs a lookup table access to get reverse bit sequence
4666 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
4667 // is obtained by swapping the reverse bit sequences of upper and lower
4668 // nibble of a byte.
4669 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4670                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
4671   if (VM_Version::supports_avx512vlbw()) {
4672 
4673     // Get the reverse bit sequence of lower nibble of each byte.
4674     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), rtmp, vec_enc);
4675     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
4676     vpandq(dst, xtmp2, src, vec_enc);
4677     vpshufb(dst, xtmp1, dst, vec_enc);
4678     vpsllq(dst, dst, 4, vec_enc);
4679 
4680     // Get the reverse bit sequence of upper nibble of each byte.
4681     vpandn(xtmp2, xtmp2, src, vec_enc);
4682     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
4683     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
4684 
4685     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
4686     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
4687     vporq(xtmp2, dst, xtmp2, vec_enc);
4688     vector_reverse_byte(bt, dst, xtmp2, rtmp, vec_enc);
4689 
4690   } else if(!VM_Version::supports_avx512vlbw() && vec_enc == Assembler::AVX_512bit) {
4691 
4692     // Shift based bit reversal.
4693     assert(bt == T_LONG || bt == T_INT, "");
4694     vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
4695 
4696     // Swap lower and upper nibble of each byte.
4697     vpandq(dst, xtmp1, src, vec_enc);
4698     vpsllq(dst, dst, 4, vec_enc);
4699     vpandn(xtmp2, xtmp1, src, vec_enc);
4700     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
4701     vporq(xtmp1, dst, xtmp2, vec_enc);
4702 
4703     // Swap two least and most significant bits of each nibble.
4704     vbroadcast(T_INT, xtmp2, 0x33333333, rtmp, vec_enc);
4705     vpandq(dst, xtmp2, xtmp1, vec_enc);
4706     vpsllq(dst, dst, 2, vec_enc);
4707     vpandn(xtmp2, xtmp2, xtmp1, vec_enc);
4708     vpsrlq(xtmp2, xtmp2, 2, vec_enc);
4709     vporq(xtmp1, dst, xtmp2, vec_enc);
4710 
4711     // Swap adjacent pair of bits.
4712     vbroadcast(T_INT, xtmp2, 0x55555555, rtmp, vec_enc);
4713     vpandq(dst, xtmp2, xtmp1, vec_enc);
4714     vpsllq(dst, dst, 1, vec_enc);
4715     vpandn(xtmp2, xtmp2, xtmp1, vec_enc);
4716     vpsrlq(xtmp2, xtmp2, 1, vec_enc);
4717     vporq(xtmp1, dst, xtmp2, vec_enc);
4718 
4719     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
4720 
4721   } else {
4722     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), rtmp, vec_enc);
4723     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
4724 
4725     // Get the reverse bit sequence of lower nibble of each byte.
4726     vpand(dst, xtmp2, src, vec_enc);
4727     vpshufb(dst, xtmp1, dst, vec_enc);
4728     vpsllq(dst, dst, 4, vec_enc);
4729 
4730     // Get the reverse bit sequence of upper nibble of each byte.
4731     vpandn(xtmp2, xtmp2, src, vec_enc);
4732     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
4733     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
4734 
4735     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
4736     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
4737     vpor(xtmp2, dst, xtmp2, vec_enc);
4738     vector_reverse_byte(bt, dst, xtmp2, rtmp, vec_enc);
4739   }
4740 }
4741 
4742 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src,
4743                                                 XMMRegister xtmp, AddressLiteral mask, Register rtmp, int vec_enc) {
4744   // Galois field instruction based bit reversal based on following algorithm.
4745   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
4746   assert(VM_Version::supports_gfni(), "");
4747   vpbroadcastq(xtmp, mask, vec_enc, rtmp);
4748   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
4749   vector_reverse_byte(bt, dst, xtmp, rtmp, vec_enc);
4750 }
4751 
4752 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4753                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
4754   // Shift based bit reversal.
4755   assert(VM_Version::supports_evex(), "");
4756   evmovdqul(xtmp1, k0, src, true, vec_enc);
4757   switch(bt) {
4758     case T_LONG:
4759       // Swap upper and lower double word of each quad word.
4760       evprorq(xtmp1, k0, xtmp1, 32, true, vec_enc);
4761     case T_INT:
4762       // Swap upper and lower word of each double word.
4763       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
4764     case T_SHORT:
4765       // Swap upper and lower byte of each word.
4766       vbroadcast(T_INT, dst, 0x00FF00FF, rtmp, vec_enc);
4767       vpandq(xtmp2, dst, xtmp1, vec_enc);
4768       vpsllq(xtmp2, xtmp2, 8, vec_enc);
4769       vpandn(xtmp1, dst, xtmp1, vec_enc);
4770       vpsrlq(dst, xtmp1, 8, vec_enc);
4771       vporq(dst, dst, xtmp2, vec_enc);
4772       break;
4773     case T_BYTE:
4774       evmovdquq(dst, k0, src, true, vec_enc);
4775       break;
4776     default:
4777       fatal("Unsupported type");
4778       break;
4779   }
4780 }
4781 
4782 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, Register rtmp, int vec_enc) {
4783   if (bt == T_BYTE) {
4784     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
4785       evmovdquq(dst, k0, src, true, vec_enc);
4786     } else {
4787       vmovdqu(dst, src);
4788     }
4789     return;
4790   }
4791   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
4792   // pre-computed shuffle indices.
4793   switch(bt) {
4794     case T_LONG:
4795       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), rtmp, vec_enc);
4796       break;
4797     case T_INT:
4798       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), rtmp, vec_enc);
4799       break;
4800     case T_SHORT:
4801       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), rtmp, vec_enc);
4802       break;
4803     default:
4804       fatal("Unsupported type");
4805       break;
4806   }
4807   vpshufb(dst, src, dst, vec_enc);
4808 }
4809 
4810 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
4811                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
4812                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
4813   assert(is_integral_type(bt), "");
4814   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
4815   assert(VM_Version::supports_avx512cd(), "");
4816   switch(bt) {
4817     case T_LONG:
4818       evplzcntq(dst, ktmp, src, merge, vec_enc);
4819       break;
4820     case T_INT:
4821       evplzcntd(dst, ktmp, src, merge, vec_enc);
4822       break;
4823     case T_SHORT:
4824       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
4825       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
4826       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
4827       vpunpckhwd(dst, xtmp1, src, vec_enc);
4828       evplzcntd(dst, ktmp, dst, merge, vec_enc);
4829       vpackusdw(dst, xtmp2, dst, vec_enc);
4830       break;
4831     case T_BYTE:
4832       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
4833       // accessing the lookup table.
4834       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
4835       // accessing the lookup table.
4836       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
4837       assert(VM_Version::supports_avx512bw(), "");
4838       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
4839       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
4840       vpand(xtmp2, dst, src, vec_enc);
4841       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
4842       vpsrlw(xtmp3, src, 4, vec_enc);
4843       vpand(xtmp3, dst, xtmp3, vec_enc);
4844       vpshufb(dst, xtmp1, xtmp3, vec_enc);
4845       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
4846       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
4847       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
4848       break;
4849     default:
4850       ShouldNotReachHere();
4851   }
4852 }
4853 
4854 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4855                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
4856   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
4857   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
4858   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
4859   // accessing the lookup table.
4860   vpand(dst, xtmp2, src, vec_enc);
4861   vpshufb(dst, xtmp1, dst, vec_enc);
4862   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
4863   // accessing the lookup table.
4864   vpsrlw(xtmp3, src, 4, vec_enc);
4865   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
4866   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
4867   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
4868   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
4869   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
4870   vpaddb(dst, dst, xtmp2, vec_enc);
4871   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
4872 }
4873 
4874 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4875                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
4876   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4877   // Add zero counts of lower byte and upper byte of a word if
4878   // upper byte holds a zero value.
4879   vpsrlw(xtmp3, src, 8, vec_enc);
4880   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
4881   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
4882   vpsllw(xtmp2, dst, 8, vec_enc);
4883   vpaddw(xtmp2, xtmp2, dst, vec_enc);
4884   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
4885   vpsrlw(dst, dst, 8, vec_enc);
4886 }
4887 
4888 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4889                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
4890   // Since IEEE 754 floating point format represents mantissa in 1.0 format
4891   // hence biased exponent can be used to compute leading zero count as per
4892   // following formula:-
4893   // LZCNT = 32 - (biased_exp - 127)
4894   // Special handling has been introduced for Zero, Max_Int and -ve source values.
4895 
4896   // Broadcast 0xFF
4897   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
4898   vpsrld(xtmp1, xtmp1, 24, vec_enc);
4899 
4900   // Extract biased exponent.
4901   vcvtdq2ps(dst, src, vec_enc);
4902   vpsrld(dst, dst, 23, vec_enc);
4903   vpand(dst, dst, xtmp1, vec_enc);
4904 
4905   // Broadcast 127.
4906   vpsrld(xtmp1, xtmp1, 1, vec_enc);
4907   // Exponent = biased_exp - 127
4908   vpsubd(dst, dst, xtmp1, vec_enc);
4909 
4910   // Exponent = Exponent  + 1
4911   vpsrld(xtmp3, xtmp1, 6, vec_enc);
4912   vpaddd(dst, dst, xtmp3, vec_enc);
4913 
4914   // Replace -ve exponent with zero, exponent is -ve when src
4915   // lane contains a zero value.
4916   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4917   vblendvps(dst, dst, xtmp2, dst, vec_enc);
4918 
4919   // Rematerialize broadcast 32.
4920   vpslld(xtmp1, xtmp3, 5, vec_enc);
4921   // Exponent is 32 if corresponding source lane contains max_int value.
4922   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4923   // LZCNT = 32 - exponent
4924   vpsubd(dst, xtmp1, dst, vec_enc);
4925 
4926   // Replace LZCNT with a value 1 if corresponding source lane
4927   // contains max_int value.
4928   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
4929 
4930   // Replace biased_exp with 0 if source lane value is less than zero.
4931   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4932   vblendvps(dst, dst, xtmp2, src, vec_enc);
4933 }
4934 
4935 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4936                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
4937   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4938   // Add zero counts of lower word and upper word of a double word if
4939   // upper word holds a zero value.
4940   vpsrld(xtmp3, src, 16, vec_enc);
4941   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
4942   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
4943   vpslld(xtmp2, dst, 16, vec_enc);
4944   vpaddd(xtmp2, xtmp2, dst, vec_enc);
4945   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
4946   vpsrld(dst, dst, 16, vec_enc);
4947   // Add zero counts of lower doubleword and upper doubleword of a
4948   // quadword if upper doubleword holds a zero value.
4949   vpsrlq(xtmp3, src, 32, vec_enc);
4950   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
4951   vpsllq(xtmp2, dst, 32, vec_enc);
4952   vpaddq(xtmp2, xtmp2, dst, vec_enc);
4953   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
4954   vpsrlq(dst, dst, 32, vec_enc);
4955 }
4956 
4957 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
4958                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
4959                                                        Register rtmp, int vec_enc) {
4960   assert(is_integral_type(bt), "unexpected type");
4961   assert(vec_enc < Assembler::AVX_512bit, "");
4962   switch(bt) {
4963     case T_LONG:
4964       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4965       break;
4966     case T_INT:
4967       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
4968       break;
4969     case T_SHORT:
4970       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4971       break;
4972     case T_BYTE:
4973       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
4974       break;
4975     default:
4976       ShouldNotReachHere();
4977   }
4978 }
4979 
4980 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
4981   switch(bt) {
4982     case T_BYTE:
4983       vpsubb(dst, src1, src2, vec_enc);
4984       break;
4985     case T_SHORT:
4986       vpsubw(dst, src1, src2, vec_enc);
4987       break;
4988     case T_INT:
4989       vpsubd(dst, src1, src2, vec_enc);
4990       break;
4991     case T_LONG:
4992       vpsubq(dst, src1, src2, vec_enc);
4993       break;
4994     default:
4995       ShouldNotReachHere();
4996   }
4997 }
4998 
4999 void C2_MacroAssembler::vpadd(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5000   switch(bt) {
5001     case T_BYTE:
5002       vpaddb(dst, src1, src2, vec_enc);
5003       break;
5004     case T_SHORT:
5005       vpaddw(dst, src1, src2, vec_enc);
5006       break;
5007     case T_INT:
5008       vpaddd(dst, src1, src2, vec_enc);
5009       break;
5010     case T_LONG:
5011       vpaddq(dst, src1, src2, vec_enc);
5012       break;
5013     default:
5014       ShouldNotReachHere();
5015   }
5016 }
5017 
5018 // Trailing zero count computation is based on leading zero count operation as per
5019 // following equation. All AVX3 targets support AVX512CD feature which offers
5020 // direct vector instruction to compute leading zero count.
5021 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5022 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5023                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5024                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5025   assert(is_integral_type(bt), "");
5026   // xtmp = -1
5027   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5028   // xtmp = xtmp + src
5029   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5030   // xtmp = xtmp & ~src
5031   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5032   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5033   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5034   vpsub(bt, dst, xtmp4, dst, vec_enc);
5035 }
5036 
5037 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5038 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5039 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5040                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5041   assert(is_integral_type(bt), "");
5042   // xtmp = 0
5043   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5044   // xtmp = 0 - src
5045   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5046   // xtmp = xtmp | src
5047   vpor(xtmp3, xtmp3, src, vec_enc);
5048   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5049   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5050   vpsub(bt, dst, xtmp1, dst, vec_enc);
5051 }
5052 
5053 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5054   Label done;
5055   Label neg_divisor_fastpath;
5056   cmpl(divisor, 0);
5057   jccb(Assembler::less, neg_divisor_fastpath);
5058   xorl(rdx, rdx);
5059   divl(divisor);
5060   jmpb(done);
5061   bind(neg_divisor_fastpath);
5062   // Fastpath for divisor < 0:
5063   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5064   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5065   movl(rdx, rax);
5066   subl(rdx, divisor);
5067   if (VM_Version::supports_bmi1()) {
5068     andnl(rax, rdx, rax);
5069   } else {
5070     notl(rdx);
5071     andl(rax, rdx);
5072   }

5195   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5196   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5197   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5198   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5199   movq(rdx, rax);
5200   subq(rax, divisor);
5201   if (VM_Version::supports_bmi1()) {
5202     andnq(rax, rax, rdx);
5203   } else {
5204     notq(rax);
5205     andq(rax, rdx);
5206   }
5207   movq(tmp, rax);
5208   shrq(rax, 63); // quotient
5209   sarq(tmp, 63);
5210   andq(tmp, divisor);
5211   subq(rdx, tmp); // remainder
5212   bind(done);
5213 }
5214 #endif

< prev index next >