475 }
476
477 // !!!!! Special hack to get all types of calls to specify the byte offset
478 // from the start of the call to the point where the return address
479 // will point.
480 int MachCallStaticJavaNode::ret_addr_offset()
481 {
482 int offset = 5; // 5 bytes from start of call to where return address points
483 offset += clear_avx_size();
484 return offset;
485 }
486
487 int MachCallDynamicJavaNode::ret_addr_offset()
488 {
489 int offset = 15; // 15 bytes from start of call to where return address points
490 offset += clear_avx_size();
491 return offset;
492 }
493
494 int MachCallRuntimeNode::ret_addr_offset() {
495 int offset = 13; // movq r10,#addr; callq (r10)
496 if (this->ideal_Opcode() != Op_CallLeafVector) {
497 offset += clear_avx_size();
498 }
499 return offset;
500 }
501
502 int MachCallNativeNode::ret_addr_offset() {
503 int offset = 13; // movq r10,#addr; callq (r10)
504 offset += clear_avx_size();
505 return offset;
506 }
507 //
508 // Compute padding required for nodes which need alignment
509 //
510
511 // The address of the call instruction needs to be 4-byte aligned to
512 // ensure that it does not span a cache line so that it can be patched.
513 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
514 {
515 current_offset += clear_avx_size(); // skip vzeroupper
516 current_offset += 1; // skip call opcode byte
517 return align_up(current_offset, alignment_required()) - current_offset;
518 }
519
520 // The address of the call instruction needs to be 4-byte aligned to
521 // ensure that it does not span a cache line so that it can be patched.
522 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
523 {
524 current_offset += clear_avx_size(); // skip vzeroupper
525 current_offset += 11; // skip movq instruction + call opcode byte
526 return align_up(current_offset, alignment_required()) - current_offset;
895 st->print("# stack alignment check");
896 #endif
897 }
898 if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
899 st->print("\n\t");
900 st->print("cmpl [r15_thread + #disarmed_offset], #disarmed_value\t");
901 st->print("\n\t");
902 st->print("je fast_entry\t");
903 st->print("\n\t");
904 st->print("call #nmethod_entry_barrier_stub\t");
905 st->print("\n\tfast_entry:");
906 }
907 st->cr();
908 }
909 #endif
910
911 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
912 Compile* C = ra_->C;
913 MacroAssembler _masm(&cbuf);
914
915 int framesize = C->output()->frame_size_in_bytes();
916 int bangsize = C->output()->bang_size_in_bytes();
917
918 if (C->clinit_barrier_on_entry()) {
919 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
920 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
921
922 Label L_skip_barrier;
923 Register klass = rscratch1;
924
925 __ mov_metadata(klass, C->method()->holder()->constant_encoding());
926 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
927
928 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
929
930 __ bind(L_skip_barrier);
931 }
932
933 __ verified_entry(framesize, C->output()->need_stack_bang(bangsize)?bangsize:0, false, C->stub_function() != NULL);
934
935 C->output()->set_frame_complete(cbuf.insts_size());
936
937 if (C->has_mach_constant_base_node()) {
938 // NOTE: We set the table base offset here because users might be
939 // emitted before MachConstantBaseNode.
940 ConstantTable& constant_table = C->output()->constant_table();
941 constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
942 }
943 }
944
945 uint MachPrologNode::size(PhaseRegAlloc* ra_) const
946 {
947 return MachNode::size(ra_); // too many variables; just compute it
948 // the hard way
949 }
950
951 int MachPrologNode::reloc() const
952 {
953 return 0; // a large enough number
954 }
955
956 //=============================================================================
957 #ifndef PRODUCT
958 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
959 {
960 Compile* C = ra_->C;
961 if (generate_vzeroupper(C)) {
962 st->print("vzeroupper");
963 st->cr(); st->print("\t");
964 }
965
966 int framesize = C->output()->frame_size_in_bytes();
967 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
968 // Remove word for return adr already pushed
969 // and RBP
970 framesize -= 2*wordSize;
978 if (do_polling() && C->is_method_compilation()) {
979 st->print("\t");
980 st->print_cr("cmpq rsp, poll_offset[r15_thread] \n\t"
981 "ja #safepoint_stub\t"
982 "# Safepoint: poll for GC");
983 }
984 }
985 #endif
986
987 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
988 {
989 Compile* C = ra_->C;
990 MacroAssembler _masm(&cbuf);
991
992 if (generate_vzeroupper(C)) {
993 // Clear upper bits of YMM registers when current compiled code uses
994 // wide vectors to avoid AVX <-> SSE transition penalty during call.
995 __ vzeroupper();
996 }
997
998 int framesize = C->output()->frame_size_in_bytes();
999 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
1000 // Remove word for return adr already pushed
1001 // and RBP
1002 framesize -= 2*wordSize;
1003
1004 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
1005
1006 if (framesize) {
1007 emit_opcode(cbuf, Assembler::REX_W);
1008 if (framesize < 0x80) {
1009 emit_opcode(cbuf, 0x83); // addq rsp, #framesize
1010 emit_rm(cbuf, 0x3, 0x00, RSP_enc);
1011 emit_d8(cbuf, framesize);
1012 } else {
1013 emit_opcode(cbuf, 0x81); // addq rsp, #framesize
1014 emit_rm(cbuf, 0x3, 0x00, RSP_enc);
1015 emit_d32(cbuf, framesize);
1016 }
1017 }
1018
1019 // popq rbp
1020 emit_opcode(cbuf, 0x58 | RBP_enc);
1021
1022 if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
1023 __ reserved_stack_check();
1024 }
1025
1026 if (do_polling() && C->is_method_compilation()) {
1027 MacroAssembler _masm(&cbuf);
1028 Label dummy_label;
1029 Label* code_stub = &dummy_label;
1030 if (!C->output()->in_scratch_emit_size()) {
1031 code_stub = &C->output()->safepoint_poll_table()->add_safepoint(__ offset());
1032 }
1033 __ relocate(relocInfo::poll_return_type);
1034 __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
1035 }
1036 }
1037
1038 uint MachEpilogNode::size(PhaseRegAlloc* ra_) const
1039 {
1040 return MachNode::size(ra_); // too many variables; just compute it
1041 // the hard way
1042 }
1043
1044 int MachEpilogNode::reloc() const
1045 {
1046 return 2; // a large enough number
1047 }
1048
1049 const Pipeline* MachEpilogNode::pipeline() const
1050 {
1051 return MachNode::pipeline_class();
1052 }
1053
1054 //=============================================================================
1055
1056 enum RC {
1057 rc_bad,
1058 rc_int,
1059 rc_kreg,
1060 rc_float,
1061 rc_stack
1062 };
1063
1656 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1657 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1658 emit_rm(cbuf, 0x2, reg & 7, 0x04);
1659 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1660 emit_d32(cbuf, offset);
1661 } else {
1662 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1663 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1664 emit_rm(cbuf, 0x1, reg & 7, 0x04);
1665 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1666 emit_d8(cbuf, offset);
1667 }
1668 }
1669
1670 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
1671 {
1672 int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1673 return (offset < 0x80) ? 5 : 8; // REX
1674 }
1675
1676 //=============================================================================
1677 #ifndef PRODUCT
1678 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1679 {
1680 if (UseCompressedClassPointers) {
1681 st->print_cr("movl rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
1682 st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
1683 st->print_cr("\tcmpq rax, rscratch1\t # Inline cache check");
1684 } else {
1685 st->print_cr("\tcmpq rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
1686 "# Inline cache check");
1687 }
1688 st->print_cr("\tjne SharedRuntime::_ic_miss_stub");
1689 st->print_cr("\tnop\t# nops to align entry point");
1690 }
1691 #endif
1692
1693 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1694 {
1695 MacroAssembler masm(&cbuf);
1698 masm.load_klass(rscratch1, j_rarg0, rscratch2);
1699 masm.cmpptr(rax, rscratch1);
1700 } else {
1701 masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1702 }
1703
1704 masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1705
1706 /* WARNING these NOPs are critical so that verified entry point is properly
1707 4 bytes aligned for patching by NativeJump::patch_verified_entry() */
1708 int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
1709 if (OptoBreakpoint) {
1710 // Leave space for int3
1711 nops_cnt -= 1;
1712 }
1713 nops_cnt &= 0x3; // Do not add nops if code is aligned.
1714 if (nops_cnt > 0)
1715 masm.nop(nops_cnt);
1716 }
1717
1718 uint MachUEPNode::size(PhaseRegAlloc* ra_) const
1719 {
1720 return MachNode::size(ra_); // too many variables; just compute it
1721 // the hard way
1722 }
1723
1724
1725 //=============================================================================
1726
1727 const bool Matcher::supports_vector_calling_convention(void) {
1728 if (EnableVectorSupport && UseVectorStubs) {
1729 return true;
1730 }
1731 return false;
1732 }
1733
1734 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
1735 assert(EnableVectorSupport && UseVectorStubs, "sanity");
1736 int lo = XMM0_num;
1737 int hi = XMM0b_num;
1738 if (ideal_reg == Op_VecX) hi = XMM0d_num;
1739 else if (ideal_reg == Op_VecY) hi = XMM0h_num;
1740 else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
1741 return OptoRegPair(hi, lo);
1742 }
1743
1744 // Is this branch offset short enough that a short branch can be used?
4010 %}
4011 %}
4012
4013 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
4014 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
4015 %{
4016 constraint(ALLOC_IN_RC(ptr_reg));
4017 predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
4018 match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
4019
4020 op_cost(10);
4021 format %{"[$reg + $off + $idx << $scale]" %}
4022 interface(MEMORY_INTER) %{
4023 base($reg);
4024 index($idx);
4025 scale($scale);
4026 disp($off);
4027 %}
4028 %}
4029
4030 // Indirect Narrow Oop Plus Offset Operand
4031 // Note: x86 architecture doesn't support "scale * index + offset" without a base
4032 // we can't free r12 even with CompressedOops::base() == NULL.
4033 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
4034 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4035 constraint(ALLOC_IN_RC(ptr_reg));
4036 match(AddP (DecodeN reg) off);
4037
4038 op_cost(10);
4039 format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
4040 interface(MEMORY_INTER) %{
4041 base(0xc); // R12
4042 index($reg);
4043 scale(0x3);
4044 disp($off);
4045 %}
4046 %}
4047
4048 // Indirect Memory Operand
4049 operand indirectNarrow(rRegN reg)
4352 equal(0x4, "e");
4353 not_equal(0x5, "ne");
4354 less(0x2, "b");
4355 greater_equal(0x3, "nb");
4356 less_equal(0x6, "be");
4357 greater(0x7, "nbe");
4358 overflow(0x0, "o");
4359 no_overflow(0x1, "no");
4360 %}
4361 %}
4362
4363 //----------OPERAND CLASSES----------------------------------------------------
4364 // Operand Classes are groups of operands that are used as to simplify
4365 // instruction definitions by not requiring the AD writer to specify separate
4366 // instructions for every form of operand when the instruction accepts
4367 // multiple operand types with the same basic encoding and format. The classic
4368 // case of this is memory operands.
4369
4370 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
4371 indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
4372 indCompressedOopOffset,
4373 indirectNarrow, indOffset8Narrow, indOffset32Narrow,
4374 indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
4375 indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
4376
4377 //----------PIPELINE-----------------------------------------------------------
4378 // Rules which define the behavior of the target architectures pipeline.
4379 pipeline %{
4380
4381 //----------ATTRIBUTES---------------------------------------------------------
4382 attributes %{
4383 variable_size_instructions; // Fixed size instructions
4384 max_instructions_per_bundle = 3; // Up to 3 instructions per bundle
4385 instruction_unit_size = 1; // An instruction is 1 bytes long
4386 instruction_fetch_unit_size = 16; // The processor fetches one line
4387 instruction_fetch_units = 1; // of 16 bytes
4388
4389 // List of nop instructions
4390 nops( MachNop );
4391 %}
4392
6846 format %{ "MEMBAR-storestore (empty encoding)" %}
6847 ins_encode( );
6848 ins_pipe(empty);
6849 %}
6850
6851 //----------Move Instructions--------------------------------------------------
6852
6853 instruct castX2P(rRegP dst, rRegL src)
6854 %{
6855 match(Set dst (CastX2P src));
6856
6857 format %{ "movq $dst, $src\t# long->ptr" %}
6858 ins_encode %{
6859 if ($dst$$reg != $src$$reg) {
6860 __ movptr($dst$$Register, $src$$Register);
6861 }
6862 %}
6863 ins_pipe(ialu_reg_reg); // XXX
6864 %}
6865
6866 instruct castP2X(rRegL dst, rRegP src)
6867 %{
6868 match(Set dst (CastP2X src));
6869
6870 format %{ "movq $dst, $src\t# ptr -> long" %}
6871 ins_encode %{
6872 if ($dst$$reg != $src$$reg) {
6873 __ movptr($dst$$Register, $src$$Register);
6874 }
6875 %}
6876 ins_pipe(ialu_reg_reg); // XXX
6877 %}
6878
6879 // Convert oop into int for vectors alignment masking
6880 instruct convP2I(rRegI dst, rRegP src)
6881 %{
6882 match(Set dst (ConvL2I (CastP2X src)));
6883
6884 format %{ "movl $dst, $src\t# ptr -> int" %}
6885 ins_encode %{
11155 effect(DEF dst, USE src);
11156 ins_cost(100);
11157 format %{ "movd $dst,$src\t# MoveI2F" %}
11158 ins_encode %{
11159 __ movdl($dst$$XMMRegister, $src$$Register);
11160 %}
11161 ins_pipe( pipe_slow );
11162 %}
11163
11164 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11165 match(Set dst (MoveL2D src));
11166 effect(DEF dst, USE src);
11167 ins_cost(100);
11168 format %{ "movd $dst,$src\t# MoveL2D" %}
11169 ins_encode %{
11170 __ movdq($dst$$XMMRegister, $src$$Register);
11171 %}
11172 ins_pipe( pipe_slow );
11173 %}
11174
11175 // Fast clearing of an array
11176 // Small ClearArray non-AVX512.
11177 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11178 Universe dummy, rFlagsReg cr)
11179 %{
11180 predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX <= 2));
11181 match(Set dummy (ClearArray cnt base));
11182 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
11183
11184 format %{ $$template
11185 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11186 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11187 $$emit$$"jg LARGE\n\t"
11188 $$emit$$"dec rcx\n\t"
11189 $$emit$$"js DONE\t# Zero length\n\t"
11190 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11191 $$emit$$"dec rcx\n\t"
11192 $$emit$$"jge LOOP\n\t"
11193 $$emit$$"jmp DONE\n\t"
11194 $$emit$$"# LARGE:\n\t"
11195 if (UseFastStosb) {
11196 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11197 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11198 } else if (UseXMMForObjInit) {
11199 $$emit$$"mov rdi,rax\n\t"
11200 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11201 $$emit$$"jmpq L_zero_64_bytes\n\t"
11202 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11210 $$emit$$"jl L_tail\n\t"
11211 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11212 $$emit$$"add 0x20,rax\n\t"
11213 $$emit$$"sub 0x4,rcx\n\t"
11214 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11215 $$emit$$"add 0x4,rcx\n\t"
11216 $$emit$$"jle L_end\n\t"
11217 $$emit$$"dec rcx\n\t"
11218 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11219 $$emit$$"vmovq xmm0,(rax)\n\t"
11220 $$emit$$"add 0x8,rax\n\t"
11221 $$emit$$"dec rcx\n\t"
11222 $$emit$$"jge L_sloop\n\t"
11223 $$emit$$"# L_end:\n\t"
11224 } else {
11225 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11226 }
11227 $$emit$$"# DONE"
11228 %}
11229 ins_encode %{
11230 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11231 $tmp$$XMMRegister, false, knoreg);
11232 %}
11233 ins_pipe(pipe_slow);
11234 %}
11235
11236 // Small ClearArray AVX512 non-constant length.
11237 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11238 Universe dummy, rFlagsReg cr)
11239 %{
11240 predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX > 2));
11241 match(Set dummy (ClearArray cnt base));
11242 ins_cost(125);
11243 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11244
11245 format %{ $$template
11246 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11247 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11248 $$emit$$"jg LARGE\n\t"
11249 $$emit$$"dec rcx\n\t"
11250 $$emit$$"js DONE\t# Zero length\n\t"
11251 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11252 $$emit$$"dec rcx\n\t"
11253 $$emit$$"jge LOOP\n\t"
11254 $$emit$$"jmp DONE\n\t"
11255 $$emit$$"# LARGE:\n\t"
11256 if (UseFastStosb) {
11257 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11258 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11259 } else if (UseXMMForObjInit) {
11260 $$emit$$"mov rdi,rax\n\t"
11261 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11262 $$emit$$"jmpq L_zero_64_bytes\n\t"
11263 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11271 $$emit$$"jl L_tail\n\t"
11272 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11273 $$emit$$"add 0x20,rax\n\t"
11274 $$emit$$"sub 0x4,rcx\n\t"
11275 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11276 $$emit$$"add 0x4,rcx\n\t"
11277 $$emit$$"jle L_end\n\t"
11278 $$emit$$"dec rcx\n\t"
11279 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11280 $$emit$$"vmovq xmm0,(rax)\n\t"
11281 $$emit$$"add 0x8,rax\n\t"
11282 $$emit$$"dec rcx\n\t"
11283 $$emit$$"jge L_sloop\n\t"
11284 $$emit$$"# L_end:\n\t"
11285 } else {
11286 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11287 }
11288 $$emit$$"# DONE"
11289 %}
11290 ins_encode %{
11291 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11292 $tmp$$XMMRegister, false, $ktmp$$KRegister);
11293 %}
11294 ins_pipe(pipe_slow);
11295 %}
11296
11297 // Large ClearArray non-AVX512.
11298 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11299 Universe dummy, rFlagsReg cr)
11300 %{
11301 predicate((UseAVX <=2) && ((ClearArrayNode*)n)->is_large());
11302 match(Set dummy (ClearArray cnt base));
11303 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
11304
11305 format %{ $$template
11306 if (UseFastStosb) {
11307 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11308 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11309 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11310 } else if (UseXMMForObjInit) {
11311 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11312 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11313 $$emit$$"jmpq L_zero_64_bytes\n\t"
11314 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11315 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11316 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11317 $$emit$$"add 0x40,rax\n\t"
11318 $$emit$$"# L_zero_64_bytes:\n\t"
11319 $$emit$$"sub 0x8,rcx\n\t"
11320 $$emit$$"jge L_loop\n\t"
11321 $$emit$$"add 0x4,rcx\n\t"
11322 $$emit$$"jl L_tail\n\t"
11323 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11324 $$emit$$"add 0x20,rax\n\t"
11325 $$emit$$"sub 0x4,rcx\n\t"
11326 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11327 $$emit$$"add 0x4,rcx\n\t"
11328 $$emit$$"jle L_end\n\t"
11329 $$emit$$"dec rcx\n\t"
11330 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11331 $$emit$$"vmovq xmm0,(rax)\n\t"
11332 $$emit$$"add 0x8,rax\n\t"
11333 $$emit$$"dec rcx\n\t"
11334 $$emit$$"jge L_sloop\n\t"
11335 $$emit$$"# L_end:\n\t"
11336 } else {
11337 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11338 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11339 }
11340 %}
11341 ins_encode %{
11342 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11343 $tmp$$XMMRegister, true, knoreg);
11344 %}
11345 ins_pipe(pipe_slow);
11346 %}
11347
11348 // Large ClearArray AVX512.
11349 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11350 Universe dummy, rFlagsReg cr)
11351 %{
11352 predicate((UseAVX > 2) && ((ClearArrayNode*)n)->is_large());
11353 match(Set dummy (ClearArray cnt base));
11354 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11355
11356 format %{ $$template
11357 if (UseFastStosb) {
11358 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11359 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11360 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11361 } else if (UseXMMForObjInit) {
11362 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11363 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11364 $$emit$$"jmpq L_zero_64_bytes\n\t"
11365 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11366 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11367 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11368 $$emit$$"add 0x40,rax\n\t"
11369 $$emit$$"# L_zero_64_bytes:\n\t"
11370 $$emit$$"sub 0x8,rcx\n\t"
11371 $$emit$$"jge L_loop\n\t"
11372 $$emit$$"add 0x4,rcx\n\t"
11373 $$emit$$"jl L_tail\n\t"
11374 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11375 $$emit$$"add 0x20,rax\n\t"
11376 $$emit$$"sub 0x4,rcx\n\t"
11377 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11378 $$emit$$"add 0x4,rcx\n\t"
11379 $$emit$$"jle L_end\n\t"
11380 $$emit$$"dec rcx\n\t"
11381 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11382 $$emit$$"vmovq xmm0,(rax)\n\t"
11383 $$emit$$"add 0x8,rax\n\t"
11384 $$emit$$"dec rcx\n\t"
11385 $$emit$$"jge L_sloop\n\t"
11386 $$emit$$"# L_end:\n\t"
11387 } else {
11388 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11389 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11390 }
11391 %}
11392 ins_encode %{
11393 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11394 $tmp$$XMMRegister, true, $ktmp$$KRegister);
11395 %}
11396 ins_pipe(pipe_slow);
11397 %}
11398
11399 // Small ClearArray AVX512 constant length.
11400 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rRegI zero, kReg ktmp, Universe dummy, rFlagsReg cr)
11401 %{
11402 predicate(!((ClearArrayNode*)n)->is_large() &&
11403 ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11404 match(Set dummy (ClearArray cnt base));
11405 ins_cost(100);
11406 effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr);
11407 format %{ "clear_mem_imm $base , $cnt \n\t" %}
11408 ins_encode %{
11409 __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11410 %}
11411 ins_pipe(pipe_slow);
11412 %}
11413
11414 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11415 rax_RegI result, legRegD tmp1, rFlagsReg cr)
11416 %{
11417 predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11418 match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11419 effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11420
11421 format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %}
11422 ins_encode %{
11423 __ string_compare($str1$$Register, $str2$$Register,
11424 $cnt1$$Register, $cnt2$$Register, $result$$Register,
11425 $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11426 %}
11427 ins_pipe( pipe_slow );
11428 %}
11429
13217
13218 ins_cost(300);
13219 format %{ "call_leaf,vector " %}
13220 ins_encode(Java_To_Runtime(meth));
13221 ins_pipe(pipe_slow);
13222 %}
13223
13224 //
13225 instruct CallNativeDirect(method meth)
13226 %{
13227 match(CallNative);
13228 effect(USE meth);
13229
13230 ins_cost(300);
13231 format %{ "call_native " %}
13232 ins_encode(clear_avx, Java_To_Runtime(meth));
13233 ins_pipe(pipe_slow);
13234 %}
13235
13236 // Call runtime without safepoint
13237 instruct CallLeafNoFPDirect(method meth)
13238 %{
13239 match(CallLeafNoFP);
13240 effect(USE meth);
13241
13242 ins_cost(300);
13243 format %{ "call_leaf_nofp,runtime " %}
13244 ins_encode(clear_avx, Java_To_Runtime(meth));
13245 ins_pipe(pipe_slow);
13246 %}
13247
13248 // Return Instruction
13249 // Remove the return address & jump to it.
13250 // Notice: We always emit a nop after a ret to make sure there is room
13251 // for safepoint patching
13252 instruct Ret()
13253 %{
13254 match(Return);
13255
13256 format %{ "ret" %}
13257 ins_encode %{
13258 __ ret(0);
|
475 }
476
477 // !!!!! Special hack to get all types of calls to specify the byte offset
478 // from the start of the call to the point where the return address
479 // will point.
480 int MachCallStaticJavaNode::ret_addr_offset()
481 {
482 int offset = 5; // 5 bytes from start of call to where return address points
483 offset += clear_avx_size();
484 return offset;
485 }
486
487 int MachCallDynamicJavaNode::ret_addr_offset()
488 {
489 int offset = 15; // 15 bytes from start of call to where return address points
490 offset += clear_avx_size();
491 return offset;
492 }
493
494 int MachCallRuntimeNode::ret_addr_offset() {
495 if (_entry_point == NULL) {
496 // CallLeafNoFPInDirect
497 return 3; // callq (register)
498 }
499 int offset = 13; // movq r10,#addr; callq (r10)
500 if (this->ideal_Opcode() != Op_CallLeafVector) {
501 offset += clear_avx_size();
502 }
503 return offset;
504 }
505
506 int MachCallNativeNode::ret_addr_offset() {
507 int offset = 13; // movq r10,#addr; callq (r10)
508 offset += clear_avx_size();
509 return offset;
510 }
511
512 //
513 // Compute padding required for nodes which need alignment
514 //
515
516 // The address of the call instruction needs to be 4-byte aligned to
517 // ensure that it does not span a cache line so that it can be patched.
518 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
519 {
520 current_offset += clear_avx_size(); // skip vzeroupper
521 current_offset += 1; // skip call opcode byte
522 return align_up(current_offset, alignment_required()) - current_offset;
523 }
524
525 // The address of the call instruction needs to be 4-byte aligned to
526 // ensure that it does not span a cache line so that it can be patched.
527 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
528 {
529 current_offset += clear_avx_size(); // skip vzeroupper
530 current_offset += 11; // skip movq instruction + call opcode byte
531 return align_up(current_offset, alignment_required()) - current_offset;
900 st->print("# stack alignment check");
901 #endif
902 }
903 if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
904 st->print("\n\t");
905 st->print("cmpl [r15_thread + #disarmed_offset], #disarmed_value\t");
906 st->print("\n\t");
907 st->print("je fast_entry\t");
908 st->print("\n\t");
909 st->print("call #nmethod_entry_barrier_stub\t");
910 st->print("\n\tfast_entry:");
911 }
912 st->cr();
913 }
914 #endif
915
916 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
917 Compile* C = ra_->C;
918 MacroAssembler _masm(&cbuf);
919
920 if (C->clinit_barrier_on_entry()) {
921 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
922 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
923
924 Label L_skip_barrier;
925 Register klass = rscratch1;
926
927 __ mov_metadata(klass, C->method()->holder()->constant_encoding());
928 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
929
930 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
931
932 __ bind(L_skip_barrier);
933 }
934
935 __ verified_entry(C);
936 __ bind(*_verified_entry);
937
938 if (C->stub_function() == NULL) {
939 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
940 bs->nmethod_entry_barrier(&_masm);
941 }
942
943 C->output()->set_frame_complete(cbuf.insts_size());
944
945 if (C->has_mach_constant_base_node()) {
946 // NOTE: We set the table base offset here because users might be
947 // emitted before MachConstantBaseNode.
948 ConstantTable& constant_table = C->output()->constant_table();
949 constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
950 }
951 }
952
953 int MachPrologNode::reloc() const
954 {
955 return 0; // a large enough number
956 }
957
958 //=============================================================================
959 #ifndef PRODUCT
960 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
961 {
962 Compile* C = ra_->C;
963 if (generate_vzeroupper(C)) {
964 st->print("vzeroupper");
965 st->cr(); st->print("\t");
966 }
967
968 int framesize = C->output()->frame_size_in_bytes();
969 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
970 // Remove word for return adr already pushed
971 // and RBP
972 framesize -= 2*wordSize;
980 if (do_polling() && C->is_method_compilation()) {
981 st->print("\t");
982 st->print_cr("cmpq rsp, poll_offset[r15_thread] \n\t"
983 "ja #safepoint_stub\t"
984 "# Safepoint: poll for GC");
985 }
986 }
987 #endif
988
989 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
990 {
991 Compile* C = ra_->C;
992 MacroAssembler _masm(&cbuf);
993
994 if (generate_vzeroupper(C)) {
995 // Clear upper bits of YMM registers when current compiled code uses
996 // wide vectors to avoid AVX <-> SSE transition penalty during call.
997 __ vzeroupper();
998 }
999
1000 // Subtract two words to account for return address and rbp
1001 int initial_framesize = C->output()->frame_size_in_bytes() - 2*wordSize;
1002 __ remove_frame(initial_framesize, C->needs_stack_repair());
1003
1004 if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
1005 __ reserved_stack_check();
1006 }
1007
1008 if (do_polling() && C->is_method_compilation()) {
1009 MacroAssembler _masm(&cbuf);
1010 Label dummy_label;
1011 Label* code_stub = &dummy_label;
1012 if (!C->output()->in_scratch_emit_size()) {
1013 code_stub = &C->output()->safepoint_poll_table()->add_safepoint(__ offset());
1014 }
1015 __ relocate(relocInfo::poll_return_type);
1016 __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
1017 }
1018 }
1019
1020 int MachEpilogNode::reloc() const
1021 {
1022 return 2; // a large enough number
1023 }
1024
1025 const Pipeline* MachEpilogNode::pipeline() const
1026 {
1027 return MachNode::pipeline_class();
1028 }
1029
1030 //=============================================================================
1031
1032 enum RC {
1033 rc_bad,
1034 rc_int,
1035 rc_kreg,
1036 rc_float,
1037 rc_stack
1038 };
1039
1632 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1633 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1634 emit_rm(cbuf, 0x2, reg & 7, 0x04);
1635 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1636 emit_d32(cbuf, offset);
1637 } else {
1638 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1639 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1640 emit_rm(cbuf, 0x1, reg & 7, 0x04);
1641 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1642 emit_d8(cbuf, offset);
1643 }
1644 }
1645
1646 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
1647 {
1648 int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1649 return (offset < 0x80) ? 5 : 8; // REX
1650 }
1651
1652 //=============================================================================
1653 #ifndef PRODUCT
1654 void MachVEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1655 {
1656 st->print_cr("MachVEPNode");
1657 }
1658 #endif
1659
1660 void MachVEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1661 {
1662 MacroAssembler _masm(&cbuf);
1663 if (!_verified) {
1664 uint insts_size = cbuf.insts_size();
1665 if (UseCompressedClassPointers) {
1666 __ load_klass(rscratch1, j_rarg0, rscratch2);
1667 __ cmpptr(rax, rscratch1);
1668 } else {
1669 __ cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1670 }
1671 __ jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1672 } else {
1673 // Unpack inline type args passed as oop and then jump to
1674 // the verified entry point (skipping the unverified entry).
1675 int sp_inc = __ unpack_inline_args(ra_->C, _receiver_only);
1676 // Emit code for verified entry and save increment for stack repair on return
1677 __ verified_entry(ra_->C, sp_inc);
1678 __ jmp(*_verified_entry);
1679 }
1680 }
1681
1682 //=============================================================================
1683 #ifndef PRODUCT
1684 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1685 {
1686 if (UseCompressedClassPointers) {
1687 st->print_cr("movl rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
1688 st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
1689 st->print_cr("\tcmpq rax, rscratch1\t # Inline cache check");
1690 } else {
1691 st->print_cr("\tcmpq rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
1692 "# Inline cache check");
1693 }
1694 st->print_cr("\tjne SharedRuntime::_ic_miss_stub");
1695 st->print_cr("\tnop\t# nops to align entry point");
1696 }
1697 #endif
1698
1699 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1700 {
1701 MacroAssembler masm(&cbuf);
1704 masm.load_klass(rscratch1, j_rarg0, rscratch2);
1705 masm.cmpptr(rax, rscratch1);
1706 } else {
1707 masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1708 }
1709
1710 masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1711
1712 /* WARNING these NOPs are critical so that verified entry point is properly
1713 4 bytes aligned for patching by NativeJump::patch_verified_entry() */
1714 int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
1715 if (OptoBreakpoint) {
1716 // Leave space for int3
1717 nops_cnt -= 1;
1718 }
1719 nops_cnt &= 0x3; // Do not add nops if code is aligned.
1720 if (nops_cnt > 0)
1721 masm.nop(nops_cnt);
1722 }
1723
1724 //=============================================================================
1725
1726 const bool Matcher::supports_vector_calling_convention(void) {
1727 if (EnableVectorSupport && UseVectorStubs) {
1728 return true;
1729 }
1730 return false;
1731 }
1732
1733 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
1734 assert(EnableVectorSupport && UseVectorStubs, "sanity");
1735 int lo = XMM0_num;
1736 int hi = XMM0b_num;
1737 if (ideal_reg == Op_VecX) hi = XMM0d_num;
1738 else if (ideal_reg == Op_VecY) hi = XMM0h_num;
1739 else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
1740 return OptoRegPair(hi, lo);
1741 }
1742
1743 // Is this branch offset short enough that a short branch can be used?
4009 %}
4010 %}
4011
4012 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
4013 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
4014 %{
4015 constraint(ALLOC_IN_RC(ptr_reg));
4016 predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
4017 match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
4018
4019 op_cost(10);
4020 format %{"[$reg + $off + $idx << $scale]" %}
4021 interface(MEMORY_INTER) %{
4022 base($reg);
4023 index($idx);
4024 scale($scale);
4025 disp($off);
4026 %}
4027 %}
4028
4029 // Indirect Narrow Oop Operand
4030 operand indCompressedOop(rRegN reg) %{
4031 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4032 constraint(ALLOC_IN_RC(ptr_reg));
4033 match(DecodeN reg);
4034
4035 op_cost(10);
4036 format %{"[R12 + $reg << 3] (compressed oop addressing)" %}
4037 interface(MEMORY_INTER) %{
4038 base(0xc); // R12
4039 index($reg);
4040 scale(0x3);
4041 disp(0x0);
4042 %}
4043 %}
4044
4045 // Indirect Narrow Oop Plus Offset Operand
4046 // Note: x86 architecture doesn't support "scale * index + offset" without a base
4047 // we can't free r12 even with CompressedOops::base() == NULL.
4048 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
4049 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4050 constraint(ALLOC_IN_RC(ptr_reg));
4051 match(AddP (DecodeN reg) off);
4052
4053 op_cost(10);
4054 format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
4055 interface(MEMORY_INTER) %{
4056 base(0xc); // R12
4057 index($reg);
4058 scale(0x3);
4059 disp($off);
4060 %}
4061 %}
4062
4063 // Indirect Memory Operand
4064 operand indirectNarrow(rRegN reg)
4367 equal(0x4, "e");
4368 not_equal(0x5, "ne");
4369 less(0x2, "b");
4370 greater_equal(0x3, "nb");
4371 less_equal(0x6, "be");
4372 greater(0x7, "nbe");
4373 overflow(0x0, "o");
4374 no_overflow(0x1, "no");
4375 %}
4376 %}
4377
4378 //----------OPERAND CLASSES----------------------------------------------------
4379 // Operand Classes are groups of operands that are used as to simplify
4380 // instruction definitions by not requiring the AD writer to specify separate
4381 // instructions for every form of operand when the instruction accepts
4382 // multiple operand types with the same basic encoding and format. The classic
4383 // case of this is memory operands.
4384
4385 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
4386 indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
4387 indCompressedOop, indCompressedOopOffset,
4388 indirectNarrow, indOffset8Narrow, indOffset32Narrow,
4389 indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
4390 indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
4391
4392 //----------PIPELINE-----------------------------------------------------------
4393 // Rules which define the behavior of the target architectures pipeline.
4394 pipeline %{
4395
4396 //----------ATTRIBUTES---------------------------------------------------------
4397 attributes %{
4398 variable_size_instructions; // Fixed size instructions
4399 max_instructions_per_bundle = 3; // Up to 3 instructions per bundle
4400 instruction_unit_size = 1; // An instruction is 1 bytes long
4401 instruction_fetch_unit_size = 16; // The processor fetches one line
4402 instruction_fetch_units = 1; // of 16 bytes
4403
4404 // List of nop instructions
4405 nops( MachNop );
4406 %}
4407
6861 format %{ "MEMBAR-storestore (empty encoding)" %}
6862 ins_encode( );
6863 ins_pipe(empty);
6864 %}
6865
6866 //----------Move Instructions--------------------------------------------------
6867
6868 instruct castX2P(rRegP dst, rRegL src)
6869 %{
6870 match(Set dst (CastX2P src));
6871
6872 format %{ "movq $dst, $src\t# long->ptr" %}
6873 ins_encode %{
6874 if ($dst$$reg != $src$$reg) {
6875 __ movptr($dst$$Register, $src$$Register);
6876 }
6877 %}
6878 ins_pipe(ialu_reg_reg); // XXX
6879 %}
6880
6881 instruct castN2X(rRegL dst, rRegN src)
6882 %{
6883 match(Set dst (CastP2X src));
6884
6885 format %{ "movq $dst, $src\t# ptr -> long" %}
6886 ins_encode %{
6887 if ($dst$$reg != $src$$reg) {
6888 __ movptr($dst$$Register, $src$$Register);
6889 }
6890 %}
6891 ins_pipe(ialu_reg_reg); // XXX
6892 %}
6893
6894 instruct castP2X(rRegL dst, rRegP src)
6895 %{
6896 match(Set dst (CastP2X src));
6897
6898 format %{ "movq $dst, $src\t# ptr -> long" %}
6899 ins_encode %{
6900 if ($dst$$reg != $src$$reg) {
6901 __ movptr($dst$$Register, $src$$Register);
6902 }
6903 %}
6904 ins_pipe(ialu_reg_reg); // XXX
6905 %}
6906
6907 // Convert oop into int for vectors alignment masking
6908 instruct convP2I(rRegI dst, rRegP src)
6909 %{
6910 match(Set dst (ConvL2I (CastP2X src)));
6911
6912 format %{ "movl $dst, $src\t# ptr -> int" %}
6913 ins_encode %{
11183 effect(DEF dst, USE src);
11184 ins_cost(100);
11185 format %{ "movd $dst,$src\t# MoveI2F" %}
11186 ins_encode %{
11187 __ movdl($dst$$XMMRegister, $src$$Register);
11188 %}
11189 ins_pipe( pipe_slow );
11190 %}
11191
11192 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11193 match(Set dst (MoveL2D src));
11194 effect(DEF dst, USE src);
11195 ins_cost(100);
11196 format %{ "movd $dst,$src\t# MoveL2D" %}
11197 ins_encode %{
11198 __ movdq($dst$$XMMRegister, $src$$Register);
11199 %}
11200 ins_pipe( pipe_slow );
11201 %}
11202
11203
11204 // Fast clearing of an array
11205 // Small ClearArray non-AVX512.
11206 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11207 Universe dummy, rFlagsReg cr)
11208 %{
11209 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11210 match(Set dummy (ClearArray (Binary cnt base) val));
11211 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11212
11213 format %{ $$template
11214 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11215 $$emit$$"jg LARGE\n\t"
11216 $$emit$$"dec rcx\n\t"
11217 $$emit$$"js DONE\t# Zero length\n\t"
11218 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11219 $$emit$$"dec rcx\n\t"
11220 $$emit$$"jge LOOP\n\t"
11221 $$emit$$"jmp DONE\n\t"
11222 $$emit$$"# LARGE:\n\t"
11223 if (UseFastStosb) {
11224 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11225 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11226 } else if (UseXMMForObjInit) {
11227 $$emit$$"movdq $tmp, $val\n\t"
11228 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11229 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11230 $$emit$$"jmpq L_zero_64_bytes\n\t"
11231 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11232 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11233 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11234 $$emit$$"add 0x40,rax\n\t"
11235 $$emit$$"# L_zero_64_bytes:\n\t"
11236 $$emit$$"sub 0x8,rcx\n\t"
11237 $$emit$$"jge L_loop\n\t"
11238 $$emit$$"add 0x4,rcx\n\t"
11239 $$emit$$"jl L_tail\n\t"
11240 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11241 $$emit$$"add 0x20,rax\n\t"
11242 $$emit$$"sub 0x4,rcx\n\t"
11243 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11244 $$emit$$"add 0x4,rcx\n\t"
11245 $$emit$$"jle L_end\n\t"
11246 $$emit$$"dec rcx\n\t"
11247 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11248 $$emit$$"vmovq xmm0,(rax)\n\t"
11249 $$emit$$"add 0x8,rax\n\t"
11250 $$emit$$"dec rcx\n\t"
11251 $$emit$$"jge L_sloop\n\t"
11252 $$emit$$"# L_end:\n\t"
11253 } else {
11254 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11255 }
11256 $$emit$$"# DONE"
11257 %}
11258 ins_encode %{
11259 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11260 $tmp$$XMMRegister, false, false);
11261 %}
11262 ins_pipe(pipe_slow);
11263 %}
11264
11265 instruct rep_stos_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11266 Universe dummy, rFlagsReg cr)
11267 %{
11268 predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11269 match(Set dummy (ClearArray (Binary cnt base) val));
11270 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11271
11272 format %{ $$template
11273 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11274 $$emit$$"jg LARGE\n\t"
11275 $$emit$$"dec rcx\n\t"
11276 $$emit$$"js DONE\t# Zero length\n\t"
11277 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11278 $$emit$$"dec rcx\n\t"
11279 $$emit$$"jge LOOP\n\t"
11280 $$emit$$"jmp DONE\n\t"
11281 $$emit$$"# LARGE:\n\t"
11282 if (UseXMMForObjInit) {
11283 $$emit$$"movdq $tmp, $val\n\t"
11284 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11285 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11286 $$emit$$"jmpq L_zero_64_bytes\n\t"
11287 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11288 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11289 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11290 $$emit$$"add 0x40,rax\n\t"
11291 $$emit$$"# L_zero_64_bytes:\n\t"
11292 $$emit$$"sub 0x8,rcx\n\t"
11293 $$emit$$"jge L_loop\n\t"
11294 $$emit$$"add 0x4,rcx\n\t"
11295 $$emit$$"jl L_tail\n\t"
11296 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11297 $$emit$$"add 0x20,rax\n\t"
11298 $$emit$$"sub 0x4,rcx\n\t"
11299 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11300 $$emit$$"add 0x4,rcx\n\t"
11301 $$emit$$"jle L_end\n\t"
11302 $$emit$$"dec rcx\n\t"
11303 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11304 $$emit$$"vmovq xmm0,(rax)\n\t"
11305 $$emit$$"add 0x8,rax\n\t"
11306 $$emit$$"dec rcx\n\t"
11307 $$emit$$"jge L_sloop\n\t"
11308 $$emit$$"# L_end:\n\t"
11309 } else {
11310 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11311 }
11312 $$emit$$"# DONE"
11313 %}
11314 ins_encode %{
11315 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11316 $tmp$$XMMRegister, false, true);
11317 %}
11318 ins_pipe(pipe_slow);
11319 %}
11320
11321 // Small ClearArray AVX512 non-constant length.
11322 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11323 Universe dummy, rFlagsReg cr)
11324 %{
11325 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11326 match(Set dummy (ClearArray (Binary cnt base) val));
11327 ins_cost(125);
11328 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11329
11330 format %{ $$template
11331 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11332 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11333 $$emit$$"jg LARGE\n\t"
11334 $$emit$$"dec rcx\n\t"
11335 $$emit$$"js DONE\t# Zero length\n\t"
11336 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11337 $$emit$$"dec rcx\n\t"
11338 $$emit$$"jge LOOP\n\t"
11339 $$emit$$"jmp DONE\n\t"
11340 $$emit$$"# LARGE:\n\t"
11341 if (UseFastStosb) {
11342 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11343 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11344 } else if (UseXMMForObjInit) {
11345 $$emit$$"mov rdi,rax\n\t"
11346 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11347 $$emit$$"jmpq L_zero_64_bytes\n\t"
11348 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11356 $$emit$$"jl L_tail\n\t"
11357 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11358 $$emit$$"add 0x20,rax\n\t"
11359 $$emit$$"sub 0x4,rcx\n\t"
11360 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11361 $$emit$$"add 0x4,rcx\n\t"
11362 $$emit$$"jle L_end\n\t"
11363 $$emit$$"dec rcx\n\t"
11364 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11365 $$emit$$"vmovq xmm0,(rax)\n\t"
11366 $$emit$$"add 0x8,rax\n\t"
11367 $$emit$$"dec rcx\n\t"
11368 $$emit$$"jge L_sloop\n\t"
11369 $$emit$$"# L_end:\n\t"
11370 } else {
11371 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11372 }
11373 $$emit$$"# DONE"
11374 %}
11375 ins_encode %{
11376 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11377 $tmp$$XMMRegister, false, false, $ktmp$$KRegister);
11378 %}
11379 ins_pipe(pipe_slow);
11380 %}
11381
11382 instruct rep_stos_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11383 Universe dummy, rFlagsReg cr)
11384 %{
11385 predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11386 match(Set dummy (ClearArray (Binary cnt base) val));
11387 ins_cost(125);
11388 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11389
11390 format %{ $$template
11391 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11392 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11393 $$emit$$"jg LARGE\n\t"
11394 $$emit$$"dec rcx\n\t"
11395 $$emit$$"js DONE\t# Zero length\n\t"
11396 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11397 $$emit$$"dec rcx\n\t"
11398 $$emit$$"jge LOOP\n\t"
11399 $$emit$$"jmp DONE\n\t"
11400 $$emit$$"# LARGE:\n\t"
11401 if (UseFastStosb) {
11402 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11403 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11404 } else if (UseXMMForObjInit) {
11405 $$emit$$"mov rdi,rax\n\t"
11406 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11407 $$emit$$"jmpq L_zero_64_bytes\n\t"
11408 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11416 $$emit$$"jl L_tail\n\t"
11417 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11418 $$emit$$"add 0x20,rax\n\t"
11419 $$emit$$"sub 0x4,rcx\n\t"
11420 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11421 $$emit$$"add 0x4,rcx\n\t"
11422 $$emit$$"jle L_end\n\t"
11423 $$emit$$"dec rcx\n\t"
11424 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11425 $$emit$$"vmovq xmm0,(rax)\n\t"
11426 $$emit$$"add 0x8,rax\n\t"
11427 $$emit$$"dec rcx\n\t"
11428 $$emit$$"jge L_sloop\n\t"
11429 $$emit$$"# L_end:\n\t"
11430 } else {
11431 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11432 }
11433 $$emit$$"# DONE"
11434 %}
11435 ins_encode %{
11436 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11437 $tmp$$XMMRegister, false, true, $ktmp$$KRegister);
11438 %}
11439 ins_pipe(pipe_slow);
11440 %}
11441
11442 // Large ClearArray non-AVX512.
11443 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11444 Universe dummy, rFlagsReg cr)
11445 %{
11446 predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11447 match(Set dummy (ClearArray (Binary cnt base) val));
11448 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11449
11450 format %{ $$template
11451 if (UseFastStosb) {
11452 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11453 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11454 } else if (UseXMMForObjInit) {
11455 $$emit$$"movdq $tmp, $val\n\t"
11456 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11457 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11458 $$emit$$"jmpq L_zero_64_bytes\n\t"
11459 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11460 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11461 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11462 $$emit$$"add 0x40,rax\n\t"
11463 $$emit$$"# L_zero_64_bytes:\n\t"
11464 $$emit$$"sub 0x8,rcx\n\t"
11465 $$emit$$"jge L_loop\n\t"
11466 $$emit$$"add 0x4,rcx\n\t"
11467 $$emit$$"jl L_tail\n\t"
11468 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11469 $$emit$$"add 0x20,rax\n\t"
11470 $$emit$$"sub 0x4,rcx\n\t"
11471 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11472 $$emit$$"add 0x4,rcx\n\t"
11473 $$emit$$"jle L_end\n\t"
11474 $$emit$$"dec rcx\n\t"
11475 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11476 $$emit$$"vmovq xmm0,(rax)\n\t"
11477 $$emit$$"add 0x8,rax\n\t"
11478 $$emit$$"dec rcx\n\t"
11479 $$emit$$"jge L_sloop\n\t"
11480 $$emit$$"# L_end:\n\t"
11481 } else {
11482 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11483 }
11484 %}
11485 ins_encode %{
11486 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11487 $tmp$$XMMRegister, true, false);
11488 %}
11489 ins_pipe(pipe_slow);
11490 %}
11491
11492 instruct rep_stos_large_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11493 Universe dummy, rFlagsReg cr)
11494 %{
11495 predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11496 match(Set dummy (ClearArray (Binary cnt base) val));
11497 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11498
11499 format %{ $$template
11500 if (UseXMMForObjInit) {
11501 $$emit$$"movdq $tmp, $val\n\t"
11502 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11503 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11504 $$emit$$"jmpq L_zero_64_bytes\n\t"
11505 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11506 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11507 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11508 $$emit$$"add 0x40,rax\n\t"
11509 $$emit$$"# L_zero_64_bytes:\n\t"
11510 $$emit$$"sub 0x8,rcx\n\t"
11511 $$emit$$"jge L_loop\n\t"
11512 $$emit$$"add 0x4,rcx\n\t"
11513 $$emit$$"jl L_tail\n\t"
11514 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11515 $$emit$$"add 0x20,rax\n\t"
11516 $$emit$$"sub 0x4,rcx\n\t"
11517 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11518 $$emit$$"add 0x4,rcx\n\t"
11519 $$emit$$"jle L_end\n\t"
11520 $$emit$$"dec rcx\n\t"
11521 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11522 $$emit$$"vmovq xmm0,(rax)\n\t"
11523 $$emit$$"add 0x8,rax\n\t"
11524 $$emit$$"dec rcx\n\t"
11525 $$emit$$"jge L_sloop\n\t"
11526 $$emit$$"# L_end:\n\t"
11527 } else {
11528 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11529 }
11530 %}
11531 ins_encode %{
11532 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11533 $tmp$$XMMRegister, true, true);
11534 %}
11535 ins_pipe(pipe_slow);
11536 %}
11537
11538 // Large ClearArray AVX512.
11539 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11540 Universe dummy, rFlagsReg cr)
11541 %{
11542 predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11543 match(Set dummy (ClearArray (Binary cnt base) val));
11544 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11545
11546 format %{ $$template
11547 if (UseFastStosb) {
11548 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11549 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11550 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11551 } else if (UseXMMForObjInit) {
11552 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11553 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11554 $$emit$$"jmpq L_zero_64_bytes\n\t"
11555 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11556 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11557 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11558 $$emit$$"add 0x40,rax\n\t"
11559 $$emit$$"# L_zero_64_bytes:\n\t"
11560 $$emit$$"sub 0x8,rcx\n\t"
11561 $$emit$$"jge L_loop\n\t"
11562 $$emit$$"add 0x4,rcx\n\t"
11563 $$emit$$"jl L_tail\n\t"
11564 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11565 $$emit$$"add 0x20,rax\n\t"
11566 $$emit$$"sub 0x4,rcx\n\t"
11567 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11568 $$emit$$"add 0x4,rcx\n\t"
11569 $$emit$$"jle L_end\n\t"
11570 $$emit$$"dec rcx\n\t"
11571 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11572 $$emit$$"vmovq xmm0,(rax)\n\t"
11573 $$emit$$"add 0x8,rax\n\t"
11574 $$emit$$"dec rcx\n\t"
11575 $$emit$$"jge L_sloop\n\t"
11576 $$emit$$"# L_end:\n\t"
11577 } else {
11578 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11579 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11580 }
11581 %}
11582 ins_encode %{
11583 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11584 $tmp$$XMMRegister, true, false, $ktmp$$KRegister);
11585 %}
11586 ins_pipe(pipe_slow);
11587 %}
11588
11589 instruct rep_stos_large_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11590 Universe dummy, rFlagsReg cr)
11591 %{
11592 predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11593 match(Set dummy (ClearArray (Binary cnt base) val));
11594 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11595
11596 format %{ $$template
11597 if (UseFastStosb) {
11598 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11599 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11600 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11601 } else if (UseXMMForObjInit) {
11602 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11603 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11604 $$emit$$"jmpq L_zero_64_bytes\n\t"
11605 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11606 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11607 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11608 $$emit$$"add 0x40,rax\n\t"
11609 $$emit$$"# L_zero_64_bytes:\n\t"
11610 $$emit$$"sub 0x8,rcx\n\t"
11611 $$emit$$"jge L_loop\n\t"
11612 $$emit$$"add 0x4,rcx\n\t"
11613 $$emit$$"jl L_tail\n\t"
11614 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11615 $$emit$$"add 0x20,rax\n\t"
11616 $$emit$$"sub 0x4,rcx\n\t"
11617 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11618 $$emit$$"add 0x4,rcx\n\t"
11619 $$emit$$"jle L_end\n\t"
11620 $$emit$$"dec rcx\n\t"
11621 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11622 $$emit$$"vmovq xmm0,(rax)\n\t"
11623 $$emit$$"add 0x8,rax\n\t"
11624 $$emit$$"dec rcx\n\t"
11625 $$emit$$"jge L_sloop\n\t"
11626 $$emit$$"# L_end:\n\t"
11627 } else {
11628 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11629 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11630 }
11631 %}
11632 ins_encode %{
11633 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11634 $tmp$$XMMRegister, true, true, $ktmp$$KRegister);
11635 %}
11636 ins_pipe(pipe_slow);
11637 %}
11638
11639 // Small ClearArray AVX512 constant length.
11640 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rax_RegL val, kReg ktmp, Universe dummy, rFlagsReg cr)
11641 %{
11642 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() &&
11643 ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11644 match(Set dummy (ClearArray (Binary cnt base) val));
11645 ins_cost(100);
11646 effect(TEMP tmp, USE_KILL val, TEMP ktmp, KILL cr);
11647 format %{ "clear_mem_imm $base , $cnt \n\t" %}
11648 ins_encode %{
11649 __ clear_mem($base$$Register, $cnt$$constant, $val$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11650 %}
11651 ins_pipe(pipe_slow);
11652 %}
11653
11654 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11655 rax_RegI result, legRegD tmp1, rFlagsReg cr)
11656 %{
11657 predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11658 match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11659 effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11660
11661 format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %}
11662 ins_encode %{
11663 __ string_compare($str1$$Register, $str2$$Register,
11664 $cnt1$$Register, $cnt2$$Register, $result$$Register,
11665 $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11666 %}
11667 ins_pipe( pipe_slow );
11668 %}
11669
13457
13458 ins_cost(300);
13459 format %{ "call_leaf,vector " %}
13460 ins_encode(Java_To_Runtime(meth));
13461 ins_pipe(pipe_slow);
13462 %}
13463
13464 //
13465 instruct CallNativeDirect(method meth)
13466 %{
13467 match(CallNative);
13468 effect(USE meth);
13469
13470 ins_cost(300);
13471 format %{ "call_native " %}
13472 ins_encode(clear_avx, Java_To_Runtime(meth));
13473 ins_pipe(pipe_slow);
13474 %}
13475
13476 // Call runtime without safepoint
13477 // entry point is null, target holds the address to call
13478 instruct CallLeafNoFPInDirect(rRegP target)
13479 %{
13480 predicate(n->as_Call()->entry_point() == NULL);
13481 match(CallLeafNoFP target);
13482
13483 ins_cost(300);
13484 format %{ "call_leaf_nofp,runtime indirect " %}
13485 ins_encode %{
13486 __ call($target$$Register);
13487 %}
13488
13489 ins_pipe(pipe_slow);
13490 %}
13491
13492 instruct CallLeafNoFPDirect(method meth)
13493 %{
13494 predicate(n->as_Call()->entry_point() != NULL);
13495 match(CallLeafNoFP);
13496 effect(USE meth);
13497
13498 ins_cost(300);
13499 format %{ "call_leaf_nofp,runtime " %}
13500 ins_encode(clear_avx, Java_To_Runtime(meth));
13501 ins_pipe(pipe_slow);
13502 %}
13503
13504 // Return Instruction
13505 // Remove the return address & jump to it.
13506 // Notice: We always emit a nop after a ret to make sure there is room
13507 // for safepoint patching
13508 instruct Ret()
13509 %{
13510 match(Return);
13511
13512 format %{ "ret" %}
13513 ins_encode %{
13514 __ ret(0);
|