473 }
474
475 // !!!!! Special hack to get all types of calls to specify the byte offset
476 // from the start of the call to the point where the return address
477 // will point.
478 int MachCallStaticJavaNode::ret_addr_offset()
479 {
480 int offset = 5; // 5 bytes from start of call to where return address points
481 offset += clear_avx_size();
482 return offset;
483 }
484
485 int MachCallDynamicJavaNode::ret_addr_offset()
486 {
487 int offset = 15; // 15 bytes from start of call to where return address points
488 offset += clear_avx_size();
489 return offset;
490 }
491
492 int MachCallRuntimeNode::ret_addr_offset() {
493 int offset = 13; // movq r10,#addr; callq (r10)
494 if (this->ideal_Opcode() != Op_CallLeafVector) {
495 offset += clear_avx_size();
496 }
497 return offset;
498 }
499 //
500 // Compute padding required for nodes which need alignment
501 //
502
503 // The address of the call instruction needs to be 4-byte aligned to
504 // ensure that it does not span a cache line so that it can be patched.
505 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
506 {
507 current_offset += clear_avx_size(); // skip vzeroupper
508 current_offset += 1; // skip call opcode byte
509 return align_up(current_offset, alignment_required()) - current_offset;
510 }
511
512 // The address of the call instruction needs to be 4-byte aligned to
513 // ensure that it does not span a cache line so that it can be patched.
514 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
515 {
516 current_offset += clear_avx_size(); // skip vzeroupper
517 current_offset += 11; // skip movq instruction + call opcode byte
518 return align_up(current_offset, alignment_required()) - current_offset;
887 st->print("# stack alignment check");
888 #endif
889 }
890 if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
891 st->print("\n\t");
892 st->print("cmpl [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
893 st->print("\n\t");
894 st->print("je fast_entry\t");
895 st->print("\n\t");
896 st->print("call #nmethod_entry_barrier_stub\t");
897 st->print("\n\tfast_entry:");
898 }
899 st->cr();
900 }
901 #endif
902
903 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
904 Compile* C = ra_->C;
905 C2_MacroAssembler _masm(&cbuf);
906
907 int framesize = C->output()->frame_size_in_bytes();
908 int bangsize = C->output()->bang_size_in_bytes();
909
910 if (C->clinit_barrier_on_entry()) {
911 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
912 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
913
914 Label L_skip_barrier;
915 Register klass = rscratch1;
916
917 __ mov_metadata(klass, C->method()->holder()->constant_encoding());
918 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
919
920 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
921
922 __ bind(L_skip_barrier);
923 }
924
925 __ verified_entry(framesize, C->output()->need_stack_bang(bangsize)?bangsize:0, false, C->stub_function() != NULL);
926
927 C->output()->set_frame_complete(cbuf.insts_size());
928
929 if (C->has_mach_constant_base_node()) {
930 // NOTE: We set the table base offset here because users might be
931 // emitted before MachConstantBaseNode.
932 ConstantTable& constant_table = C->output()->constant_table();
933 constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
934 }
935 }
936
937 uint MachPrologNode::size(PhaseRegAlloc* ra_) const
938 {
939 return MachNode::size(ra_); // too many variables; just compute it
940 // the hard way
941 }
942
943 int MachPrologNode::reloc() const
944 {
945 return 0; // a large enough number
946 }
947
948 //=============================================================================
949 #ifndef PRODUCT
950 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
951 {
952 Compile* C = ra_->C;
953 if (generate_vzeroupper(C)) {
954 st->print("vzeroupper");
955 st->cr(); st->print("\t");
956 }
957
958 int framesize = C->output()->frame_size_in_bytes();
959 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
960 // Remove word for return adr already pushed
961 // and RBP
962 framesize -= 2*wordSize;
970 if (do_polling() && C->is_method_compilation()) {
971 st->print("\t");
972 st->print_cr("cmpq rsp, poll_offset[r15_thread] \n\t"
973 "ja #safepoint_stub\t"
974 "# Safepoint: poll for GC");
975 }
976 }
977 #endif
978
979 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
980 {
981 Compile* C = ra_->C;
982 MacroAssembler _masm(&cbuf);
983
984 if (generate_vzeroupper(C)) {
985 // Clear upper bits of YMM registers when current compiled code uses
986 // wide vectors to avoid AVX <-> SSE transition penalty during call.
987 __ vzeroupper();
988 }
989
990 int framesize = C->output()->frame_size_in_bytes();
991 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
992 // Remove word for return adr already pushed
993 // and RBP
994 framesize -= 2*wordSize;
995
996 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
997
998 if (framesize) {
999 emit_opcode(cbuf, Assembler::REX_W);
1000 if (framesize < 0x80) {
1001 emit_opcode(cbuf, 0x83); // addq rsp, #framesize
1002 emit_rm(cbuf, 0x3, 0x00, RSP_enc);
1003 emit_d8(cbuf, framesize);
1004 } else {
1005 emit_opcode(cbuf, 0x81); // addq rsp, #framesize
1006 emit_rm(cbuf, 0x3, 0x00, RSP_enc);
1007 emit_d32(cbuf, framesize);
1008 }
1009 }
1010
1011 // popq rbp
1012 emit_opcode(cbuf, 0x58 | RBP_enc);
1013
1014 if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
1015 __ reserved_stack_check();
1016 }
1017
1018 if (do_polling() && C->is_method_compilation()) {
1019 MacroAssembler _masm(&cbuf);
1020 Label dummy_label;
1021 Label* code_stub = &dummy_label;
1022 if (!C->output()->in_scratch_emit_size()) {
1023 C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
1024 C->output()->add_stub(stub);
1025 code_stub = &stub->entry();
1026 }
1027 __ relocate(relocInfo::poll_return_type);
1028 __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
1029 }
1030 }
1031
1032 uint MachEpilogNode::size(PhaseRegAlloc* ra_) const
1033 {
1034 return MachNode::size(ra_); // too many variables; just compute it
1035 // the hard way
1036 }
1037
1038 int MachEpilogNode::reloc() const
1039 {
1040 return 2; // a large enough number
1041 }
1042
1043 const Pipeline* MachEpilogNode::pipeline() const
1044 {
1045 return MachNode::pipeline_class();
1046 }
1047
1048 //=============================================================================
1049
1050 enum RC {
1051 rc_bad,
1052 rc_int,
1053 rc_kreg,
1054 rc_float,
1055 rc_stack
1056 };
1057
1650 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1651 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1652 emit_rm(cbuf, 0x2, reg & 7, 0x04);
1653 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1654 emit_d32(cbuf, offset);
1655 } else {
1656 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1657 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1658 emit_rm(cbuf, 0x1, reg & 7, 0x04);
1659 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1660 emit_d8(cbuf, offset);
1661 }
1662 }
1663
1664 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
1665 {
1666 int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1667 return (offset < 0x80) ? 5 : 8; // REX
1668 }
1669
1670 //=============================================================================
1671 #ifndef PRODUCT
1672 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1673 {
1674 if (UseCompressedClassPointers) {
1675 st->print_cr("movl rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
1676 st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
1677 st->print_cr("\tcmpq rax, rscratch1\t # Inline cache check");
1678 } else {
1679 st->print_cr("\tcmpq rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
1680 "# Inline cache check");
1681 }
1682 st->print_cr("\tjne SharedRuntime::_ic_miss_stub");
1683 st->print_cr("\tnop\t# nops to align entry point");
1684 }
1685 #endif
1686
1687 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1688 {
1689 MacroAssembler masm(&cbuf);
1692 masm.load_klass(rscratch1, j_rarg0, rscratch2);
1693 masm.cmpptr(rax, rscratch1);
1694 } else {
1695 masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1696 }
1697
1698 masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1699
1700 /* WARNING these NOPs are critical so that verified entry point is properly
1701 4 bytes aligned for patching by NativeJump::patch_verified_entry() */
1702 int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
1703 if (OptoBreakpoint) {
1704 // Leave space for int3
1705 nops_cnt -= 1;
1706 }
1707 nops_cnt &= 0x3; // Do not add nops if code is aligned.
1708 if (nops_cnt > 0)
1709 masm.nop(nops_cnt);
1710 }
1711
1712 uint MachUEPNode::size(PhaseRegAlloc* ra_) const
1713 {
1714 return MachNode::size(ra_); // too many variables; just compute it
1715 // the hard way
1716 }
1717
1718
1719 //=============================================================================
1720
1721 const bool Matcher::supports_vector_calling_convention(void) {
1722 if (EnableVectorSupport && UseVectorStubs) {
1723 return true;
1724 }
1725 return false;
1726 }
1727
1728 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
1729 assert(EnableVectorSupport && UseVectorStubs, "sanity");
1730 int lo = XMM0_num;
1731 int hi = XMM0b_num;
1732 if (ideal_reg == Op_VecX) hi = XMM0d_num;
1733 else if (ideal_reg == Op_VecY) hi = XMM0h_num;
1734 else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
1735 return OptoRegPair(hi, lo);
1736 }
1737
1738 // Is this branch offset short enough that a short branch can be used?
3977 %}
3978 %}
3979
3980 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
3981 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
3982 %{
3983 constraint(ALLOC_IN_RC(ptr_reg));
3984 predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
3985 match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
3986
3987 op_cost(10);
3988 format %{"[$reg + $off + $idx << $scale]" %}
3989 interface(MEMORY_INTER) %{
3990 base($reg);
3991 index($idx);
3992 scale($scale);
3993 disp($off);
3994 %}
3995 %}
3996
3997 // Indirect Narrow Oop Plus Offset Operand
3998 // Note: x86 architecture doesn't support "scale * index + offset" without a base
3999 // we can't free r12 even with CompressedOops::base() == NULL.
4000 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
4001 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4002 constraint(ALLOC_IN_RC(ptr_reg));
4003 match(AddP (DecodeN reg) off);
4004
4005 op_cost(10);
4006 format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
4007 interface(MEMORY_INTER) %{
4008 base(0xc); // R12
4009 index($reg);
4010 scale(0x3);
4011 disp($off);
4012 %}
4013 %}
4014
4015 // Indirect Memory Operand
4016 operand indirectNarrow(rRegN reg)
4323 equal(0x4, "e");
4324 not_equal(0x5, "ne");
4325 less(0x2, "b");
4326 greater_equal(0x3, "ae");
4327 less_equal(0x6, "be");
4328 greater(0x7, "a");
4329 overflow(0x0, "o");
4330 no_overflow(0x1, "no");
4331 %}
4332 %}
4333
4334 //----------OPERAND CLASSES----------------------------------------------------
4335 // Operand Classes are groups of operands that are used as to simplify
4336 // instruction definitions by not requiring the AD writer to specify separate
4337 // instructions for every form of operand when the instruction accepts
4338 // multiple operand types with the same basic encoding and format. The classic
4339 // case of this is memory operands.
4340
4341 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
4342 indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
4343 indCompressedOopOffset,
4344 indirectNarrow, indOffset8Narrow, indOffset32Narrow,
4345 indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
4346 indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
4347
4348 //----------PIPELINE-----------------------------------------------------------
4349 // Rules which define the behavior of the target architectures pipeline.
4350 pipeline %{
4351
4352 //----------ATTRIBUTES---------------------------------------------------------
4353 attributes %{
4354 variable_size_instructions; // Fixed size instructions
4355 max_instructions_per_bundle = 3; // Up to 3 instructions per bundle
4356 instruction_unit_size = 1; // An instruction is 1 bytes long
4357 instruction_fetch_unit_size = 16; // The processor fetches one line
4358 instruction_fetch_units = 1; // of 16 bytes
4359
4360 // List of nop instructions
4361 nops( MachNop );
4362 %}
4363
6911 format %{ "MEMBAR-storestore (empty encoding)" %}
6912 ins_encode( );
6913 ins_pipe(empty);
6914 %}
6915
6916 //----------Move Instructions--------------------------------------------------
6917
6918 instruct castX2P(rRegP dst, rRegL src)
6919 %{
6920 match(Set dst (CastX2P src));
6921
6922 format %{ "movq $dst, $src\t# long->ptr" %}
6923 ins_encode %{
6924 if ($dst$$reg != $src$$reg) {
6925 __ movptr($dst$$Register, $src$$Register);
6926 }
6927 %}
6928 ins_pipe(ialu_reg_reg); // XXX
6929 %}
6930
6931 instruct castP2X(rRegL dst, rRegP src)
6932 %{
6933 match(Set dst (CastP2X src));
6934
6935 format %{ "movq $dst, $src\t# ptr -> long" %}
6936 ins_encode %{
6937 if ($dst$$reg != $src$$reg) {
6938 __ movptr($dst$$Register, $src$$Register);
6939 }
6940 %}
6941 ins_pipe(ialu_reg_reg); // XXX
6942 %}
6943
6944 // Convert oop into int for vectors alignment masking
6945 instruct convP2I(rRegI dst, rRegP src)
6946 %{
6947 match(Set dst (ConvL2I (CastP2X src)));
6948
6949 format %{ "movl $dst, $src\t# ptr -> int" %}
6950 ins_encode %{
11426 effect(DEF dst, USE src);
11427 ins_cost(100);
11428 format %{ "movd $dst,$src\t# MoveI2F" %}
11429 ins_encode %{
11430 __ movdl($dst$$XMMRegister, $src$$Register);
11431 %}
11432 ins_pipe( pipe_slow );
11433 %}
11434
11435 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11436 match(Set dst (MoveL2D src));
11437 effect(DEF dst, USE src);
11438 ins_cost(100);
11439 format %{ "movd $dst,$src\t# MoveL2D" %}
11440 ins_encode %{
11441 __ movdq($dst$$XMMRegister, $src$$Register);
11442 %}
11443 ins_pipe( pipe_slow );
11444 %}
11445
11446 // Fast clearing of an array
11447 // Small ClearArray non-AVX512.
11448 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11449 Universe dummy, rFlagsReg cr)
11450 %{
11451 predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX <= 2));
11452 match(Set dummy (ClearArray cnt base));
11453 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
11454
11455 format %{ $$template
11456 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11457 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11458 $$emit$$"jg LARGE\n\t"
11459 $$emit$$"dec rcx\n\t"
11460 $$emit$$"js DONE\t# Zero length\n\t"
11461 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11462 $$emit$$"dec rcx\n\t"
11463 $$emit$$"jge LOOP\n\t"
11464 $$emit$$"jmp DONE\n\t"
11465 $$emit$$"# LARGE:\n\t"
11466 if (UseFastStosb) {
11467 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11468 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11469 } else if (UseXMMForObjInit) {
11470 $$emit$$"mov rdi,rax\n\t"
11471 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11472 $$emit$$"jmpq L_zero_64_bytes\n\t"
11473 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11481 $$emit$$"jl L_tail\n\t"
11482 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11483 $$emit$$"add 0x20,rax\n\t"
11484 $$emit$$"sub 0x4,rcx\n\t"
11485 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11486 $$emit$$"add 0x4,rcx\n\t"
11487 $$emit$$"jle L_end\n\t"
11488 $$emit$$"dec rcx\n\t"
11489 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11490 $$emit$$"vmovq xmm0,(rax)\n\t"
11491 $$emit$$"add 0x8,rax\n\t"
11492 $$emit$$"dec rcx\n\t"
11493 $$emit$$"jge L_sloop\n\t"
11494 $$emit$$"# L_end:\n\t"
11495 } else {
11496 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11497 }
11498 $$emit$$"# DONE"
11499 %}
11500 ins_encode %{
11501 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11502 $tmp$$XMMRegister, false, knoreg);
11503 %}
11504 ins_pipe(pipe_slow);
11505 %}
11506
11507 // Small ClearArray AVX512 non-constant length.
11508 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11509 Universe dummy, rFlagsReg cr)
11510 %{
11511 predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX > 2));
11512 match(Set dummy (ClearArray cnt base));
11513 ins_cost(125);
11514 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11515
11516 format %{ $$template
11517 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11518 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11519 $$emit$$"jg LARGE\n\t"
11520 $$emit$$"dec rcx\n\t"
11521 $$emit$$"js DONE\t# Zero length\n\t"
11522 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11523 $$emit$$"dec rcx\n\t"
11524 $$emit$$"jge LOOP\n\t"
11525 $$emit$$"jmp DONE\n\t"
11526 $$emit$$"# LARGE:\n\t"
11527 if (UseFastStosb) {
11528 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11529 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11530 } else if (UseXMMForObjInit) {
11531 $$emit$$"mov rdi,rax\n\t"
11532 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11533 $$emit$$"jmpq L_zero_64_bytes\n\t"
11534 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11542 $$emit$$"jl L_tail\n\t"
11543 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11544 $$emit$$"add 0x20,rax\n\t"
11545 $$emit$$"sub 0x4,rcx\n\t"
11546 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11547 $$emit$$"add 0x4,rcx\n\t"
11548 $$emit$$"jle L_end\n\t"
11549 $$emit$$"dec rcx\n\t"
11550 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11551 $$emit$$"vmovq xmm0,(rax)\n\t"
11552 $$emit$$"add 0x8,rax\n\t"
11553 $$emit$$"dec rcx\n\t"
11554 $$emit$$"jge L_sloop\n\t"
11555 $$emit$$"# L_end:\n\t"
11556 } else {
11557 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11558 }
11559 $$emit$$"# DONE"
11560 %}
11561 ins_encode %{
11562 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11563 $tmp$$XMMRegister, false, $ktmp$$KRegister);
11564 %}
11565 ins_pipe(pipe_slow);
11566 %}
11567
11568 // Large ClearArray non-AVX512.
11569 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11570 Universe dummy, rFlagsReg cr)
11571 %{
11572 predicate((UseAVX <=2) && ((ClearArrayNode*)n)->is_large());
11573 match(Set dummy (ClearArray cnt base));
11574 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
11575
11576 format %{ $$template
11577 if (UseFastStosb) {
11578 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11579 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11580 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11581 } else if (UseXMMForObjInit) {
11582 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11583 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11584 $$emit$$"jmpq L_zero_64_bytes\n\t"
11585 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11586 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11587 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11588 $$emit$$"add 0x40,rax\n\t"
11589 $$emit$$"# L_zero_64_bytes:\n\t"
11590 $$emit$$"sub 0x8,rcx\n\t"
11591 $$emit$$"jge L_loop\n\t"
11592 $$emit$$"add 0x4,rcx\n\t"
11593 $$emit$$"jl L_tail\n\t"
11594 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11595 $$emit$$"add 0x20,rax\n\t"
11596 $$emit$$"sub 0x4,rcx\n\t"
11597 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11598 $$emit$$"add 0x4,rcx\n\t"
11599 $$emit$$"jle L_end\n\t"
11600 $$emit$$"dec rcx\n\t"
11601 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11602 $$emit$$"vmovq xmm0,(rax)\n\t"
11603 $$emit$$"add 0x8,rax\n\t"
11604 $$emit$$"dec rcx\n\t"
11605 $$emit$$"jge L_sloop\n\t"
11606 $$emit$$"# L_end:\n\t"
11607 } else {
11608 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11609 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11610 }
11611 %}
11612 ins_encode %{
11613 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11614 $tmp$$XMMRegister, true, knoreg);
11615 %}
11616 ins_pipe(pipe_slow);
11617 %}
11618
11619 // Large ClearArray AVX512.
11620 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11621 Universe dummy, rFlagsReg cr)
11622 %{
11623 predicate((UseAVX > 2) && ((ClearArrayNode*)n)->is_large());
11624 match(Set dummy (ClearArray cnt base));
11625 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11626
11627 format %{ $$template
11628 if (UseFastStosb) {
11629 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11630 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11631 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11632 } else if (UseXMMForObjInit) {
11633 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11634 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11635 $$emit$$"jmpq L_zero_64_bytes\n\t"
11636 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11637 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11638 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11639 $$emit$$"add 0x40,rax\n\t"
11640 $$emit$$"# L_zero_64_bytes:\n\t"
11641 $$emit$$"sub 0x8,rcx\n\t"
11642 $$emit$$"jge L_loop\n\t"
11643 $$emit$$"add 0x4,rcx\n\t"
11644 $$emit$$"jl L_tail\n\t"
11645 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11646 $$emit$$"add 0x20,rax\n\t"
11647 $$emit$$"sub 0x4,rcx\n\t"
11648 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11649 $$emit$$"add 0x4,rcx\n\t"
11650 $$emit$$"jle L_end\n\t"
11651 $$emit$$"dec rcx\n\t"
11652 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11653 $$emit$$"vmovq xmm0,(rax)\n\t"
11654 $$emit$$"add 0x8,rax\n\t"
11655 $$emit$$"dec rcx\n\t"
11656 $$emit$$"jge L_sloop\n\t"
11657 $$emit$$"# L_end:\n\t"
11658 } else {
11659 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11660 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11661 }
11662 %}
11663 ins_encode %{
11664 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11665 $tmp$$XMMRegister, true, $ktmp$$KRegister);
11666 %}
11667 ins_pipe(pipe_slow);
11668 %}
11669
11670 // Small ClearArray AVX512 constant length.
11671 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rRegI zero, kReg ktmp, Universe dummy, rFlagsReg cr)
11672 %{
11673 predicate(!((ClearArrayNode*)n)->is_large() &&
11674 ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11675 match(Set dummy (ClearArray cnt base));
11676 ins_cost(100);
11677 effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr);
11678 format %{ "clear_mem_imm $base , $cnt \n\t" %}
11679 ins_encode %{
11680 __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11681 %}
11682 ins_pipe(pipe_slow);
11683 %}
11684
11685 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11686 rax_RegI result, legRegD tmp1, rFlagsReg cr)
11687 %{
11688 predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11689 match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11690 effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11691
11692 format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %}
11693 ins_encode %{
11694 __ string_compare($str1$$Register, $str2$$Register,
11695 $cnt1$$Register, $cnt2$$Register, $result$$Register,
11696 $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11697 %}
11698 ins_pipe( pipe_slow );
11699 %}
11700
13464
13465 ins_cost(300);
13466 format %{ "call_leaf,runtime " %}
13467 ins_encode(clear_avx, Java_To_Runtime(meth));
13468 ins_pipe(pipe_slow);
13469 %}
13470
13471 // Call runtime without safepoint and with vector arguments
13472 instruct CallLeafDirectVector(method meth)
13473 %{
13474 match(CallLeafVector);
13475 effect(USE meth);
13476
13477 ins_cost(300);
13478 format %{ "call_leaf,vector " %}
13479 ins_encode(Java_To_Runtime(meth));
13480 ins_pipe(pipe_slow);
13481 %}
13482
13483 // Call runtime without safepoint
13484 instruct CallLeafNoFPDirect(method meth)
13485 %{
13486 match(CallLeafNoFP);
13487 effect(USE meth);
13488
13489 ins_cost(300);
13490 format %{ "call_leaf_nofp,runtime " %}
13491 ins_encode(clear_avx, Java_To_Runtime(meth));
13492 ins_pipe(pipe_slow);
13493 %}
13494
13495 // Return Instruction
13496 // Remove the return address & jump to it.
13497 // Notice: We always emit a nop after a ret to make sure there is room
13498 // for safepoint patching
13499 instruct Ret()
13500 %{
13501 match(Return);
13502
13503 format %{ "ret" %}
13504 ins_encode %{
13505 __ ret(0);
|
473 }
474
475 // !!!!! Special hack to get all types of calls to specify the byte offset
476 // from the start of the call to the point where the return address
477 // will point.
478 int MachCallStaticJavaNode::ret_addr_offset()
479 {
480 int offset = 5; // 5 bytes from start of call to where return address points
481 offset += clear_avx_size();
482 return offset;
483 }
484
485 int MachCallDynamicJavaNode::ret_addr_offset()
486 {
487 int offset = 15; // 15 bytes from start of call to where return address points
488 offset += clear_avx_size();
489 return offset;
490 }
491
492 int MachCallRuntimeNode::ret_addr_offset() {
493 if (_entry_point == NULL) {
494 // CallLeafNoFPInDirect
495 return 3; // callq (register)
496 }
497 int offset = 13; // movq r10,#addr; callq (r10)
498 if (this->ideal_Opcode() != Op_CallLeafVector) {
499 offset += clear_avx_size();
500 }
501 return offset;
502 }
503
504 //
505 // Compute padding required for nodes which need alignment
506 //
507
508 // The address of the call instruction needs to be 4-byte aligned to
509 // ensure that it does not span a cache line so that it can be patched.
510 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
511 {
512 current_offset += clear_avx_size(); // skip vzeroupper
513 current_offset += 1; // skip call opcode byte
514 return align_up(current_offset, alignment_required()) - current_offset;
515 }
516
517 // The address of the call instruction needs to be 4-byte aligned to
518 // ensure that it does not span a cache line so that it can be patched.
519 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
520 {
521 current_offset += clear_avx_size(); // skip vzeroupper
522 current_offset += 11; // skip movq instruction + call opcode byte
523 return align_up(current_offset, alignment_required()) - current_offset;
892 st->print("# stack alignment check");
893 #endif
894 }
895 if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
896 st->print("\n\t");
897 st->print("cmpl [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
898 st->print("\n\t");
899 st->print("je fast_entry\t");
900 st->print("\n\t");
901 st->print("call #nmethod_entry_barrier_stub\t");
902 st->print("\n\tfast_entry:");
903 }
904 st->cr();
905 }
906 #endif
907
908 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
909 Compile* C = ra_->C;
910 C2_MacroAssembler _masm(&cbuf);
911
912 __ verified_entry(C);
913
914 if (ra_->C->stub_function() == NULL) {
915 __ entry_barrier();
916 }
917
918 if (!Compile::current()->output()->in_scratch_emit_size()) {
919 __ bind(*_verified_entry);
920 }
921
922 C->output()->set_frame_complete(cbuf.insts_size());
923
924 if (C->has_mach_constant_base_node()) {
925 // NOTE: We set the table base offset here because users might be
926 // emitted before MachConstantBaseNode.
927 ConstantTable& constant_table = C->output()->constant_table();
928 constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
929 }
930 }
931
932 int MachPrologNode::reloc() const
933 {
934 return 0; // a large enough number
935 }
936
937 //=============================================================================
938 #ifndef PRODUCT
939 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
940 {
941 Compile* C = ra_->C;
942 if (generate_vzeroupper(C)) {
943 st->print("vzeroupper");
944 st->cr(); st->print("\t");
945 }
946
947 int framesize = C->output()->frame_size_in_bytes();
948 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
949 // Remove word for return adr already pushed
950 // and RBP
951 framesize -= 2*wordSize;
959 if (do_polling() && C->is_method_compilation()) {
960 st->print("\t");
961 st->print_cr("cmpq rsp, poll_offset[r15_thread] \n\t"
962 "ja #safepoint_stub\t"
963 "# Safepoint: poll for GC");
964 }
965 }
966 #endif
967
968 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
969 {
970 Compile* C = ra_->C;
971 MacroAssembler _masm(&cbuf);
972
973 if (generate_vzeroupper(C)) {
974 // Clear upper bits of YMM registers when current compiled code uses
975 // wide vectors to avoid AVX <-> SSE transition penalty during call.
976 __ vzeroupper();
977 }
978
979 // Subtract two words to account for return address and rbp
980 int initial_framesize = C->output()->frame_size_in_bytes() - 2*wordSize;
981 __ remove_frame(initial_framesize, C->needs_stack_repair());
982
983 if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
984 __ reserved_stack_check();
985 }
986
987 if (do_polling() && C->is_method_compilation()) {
988 MacroAssembler _masm(&cbuf);
989 Label dummy_label;
990 Label* code_stub = &dummy_label;
991 if (!C->output()->in_scratch_emit_size()) {
992 C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
993 C->output()->add_stub(stub);
994 code_stub = &stub->entry();
995 }
996 __ relocate(relocInfo::poll_return_type);
997 __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
998 }
999 }
1000
1001 int MachEpilogNode::reloc() const
1002 {
1003 return 2; // a large enough number
1004 }
1005
1006 const Pipeline* MachEpilogNode::pipeline() const
1007 {
1008 return MachNode::pipeline_class();
1009 }
1010
1011 //=============================================================================
1012
1013 enum RC {
1014 rc_bad,
1015 rc_int,
1016 rc_kreg,
1017 rc_float,
1018 rc_stack
1019 };
1020
1613 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1614 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1615 emit_rm(cbuf, 0x2, reg & 7, 0x04);
1616 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1617 emit_d32(cbuf, offset);
1618 } else {
1619 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1620 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1621 emit_rm(cbuf, 0x1, reg & 7, 0x04);
1622 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1623 emit_d8(cbuf, offset);
1624 }
1625 }
1626
1627 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
1628 {
1629 int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1630 return (offset < 0x80) ? 5 : 8; // REX
1631 }
1632
1633 //=============================================================================
1634 #ifndef PRODUCT
1635 void MachVEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1636 {
1637 st->print_cr("MachVEPNode");
1638 }
1639 #endif
1640
1641 void MachVEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1642 {
1643 C2_MacroAssembler _masm(&cbuf);
1644 uint insts_size = cbuf.insts_size();
1645 if (!_verified) {
1646 if (UseCompressedClassPointers) {
1647 __ load_klass(rscratch1, j_rarg0, rscratch2);
1648 __ cmpptr(rax, rscratch1);
1649 } else {
1650 __ cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1651 }
1652 __ jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1653 } else {
1654 // TODO 8284443 Avoid creation of temporary frame
1655 if (ra_->C->stub_function() == NULL) {
1656 __ verified_entry(ra_->C, 0);
1657 __ entry_barrier();
1658 int initial_framesize = ra_->C->output()->frame_size_in_bytes() - 2*wordSize;
1659 __ remove_frame(initial_framesize, false);
1660 }
1661 // Unpack inline type args passed as oop and then jump to
1662 // the verified entry point (skipping the unverified entry).
1663 int sp_inc = __ unpack_inline_args(ra_->C, _receiver_only);
1664 // Emit code for verified entry and save increment for stack repair on return
1665 __ verified_entry(ra_->C, sp_inc);
1666 if (Compile::current()->output()->in_scratch_emit_size()) {
1667 Label dummy_verified_entry;
1668 __ jmp(dummy_verified_entry);
1669 } else {
1670 __ jmp(*_verified_entry);
1671 }
1672 }
1673 /* WARNING these NOPs are critical so that verified entry point is properly
1674 4 bytes aligned for patching by NativeJump::patch_verified_entry() */
1675 int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
1676 nops_cnt &= 0x3; // Do not add nops if code is aligned.
1677 if (nops_cnt > 0) {
1678 __ nop(nops_cnt);
1679 }
1680 }
1681
1682 //=============================================================================
1683 #ifndef PRODUCT
1684 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1685 {
1686 if (UseCompressedClassPointers) {
1687 st->print_cr("movl rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
1688 st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
1689 st->print_cr("\tcmpq rax, rscratch1\t # Inline cache check");
1690 } else {
1691 st->print_cr("\tcmpq rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
1692 "# Inline cache check");
1693 }
1694 st->print_cr("\tjne SharedRuntime::_ic_miss_stub");
1695 st->print_cr("\tnop\t# nops to align entry point");
1696 }
1697 #endif
1698
1699 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1700 {
1701 MacroAssembler masm(&cbuf);
1704 masm.load_klass(rscratch1, j_rarg0, rscratch2);
1705 masm.cmpptr(rax, rscratch1);
1706 } else {
1707 masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1708 }
1709
1710 masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1711
1712 /* WARNING these NOPs are critical so that verified entry point is properly
1713 4 bytes aligned for patching by NativeJump::patch_verified_entry() */
1714 int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
1715 if (OptoBreakpoint) {
1716 // Leave space for int3
1717 nops_cnt -= 1;
1718 }
1719 nops_cnt &= 0x3; // Do not add nops if code is aligned.
1720 if (nops_cnt > 0)
1721 masm.nop(nops_cnt);
1722 }
1723
1724 //=============================================================================
1725
1726 const bool Matcher::supports_vector_calling_convention(void) {
1727 if (EnableVectorSupport && UseVectorStubs) {
1728 return true;
1729 }
1730 return false;
1731 }
1732
1733 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
1734 assert(EnableVectorSupport && UseVectorStubs, "sanity");
1735 int lo = XMM0_num;
1736 int hi = XMM0b_num;
1737 if (ideal_reg == Op_VecX) hi = XMM0d_num;
1738 else if (ideal_reg == Op_VecY) hi = XMM0h_num;
1739 else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
1740 return OptoRegPair(hi, lo);
1741 }
1742
1743 // Is this branch offset short enough that a short branch can be used?
3982 %}
3983 %}
3984
3985 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
3986 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
3987 %{
3988 constraint(ALLOC_IN_RC(ptr_reg));
3989 predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
3990 match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
3991
3992 op_cost(10);
3993 format %{"[$reg + $off + $idx << $scale]" %}
3994 interface(MEMORY_INTER) %{
3995 base($reg);
3996 index($idx);
3997 scale($scale);
3998 disp($off);
3999 %}
4000 %}
4001
4002 // Indirect Narrow Oop Operand
4003 operand indCompressedOop(rRegN reg) %{
4004 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4005 constraint(ALLOC_IN_RC(ptr_reg));
4006 match(DecodeN reg);
4007
4008 op_cost(10);
4009 format %{"[R12 + $reg << 3] (compressed oop addressing)" %}
4010 interface(MEMORY_INTER) %{
4011 base(0xc); // R12
4012 index($reg);
4013 scale(0x3);
4014 disp(0x0);
4015 %}
4016 %}
4017
4018 // Indirect Narrow Oop Plus Offset Operand
4019 // Note: x86 architecture doesn't support "scale * index + offset" without a base
4020 // we can't free r12 even with CompressedOops::base() == NULL.
4021 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
4022 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4023 constraint(ALLOC_IN_RC(ptr_reg));
4024 match(AddP (DecodeN reg) off);
4025
4026 op_cost(10);
4027 format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
4028 interface(MEMORY_INTER) %{
4029 base(0xc); // R12
4030 index($reg);
4031 scale(0x3);
4032 disp($off);
4033 %}
4034 %}
4035
4036 // Indirect Memory Operand
4037 operand indirectNarrow(rRegN reg)
4344 equal(0x4, "e");
4345 not_equal(0x5, "ne");
4346 less(0x2, "b");
4347 greater_equal(0x3, "ae");
4348 less_equal(0x6, "be");
4349 greater(0x7, "a");
4350 overflow(0x0, "o");
4351 no_overflow(0x1, "no");
4352 %}
4353 %}
4354
4355 //----------OPERAND CLASSES----------------------------------------------------
4356 // Operand Classes are groups of operands that are used as to simplify
4357 // instruction definitions by not requiring the AD writer to specify separate
4358 // instructions for every form of operand when the instruction accepts
4359 // multiple operand types with the same basic encoding and format. The classic
4360 // case of this is memory operands.
4361
4362 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
4363 indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
4364 indCompressedOop, indCompressedOopOffset,
4365 indirectNarrow, indOffset8Narrow, indOffset32Narrow,
4366 indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
4367 indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
4368
4369 //----------PIPELINE-----------------------------------------------------------
4370 // Rules which define the behavior of the target architectures pipeline.
4371 pipeline %{
4372
4373 //----------ATTRIBUTES---------------------------------------------------------
4374 attributes %{
4375 variable_size_instructions; // Fixed size instructions
4376 max_instructions_per_bundle = 3; // Up to 3 instructions per bundle
4377 instruction_unit_size = 1; // An instruction is 1 bytes long
4378 instruction_fetch_unit_size = 16; // The processor fetches one line
4379 instruction_fetch_units = 1; // of 16 bytes
4380
4381 // List of nop instructions
4382 nops( MachNop );
4383 %}
4384
6932 format %{ "MEMBAR-storestore (empty encoding)" %}
6933 ins_encode( );
6934 ins_pipe(empty);
6935 %}
6936
6937 //----------Move Instructions--------------------------------------------------
6938
6939 instruct castX2P(rRegP dst, rRegL src)
6940 %{
6941 match(Set dst (CastX2P src));
6942
6943 format %{ "movq $dst, $src\t# long->ptr" %}
6944 ins_encode %{
6945 if ($dst$$reg != $src$$reg) {
6946 __ movptr($dst$$Register, $src$$Register);
6947 }
6948 %}
6949 ins_pipe(ialu_reg_reg); // XXX
6950 %}
6951
6952 instruct castN2X(rRegL dst, rRegN src)
6953 %{
6954 match(Set dst (CastP2X src));
6955
6956 format %{ "movq $dst, $src\t# ptr -> long" %}
6957 ins_encode %{
6958 if ($dst$$reg != $src$$reg) {
6959 __ movptr($dst$$Register, $src$$Register);
6960 }
6961 %}
6962 ins_pipe(ialu_reg_reg); // XXX
6963 %}
6964
6965 instruct castP2X(rRegL dst, rRegP src)
6966 %{
6967 match(Set dst (CastP2X src));
6968
6969 format %{ "movq $dst, $src\t# ptr -> long" %}
6970 ins_encode %{
6971 if ($dst$$reg != $src$$reg) {
6972 __ movptr($dst$$Register, $src$$Register);
6973 }
6974 %}
6975 ins_pipe(ialu_reg_reg); // XXX
6976 %}
6977
6978 // Convert oop into int for vectors alignment masking
6979 instruct convP2I(rRegI dst, rRegP src)
6980 %{
6981 match(Set dst (ConvL2I (CastP2X src)));
6982
6983 format %{ "movl $dst, $src\t# ptr -> int" %}
6984 ins_encode %{
11460 effect(DEF dst, USE src);
11461 ins_cost(100);
11462 format %{ "movd $dst,$src\t# MoveI2F" %}
11463 ins_encode %{
11464 __ movdl($dst$$XMMRegister, $src$$Register);
11465 %}
11466 ins_pipe( pipe_slow );
11467 %}
11468
11469 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11470 match(Set dst (MoveL2D src));
11471 effect(DEF dst, USE src);
11472 ins_cost(100);
11473 format %{ "movd $dst,$src\t# MoveL2D" %}
11474 ins_encode %{
11475 __ movdq($dst$$XMMRegister, $src$$Register);
11476 %}
11477 ins_pipe( pipe_slow );
11478 %}
11479
11480
11481 // Fast clearing of an array
11482 // Small ClearArray non-AVX512.
11483 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11484 Universe dummy, rFlagsReg cr)
11485 %{
11486 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11487 match(Set dummy (ClearArray (Binary cnt base) val));
11488 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11489
11490 format %{ $$template
11491 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11492 $$emit$$"jg LARGE\n\t"
11493 $$emit$$"dec rcx\n\t"
11494 $$emit$$"js DONE\t# Zero length\n\t"
11495 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11496 $$emit$$"dec rcx\n\t"
11497 $$emit$$"jge LOOP\n\t"
11498 $$emit$$"jmp DONE\n\t"
11499 $$emit$$"# LARGE:\n\t"
11500 if (UseFastStosb) {
11501 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11502 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11503 } else if (UseXMMForObjInit) {
11504 $$emit$$"movdq $tmp, $val\n\t"
11505 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11506 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11507 $$emit$$"jmpq L_zero_64_bytes\n\t"
11508 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11509 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11510 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11511 $$emit$$"add 0x40,rax\n\t"
11512 $$emit$$"# L_zero_64_bytes:\n\t"
11513 $$emit$$"sub 0x8,rcx\n\t"
11514 $$emit$$"jge L_loop\n\t"
11515 $$emit$$"add 0x4,rcx\n\t"
11516 $$emit$$"jl L_tail\n\t"
11517 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11518 $$emit$$"add 0x20,rax\n\t"
11519 $$emit$$"sub 0x4,rcx\n\t"
11520 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11521 $$emit$$"add 0x4,rcx\n\t"
11522 $$emit$$"jle L_end\n\t"
11523 $$emit$$"dec rcx\n\t"
11524 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11525 $$emit$$"vmovq xmm0,(rax)\n\t"
11526 $$emit$$"add 0x8,rax\n\t"
11527 $$emit$$"dec rcx\n\t"
11528 $$emit$$"jge L_sloop\n\t"
11529 $$emit$$"# L_end:\n\t"
11530 } else {
11531 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11532 }
11533 $$emit$$"# DONE"
11534 %}
11535 ins_encode %{
11536 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11537 $tmp$$XMMRegister, false, false);
11538 %}
11539 ins_pipe(pipe_slow);
11540 %}
11541
11542 instruct rep_stos_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11543 Universe dummy, rFlagsReg cr)
11544 %{
11545 predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11546 match(Set dummy (ClearArray (Binary cnt base) val));
11547 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11548
11549 format %{ $$template
11550 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11551 $$emit$$"jg LARGE\n\t"
11552 $$emit$$"dec rcx\n\t"
11553 $$emit$$"js DONE\t# Zero length\n\t"
11554 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11555 $$emit$$"dec rcx\n\t"
11556 $$emit$$"jge LOOP\n\t"
11557 $$emit$$"jmp DONE\n\t"
11558 $$emit$$"# LARGE:\n\t"
11559 if (UseXMMForObjInit) {
11560 $$emit$$"movdq $tmp, $val\n\t"
11561 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11562 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11563 $$emit$$"jmpq L_zero_64_bytes\n\t"
11564 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11565 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11566 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11567 $$emit$$"add 0x40,rax\n\t"
11568 $$emit$$"# L_zero_64_bytes:\n\t"
11569 $$emit$$"sub 0x8,rcx\n\t"
11570 $$emit$$"jge L_loop\n\t"
11571 $$emit$$"add 0x4,rcx\n\t"
11572 $$emit$$"jl L_tail\n\t"
11573 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11574 $$emit$$"add 0x20,rax\n\t"
11575 $$emit$$"sub 0x4,rcx\n\t"
11576 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11577 $$emit$$"add 0x4,rcx\n\t"
11578 $$emit$$"jle L_end\n\t"
11579 $$emit$$"dec rcx\n\t"
11580 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11581 $$emit$$"vmovq xmm0,(rax)\n\t"
11582 $$emit$$"add 0x8,rax\n\t"
11583 $$emit$$"dec rcx\n\t"
11584 $$emit$$"jge L_sloop\n\t"
11585 $$emit$$"# L_end:\n\t"
11586 } else {
11587 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11588 }
11589 $$emit$$"# DONE"
11590 %}
11591 ins_encode %{
11592 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11593 $tmp$$XMMRegister, false, true);
11594 %}
11595 ins_pipe(pipe_slow);
11596 %}
11597
11598 // Small ClearArray AVX512 non-constant length.
11599 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11600 Universe dummy, rFlagsReg cr)
11601 %{
11602 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11603 match(Set dummy (ClearArray (Binary cnt base) val));
11604 ins_cost(125);
11605 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11606
11607 format %{ $$template
11608 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11609 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11610 $$emit$$"jg LARGE\n\t"
11611 $$emit$$"dec rcx\n\t"
11612 $$emit$$"js DONE\t# Zero length\n\t"
11613 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11614 $$emit$$"dec rcx\n\t"
11615 $$emit$$"jge LOOP\n\t"
11616 $$emit$$"jmp DONE\n\t"
11617 $$emit$$"# LARGE:\n\t"
11618 if (UseFastStosb) {
11619 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11620 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11621 } else if (UseXMMForObjInit) {
11622 $$emit$$"mov rdi,rax\n\t"
11623 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11624 $$emit$$"jmpq L_zero_64_bytes\n\t"
11625 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11633 $$emit$$"jl L_tail\n\t"
11634 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11635 $$emit$$"add 0x20,rax\n\t"
11636 $$emit$$"sub 0x4,rcx\n\t"
11637 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11638 $$emit$$"add 0x4,rcx\n\t"
11639 $$emit$$"jle L_end\n\t"
11640 $$emit$$"dec rcx\n\t"
11641 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11642 $$emit$$"vmovq xmm0,(rax)\n\t"
11643 $$emit$$"add 0x8,rax\n\t"
11644 $$emit$$"dec rcx\n\t"
11645 $$emit$$"jge L_sloop\n\t"
11646 $$emit$$"# L_end:\n\t"
11647 } else {
11648 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11649 }
11650 $$emit$$"# DONE"
11651 %}
11652 ins_encode %{
11653 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11654 $tmp$$XMMRegister, false, false, $ktmp$$KRegister);
11655 %}
11656 ins_pipe(pipe_slow);
11657 %}
11658
11659 instruct rep_stos_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11660 Universe dummy, rFlagsReg cr)
11661 %{
11662 predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11663 match(Set dummy (ClearArray (Binary cnt base) val));
11664 ins_cost(125);
11665 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11666
11667 format %{ $$template
11668 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11669 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11670 $$emit$$"jg LARGE\n\t"
11671 $$emit$$"dec rcx\n\t"
11672 $$emit$$"js DONE\t# Zero length\n\t"
11673 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11674 $$emit$$"dec rcx\n\t"
11675 $$emit$$"jge LOOP\n\t"
11676 $$emit$$"jmp DONE\n\t"
11677 $$emit$$"# LARGE:\n\t"
11678 if (UseFastStosb) {
11679 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11680 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11681 } else if (UseXMMForObjInit) {
11682 $$emit$$"mov rdi,rax\n\t"
11683 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11684 $$emit$$"jmpq L_zero_64_bytes\n\t"
11685 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11693 $$emit$$"jl L_tail\n\t"
11694 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11695 $$emit$$"add 0x20,rax\n\t"
11696 $$emit$$"sub 0x4,rcx\n\t"
11697 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11698 $$emit$$"add 0x4,rcx\n\t"
11699 $$emit$$"jle L_end\n\t"
11700 $$emit$$"dec rcx\n\t"
11701 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11702 $$emit$$"vmovq xmm0,(rax)\n\t"
11703 $$emit$$"add 0x8,rax\n\t"
11704 $$emit$$"dec rcx\n\t"
11705 $$emit$$"jge L_sloop\n\t"
11706 $$emit$$"# L_end:\n\t"
11707 } else {
11708 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11709 }
11710 $$emit$$"# DONE"
11711 %}
11712 ins_encode %{
11713 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11714 $tmp$$XMMRegister, false, true, $ktmp$$KRegister);
11715 %}
11716 ins_pipe(pipe_slow);
11717 %}
11718
11719 // Large ClearArray non-AVX512.
11720 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11721 Universe dummy, rFlagsReg cr)
11722 %{
11723 predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11724 match(Set dummy (ClearArray (Binary cnt base) val));
11725 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11726
11727 format %{ $$template
11728 if (UseFastStosb) {
11729 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11730 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11731 } else if (UseXMMForObjInit) {
11732 $$emit$$"movdq $tmp, $val\n\t"
11733 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11734 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11735 $$emit$$"jmpq L_zero_64_bytes\n\t"
11736 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11737 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11738 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11739 $$emit$$"add 0x40,rax\n\t"
11740 $$emit$$"# L_zero_64_bytes:\n\t"
11741 $$emit$$"sub 0x8,rcx\n\t"
11742 $$emit$$"jge L_loop\n\t"
11743 $$emit$$"add 0x4,rcx\n\t"
11744 $$emit$$"jl L_tail\n\t"
11745 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11746 $$emit$$"add 0x20,rax\n\t"
11747 $$emit$$"sub 0x4,rcx\n\t"
11748 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11749 $$emit$$"add 0x4,rcx\n\t"
11750 $$emit$$"jle L_end\n\t"
11751 $$emit$$"dec rcx\n\t"
11752 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11753 $$emit$$"vmovq xmm0,(rax)\n\t"
11754 $$emit$$"add 0x8,rax\n\t"
11755 $$emit$$"dec rcx\n\t"
11756 $$emit$$"jge L_sloop\n\t"
11757 $$emit$$"# L_end:\n\t"
11758 } else {
11759 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11760 }
11761 %}
11762 ins_encode %{
11763 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11764 $tmp$$XMMRegister, true, false);
11765 %}
11766 ins_pipe(pipe_slow);
11767 %}
11768
11769 instruct rep_stos_large_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11770 Universe dummy, rFlagsReg cr)
11771 %{
11772 predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11773 match(Set dummy (ClearArray (Binary cnt base) val));
11774 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11775
11776 format %{ $$template
11777 if (UseXMMForObjInit) {
11778 $$emit$$"movdq $tmp, $val\n\t"
11779 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11780 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11781 $$emit$$"jmpq L_zero_64_bytes\n\t"
11782 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11783 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11784 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11785 $$emit$$"add 0x40,rax\n\t"
11786 $$emit$$"# L_zero_64_bytes:\n\t"
11787 $$emit$$"sub 0x8,rcx\n\t"
11788 $$emit$$"jge L_loop\n\t"
11789 $$emit$$"add 0x4,rcx\n\t"
11790 $$emit$$"jl L_tail\n\t"
11791 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11792 $$emit$$"add 0x20,rax\n\t"
11793 $$emit$$"sub 0x4,rcx\n\t"
11794 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11795 $$emit$$"add 0x4,rcx\n\t"
11796 $$emit$$"jle L_end\n\t"
11797 $$emit$$"dec rcx\n\t"
11798 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11799 $$emit$$"vmovq xmm0,(rax)\n\t"
11800 $$emit$$"add 0x8,rax\n\t"
11801 $$emit$$"dec rcx\n\t"
11802 $$emit$$"jge L_sloop\n\t"
11803 $$emit$$"# L_end:\n\t"
11804 } else {
11805 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11806 }
11807 %}
11808 ins_encode %{
11809 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11810 $tmp$$XMMRegister, true, true);
11811 %}
11812 ins_pipe(pipe_slow);
11813 %}
11814
11815 // Large ClearArray AVX512.
11816 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11817 Universe dummy, rFlagsReg cr)
11818 %{
11819 predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11820 match(Set dummy (ClearArray (Binary cnt base) val));
11821 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11822
11823 format %{ $$template
11824 if (UseFastStosb) {
11825 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11826 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11827 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11828 } else if (UseXMMForObjInit) {
11829 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11830 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11831 $$emit$$"jmpq L_zero_64_bytes\n\t"
11832 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11833 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11834 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11835 $$emit$$"add 0x40,rax\n\t"
11836 $$emit$$"# L_zero_64_bytes:\n\t"
11837 $$emit$$"sub 0x8,rcx\n\t"
11838 $$emit$$"jge L_loop\n\t"
11839 $$emit$$"add 0x4,rcx\n\t"
11840 $$emit$$"jl L_tail\n\t"
11841 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11842 $$emit$$"add 0x20,rax\n\t"
11843 $$emit$$"sub 0x4,rcx\n\t"
11844 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11845 $$emit$$"add 0x4,rcx\n\t"
11846 $$emit$$"jle L_end\n\t"
11847 $$emit$$"dec rcx\n\t"
11848 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11849 $$emit$$"vmovq xmm0,(rax)\n\t"
11850 $$emit$$"add 0x8,rax\n\t"
11851 $$emit$$"dec rcx\n\t"
11852 $$emit$$"jge L_sloop\n\t"
11853 $$emit$$"# L_end:\n\t"
11854 } else {
11855 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11856 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11857 }
11858 %}
11859 ins_encode %{
11860 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11861 $tmp$$XMMRegister, true, false, $ktmp$$KRegister);
11862 %}
11863 ins_pipe(pipe_slow);
11864 %}
11865
11866 instruct rep_stos_large_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11867 Universe dummy, rFlagsReg cr)
11868 %{
11869 predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11870 match(Set dummy (ClearArray (Binary cnt base) val));
11871 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11872
11873 format %{ $$template
11874 if (UseFastStosb) {
11875 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11876 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11877 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11878 } else if (UseXMMForObjInit) {
11879 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11880 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11881 $$emit$$"jmpq L_zero_64_bytes\n\t"
11882 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11883 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11884 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11885 $$emit$$"add 0x40,rax\n\t"
11886 $$emit$$"# L_zero_64_bytes:\n\t"
11887 $$emit$$"sub 0x8,rcx\n\t"
11888 $$emit$$"jge L_loop\n\t"
11889 $$emit$$"add 0x4,rcx\n\t"
11890 $$emit$$"jl L_tail\n\t"
11891 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11892 $$emit$$"add 0x20,rax\n\t"
11893 $$emit$$"sub 0x4,rcx\n\t"
11894 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11895 $$emit$$"add 0x4,rcx\n\t"
11896 $$emit$$"jle L_end\n\t"
11897 $$emit$$"dec rcx\n\t"
11898 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11899 $$emit$$"vmovq xmm0,(rax)\n\t"
11900 $$emit$$"add 0x8,rax\n\t"
11901 $$emit$$"dec rcx\n\t"
11902 $$emit$$"jge L_sloop\n\t"
11903 $$emit$$"# L_end:\n\t"
11904 } else {
11905 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11906 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11907 }
11908 %}
11909 ins_encode %{
11910 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11911 $tmp$$XMMRegister, true, true, $ktmp$$KRegister);
11912 %}
11913 ins_pipe(pipe_slow);
11914 %}
11915
11916 // Small ClearArray AVX512 constant length.
11917 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rax_RegL val, kReg ktmp, Universe dummy, rFlagsReg cr)
11918 %{
11919 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() &&
11920 ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11921 match(Set dummy (ClearArray (Binary cnt base) val));
11922 ins_cost(100);
11923 effect(TEMP tmp, USE_KILL val, TEMP ktmp, KILL cr);
11924 format %{ "clear_mem_imm $base , $cnt \n\t" %}
11925 ins_encode %{
11926 __ clear_mem($base$$Register, $cnt$$constant, $val$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11927 %}
11928 ins_pipe(pipe_slow);
11929 %}
11930
11931 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11932 rax_RegI result, legRegD tmp1, rFlagsReg cr)
11933 %{
11934 predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11935 match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11936 effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11937
11938 format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %}
11939 ins_encode %{
11940 __ string_compare($str1$$Register, $str2$$Register,
11941 $cnt1$$Register, $cnt2$$Register, $result$$Register,
11942 $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11943 %}
11944 ins_pipe( pipe_slow );
11945 %}
11946
13710
13711 ins_cost(300);
13712 format %{ "call_leaf,runtime " %}
13713 ins_encode(clear_avx, Java_To_Runtime(meth));
13714 ins_pipe(pipe_slow);
13715 %}
13716
13717 // Call runtime without safepoint and with vector arguments
13718 instruct CallLeafDirectVector(method meth)
13719 %{
13720 match(CallLeafVector);
13721 effect(USE meth);
13722
13723 ins_cost(300);
13724 format %{ "call_leaf,vector " %}
13725 ins_encode(Java_To_Runtime(meth));
13726 ins_pipe(pipe_slow);
13727 %}
13728
13729 // Call runtime without safepoint
13730 // entry point is null, target holds the address to call
13731 instruct CallLeafNoFPInDirect(rRegP target)
13732 %{
13733 predicate(n->as_Call()->entry_point() == NULL);
13734 match(CallLeafNoFP target);
13735
13736 ins_cost(300);
13737 format %{ "call_leaf_nofp,runtime indirect " %}
13738 ins_encode %{
13739 __ call($target$$Register);
13740 %}
13741
13742 ins_pipe(pipe_slow);
13743 %}
13744
13745 instruct CallLeafNoFPDirect(method meth)
13746 %{
13747 predicate(n->as_Call()->entry_point() != NULL);
13748 match(CallLeafNoFP);
13749 effect(USE meth);
13750
13751 ins_cost(300);
13752 format %{ "call_leaf_nofp,runtime " %}
13753 ins_encode(clear_avx, Java_To_Runtime(meth));
13754 ins_pipe(pipe_slow);
13755 %}
13756
13757 // Return Instruction
13758 // Remove the return address & jump to it.
13759 // Notice: We always emit a nop after a ret to make sure there is room
13760 // for safepoint patching
13761 instruct Ret()
13762 %{
13763 match(Return);
13764
13765 format %{ "ret" %}
13766 ins_encode %{
13767 __ ret(0);
|