473 }
474
475 // !!!!! Special hack to get all types of calls to specify the byte offset
476 // from the start of the call to the point where the return address
477 // will point.
478 int MachCallStaticJavaNode::ret_addr_offset()
479 {
480 int offset = 5; // 5 bytes from start of call to where return address points
481 offset += clear_avx_size();
482 return offset;
483 }
484
485 int MachCallDynamicJavaNode::ret_addr_offset()
486 {
487 int offset = 15; // 15 bytes from start of call to where return address points
488 offset += clear_avx_size();
489 return offset;
490 }
491
492 int MachCallRuntimeNode::ret_addr_offset() {
493 int offset = 13; // movq r10,#addr; callq (r10)
494 if (this->ideal_Opcode() != Op_CallLeafVector) {
495 offset += clear_avx_size();
496 }
497 return offset;
498 }
499 //
500 // Compute padding required for nodes which need alignment
501 //
502
503 // The address of the call instruction needs to be 4-byte aligned to
504 // ensure that it does not span a cache line so that it can be patched.
505 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
506 {
507 current_offset += clear_avx_size(); // skip vzeroupper
508 current_offset += 1; // skip call opcode byte
509 return align_up(current_offset, alignment_required()) - current_offset;
510 }
511
512 // The address of the call instruction needs to be 4-byte aligned to
513 // ensure that it does not span a cache line so that it can be patched.
514 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
515 {
516 current_offset += clear_avx_size(); // skip vzeroupper
517 current_offset += 11; // skip movq instruction + call opcode byte
518 return align_up(current_offset, alignment_required()) - current_offset;
887 st->print("# stack alignment check");
888 #endif
889 }
890 if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
891 st->print("\n\t");
892 st->print("cmpl [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
893 st->print("\n\t");
894 st->print("je fast_entry\t");
895 st->print("\n\t");
896 st->print("call #nmethod_entry_barrier_stub\t");
897 st->print("\n\tfast_entry:");
898 }
899 st->cr();
900 }
901 #endif
902
903 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
904 Compile* C = ra_->C;
905 C2_MacroAssembler _masm(&cbuf);
906
907 int framesize = C->output()->frame_size_in_bytes();
908 int bangsize = C->output()->bang_size_in_bytes();
909
910 if (C->clinit_barrier_on_entry()) {
911 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
912 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
913
914 Label L_skip_barrier;
915 Register klass = rscratch1;
916
917 __ mov_metadata(klass, C->method()->holder()->constant_encoding());
918 __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
919
920 __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
921
922 __ bind(L_skip_barrier);
923 }
924
925 __ verified_entry(framesize, C->output()->need_stack_bang(bangsize)?bangsize:0, false, C->stub_function() != NULL);
926
927 C->output()->set_frame_complete(cbuf.insts_size());
928
929 if (C->has_mach_constant_base_node()) {
930 // NOTE: We set the table base offset here because users might be
931 // emitted before MachConstantBaseNode.
932 ConstantTable& constant_table = C->output()->constant_table();
933 constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
934 }
935 }
936
937 uint MachPrologNode::size(PhaseRegAlloc* ra_) const
938 {
939 return MachNode::size(ra_); // too many variables; just compute it
940 // the hard way
941 }
942
943 int MachPrologNode::reloc() const
944 {
945 return 0; // a large enough number
946 }
947
948 //=============================================================================
949 #ifndef PRODUCT
950 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
951 {
952 Compile* C = ra_->C;
953 if (generate_vzeroupper(C)) {
954 st->print("vzeroupper");
955 st->cr(); st->print("\t");
956 }
957
958 int framesize = C->output()->frame_size_in_bytes();
959 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
960 // Remove word for return adr already pushed
961 // and RBP
962 framesize -= 2*wordSize;
970 if (do_polling() && C->is_method_compilation()) {
971 st->print("\t");
972 st->print_cr("cmpq rsp, poll_offset[r15_thread] \n\t"
973 "ja #safepoint_stub\t"
974 "# Safepoint: poll for GC");
975 }
976 }
977 #endif
978
979 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
980 {
981 Compile* C = ra_->C;
982 MacroAssembler _masm(&cbuf);
983
984 if (generate_vzeroupper(C)) {
985 // Clear upper bits of YMM registers when current compiled code uses
986 // wide vectors to avoid AVX <-> SSE transition penalty during call.
987 __ vzeroupper();
988 }
989
990 int framesize = C->output()->frame_size_in_bytes();
991 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
992 // Remove word for return adr already pushed
993 // and RBP
994 framesize -= 2*wordSize;
995
996 // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
997
998 if (framesize) {
999 emit_opcode(cbuf, Assembler::REX_W);
1000 if (framesize < 0x80) {
1001 emit_opcode(cbuf, 0x83); // addq rsp, #framesize
1002 emit_rm(cbuf, 0x3, 0x00, RSP_enc);
1003 emit_d8(cbuf, framesize);
1004 } else {
1005 emit_opcode(cbuf, 0x81); // addq rsp, #framesize
1006 emit_rm(cbuf, 0x3, 0x00, RSP_enc);
1007 emit_d32(cbuf, framesize);
1008 }
1009 }
1010
1011 // popq rbp
1012 emit_opcode(cbuf, 0x58 | RBP_enc);
1013
1014 if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
1015 __ reserved_stack_check();
1016 }
1017
1018 if (do_polling() && C->is_method_compilation()) {
1019 MacroAssembler _masm(&cbuf);
1020 Label dummy_label;
1021 Label* code_stub = &dummy_label;
1022 if (!C->output()->in_scratch_emit_size()) {
1023 C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
1024 C->output()->add_stub(stub);
1025 code_stub = &stub->entry();
1026 }
1027 __ relocate(relocInfo::poll_return_type);
1028 __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
1029 }
1030 }
1031
1032 uint MachEpilogNode::size(PhaseRegAlloc* ra_) const
1033 {
1034 return MachNode::size(ra_); // too many variables; just compute it
1035 // the hard way
1036 }
1037
1038 int MachEpilogNode::reloc() const
1039 {
1040 return 2; // a large enough number
1041 }
1042
1043 const Pipeline* MachEpilogNode::pipeline() const
1044 {
1045 return MachNode::pipeline_class();
1046 }
1047
1048 //=============================================================================
1049
1050 enum RC {
1051 rc_bad,
1052 rc_int,
1053 rc_kreg,
1054 rc_float,
1055 rc_stack
1056 };
1057
1650 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1651 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1652 emit_rm(cbuf, 0x2, reg & 7, 0x04);
1653 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1654 emit_d32(cbuf, offset);
1655 } else {
1656 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1657 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1658 emit_rm(cbuf, 0x1, reg & 7, 0x04);
1659 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1660 emit_d8(cbuf, offset);
1661 }
1662 }
1663
1664 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
1665 {
1666 int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1667 return (offset < 0x80) ? 5 : 8; // REX
1668 }
1669
1670 //=============================================================================
1671 #ifndef PRODUCT
1672 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1673 {
1674 if (UseCompressedClassPointers) {
1675 st->print_cr("movl rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
1676 st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
1677 st->print_cr("\tcmpq rax, rscratch1\t # Inline cache check");
1678 } else {
1679 st->print_cr("\tcmpq rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
1680 "# Inline cache check");
1681 }
1682 st->print_cr("\tjne SharedRuntime::_ic_miss_stub");
1683 st->print_cr("\tnop\t# nops to align entry point");
1684 }
1685 #endif
1686
1687 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1688 {
1689 MacroAssembler masm(&cbuf);
1692 masm.load_klass(rscratch1, j_rarg0, rscratch2);
1693 masm.cmpptr(rax, rscratch1);
1694 } else {
1695 masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1696 }
1697
1698 masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1699
1700 /* WARNING these NOPs are critical so that verified entry point is properly
1701 4 bytes aligned for patching by NativeJump::patch_verified_entry() */
1702 int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
1703 if (OptoBreakpoint) {
1704 // Leave space for int3
1705 nops_cnt -= 1;
1706 }
1707 nops_cnt &= 0x3; // Do not add nops if code is aligned.
1708 if (nops_cnt > 0)
1709 masm.nop(nops_cnt);
1710 }
1711
1712 uint MachUEPNode::size(PhaseRegAlloc* ra_) const
1713 {
1714 return MachNode::size(ra_); // too many variables; just compute it
1715 // the hard way
1716 }
1717
1718
1719 //=============================================================================
1720
1721 const bool Matcher::supports_vector_calling_convention(void) {
1722 if (EnableVectorSupport && UseVectorStubs) {
1723 return true;
1724 }
1725 return false;
1726 }
1727
1728 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
1729 assert(EnableVectorSupport && UseVectorStubs, "sanity");
1730 int lo = XMM0_num;
1731 int hi = XMM0b_num;
1732 if (ideal_reg == Op_VecX) hi = XMM0d_num;
1733 else if (ideal_reg == Op_VecY) hi = XMM0h_num;
1734 else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
1735 return OptoRegPair(hi, lo);
1736 }
1737
1738 // Is this branch offset short enough that a short branch can be used?
3977 %}
3978 %}
3979
3980 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
3981 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
3982 %{
3983 constraint(ALLOC_IN_RC(ptr_reg));
3984 predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
3985 match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
3986
3987 op_cost(10);
3988 format %{"[$reg + $off + $idx << $scale]" %}
3989 interface(MEMORY_INTER) %{
3990 base($reg);
3991 index($idx);
3992 scale($scale);
3993 disp($off);
3994 %}
3995 %}
3996
3997 // Indirect Narrow Oop Plus Offset Operand
3998 // Note: x86 architecture doesn't support "scale * index + offset" without a base
3999 // we can't free r12 even with CompressedOops::base() == NULL.
4000 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
4001 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4002 constraint(ALLOC_IN_RC(ptr_reg));
4003 match(AddP (DecodeN reg) off);
4004
4005 op_cost(10);
4006 format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
4007 interface(MEMORY_INTER) %{
4008 base(0xc); // R12
4009 index($reg);
4010 scale(0x3);
4011 disp($off);
4012 %}
4013 %}
4014
4015 // Indirect Memory Operand
4016 operand indirectNarrow(rRegN reg)
4323 equal(0x4, "e");
4324 not_equal(0x5, "ne");
4325 less(0x2, "b");
4326 greater_equal(0x3, "ae");
4327 less_equal(0x6, "be");
4328 greater(0x7, "a");
4329 overflow(0x0, "o");
4330 no_overflow(0x1, "no");
4331 %}
4332 %}
4333
4334 //----------OPERAND CLASSES----------------------------------------------------
4335 // Operand Classes are groups of operands that are used as to simplify
4336 // instruction definitions by not requiring the AD writer to specify separate
4337 // instructions for every form of operand when the instruction accepts
4338 // multiple operand types with the same basic encoding and format. The classic
4339 // case of this is memory operands.
4340
4341 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
4342 indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
4343 indCompressedOopOffset,
4344 indirectNarrow, indOffset8Narrow, indOffset32Narrow,
4345 indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
4346 indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
4347
4348 //----------PIPELINE-----------------------------------------------------------
4349 // Rules which define the behavior of the target architectures pipeline.
4350 pipeline %{
4351
4352 //----------ATTRIBUTES---------------------------------------------------------
4353 attributes %{
4354 variable_size_instructions; // Fixed size instructions
4355 max_instructions_per_bundle = 3; // Up to 3 instructions per bundle
4356 instruction_unit_size = 1; // An instruction is 1 bytes long
4357 instruction_fetch_unit_size = 16; // The processor fetches one line
4358 instruction_fetch_units = 1; // of 16 bytes
4359
4360 // List of nop instructions
4361 nops( MachNop );
4362 %}
4363
6911 format %{ "MEMBAR-storestore (empty encoding)" %}
6912 ins_encode( );
6913 ins_pipe(empty);
6914 %}
6915
6916 //----------Move Instructions--------------------------------------------------
6917
6918 instruct castX2P(rRegP dst, rRegL src)
6919 %{
6920 match(Set dst (CastX2P src));
6921
6922 format %{ "movq $dst, $src\t# long->ptr" %}
6923 ins_encode %{
6924 if ($dst$$reg != $src$$reg) {
6925 __ movptr($dst$$Register, $src$$Register);
6926 }
6927 %}
6928 ins_pipe(ialu_reg_reg); // XXX
6929 %}
6930
6931 instruct castP2X(rRegL dst, rRegP src)
6932 %{
6933 match(Set dst (CastP2X src));
6934
6935 format %{ "movq $dst, $src\t# ptr -> long" %}
6936 ins_encode %{
6937 if ($dst$$reg != $src$$reg) {
6938 __ movptr($dst$$Register, $src$$Register);
6939 }
6940 %}
6941 ins_pipe(ialu_reg_reg); // XXX
6942 %}
6943
6944 // Convert oop into int for vectors alignment masking
6945 instruct convP2I(rRegI dst, rRegP src)
6946 %{
6947 match(Set dst (ConvL2I (CastP2X src)));
6948
6949 format %{ "movl $dst, $src\t# ptr -> int" %}
6950 ins_encode %{
11416 effect(DEF dst, USE src);
11417 ins_cost(100);
11418 format %{ "movd $dst,$src\t# MoveI2F" %}
11419 ins_encode %{
11420 __ movdl($dst$$XMMRegister, $src$$Register);
11421 %}
11422 ins_pipe( pipe_slow );
11423 %}
11424
11425 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11426 match(Set dst (MoveL2D src));
11427 effect(DEF dst, USE src);
11428 ins_cost(100);
11429 format %{ "movd $dst,$src\t# MoveL2D" %}
11430 ins_encode %{
11431 __ movdq($dst$$XMMRegister, $src$$Register);
11432 %}
11433 ins_pipe( pipe_slow );
11434 %}
11435
11436 // Fast clearing of an array
11437 // Small ClearArray non-AVX512.
11438 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11439 Universe dummy, rFlagsReg cr)
11440 %{
11441 predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX <= 2));
11442 match(Set dummy (ClearArray cnt base));
11443 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
11444
11445 format %{ $$template
11446 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11447 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11448 $$emit$$"jg LARGE\n\t"
11449 $$emit$$"dec rcx\n\t"
11450 $$emit$$"js DONE\t# Zero length\n\t"
11451 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11452 $$emit$$"dec rcx\n\t"
11453 $$emit$$"jge LOOP\n\t"
11454 $$emit$$"jmp DONE\n\t"
11455 $$emit$$"# LARGE:\n\t"
11456 if (UseFastStosb) {
11457 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11458 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11459 } else if (UseXMMForObjInit) {
11460 $$emit$$"mov rdi,rax\n\t"
11461 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11462 $$emit$$"jmpq L_zero_64_bytes\n\t"
11463 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11471 $$emit$$"jl L_tail\n\t"
11472 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11473 $$emit$$"add 0x20,rax\n\t"
11474 $$emit$$"sub 0x4,rcx\n\t"
11475 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11476 $$emit$$"add 0x4,rcx\n\t"
11477 $$emit$$"jle L_end\n\t"
11478 $$emit$$"dec rcx\n\t"
11479 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11480 $$emit$$"vmovq xmm0,(rax)\n\t"
11481 $$emit$$"add 0x8,rax\n\t"
11482 $$emit$$"dec rcx\n\t"
11483 $$emit$$"jge L_sloop\n\t"
11484 $$emit$$"# L_end:\n\t"
11485 } else {
11486 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11487 }
11488 $$emit$$"# DONE"
11489 %}
11490 ins_encode %{
11491 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11492 $tmp$$XMMRegister, false, knoreg);
11493 %}
11494 ins_pipe(pipe_slow);
11495 %}
11496
11497 // Small ClearArray AVX512 non-constant length.
11498 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11499 Universe dummy, rFlagsReg cr)
11500 %{
11501 predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX > 2));
11502 match(Set dummy (ClearArray cnt base));
11503 ins_cost(125);
11504 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11505
11506 format %{ $$template
11507 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11508 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11509 $$emit$$"jg LARGE\n\t"
11510 $$emit$$"dec rcx\n\t"
11511 $$emit$$"js DONE\t# Zero length\n\t"
11512 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11513 $$emit$$"dec rcx\n\t"
11514 $$emit$$"jge LOOP\n\t"
11515 $$emit$$"jmp DONE\n\t"
11516 $$emit$$"# LARGE:\n\t"
11517 if (UseFastStosb) {
11518 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11519 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11520 } else if (UseXMMForObjInit) {
11521 $$emit$$"mov rdi,rax\n\t"
11522 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11523 $$emit$$"jmpq L_zero_64_bytes\n\t"
11524 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11532 $$emit$$"jl L_tail\n\t"
11533 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11534 $$emit$$"add 0x20,rax\n\t"
11535 $$emit$$"sub 0x4,rcx\n\t"
11536 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11537 $$emit$$"add 0x4,rcx\n\t"
11538 $$emit$$"jle L_end\n\t"
11539 $$emit$$"dec rcx\n\t"
11540 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11541 $$emit$$"vmovq xmm0,(rax)\n\t"
11542 $$emit$$"add 0x8,rax\n\t"
11543 $$emit$$"dec rcx\n\t"
11544 $$emit$$"jge L_sloop\n\t"
11545 $$emit$$"# L_end:\n\t"
11546 } else {
11547 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11548 }
11549 $$emit$$"# DONE"
11550 %}
11551 ins_encode %{
11552 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11553 $tmp$$XMMRegister, false, $ktmp$$KRegister);
11554 %}
11555 ins_pipe(pipe_slow);
11556 %}
11557
11558 // Large ClearArray non-AVX512.
11559 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11560 Universe dummy, rFlagsReg cr)
11561 %{
11562 predicate((UseAVX <=2) && ((ClearArrayNode*)n)->is_large());
11563 match(Set dummy (ClearArray cnt base));
11564 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
11565
11566 format %{ $$template
11567 if (UseFastStosb) {
11568 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11569 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11570 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11571 } else if (UseXMMForObjInit) {
11572 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11573 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11574 $$emit$$"jmpq L_zero_64_bytes\n\t"
11575 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11576 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11577 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11578 $$emit$$"add 0x40,rax\n\t"
11579 $$emit$$"# L_zero_64_bytes:\n\t"
11580 $$emit$$"sub 0x8,rcx\n\t"
11581 $$emit$$"jge L_loop\n\t"
11582 $$emit$$"add 0x4,rcx\n\t"
11583 $$emit$$"jl L_tail\n\t"
11584 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11585 $$emit$$"add 0x20,rax\n\t"
11586 $$emit$$"sub 0x4,rcx\n\t"
11587 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11588 $$emit$$"add 0x4,rcx\n\t"
11589 $$emit$$"jle L_end\n\t"
11590 $$emit$$"dec rcx\n\t"
11591 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11592 $$emit$$"vmovq xmm0,(rax)\n\t"
11593 $$emit$$"add 0x8,rax\n\t"
11594 $$emit$$"dec rcx\n\t"
11595 $$emit$$"jge L_sloop\n\t"
11596 $$emit$$"# L_end:\n\t"
11597 } else {
11598 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11599 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11600 }
11601 %}
11602 ins_encode %{
11603 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11604 $tmp$$XMMRegister, true, knoreg);
11605 %}
11606 ins_pipe(pipe_slow);
11607 %}
11608
11609 // Large ClearArray AVX512.
11610 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11611 Universe dummy, rFlagsReg cr)
11612 %{
11613 predicate((UseAVX > 2) && ((ClearArrayNode*)n)->is_large());
11614 match(Set dummy (ClearArray cnt base));
11615 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11616
11617 format %{ $$template
11618 if (UseFastStosb) {
11619 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11620 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11621 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11622 } else if (UseXMMForObjInit) {
11623 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11624 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11625 $$emit$$"jmpq L_zero_64_bytes\n\t"
11626 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11627 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11628 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11629 $$emit$$"add 0x40,rax\n\t"
11630 $$emit$$"# L_zero_64_bytes:\n\t"
11631 $$emit$$"sub 0x8,rcx\n\t"
11632 $$emit$$"jge L_loop\n\t"
11633 $$emit$$"add 0x4,rcx\n\t"
11634 $$emit$$"jl L_tail\n\t"
11635 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11636 $$emit$$"add 0x20,rax\n\t"
11637 $$emit$$"sub 0x4,rcx\n\t"
11638 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11639 $$emit$$"add 0x4,rcx\n\t"
11640 $$emit$$"jle L_end\n\t"
11641 $$emit$$"dec rcx\n\t"
11642 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11643 $$emit$$"vmovq xmm0,(rax)\n\t"
11644 $$emit$$"add 0x8,rax\n\t"
11645 $$emit$$"dec rcx\n\t"
11646 $$emit$$"jge L_sloop\n\t"
11647 $$emit$$"# L_end:\n\t"
11648 } else {
11649 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11650 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11651 }
11652 %}
11653 ins_encode %{
11654 __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11655 $tmp$$XMMRegister, true, $ktmp$$KRegister);
11656 %}
11657 ins_pipe(pipe_slow);
11658 %}
11659
11660 // Small ClearArray AVX512 constant length.
11661 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rRegI zero, kReg ktmp, Universe dummy, rFlagsReg cr)
11662 %{
11663 predicate(!((ClearArrayNode*)n)->is_large() &&
11664 ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11665 match(Set dummy (ClearArray cnt base));
11666 ins_cost(100);
11667 effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr);
11668 format %{ "clear_mem_imm $base , $cnt \n\t" %}
11669 ins_encode %{
11670 __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11671 %}
11672 ins_pipe(pipe_slow);
11673 %}
11674
11675 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11676 rax_RegI result, legRegD tmp1, rFlagsReg cr)
11677 %{
11678 predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11679 match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11680 effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11681
11682 format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %}
11683 ins_encode %{
11684 __ string_compare($str1$$Register, $str2$$Register,
11685 $cnt1$$Register, $cnt2$$Register, $result$$Register,
11686 $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11687 %}
11688 ins_pipe( pipe_slow );
11689 %}
11690
13420
13421 ins_cost(300);
13422 format %{ "call_leaf,runtime " %}
13423 ins_encode(clear_avx, Java_To_Runtime(meth));
13424 ins_pipe(pipe_slow);
13425 %}
13426
13427 // Call runtime without safepoint and with vector arguments
13428 instruct CallLeafDirectVector(method meth)
13429 %{
13430 match(CallLeafVector);
13431 effect(USE meth);
13432
13433 ins_cost(300);
13434 format %{ "call_leaf,vector " %}
13435 ins_encode(Java_To_Runtime(meth));
13436 ins_pipe(pipe_slow);
13437 %}
13438
13439 // Call runtime without safepoint
13440 instruct CallLeafNoFPDirect(method meth)
13441 %{
13442 match(CallLeafNoFP);
13443 effect(USE meth);
13444
13445 ins_cost(300);
13446 format %{ "call_leaf_nofp,runtime " %}
13447 ins_encode(clear_avx, Java_To_Runtime(meth));
13448 ins_pipe(pipe_slow);
13449 %}
13450
13451 // Return Instruction
13452 // Remove the return address & jump to it.
13453 // Notice: We always emit a nop after a ret to make sure there is room
13454 // for safepoint patching
13455 instruct Ret()
13456 %{
13457 match(Return);
13458
13459 format %{ "ret" %}
13460 ins_encode %{
13461 __ ret(0);
|
473 }
474
475 // !!!!! Special hack to get all types of calls to specify the byte offset
476 // from the start of the call to the point where the return address
477 // will point.
478 int MachCallStaticJavaNode::ret_addr_offset()
479 {
480 int offset = 5; // 5 bytes from start of call to where return address points
481 offset += clear_avx_size();
482 return offset;
483 }
484
485 int MachCallDynamicJavaNode::ret_addr_offset()
486 {
487 int offset = 15; // 15 bytes from start of call to where return address points
488 offset += clear_avx_size();
489 return offset;
490 }
491
492 int MachCallRuntimeNode::ret_addr_offset() {
493 if (_entry_point == NULL) {
494 // CallLeafNoFPInDirect
495 return 3; // callq (register)
496 }
497 int offset = 13; // movq r10,#addr; callq (r10)
498 if (this->ideal_Opcode() != Op_CallLeafVector) {
499 offset += clear_avx_size();
500 }
501 return offset;
502 }
503
504 //
505 // Compute padding required for nodes which need alignment
506 //
507
508 // The address of the call instruction needs to be 4-byte aligned to
509 // ensure that it does not span a cache line so that it can be patched.
510 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
511 {
512 current_offset += clear_avx_size(); // skip vzeroupper
513 current_offset += 1; // skip call opcode byte
514 return align_up(current_offset, alignment_required()) - current_offset;
515 }
516
517 // The address of the call instruction needs to be 4-byte aligned to
518 // ensure that it does not span a cache line so that it can be patched.
519 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
520 {
521 current_offset += clear_avx_size(); // skip vzeroupper
522 current_offset += 11; // skip movq instruction + call opcode byte
523 return align_up(current_offset, alignment_required()) - current_offset;
892 st->print("# stack alignment check");
893 #endif
894 }
895 if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
896 st->print("\n\t");
897 st->print("cmpl [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
898 st->print("\n\t");
899 st->print("je fast_entry\t");
900 st->print("\n\t");
901 st->print("call #nmethod_entry_barrier_stub\t");
902 st->print("\n\tfast_entry:");
903 }
904 st->cr();
905 }
906 #endif
907
908 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
909 Compile* C = ra_->C;
910 C2_MacroAssembler _masm(&cbuf);
911
912 __ verified_entry(C);
913
914 if (ra_->C->stub_function() == NULL) {
915 __ entry_barrier();
916 }
917
918 if (!Compile::current()->output()->in_scratch_emit_size()) {
919 __ bind(*_verified_entry);
920 }
921
922 C->output()->set_frame_complete(cbuf.insts_size());
923
924 if (C->has_mach_constant_base_node()) {
925 // NOTE: We set the table base offset here because users might be
926 // emitted before MachConstantBaseNode.
927 ConstantTable& constant_table = C->output()->constant_table();
928 constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
929 }
930 }
931
932 int MachPrologNode::reloc() const
933 {
934 return 0; // a large enough number
935 }
936
937 //=============================================================================
938 #ifndef PRODUCT
939 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
940 {
941 Compile* C = ra_->C;
942 if (generate_vzeroupper(C)) {
943 st->print("vzeroupper");
944 st->cr(); st->print("\t");
945 }
946
947 int framesize = C->output()->frame_size_in_bytes();
948 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
949 // Remove word for return adr already pushed
950 // and RBP
951 framesize -= 2*wordSize;
959 if (do_polling() && C->is_method_compilation()) {
960 st->print("\t");
961 st->print_cr("cmpq rsp, poll_offset[r15_thread] \n\t"
962 "ja #safepoint_stub\t"
963 "# Safepoint: poll for GC");
964 }
965 }
966 #endif
967
968 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
969 {
970 Compile* C = ra_->C;
971 MacroAssembler _masm(&cbuf);
972
973 if (generate_vzeroupper(C)) {
974 // Clear upper bits of YMM registers when current compiled code uses
975 // wide vectors to avoid AVX <-> SSE transition penalty during call.
976 __ vzeroupper();
977 }
978
979 // Subtract two words to account for return address and rbp
980 int initial_framesize = C->output()->frame_size_in_bytes() - 2*wordSize;
981 __ remove_frame(initial_framesize, C->needs_stack_repair());
982
983 if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
984 __ reserved_stack_check();
985 }
986
987 if (do_polling() && C->is_method_compilation()) {
988 MacroAssembler _masm(&cbuf);
989 Label dummy_label;
990 Label* code_stub = &dummy_label;
991 if (!C->output()->in_scratch_emit_size()) {
992 C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
993 C->output()->add_stub(stub);
994 code_stub = &stub->entry();
995 }
996 __ relocate(relocInfo::poll_return_type);
997 __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
998 }
999 }
1000
1001 int MachEpilogNode::reloc() const
1002 {
1003 return 2; // a large enough number
1004 }
1005
1006 const Pipeline* MachEpilogNode::pipeline() const
1007 {
1008 return MachNode::pipeline_class();
1009 }
1010
1011 //=============================================================================
1012
1013 enum RC {
1014 rc_bad,
1015 rc_int,
1016 rc_kreg,
1017 rc_float,
1018 rc_stack
1019 };
1020
1613 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1614 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1615 emit_rm(cbuf, 0x2, reg & 7, 0x04);
1616 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1617 emit_d32(cbuf, offset);
1618 } else {
1619 emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
1620 emit_opcode(cbuf, 0x8D); // LEA reg,[SP+offset]
1621 emit_rm(cbuf, 0x1, reg & 7, 0x04);
1622 emit_rm(cbuf, 0x0, 0x04, RSP_enc);
1623 emit_d8(cbuf, offset);
1624 }
1625 }
1626
1627 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
1628 {
1629 int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1630 return (offset < 0x80) ? 5 : 8; // REX
1631 }
1632
1633 //=============================================================================
1634 #ifndef PRODUCT
1635 void MachVEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1636 {
1637 st->print_cr("MachVEPNode");
1638 }
1639 #endif
1640
1641 void MachVEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1642 {
1643 C2_MacroAssembler _masm(&cbuf);
1644 uint insts_size = cbuf.insts_size();
1645 if (!_verified) {
1646 if (UseCompressedClassPointers) {
1647 __ load_klass(rscratch1, j_rarg0, rscratch2);
1648 __ cmpptr(rax, rscratch1);
1649 } else {
1650 __ cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1651 }
1652 __ jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1653 } else {
1654 // TODO 8284443 Avoid creation of temporary frame
1655 if (ra_->C->stub_function() == NULL) {
1656 __ verified_entry(ra_->C, 0);
1657 __ entry_barrier();
1658 int initial_framesize = ra_->C->output()->frame_size_in_bytes() - 2*wordSize;
1659 __ remove_frame(initial_framesize, false);
1660 }
1661 // Unpack inline type args passed as oop and then jump to
1662 // the verified entry point (skipping the unverified entry).
1663 int sp_inc = __ unpack_inline_args(ra_->C, _receiver_only);
1664 // Emit code for verified entry and save increment for stack repair on return
1665 __ verified_entry(ra_->C, sp_inc);
1666 if (Compile::current()->output()->in_scratch_emit_size()) {
1667 Label dummy_verified_entry;
1668 __ jmp(dummy_verified_entry);
1669 } else {
1670 __ jmp(*_verified_entry);
1671 }
1672 }
1673 /* WARNING these NOPs are critical so that verified entry point is properly
1674 4 bytes aligned for patching by NativeJump::patch_verified_entry() */
1675 int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
1676 nops_cnt &= 0x3; // Do not add nops if code is aligned.
1677 if (nops_cnt > 0) {
1678 __ nop(nops_cnt);
1679 }
1680 }
1681
1682 //=============================================================================
1683 #ifndef PRODUCT
1684 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
1685 {
1686 if (UseCompressedClassPointers) {
1687 st->print_cr("movl rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
1688 st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
1689 st->print_cr("\tcmpq rax, rscratch1\t # Inline cache check");
1690 } else {
1691 st->print_cr("\tcmpq rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
1692 "# Inline cache check");
1693 }
1694 st->print_cr("\tjne SharedRuntime::_ic_miss_stub");
1695 st->print_cr("\tnop\t# nops to align entry point");
1696 }
1697 #endif
1698
1699 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
1700 {
1701 MacroAssembler masm(&cbuf);
1704 masm.load_klass(rscratch1, j_rarg0, rscratch2);
1705 masm.cmpptr(rax, rscratch1);
1706 } else {
1707 masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
1708 }
1709
1710 masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1711
1712 /* WARNING these NOPs are critical so that verified entry point is properly
1713 4 bytes aligned for patching by NativeJump::patch_verified_entry() */
1714 int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
1715 if (OptoBreakpoint) {
1716 // Leave space for int3
1717 nops_cnt -= 1;
1718 }
1719 nops_cnt &= 0x3; // Do not add nops if code is aligned.
1720 if (nops_cnt > 0)
1721 masm.nop(nops_cnt);
1722 }
1723
1724 //=============================================================================
1725
1726 const bool Matcher::supports_vector_calling_convention(void) {
1727 if (EnableVectorSupport && UseVectorStubs) {
1728 return true;
1729 }
1730 return false;
1731 }
1732
1733 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
1734 assert(EnableVectorSupport && UseVectorStubs, "sanity");
1735 int lo = XMM0_num;
1736 int hi = XMM0b_num;
1737 if (ideal_reg == Op_VecX) hi = XMM0d_num;
1738 else if (ideal_reg == Op_VecY) hi = XMM0h_num;
1739 else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
1740 return OptoRegPair(hi, lo);
1741 }
1742
1743 // Is this branch offset short enough that a short branch can be used?
3982 %}
3983 %}
3984
3985 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
3986 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
3987 %{
3988 constraint(ALLOC_IN_RC(ptr_reg));
3989 predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
3990 match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
3991
3992 op_cost(10);
3993 format %{"[$reg + $off + $idx << $scale]" %}
3994 interface(MEMORY_INTER) %{
3995 base($reg);
3996 index($idx);
3997 scale($scale);
3998 disp($off);
3999 %}
4000 %}
4001
4002 // Indirect Narrow Oop Operand
4003 operand indCompressedOop(rRegN reg) %{
4004 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4005 constraint(ALLOC_IN_RC(ptr_reg));
4006 match(DecodeN reg);
4007
4008 op_cost(10);
4009 format %{"[R12 + $reg << 3] (compressed oop addressing)" %}
4010 interface(MEMORY_INTER) %{
4011 base(0xc); // R12
4012 index($reg);
4013 scale(0x3);
4014 disp(0x0);
4015 %}
4016 %}
4017
4018 // Indirect Narrow Oop Plus Offset Operand
4019 // Note: x86 architecture doesn't support "scale * index + offset" without a base
4020 // we can't free r12 even with CompressedOops::base() == NULL.
4021 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
4022 predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
4023 constraint(ALLOC_IN_RC(ptr_reg));
4024 match(AddP (DecodeN reg) off);
4025
4026 op_cost(10);
4027 format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
4028 interface(MEMORY_INTER) %{
4029 base(0xc); // R12
4030 index($reg);
4031 scale(0x3);
4032 disp($off);
4033 %}
4034 %}
4035
4036 // Indirect Memory Operand
4037 operand indirectNarrow(rRegN reg)
4344 equal(0x4, "e");
4345 not_equal(0x5, "ne");
4346 less(0x2, "b");
4347 greater_equal(0x3, "ae");
4348 less_equal(0x6, "be");
4349 greater(0x7, "a");
4350 overflow(0x0, "o");
4351 no_overflow(0x1, "no");
4352 %}
4353 %}
4354
4355 //----------OPERAND CLASSES----------------------------------------------------
4356 // Operand Classes are groups of operands that are used as to simplify
4357 // instruction definitions by not requiring the AD writer to specify separate
4358 // instructions for every form of operand when the instruction accepts
4359 // multiple operand types with the same basic encoding and format. The classic
4360 // case of this is memory operands.
4361
4362 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
4363 indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
4364 indCompressedOop, indCompressedOopOffset,
4365 indirectNarrow, indOffset8Narrow, indOffset32Narrow,
4366 indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
4367 indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
4368
4369 //----------PIPELINE-----------------------------------------------------------
4370 // Rules which define the behavior of the target architectures pipeline.
4371 pipeline %{
4372
4373 //----------ATTRIBUTES---------------------------------------------------------
4374 attributes %{
4375 variable_size_instructions; // Fixed size instructions
4376 max_instructions_per_bundle = 3; // Up to 3 instructions per bundle
4377 instruction_unit_size = 1; // An instruction is 1 bytes long
4378 instruction_fetch_unit_size = 16; // The processor fetches one line
4379 instruction_fetch_units = 1; // of 16 bytes
4380
4381 // List of nop instructions
4382 nops( MachNop );
4383 %}
4384
6932 format %{ "MEMBAR-storestore (empty encoding)" %}
6933 ins_encode( );
6934 ins_pipe(empty);
6935 %}
6936
6937 //----------Move Instructions--------------------------------------------------
6938
6939 instruct castX2P(rRegP dst, rRegL src)
6940 %{
6941 match(Set dst (CastX2P src));
6942
6943 format %{ "movq $dst, $src\t# long->ptr" %}
6944 ins_encode %{
6945 if ($dst$$reg != $src$$reg) {
6946 __ movptr($dst$$Register, $src$$Register);
6947 }
6948 %}
6949 ins_pipe(ialu_reg_reg); // XXX
6950 %}
6951
6952 instruct castN2X(rRegL dst, rRegN src)
6953 %{
6954 match(Set dst (CastP2X src));
6955
6956 format %{ "movq $dst, $src\t# ptr -> long" %}
6957 ins_encode %{
6958 if ($dst$$reg != $src$$reg) {
6959 __ movptr($dst$$Register, $src$$Register);
6960 }
6961 %}
6962 ins_pipe(ialu_reg_reg); // XXX
6963 %}
6964
6965 instruct castP2X(rRegL dst, rRegP src)
6966 %{
6967 match(Set dst (CastP2X src));
6968
6969 format %{ "movq $dst, $src\t# ptr -> long" %}
6970 ins_encode %{
6971 if ($dst$$reg != $src$$reg) {
6972 __ movptr($dst$$Register, $src$$Register);
6973 }
6974 %}
6975 ins_pipe(ialu_reg_reg); // XXX
6976 %}
6977
6978 // Convert oop into int for vectors alignment masking
6979 instruct convP2I(rRegI dst, rRegP src)
6980 %{
6981 match(Set dst (ConvL2I (CastP2X src)));
6982
6983 format %{ "movl $dst, $src\t# ptr -> int" %}
6984 ins_encode %{
11450 effect(DEF dst, USE src);
11451 ins_cost(100);
11452 format %{ "movd $dst,$src\t# MoveI2F" %}
11453 ins_encode %{
11454 __ movdl($dst$$XMMRegister, $src$$Register);
11455 %}
11456 ins_pipe( pipe_slow );
11457 %}
11458
11459 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11460 match(Set dst (MoveL2D src));
11461 effect(DEF dst, USE src);
11462 ins_cost(100);
11463 format %{ "movd $dst,$src\t# MoveL2D" %}
11464 ins_encode %{
11465 __ movdq($dst$$XMMRegister, $src$$Register);
11466 %}
11467 ins_pipe( pipe_slow );
11468 %}
11469
11470
11471 // Fast clearing of an array
11472 // Small ClearArray non-AVX512.
11473 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11474 Universe dummy, rFlagsReg cr)
11475 %{
11476 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11477 match(Set dummy (ClearArray (Binary cnt base) val));
11478 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11479
11480 format %{ $$template
11481 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11482 $$emit$$"jg LARGE\n\t"
11483 $$emit$$"dec rcx\n\t"
11484 $$emit$$"js DONE\t# Zero length\n\t"
11485 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11486 $$emit$$"dec rcx\n\t"
11487 $$emit$$"jge LOOP\n\t"
11488 $$emit$$"jmp DONE\n\t"
11489 $$emit$$"# LARGE:\n\t"
11490 if (UseFastStosb) {
11491 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11492 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11493 } else if (UseXMMForObjInit) {
11494 $$emit$$"movdq $tmp, $val\n\t"
11495 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11496 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11497 $$emit$$"jmpq L_zero_64_bytes\n\t"
11498 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11499 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11500 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11501 $$emit$$"add 0x40,rax\n\t"
11502 $$emit$$"# L_zero_64_bytes:\n\t"
11503 $$emit$$"sub 0x8,rcx\n\t"
11504 $$emit$$"jge L_loop\n\t"
11505 $$emit$$"add 0x4,rcx\n\t"
11506 $$emit$$"jl L_tail\n\t"
11507 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11508 $$emit$$"add 0x20,rax\n\t"
11509 $$emit$$"sub 0x4,rcx\n\t"
11510 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11511 $$emit$$"add 0x4,rcx\n\t"
11512 $$emit$$"jle L_end\n\t"
11513 $$emit$$"dec rcx\n\t"
11514 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11515 $$emit$$"vmovq xmm0,(rax)\n\t"
11516 $$emit$$"add 0x8,rax\n\t"
11517 $$emit$$"dec rcx\n\t"
11518 $$emit$$"jge L_sloop\n\t"
11519 $$emit$$"# L_end:\n\t"
11520 } else {
11521 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11522 }
11523 $$emit$$"# DONE"
11524 %}
11525 ins_encode %{
11526 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11527 $tmp$$XMMRegister, false, false);
11528 %}
11529 ins_pipe(pipe_slow);
11530 %}
11531
11532 instruct rep_stos_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11533 Universe dummy, rFlagsReg cr)
11534 %{
11535 predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11536 match(Set dummy (ClearArray (Binary cnt base) val));
11537 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11538
11539 format %{ $$template
11540 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11541 $$emit$$"jg LARGE\n\t"
11542 $$emit$$"dec rcx\n\t"
11543 $$emit$$"js DONE\t# Zero length\n\t"
11544 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11545 $$emit$$"dec rcx\n\t"
11546 $$emit$$"jge LOOP\n\t"
11547 $$emit$$"jmp DONE\n\t"
11548 $$emit$$"# LARGE:\n\t"
11549 if (UseXMMForObjInit) {
11550 $$emit$$"movdq $tmp, $val\n\t"
11551 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11552 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11553 $$emit$$"jmpq L_zero_64_bytes\n\t"
11554 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11555 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11556 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11557 $$emit$$"add 0x40,rax\n\t"
11558 $$emit$$"# L_zero_64_bytes:\n\t"
11559 $$emit$$"sub 0x8,rcx\n\t"
11560 $$emit$$"jge L_loop\n\t"
11561 $$emit$$"add 0x4,rcx\n\t"
11562 $$emit$$"jl L_tail\n\t"
11563 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11564 $$emit$$"add 0x20,rax\n\t"
11565 $$emit$$"sub 0x4,rcx\n\t"
11566 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11567 $$emit$$"add 0x4,rcx\n\t"
11568 $$emit$$"jle L_end\n\t"
11569 $$emit$$"dec rcx\n\t"
11570 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11571 $$emit$$"vmovq xmm0,(rax)\n\t"
11572 $$emit$$"add 0x8,rax\n\t"
11573 $$emit$$"dec rcx\n\t"
11574 $$emit$$"jge L_sloop\n\t"
11575 $$emit$$"# L_end:\n\t"
11576 } else {
11577 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11578 }
11579 $$emit$$"# DONE"
11580 %}
11581 ins_encode %{
11582 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11583 $tmp$$XMMRegister, false, true);
11584 %}
11585 ins_pipe(pipe_slow);
11586 %}
11587
11588 // Small ClearArray AVX512 non-constant length.
11589 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11590 Universe dummy, rFlagsReg cr)
11591 %{
11592 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11593 match(Set dummy (ClearArray (Binary cnt base) val));
11594 ins_cost(125);
11595 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11596
11597 format %{ $$template
11598 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11599 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11600 $$emit$$"jg LARGE\n\t"
11601 $$emit$$"dec rcx\n\t"
11602 $$emit$$"js DONE\t# Zero length\n\t"
11603 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11604 $$emit$$"dec rcx\n\t"
11605 $$emit$$"jge LOOP\n\t"
11606 $$emit$$"jmp DONE\n\t"
11607 $$emit$$"# LARGE:\n\t"
11608 if (UseFastStosb) {
11609 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11610 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11611 } else if (UseXMMForObjInit) {
11612 $$emit$$"mov rdi,rax\n\t"
11613 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11614 $$emit$$"jmpq L_zero_64_bytes\n\t"
11615 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11623 $$emit$$"jl L_tail\n\t"
11624 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11625 $$emit$$"add 0x20,rax\n\t"
11626 $$emit$$"sub 0x4,rcx\n\t"
11627 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11628 $$emit$$"add 0x4,rcx\n\t"
11629 $$emit$$"jle L_end\n\t"
11630 $$emit$$"dec rcx\n\t"
11631 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11632 $$emit$$"vmovq xmm0,(rax)\n\t"
11633 $$emit$$"add 0x8,rax\n\t"
11634 $$emit$$"dec rcx\n\t"
11635 $$emit$$"jge L_sloop\n\t"
11636 $$emit$$"# L_end:\n\t"
11637 } else {
11638 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11639 }
11640 $$emit$$"# DONE"
11641 %}
11642 ins_encode %{
11643 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11644 $tmp$$XMMRegister, false, false, $ktmp$$KRegister);
11645 %}
11646 ins_pipe(pipe_slow);
11647 %}
11648
11649 instruct rep_stos_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11650 Universe dummy, rFlagsReg cr)
11651 %{
11652 predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11653 match(Set dummy (ClearArray (Binary cnt base) val));
11654 ins_cost(125);
11655 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11656
11657 format %{ $$template
11658 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11659 $$emit$$"cmp InitArrayShortSize,rcx\n\t"
11660 $$emit$$"jg LARGE\n\t"
11661 $$emit$$"dec rcx\n\t"
11662 $$emit$$"js DONE\t# Zero length\n\t"
11663 $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
11664 $$emit$$"dec rcx\n\t"
11665 $$emit$$"jge LOOP\n\t"
11666 $$emit$$"jmp DONE\n\t"
11667 $$emit$$"# LARGE:\n\t"
11668 if (UseFastStosb) {
11669 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11670 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
11671 } else if (UseXMMForObjInit) {
11672 $$emit$$"mov rdi,rax\n\t"
11673 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11674 $$emit$$"jmpq L_zero_64_bytes\n\t"
11675 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11683 $$emit$$"jl L_tail\n\t"
11684 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11685 $$emit$$"add 0x20,rax\n\t"
11686 $$emit$$"sub 0x4,rcx\n\t"
11687 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11688 $$emit$$"add 0x4,rcx\n\t"
11689 $$emit$$"jle L_end\n\t"
11690 $$emit$$"dec rcx\n\t"
11691 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11692 $$emit$$"vmovq xmm0,(rax)\n\t"
11693 $$emit$$"add 0x8,rax\n\t"
11694 $$emit$$"dec rcx\n\t"
11695 $$emit$$"jge L_sloop\n\t"
11696 $$emit$$"# L_end:\n\t"
11697 } else {
11698 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
11699 }
11700 $$emit$$"# DONE"
11701 %}
11702 ins_encode %{
11703 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11704 $tmp$$XMMRegister, false, true, $ktmp$$KRegister);
11705 %}
11706 ins_pipe(pipe_slow);
11707 %}
11708
11709 // Large ClearArray non-AVX512.
11710 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11711 Universe dummy, rFlagsReg cr)
11712 %{
11713 predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11714 match(Set dummy (ClearArray (Binary cnt base) val));
11715 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11716
11717 format %{ $$template
11718 if (UseFastStosb) {
11719 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11720 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11721 } else if (UseXMMForObjInit) {
11722 $$emit$$"movdq $tmp, $val\n\t"
11723 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11724 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11725 $$emit$$"jmpq L_zero_64_bytes\n\t"
11726 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11727 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11728 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11729 $$emit$$"add 0x40,rax\n\t"
11730 $$emit$$"# L_zero_64_bytes:\n\t"
11731 $$emit$$"sub 0x8,rcx\n\t"
11732 $$emit$$"jge L_loop\n\t"
11733 $$emit$$"add 0x4,rcx\n\t"
11734 $$emit$$"jl L_tail\n\t"
11735 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11736 $$emit$$"add 0x20,rax\n\t"
11737 $$emit$$"sub 0x4,rcx\n\t"
11738 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11739 $$emit$$"add 0x4,rcx\n\t"
11740 $$emit$$"jle L_end\n\t"
11741 $$emit$$"dec rcx\n\t"
11742 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11743 $$emit$$"vmovq xmm0,(rax)\n\t"
11744 $$emit$$"add 0x8,rax\n\t"
11745 $$emit$$"dec rcx\n\t"
11746 $$emit$$"jge L_sloop\n\t"
11747 $$emit$$"# L_end:\n\t"
11748 } else {
11749 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11750 }
11751 %}
11752 ins_encode %{
11753 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11754 $tmp$$XMMRegister, true, false);
11755 %}
11756 ins_pipe(pipe_slow);
11757 %}
11758
11759 instruct rep_stos_large_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11760 Universe dummy, rFlagsReg cr)
11761 %{
11762 predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11763 match(Set dummy (ClearArray (Binary cnt base) val));
11764 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11765
11766 format %{ $$template
11767 if (UseXMMForObjInit) {
11768 $$emit$$"movdq $tmp, $val\n\t"
11769 $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11770 $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11771 $$emit$$"jmpq L_zero_64_bytes\n\t"
11772 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11773 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11774 $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11775 $$emit$$"add 0x40,rax\n\t"
11776 $$emit$$"# L_zero_64_bytes:\n\t"
11777 $$emit$$"sub 0x8,rcx\n\t"
11778 $$emit$$"jge L_loop\n\t"
11779 $$emit$$"add 0x4,rcx\n\t"
11780 $$emit$$"jl L_tail\n\t"
11781 $$emit$$"vmovdqu $tmp,(rax)\n\t"
11782 $$emit$$"add 0x20,rax\n\t"
11783 $$emit$$"sub 0x4,rcx\n\t"
11784 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11785 $$emit$$"add 0x4,rcx\n\t"
11786 $$emit$$"jle L_end\n\t"
11787 $$emit$$"dec rcx\n\t"
11788 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11789 $$emit$$"vmovq xmm0,(rax)\n\t"
11790 $$emit$$"add 0x8,rax\n\t"
11791 $$emit$$"dec rcx\n\t"
11792 $$emit$$"jge L_sloop\n\t"
11793 $$emit$$"# L_end:\n\t"
11794 } else {
11795 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11796 }
11797 %}
11798 ins_encode %{
11799 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11800 $tmp$$XMMRegister, true, true);
11801 %}
11802 ins_pipe(pipe_slow);
11803 %}
11804
11805 // Large ClearArray AVX512.
11806 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11807 Universe dummy, rFlagsReg cr)
11808 %{
11809 predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11810 match(Set dummy (ClearArray (Binary cnt base) val));
11811 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11812
11813 format %{ $$template
11814 if (UseFastStosb) {
11815 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11816 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11817 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11818 } else if (UseXMMForObjInit) {
11819 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11820 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11821 $$emit$$"jmpq L_zero_64_bytes\n\t"
11822 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11823 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11824 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11825 $$emit$$"add 0x40,rax\n\t"
11826 $$emit$$"# L_zero_64_bytes:\n\t"
11827 $$emit$$"sub 0x8,rcx\n\t"
11828 $$emit$$"jge L_loop\n\t"
11829 $$emit$$"add 0x4,rcx\n\t"
11830 $$emit$$"jl L_tail\n\t"
11831 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11832 $$emit$$"add 0x20,rax\n\t"
11833 $$emit$$"sub 0x4,rcx\n\t"
11834 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11835 $$emit$$"add 0x4,rcx\n\t"
11836 $$emit$$"jle L_end\n\t"
11837 $$emit$$"dec rcx\n\t"
11838 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11839 $$emit$$"vmovq xmm0,(rax)\n\t"
11840 $$emit$$"add 0x8,rax\n\t"
11841 $$emit$$"dec rcx\n\t"
11842 $$emit$$"jge L_sloop\n\t"
11843 $$emit$$"# L_end:\n\t"
11844 } else {
11845 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11846 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11847 }
11848 %}
11849 ins_encode %{
11850 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11851 $tmp$$XMMRegister, true, false, $ktmp$$KRegister);
11852 %}
11853 ins_pipe(pipe_slow);
11854 %}
11855
11856 instruct rep_stos_large_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11857 Universe dummy, rFlagsReg cr)
11858 %{
11859 predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11860 match(Set dummy (ClearArray (Binary cnt base) val));
11861 effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11862
11863 format %{ $$template
11864 if (UseFastStosb) {
11865 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11866 $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
11867 $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
11868 } else if (UseXMMForObjInit) {
11869 $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
11870 $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
11871 $$emit$$"jmpq L_zero_64_bytes\n\t"
11872 $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11873 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11874 $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11875 $$emit$$"add 0x40,rax\n\t"
11876 $$emit$$"# L_zero_64_bytes:\n\t"
11877 $$emit$$"sub 0x8,rcx\n\t"
11878 $$emit$$"jge L_loop\n\t"
11879 $$emit$$"add 0x4,rcx\n\t"
11880 $$emit$$"jl L_tail\n\t"
11881 $$emit$$"vmovdqu ymm0,(rax)\n\t"
11882 $$emit$$"add 0x20,rax\n\t"
11883 $$emit$$"sub 0x4,rcx\n\t"
11884 $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11885 $$emit$$"add 0x4,rcx\n\t"
11886 $$emit$$"jle L_end\n\t"
11887 $$emit$$"dec rcx\n\t"
11888 $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11889 $$emit$$"vmovq xmm0,(rax)\n\t"
11890 $$emit$$"add 0x8,rax\n\t"
11891 $$emit$$"dec rcx\n\t"
11892 $$emit$$"jge L_sloop\n\t"
11893 $$emit$$"# L_end:\n\t"
11894 } else {
11895 $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
11896 $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
11897 }
11898 %}
11899 ins_encode %{
11900 __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11901 $tmp$$XMMRegister, true, true, $ktmp$$KRegister);
11902 %}
11903 ins_pipe(pipe_slow);
11904 %}
11905
11906 // Small ClearArray AVX512 constant length.
11907 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rax_RegL val, kReg ktmp, Universe dummy, rFlagsReg cr)
11908 %{
11909 predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() &&
11910 ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11911 match(Set dummy (ClearArray (Binary cnt base) val));
11912 ins_cost(100);
11913 effect(TEMP tmp, USE_KILL val, TEMP ktmp, KILL cr);
11914 format %{ "clear_mem_imm $base , $cnt \n\t" %}
11915 ins_encode %{
11916 __ clear_mem($base$$Register, $cnt$$constant, $val$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11917 %}
11918 ins_pipe(pipe_slow);
11919 %}
11920
11921 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11922 rax_RegI result, legRegD tmp1, rFlagsReg cr)
11923 %{
11924 predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11925 match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11926 effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11927
11928 format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %}
11929 ins_encode %{
11930 __ string_compare($str1$$Register, $str2$$Register,
11931 $cnt1$$Register, $cnt2$$Register, $result$$Register,
11932 $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11933 %}
11934 ins_pipe( pipe_slow );
11935 %}
11936
13666
13667 ins_cost(300);
13668 format %{ "call_leaf,runtime " %}
13669 ins_encode(clear_avx, Java_To_Runtime(meth));
13670 ins_pipe(pipe_slow);
13671 %}
13672
13673 // Call runtime without safepoint and with vector arguments
13674 instruct CallLeafDirectVector(method meth)
13675 %{
13676 match(CallLeafVector);
13677 effect(USE meth);
13678
13679 ins_cost(300);
13680 format %{ "call_leaf,vector " %}
13681 ins_encode(Java_To_Runtime(meth));
13682 ins_pipe(pipe_slow);
13683 %}
13684
13685 // Call runtime without safepoint
13686 // entry point is null, target holds the address to call
13687 instruct CallLeafNoFPInDirect(rRegP target)
13688 %{
13689 predicate(n->as_Call()->entry_point() == NULL);
13690 match(CallLeafNoFP target);
13691
13692 ins_cost(300);
13693 format %{ "call_leaf_nofp,runtime indirect " %}
13694 ins_encode %{
13695 __ call($target$$Register);
13696 %}
13697
13698 ins_pipe(pipe_slow);
13699 %}
13700
13701 instruct CallLeafNoFPDirect(method meth)
13702 %{
13703 predicate(n->as_Call()->entry_point() != NULL);
13704 match(CallLeafNoFP);
13705 effect(USE meth);
13706
13707 ins_cost(300);
13708 format %{ "call_leaf_nofp,runtime " %}
13709 ins_encode(clear_avx, Java_To_Runtime(meth));
13710 ins_pipe(pipe_slow);
13711 %}
13712
13713 // Return Instruction
13714 // Remove the return address & jump to it.
13715 // Notice: We always emit a nop after a ret to make sure there is room
13716 // for safepoint patching
13717 instruct Ret()
13718 %{
13719 match(Return);
13720
13721 format %{ "ret" %}
13722 ins_encode %{
13723 __ ret(0);
|