< prev index next >

src/hotspot/cpu/x86/x86_64.ad

Print this page

  465 }
  466 
  467 // !!!!! Special hack to get all types of calls to specify the byte offset
  468 //       from the start of the call to the point where the return address
  469 //       will point.
  470 int MachCallStaticJavaNode::ret_addr_offset()
  471 {
  472   int offset = 5; // 5 bytes from start of call to where return address points
  473   offset += clear_avx_size();
  474   return offset;
  475 }
  476 
  477 int MachCallDynamicJavaNode::ret_addr_offset()
  478 {
  479   int offset = 15; // 15 bytes from start of call to where return address points
  480   offset += clear_avx_size();
  481   return offset;
  482 }
  483 
  484 int MachCallRuntimeNode::ret_addr_offset() {




  485   int offset = 13; // movq r10,#addr; callq (r10)
  486   if (this->ideal_Opcode() != Op_CallLeafVector) {
  487     offset += clear_avx_size();
  488   }
  489   return offset;
  490 }

  491 //
  492 // Compute padding required for nodes which need alignment
  493 //
  494 
  495 // The address of the call instruction needs to be 4-byte aligned to
  496 // ensure that it does not span a cache line so that it can be patched.
  497 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
  498 {
  499   current_offset += clear_avx_size(); // skip vzeroupper
  500   current_offset += 1; // skip call opcode byte
  501   return align_up(current_offset, alignment_required()) - current_offset;
  502 }
  503 
  504 // The address of the call instruction needs to be 4-byte aligned to
  505 // ensure that it does not span a cache line so that it can be patched.
  506 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
  507 {
  508   current_offset += clear_avx_size(); // skip vzeroupper
  509   current_offset += 11; // skip movq instruction + call opcode byte
  510   return align_up(current_offset, alignment_required()) - current_offset;

  879     st->print("# stack alignment check");
  880 #endif
  881   }
  882   if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
  883     st->print("\n\t");
  884     st->print("cmpl    [r15_thread + #disarmed_offset], #disarmed_value\t");
  885     st->print("\n\t");
  886     st->print("je      fast_entry\t");
  887     st->print("\n\t");
  888     st->print("call    #nmethod_entry_barrier_stub\t");
  889     st->print("\n\tfast_entry:");
  890   }
  891   st->cr();
  892 }
  893 #endif
  894 
  895 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  896   Compile* C = ra_->C;
  897   C2_MacroAssembler _masm(&cbuf);
  898 
  899   int framesize = C->output()->frame_size_in_bytes();
  900   int bangsize = C->output()->bang_size_in_bytes();
  901 
  902   if (C->clinit_barrier_on_entry()) {
  903     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  904     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  905 
  906     Label L_skip_barrier;
  907     Register klass = rscratch1;
  908 
  909     __ mov_metadata(klass, C->method()->holder()->constant_encoding());
  910     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  911 
  912     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  913 
  914     __ bind(L_skip_barrier);
  915   }
  916 
  917   __ verified_entry(framesize, C->output()->need_stack_bang(bangsize)?bangsize:0, false, C->stub_function() != NULL);
























  918 
  919   C->output()->set_frame_complete(cbuf.insts_size());
  920 
  921   if (C->has_mach_constant_base_node()) {
  922     // NOTE: We set the table base offset here because users might be
  923     // emitted before MachConstantBaseNode.
  924     ConstantTable& constant_table = C->output()->constant_table();
  925     constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  926   }
  927 }
  928 
  929 uint MachPrologNode::size(PhaseRegAlloc* ra_) const
  930 {
  931   return MachNode::size(ra_); // too many variables; just compute it
  932                               // the hard way
  933 }
  934 
  935 int MachPrologNode::reloc() const
  936 {
  937   return 0; // a large enough number
  938 }
  939 
  940 //=============================================================================
  941 #ifndef PRODUCT
  942 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
  943 {
  944   Compile* C = ra_->C;
  945   if (generate_vzeroupper(C)) {
  946     st->print("vzeroupper");
  947     st->cr(); st->print("\t");
  948   }
  949 
  950   int framesize = C->output()->frame_size_in_bytes();
  951   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  952   // Remove word for return adr already pushed
  953   // and RBP
  954   framesize -= 2*wordSize;

  962   if (do_polling() && C->is_method_compilation()) {
  963     st->print("\t");
  964     st->print_cr("cmpq     rsp, poll_offset[r15_thread] \n\t"
  965                  "ja       #safepoint_stub\t"
  966                  "# Safepoint: poll for GC");
  967   }
  968 }
  969 #endif
  970 
  971 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
  972 {
  973   Compile* C = ra_->C;
  974   MacroAssembler _masm(&cbuf);
  975 
  976   if (generate_vzeroupper(C)) {
  977     // Clear upper bits of YMM registers when current compiled code uses
  978     // wide vectors to avoid AVX <-> SSE transition penalty during call.
  979     __ vzeroupper();
  980   }
  981 
  982   int framesize = C->output()->frame_size_in_bytes();
  983   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  984   // Remove word for return adr already pushed
  985   // and RBP
  986   framesize -= 2*wordSize;
  987 
  988   // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
  989 
  990   if (framesize) {
  991     emit_opcode(cbuf, Assembler::REX_W);
  992     if (framesize < 0x80) {
  993       emit_opcode(cbuf, 0x83); // addq rsp, #framesize
  994       emit_rm(cbuf, 0x3, 0x00, RSP_enc);
  995       emit_d8(cbuf, framesize);
  996     } else {
  997       emit_opcode(cbuf, 0x81); // addq rsp, #framesize
  998       emit_rm(cbuf, 0x3, 0x00, RSP_enc);
  999       emit_d32(cbuf, framesize);
 1000     }
 1001   }
 1002 
 1003   // popq rbp
 1004   emit_opcode(cbuf, 0x58 | RBP_enc);
 1005 
 1006   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
 1007     __ reserved_stack_check();
 1008   }
 1009 
 1010   if (do_polling() && C->is_method_compilation()) {
 1011     MacroAssembler _masm(&cbuf);
 1012     Label dummy_label;
 1013     Label* code_stub = &dummy_label;
 1014     if (!C->output()->in_scratch_emit_size()) {
 1015       code_stub = &C->output()->safepoint_poll_table()->add_safepoint(__ offset());
 1016     }
 1017     __ relocate(relocInfo::poll_return_type);
 1018     __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
 1019   }
 1020 }
 1021 
 1022 uint MachEpilogNode::size(PhaseRegAlloc* ra_) const
 1023 {
 1024   return MachNode::size(ra_); // too many variables; just compute it
 1025                               // the hard way
 1026 }
 1027 
 1028 int MachEpilogNode::reloc() const
 1029 {
 1030   return 2; // a large enough number
 1031 }
 1032 
 1033 const Pipeline* MachEpilogNode::pipeline() const
 1034 {
 1035   return MachNode::pipeline_class();
 1036 }
 1037 
 1038 //=============================================================================
 1039 
 1040 enum RC {
 1041   rc_bad,
 1042   rc_int,
 1043   rc_kreg,
 1044   rc_float,
 1045   rc_stack
 1046 };
 1047 

 1640     emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
 1641     emit_opcode(cbuf, 0x8D); // LEA  reg,[SP+offset]
 1642     emit_rm(cbuf, 0x2, reg & 7, 0x04);
 1643     emit_rm(cbuf, 0x0, 0x04, RSP_enc);
 1644     emit_d32(cbuf, offset);
 1645   } else {
 1646     emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
 1647     emit_opcode(cbuf, 0x8D); // LEA  reg,[SP+offset]
 1648     emit_rm(cbuf, 0x1, reg & 7, 0x04);
 1649     emit_rm(cbuf, 0x0, 0x04, RSP_enc);
 1650     emit_d8(cbuf, offset);
 1651   }
 1652 }
 1653 
 1654 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
 1655 {
 1656   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1657   return (offset < 0x80) ? 5 : 8; // REX
 1658 }
 1659 






























 1660 //=============================================================================
 1661 #ifndef PRODUCT
 1662 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1663 {
 1664   if (UseCompressedClassPointers) {
 1665     st->print_cr("movl    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1666     st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
 1667     st->print_cr("\tcmpq    rax, rscratch1\t # Inline cache check");
 1668   } else {
 1669     st->print_cr("\tcmpq    rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
 1670                  "# Inline cache check");
 1671   }
 1672   st->print_cr("\tjne     SharedRuntime::_ic_miss_stub");
 1673   st->print_cr("\tnop\t# nops to align entry point");
 1674 }
 1675 #endif
 1676 
 1677 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1678 {
 1679   MacroAssembler masm(&cbuf);

 1682     masm.load_klass(rscratch1, j_rarg0, rscratch2);
 1683     masm.cmpptr(rax, rscratch1);
 1684   } else {
 1685     masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1686   }
 1687 
 1688   masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1689 
 1690   /* WARNING these NOPs are critical so that verified entry point is properly
 1691      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1692   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1693   if (OptoBreakpoint) {
 1694     // Leave space for int3
 1695     nops_cnt -= 1;
 1696   }
 1697   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1698   if (nops_cnt > 0)
 1699     masm.nop(nops_cnt);
 1700 }
 1701 
 1702 uint MachUEPNode::size(PhaseRegAlloc* ra_) const
 1703 {
 1704   return MachNode::size(ra_); // too many variables; just compute it
 1705                               // the hard way
 1706 }
 1707 
 1708 
 1709 //=============================================================================
 1710 
 1711 const bool Matcher::supports_vector_calling_convention(void) {
 1712   if (EnableVectorSupport && UseVectorStubs) {
 1713     return true;
 1714   }
 1715   return false;
 1716 }
 1717 
 1718 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
 1719   assert(EnableVectorSupport && UseVectorStubs, "sanity");
 1720   int lo = XMM0_num;
 1721   int hi = XMM0b_num;
 1722   if (ideal_reg == Op_VecX) hi = XMM0d_num;
 1723   else if (ideal_reg == Op_VecY) hi = XMM0h_num;
 1724   else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
 1725   return OptoRegPair(hi, lo);
 1726 }
 1727 
 1728 // Is this branch offset short enough that a short branch can be used?

 3961   %}
 3962 %}
 3963 
 3964 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 3965 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 3966 %{
 3967   constraint(ALLOC_IN_RC(ptr_reg));
 3968   predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
 3969   match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
 3970 
 3971   op_cost(10);
 3972   format %{"[$reg + $off + $idx << $scale]" %}
 3973   interface(MEMORY_INTER) %{
 3974     base($reg);
 3975     index($idx);
 3976     scale($scale);
 3977     disp($off);
 3978   %}
 3979 %}
 3980 
















 3981 // Indirect Narrow Oop Plus Offset Operand
 3982 // Note: x86 architecture doesn't support "scale * index + offset" without a base
 3983 // we can't free r12 even with CompressedOops::base() == NULL.
 3984 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
 3985   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 3986   constraint(ALLOC_IN_RC(ptr_reg));
 3987   match(AddP (DecodeN reg) off);
 3988 
 3989   op_cost(10);
 3990   format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
 3991   interface(MEMORY_INTER) %{
 3992     base(0xc); // R12
 3993     index($reg);
 3994     scale(0x3);
 3995     disp($off);
 3996   %}
 3997 %}
 3998 
 3999 // Indirect Memory Operand
 4000 operand indirectNarrow(rRegN reg)

 4307     equal(0x4, "e");
 4308     not_equal(0x5, "ne");
 4309     less(0x2, "b");
 4310     greater_equal(0x3, "ae");
 4311     less_equal(0x6, "be");
 4312     greater(0x7, "a");
 4313     overflow(0x0, "o");
 4314     no_overflow(0x1, "no");
 4315   %}
 4316 %}
 4317 
 4318 //----------OPERAND CLASSES----------------------------------------------------
 4319 // Operand Classes are groups of operands that are used as to simplify
 4320 // instruction definitions by not requiring the AD writer to specify separate
 4321 // instructions for every form of operand when the instruction accepts
 4322 // multiple operand types with the same basic encoding and format.  The classic
 4323 // case of this is memory operands.
 4324 
 4325 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
 4326                indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
 4327                indCompressedOopOffset,
 4328                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
 4329                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
 4330                indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 4331 
 4332 //----------PIPELINE-----------------------------------------------------------
 4333 // Rules which define the behavior of the target architectures pipeline.
 4334 pipeline %{
 4335 
 4336 //----------ATTRIBUTES---------------------------------------------------------
 4337 attributes %{
 4338   variable_size_instructions;        // Fixed size instructions
 4339   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
 4340   instruction_unit_size = 1;         // An instruction is 1 bytes long
 4341   instruction_fetch_unit_size = 16;  // The processor fetches one line
 4342   instruction_fetch_units = 1;       // of 16 bytes
 4343 
 4344   // List of nop instructions
 4345   nops( MachNop );
 4346 %}
 4347 

 6893   format %{ "MEMBAR-storestore (empty encoding)" %}
 6894   ins_encode( );
 6895   ins_pipe(empty);
 6896 %}
 6897 
 6898 //----------Move Instructions--------------------------------------------------
 6899 
 6900 instruct castX2P(rRegP dst, rRegL src)
 6901 %{
 6902   match(Set dst (CastX2P src));
 6903 
 6904   format %{ "movq    $dst, $src\t# long->ptr" %}
 6905   ins_encode %{
 6906     if ($dst$$reg != $src$$reg) {
 6907       __ movptr($dst$$Register, $src$$Register);
 6908     }
 6909   %}
 6910   ins_pipe(ialu_reg_reg); // XXX
 6911 %}
 6912 













 6913 instruct castP2X(rRegL dst, rRegP src)
 6914 %{
 6915   match(Set dst (CastP2X src));
 6916 
 6917   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6918   ins_encode %{
 6919     if ($dst$$reg != $src$$reg) {
 6920       __ movptr($dst$$Register, $src$$Register);
 6921     }
 6922   %}
 6923   ins_pipe(ialu_reg_reg); // XXX
 6924 %}
 6925 
 6926 // Convert oop into int for vectors alignment masking
 6927 instruct convP2I(rRegI dst, rRegP src)
 6928 %{
 6929   match(Set dst (ConvL2I (CastP2X src)));
 6930 
 6931   format %{ "movl    $dst, $src\t# ptr -> int" %}
 6932   ins_encode %{

11382   effect(DEF dst, USE src);
11383   ins_cost(100);
11384   format %{ "movd    $dst,$src\t# MoveI2F" %}
11385   ins_encode %{
11386     __ movdl($dst$$XMMRegister, $src$$Register);
11387   %}
11388   ins_pipe( pipe_slow );
11389 %}
11390 
11391 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11392   match(Set dst (MoveL2D src));
11393   effect(DEF dst, USE src);
11394   ins_cost(100);
11395   format %{ "movd    $dst,$src\t# MoveL2D" %}
11396   ins_encode %{
11397      __ movdq($dst$$XMMRegister, $src$$Register);
11398   %}
11399   ins_pipe( pipe_slow );
11400 %}
11401 

11402 // Fast clearing of an array
11403 // Small ClearArray non-AVX512.
11404 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11405                   Universe dummy, rFlagsReg cr)
11406 %{
11407   predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX <= 2));
11408   match(Set dummy (ClearArray cnt base));
11409   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);





















































































































11410 
11411   format %{ $$template
11412     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11413     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11414     $$emit$$"jg      LARGE\n\t"
11415     $$emit$$"dec     rcx\n\t"
11416     $$emit$$"js      DONE\t# Zero length\n\t"
11417     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11418     $$emit$$"dec     rcx\n\t"
11419     $$emit$$"jge     LOOP\n\t"
11420     $$emit$$"jmp     DONE\n\t"
11421     $$emit$$"# LARGE:\n\t"
11422     if (UseFastStosb) {
11423        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11424        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11425     } else if (UseXMMForObjInit) {
11426        $$emit$$"mov     rdi,rax\n\t"
11427        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11428        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11429        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

11437        $$emit$$"jl      L_tail\n\t"
11438        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11439        $$emit$$"add     0x20,rax\n\t"
11440        $$emit$$"sub     0x4,rcx\n\t"
11441        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11442        $$emit$$"add     0x4,rcx\n\t"
11443        $$emit$$"jle     L_end\n\t"
11444        $$emit$$"dec     rcx\n\t"
11445        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11446        $$emit$$"vmovq   xmm0,(rax)\n\t"
11447        $$emit$$"add     0x8,rax\n\t"
11448        $$emit$$"dec     rcx\n\t"
11449        $$emit$$"jge     L_sloop\n\t"
11450        $$emit$$"# L_end:\n\t"
11451     } else {
11452        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11453     }
11454     $$emit$$"# DONE"
11455   %}
11456   ins_encode %{
11457     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11458                  $tmp$$XMMRegister, false, knoreg);
11459   %}
11460   ins_pipe(pipe_slow);
11461 %}
11462 
11463 // Small ClearArray AVX512 non-constant length.
11464 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11465                        Universe dummy, rFlagsReg cr)
11466 %{
11467   predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX > 2));
11468   match(Set dummy (ClearArray cnt base));
11469   ins_cost(125);
11470   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11471 
11472   format %{ $$template
11473     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11474     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11475     $$emit$$"jg      LARGE\n\t"
11476     $$emit$$"dec     rcx\n\t"
11477     $$emit$$"js      DONE\t# Zero length\n\t"
11478     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11479     $$emit$$"dec     rcx\n\t"
11480     $$emit$$"jge     LOOP\n\t"
11481     $$emit$$"jmp     DONE\n\t"
11482     $$emit$$"# LARGE:\n\t"
11483     if (UseFastStosb) {
11484        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11485        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11486     } else if (UseXMMForObjInit) {
11487        $$emit$$"mov     rdi,rax\n\t"
11488        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11489        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11490        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

11498        $$emit$$"jl      L_tail\n\t"
11499        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11500        $$emit$$"add     0x20,rax\n\t"
11501        $$emit$$"sub     0x4,rcx\n\t"
11502        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11503        $$emit$$"add     0x4,rcx\n\t"
11504        $$emit$$"jle     L_end\n\t"
11505        $$emit$$"dec     rcx\n\t"
11506        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11507        $$emit$$"vmovq   xmm0,(rax)\n\t"
11508        $$emit$$"add     0x8,rax\n\t"
11509        $$emit$$"dec     rcx\n\t"
11510        $$emit$$"jge     L_sloop\n\t"
11511        $$emit$$"# L_end:\n\t"
11512     } else {
11513        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11514     }
11515     $$emit$$"# DONE"
11516   %}
11517   ins_encode %{
11518     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11519                  $tmp$$XMMRegister, false, $ktmp$$KRegister);
11520   %}
11521   ins_pipe(pipe_slow);
11522 %}
11523 
11524 // Large ClearArray non-AVX512.
11525 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11526                         Universe dummy, rFlagsReg cr)
11527 %{
11528   predicate((UseAVX <=2) && ((ClearArrayNode*)n)->is_large());
11529   match(Set dummy (ClearArray cnt base));
11530   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
































































































11531 
11532   format %{ $$template
11533     if (UseFastStosb) {
11534        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11535        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11536        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11537     } else if (UseXMMForObjInit) {
11538        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
11539        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11540        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11541        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11542        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11543        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11544        $$emit$$"add     0x40,rax\n\t"
11545        $$emit$$"# L_zero_64_bytes:\n\t"
11546        $$emit$$"sub     0x8,rcx\n\t"
11547        $$emit$$"jge     L_loop\n\t"
11548        $$emit$$"add     0x4,rcx\n\t"
11549        $$emit$$"jl      L_tail\n\t"
11550        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11551        $$emit$$"add     0x20,rax\n\t"
11552        $$emit$$"sub     0x4,rcx\n\t"
11553        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11554        $$emit$$"add     0x4,rcx\n\t"
11555        $$emit$$"jle     L_end\n\t"
11556        $$emit$$"dec     rcx\n\t"
11557        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11558        $$emit$$"vmovq   xmm0,(rax)\n\t"
11559        $$emit$$"add     0x8,rax\n\t"
11560        $$emit$$"dec     rcx\n\t"
11561        $$emit$$"jge     L_sloop\n\t"
11562        $$emit$$"# L_end:\n\t"
11563     } else {
11564        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11565        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11566     }
11567   %}
11568   ins_encode %{
11569     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11570                  $tmp$$XMMRegister, true, knoreg);
11571   %}
11572   ins_pipe(pipe_slow);
11573 %}
11574 
11575 // Large ClearArray AVX512.
11576 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11577                              Universe dummy, rFlagsReg cr)
11578 %{
11579   predicate((UseAVX > 2) && ((ClearArrayNode*)n)->is_large());
11580   match(Set dummy (ClearArray cnt base));
11581   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11582 
11583   format %{ $$template
11584     if (UseFastStosb) {
11585        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11586        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11587        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11588     } else if (UseXMMForObjInit) {
11589        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
11590        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11591        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11592        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11593        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11594        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11595        $$emit$$"add     0x40,rax\n\t"
11596        $$emit$$"# L_zero_64_bytes:\n\t"
11597        $$emit$$"sub     0x8,rcx\n\t"
11598        $$emit$$"jge     L_loop\n\t"
11599        $$emit$$"add     0x4,rcx\n\t"
11600        $$emit$$"jl      L_tail\n\t"
11601        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11602        $$emit$$"add     0x20,rax\n\t"
11603        $$emit$$"sub     0x4,rcx\n\t"
11604        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11605        $$emit$$"add     0x4,rcx\n\t"
11606        $$emit$$"jle     L_end\n\t"
11607        $$emit$$"dec     rcx\n\t"
11608        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11609        $$emit$$"vmovq   xmm0,(rax)\n\t"
11610        $$emit$$"add     0x8,rax\n\t"
11611        $$emit$$"dec     rcx\n\t"
11612        $$emit$$"jge     L_sloop\n\t"
11613        $$emit$$"# L_end:\n\t"
11614     } else {
11615        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11616        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11617     }
11618   %}
11619   ins_encode %{
11620     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11621                  $tmp$$XMMRegister, true, $ktmp$$KRegister);
11622   %}
11623   ins_pipe(pipe_slow);
11624 %}
11625 
11626 // Small ClearArray AVX512 constant length.
11627 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rRegI zero, kReg ktmp, Universe dummy, rFlagsReg cr)
11628 %{
11629   predicate(!((ClearArrayNode*)n)->is_large() &&
11630               ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11631   match(Set dummy (ClearArray cnt base));
11632   ins_cost(100);
11633   effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr);
11634   format %{ "clear_mem_imm $base , $cnt  \n\t" %}
11635   ins_encode %{
11636    __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11637   %}
11638   ins_pipe(pipe_slow);
11639 %}
11640 
11641 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11642                          rax_RegI result, legRegD tmp1, rFlagsReg cr)
11643 %{
11644   predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11645   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11646   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11647 
11648   format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
11649   ins_encode %{
11650     __ string_compare($str1$$Register, $str2$$Register,
11651                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
11652                       $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11653   %}
11654   ins_pipe( pipe_slow );
11655 %}
11656 

13420 
13421   ins_cost(300);
13422   format %{ "call_leaf,runtime " %}
13423   ins_encode(clear_avx, Java_To_Runtime(meth));
13424   ins_pipe(pipe_slow);
13425 %}
13426 
13427 // Call runtime without safepoint and with vector arguments
13428 instruct CallLeafDirectVector(method meth)
13429 %{
13430   match(CallLeafVector);
13431   effect(USE meth);
13432 
13433   ins_cost(300);
13434   format %{ "call_leaf,vector " %}
13435   ins_encode(Java_To_Runtime(meth));
13436   ins_pipe(pipe_slow);
13437 %}
13438 
13439 // Call runtime without safepoint















13440 instruct CallLeafNoFPDirect(method meth)
13441 %{

13442   match(CallLeafNoFP);
13443   effect(USE meth);
13444 
13445   ins_cost(300);
13446   format %{ "call_leaf_nofp,runtime " %}
13447   ins_encode(clear_avx, Java_To_Runtime(meth));
13448   ins_pipe(pipe_slow);
13449 %}
13450 
13451 // Return Instruction
13452 // Remove the return address & jump to it.
13453 // Notice: We always emit a nop after a ret to make sure there is room
13454 // for safepoint patching
13455 instruct Ret()
13456 %{
13457   match(Return);
13458 
13459   format %{ "ret" %}
13460   ins_encode %{
13461     __ ret(0);

  465 }
  466 
  467 // !!!!! Special hack to get all types of calls to specify the byte offset
  468 //       from the start of the call to the point where the return address
  469 //       will point.
  470 int MachCallStaticJavaNode::ret_addr_offset()
  471 {
  472   int offset = 5; // 5 bytes from start of call to where return address points
  473   offset += clear_avx_size();
  474   return offset;
  475 }
  476 
  477 int MachCallDynamicJavaNode::ret_addr_offset()
  478 {
  479   int offset = 15; // 15 bytes from start of call to where return address points
  480   offset += clear_avx_size();
  481   return offset;
  482 }
  483 
  484 int MachCallRuntimeNode::ret_addr_offset() {
  485   if (_entry_point == NULL) {
  486     // CallLeafNoFPInDirect
  487     return 3; // callq (register)
  488   }
  489   int offset = 13; // movq r10,#addr; callq (r10)
  490   if (this->ideal_Opcode() != Op_CallLeafVector) {
  491     offset += clear_avx_size();
  492   }
  493   return offset;
  494 }
  495 
  496 //
  497 // Compute padding required for nodes which need alignment
  498 //
  499 
  500 // The address of the call instruction needs to be 4-byte aligned to
  501 // ensure that it does not span a cache line so that it can be patched.
  502 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
  503 {
  504   current_offset += clear_avx_size(); // skip vzeroupper
  505   current_offset += 1; // skip call opcode byte
  506   return align_up(current_offset, alignment_required()) - current_offset;
  507 }
  508 
  509 // The address of the call instruction needs to be 4-byte aligned to
  510 // ensure that it does not span a cache line so that it can be patched.
  511 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
  512 {
  513   current_offset += clear_avx_size(); // skip vzeroupper
  514   current_offset += 11; // skip movq instruction + call opcode byte
  515   return align_up(current_offset, alignment_required()) - current_offset;

  884     st->print("# stack alignment check");
  885 #endif
  886   }
  887   if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
  888     st->print("\n\t");
  889     st->print("cmpl    [r15_thread + #disarmed_offset], #disarmed_value\t");
  890     st->print("\n\t");
  891     st->print("je      fast_entry\t");
  892     st->print("\n\t");
  893     st->print("call    #nmethod_entry_barrier_stub\t");
  894     st->print("\n\tfast_entry:");
  895   }
  896   st->cr();
  897 }
  898 #endif
  899 
  900 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  901   Compile* C = ra_->C;
  902   C2_MacroAssembler _masm(&cbuf);
  903 



  904   if (C->clinit_barrier_on_entry()) {
  905     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  906     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  907 
  908     Label L_skip_barrier;
  909     Register klass = rscratch1;
  910 
  911     __ mov_metadata(klass, C->method()->holder()->constant_encoding());
  912     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  913 
  914     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  915 
  916     __ bind(L_skip_barrier);
  917   }
  918 
  919   __ verified_entry(C);
  920   __ bind(*_verified_entry);
  921 
  922   if (C->stub_function() == NULL) {
  923     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  924  #ifdef _LP64
  925     if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
  926       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
  927       Label dummy_slow_path;
  928       Label dummy_continuation;
  929       Label* slow_path = &dummy_slow_path;
  930       Label* continuation = &dummy_continuation;
  931       if (!Compile::current()->output()->in_scratch_emit_size()) {
  932         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  933         C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
  934         slow_path = &stub->slow_path();
  935         continuation = &stub->continuation();
  936       }
  937       bs->nmethod_entry_barrier(&_masm, slow_path, continuation);
  938     }
  939 #else
  940     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
  941     bs->nmethod_entry_barrier(&_masm, NULL /* slow_path */, NULL /* continuation */);
  942 #endif
  943   }
  944 
  945   C->output()->set_frame_complete(cbuf.insts_size());
  946 
  947   if (C->has_mach_constant_base_node()) {
  948     // NOTE: We set the table base offset here because users might be
  949     // emitted before MachConstantBaseNode.
  950     ConstantTable& constant_table = C->output()->constant_table();
  951     constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  952   }
  953 }
  954 






  955 int MachPrologNode::reloc() const
  956 {
  957   return 0; // a large enough number
  958 }
  959 
  960 //=============================================================================
  961 #ifndef PRODUCT
  962 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
  963 {
  964   Compile* C = ra_->C;
  965   if (generate_vzeroupper(C)) {
  966     st->print("vzeroupper");
  967     st->cr(); st->print("\t");
  968   }
  969 
  970   int framesize = C->output()->frame_size_in_bytes();
  971   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  972   // Remove word for return adr already pushed
  973   // and RBP
  974   framesize -= 2*wordSize;

  982   if (do_polling() && C->is_method_compilation()) {
  983     st->print("\t");
  984     st->print_cr("cmpq     rsp, poll_offset[r15_thread] \n\t"
  985                  "ja       #safepoint_stub\t"
  986                  "# Safepoint: poll for GC");
  987   }
  988 }
  989 #endif
  990 
  991 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
  992 {
  993   Compile* C = ra_->C;
  994   MacroAssembler _masm(&cbuf);
  995 
  996   if (generate_vzeroupper(C)) {
  997     // Clear upper bits of YMM registers when current compiled code uses
  998     // wide vectors to avoid AVX <-> SSE transition penalty during call.
  999     __ vzeroupper();
 1000   }
 1001 
 1002   // Subtract two words to account for return address and rbp
 1003   int initial_framesize = C->output()->frame_size_in_bytes() - 2*wordSize;
 1004   __ remove_frame(initial_framesize, C->needs_stack_repair());




















 1005 
 1006   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
 1007     __ reserved_stack_check();
 1008   }
 1009 
 1010   if (do_polling() && C->is_method_compilation()) {
 1011     MacroAssembler _masm(&cbuf);
 1012     Label dummy_label;
 1013     Label* code_stub = &dummy_label;
 1014     if (!C->output()->in_scratch_emit_size()) {
 1015       code_stub = &C->output()->safepoint_poll_table()->add_safepoint(__ offset());
 1016     }
 1017     __ relocate(relocInfo::poll_return_type);
 1018     __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
 1019   }
 1020 }
 1021 






 1022 int MachEpilogNode::reloc() const
 1023 {
 1024   return 2; // a large enough number
 1025 }
 1026 
 1027 const Pipeline* MachEpilogNode::pipeline() const
 1028 {
 1029   return MachNode::pipeline_class();
 1030 }
 1031 
 1032 //=============================================================================
 1033 
 1034 enum RC {
 1035   rc_bad,
 1036   rc_int,
 1037   rc_kreg,
 1038   rc_float,
 1039   rc_stack
 1040 };
 1041 

 1634     emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
 1635     emit_opcode(cbuf, 0x8D); // LEA  reg,[SP+offset]
 1636     emit_rm(cbuf, 0x2, reg & 7, 0x04);
 1637     emit_rm(cbuf, 0x0, 0x04, RSP_enc);
 1638     emit_d32(cbuf, offset);
 1639   } else {
 1640     emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
 1641     emit_opcode(cbuf, 0x8D); // LEA  reg,[SP+offset]
 1642     emit_rm(cbuf, 0x1, reg & 7, 0x04);
 1643     emit_rm(cbuf, 0x0, 0x04, RSP_enc);
 1644     emit_d8(cbuf, offset);
 1645   }
 1646 }
 1647 
 1648 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
 1649 {
 1650   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1651   return (offset < 0x80) ? 5 : 8; // REX
 1652 }
 1653 
 1654 //=============================================================================
 1655 #ifndef PRODUCT
 1656 void MachVEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1657 {
 1658   st->print_cr("MachVEPNode");
 1659 }
 1660 #endif
 1661 
 1662 void MachVEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1663 {
 1664   C2_MacroAssembler _masm(&cbuf);
 1665   if (!_verified) {
 1666     uint insts_size = cbuf.insts_size();
 1667     if (UseCompressedClassPointers) {
 1668       __ load_klass(rscratch1, j_rarg0, rscratch2);
 1669       __ cmpptr(rax, rscratch1);
 1670     } else {
 1671       __ cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1672     }
 1673     __ jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1674   } else {
 1675     // Unpack inline type args passed as oop and then jump to
 1676     // the verified entry point (skipping the unverified entry).
 1677     int sp_inc = __ unpack_inline_args(ra_->C, _receiver_only);
 1678     // Emit code for verified entry and save increment for stack repair on return
 1679     __ verified_entry(ra_->C, sp_inc);
 1680     __ jmp(*_verified_entry);
 1681   }
 1682 }
 1683 
 1684 //=============================================================================
 1685 #ifndef PRODUCT
 1686 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1687 {
 1688   if (UseCompressedClassPointers) {
 1689     st->print_cr("movl    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1690     st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
 1691     st->print_cr("\tcmpq    rax, rscratch1\t # Inline cache check");
 1692   } else {
 1693     st->print_cr("\tcmpq    rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
 1694                  "# Inline cache check");
 1695   }
 1696   st->print_cr("\tjne     SharedRuntime::_ic_miss_stub");
 1697   st->print_cr("\tnop\t# nops to align entry point");
 1698 }
 1699 #endif
 1700 
 1701 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1702 {
 1703   MacroAssembler masm(&cbuf);

 1706     masm.load_klass(rscratch1, j_rarg0, rscratch2);
 1707     masm.cmpptr(rax, rscratch1);
 1708   } else {
 1709     masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1710   }
 1711 
 1712   masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1713 
 1714   /* WARNING these NOPs are critical so that verified entry point is properly
 1715      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1716   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1717   if (OptoBreakpoint) {
 1718     // Leave space for int3
 1719     nops_cnt -= 1;
 1720   }
 1721   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1722   if (nops_cnt > 0)
 1723     masm.nop(nops_cnt);
 1724 }
 1725 







 1726 //=============================================================================
 1727 
 1728 const bool Matcher::supports_vector_calling_convention(void) {
 1729   if (EnableVectorSupport && UseVectorStubs) {
 1730     return true;
 1731   }
 1732   return false;
 1733 }
 1734 
 1735 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
 1736   assert(EnableVectorSupport && UseVectorStubs, "sanity");
 1737   int lo = XMM0_num;
 1738   int hi = XMM0b_num;
 1739   if (ideal_reg == Op_VecX) hi = XMM0d_num;
 1740   else if (ideal_reg == Op_VecY) hi = XMM0h_num;
 1741   else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
 1742   return OptoRegPair(hi, lo);
 1743 }
 1744 
 1745 // Is this branch offset short enough that a short branch can be used?

 3978   %}
 3979 %}
 3980 
 3981 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 3982 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 3983 %{
 3984   constraint(ALLOC_IN_RC(ptr_reg));
 3985   predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
 3986   match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
 3987 
 3988   op_cost(10);
 3989   format %{"[$reg + $off + $idx << $scale]" %}
 3990   interface(MEMORY_INTER) %{
 3991     base($reg);
 3992     index($idx);
 3993     scale($scale);
 3994     disp($off);
 3995   %}
 3996 %}
 3997 
 3998 // Indirect Narrow Oop Operand
 3999 operand indCompressedOop(rRegN reg) %{
 4000   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 4001   constraint(ALLOC_IN_RC(ptr_reg));
 4002   match(DecodeN reg);
 4003 
 4004   op_cost(10);
 4005   format %{"[R12 + $reg << 3] (compressed oop addressing)" %}
 4006   interface(MEMORY_INTER) %{
 4007     base(0xc); // R12
 4008     index($reg);
 4009     scale(0x3);
 4010     disp(0x0);
 4011   %}
 4012 %}
 4013 
 4014 // Indirect Narrow Oop Plus Offset Operand
 4015 // Note: x86 architecture doesn't support "scale * index + offset" without a base
 4016 // we can't free r12 even with CompressedOops::base() == NULL.
 4017 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
 4018   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 4019   constraint(ALLOC_IN_RC(ptr_reg));
 4020   match(AddP (DecodeN reg) off);
 4021 
 4022   op_cost(10);
 4023   format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
 4024   interface(MEMORY_INTER) %{
 4025     base(0xc); // R12
 4026     index($reg);
 4027     scale(0x3);
 4028     disp($off);
 4029   %}
 4030 %}
 4031 
 4032 // Indirect Memory Operand
 4033 operand indirectNarrow(rRegN reg)

 4340     equal(0x4, "e");
 4341     not_equal(0x5, "ne");
 4342     less(0x2, "b");
 4343     greater_equal(0x3, "ae");
 4344     less_equal(0x6, "be");
 4345     greater(0x7, "a");
 4346     overflow(0x0, "o");
 4347     no_overflow(0x1, "no");
 4348   %}
 4349 %}
 4350 
 4351 //----------OPERAND CLASSES----------------------------------------------------
 4352 // Operand Classes are groups of operands that are used as to simplify
 4353 // instruction definitions by not requiring the AD writer to specify separate
 4354 // instructions for every form of operand when the instruction accepts
 4355 // multiple operand types with the same basic encoding and format.  The classic
 4356 // case of this is memory operands.
 4357 
 4358 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
 4359                indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
 4360                indCompressedOop, indCompressedOopOffset,
 4361                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
 4362                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
 4363                indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 4364 
 4365 //----------PIPELINE-----------------------------------------------------------
 4366 // Rules which define the behavior of the target architectures pipeline.
 4367 pipeline %{
 4368 
 4369 //----------ATTRIBUTES---------------------------------------------------------
 4370 attributes %{
 4371   variable_size_instructions;        // Fixed size instructions
 4372   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
 4373   instruction_unit_size = 1;         // An instruction is 1 bytes long
 4374   instruction_fetch_unit_size = 16;  // The processor fetches one line
 4375   instruction_fetch_units = 1;       // of 16 bytes
 4376 
 4377   // List of nop instructions
 4378   nops( MachNop );
 4379 %}
 4380 

 6926   format %{ "MEMBAR-storestore (empty encoding)" %}
 6927   ins_encode( );
 6928   ins_pipe(empty);
 6929 %}
 6930 
 6931 //----------Move Instructions--------------------------------------------------
 6932 
 6933 instruct castX2P(rRegP dst, rRegL src)
 6934 %{
 6935   match(Set dst (CastX2P src));
 6936 
 6937   format %{ "movq    $dst, $src\t# long->ptr" %}
 6938   ins_encode %{
 6939     if ($dst$$reg != $src$$reg) {
 6940       __ movptr($dst$$Register, $src$$Register);
 6941     }
 6942   %}
 6943   ins_pipe(ialu_reg_reg); // XXX
 6944 %}
 6945 
 6946 instruct castN2X(rRegL dst, rRegN src)
 6947 %{
 6948   match(Set dst (CastP2X src));
 6949 
 6950   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6951   ins_encode %{
 6952     if ($dst$$reg != $src$$reg) {
 6953       __ movptr($dst$$Register, $src$$Register);
 6954     }
 6955   %}
 6956   ins_pipe(ialu_reg_reg); // XXX
 6957 %}
 6958 
 6959 instruct castP2X(rRegL dst, rRegP src)
 6960 %{
 6961   match(Set dst (CastP2X src));
 6962 
 6963   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6964   ins_encode %{
 6965     if ($dst$$reg != $src$$reg) {
 6966       __ movptr($dst$$Register, $src$$Register);
 6967     }
 6968   %}
 6969   ins_pipe(ialu_reg_reg); // XXX
 6970 %}
 6971 
 6972 // Convert oop into int for vectors alignment masking
 6973 instruct convP2I(rRegI dst, rRegP src)
 6974 %{
 6975   match(Set dst (ConvL2I (CastP2X src)));
 6976 
 6977   format %{ "movl    $dst, $src\t# ptr -> int" %}
 6978   ins_encode %{

11428   effect(DEF dst, USE src);
11429   ins_cost(100);
11430   format %{ "movd    $dst,$src\t# MoveI2F" %}
11431   ins_encode %{
11432     __ movdl($dst$$XMMRegister, $src$$Register);
11433   %}
11434   ins_pipe( pipe_slow );
11435 %}
11436 
11437 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11438   match(Set dst (MoveL2D src));
11439   effect(DEF dst, USE src);
11440   ins_cost(100);
11441   format %{ "movd    $dst,$src\t# MoveL2D" %}
11442   ins_encode %{
11443      __ movdq($dst$$XMMRegister, $src$$Register);
11444   %}
11445   ins_pipe( pipe_slow );
11446 %}
11447 
11448 
11449 // Fast clearing of an array
11450 // Small ClearArray non-AVX512.
11451 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11452                   Universe dummy, rFlagsReg cr)
11453 %{
11454   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11455   match(Set dummy (ClearArray (Binary cnt base) val));
11456   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11457 
11458   format %{ $$template
11459     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11460     $$emit$$"jg      LARGE\n\t"
11461     $$emit$$"dec     rcx\n\t"
11462     $$emit$$"js      DONE\t# Zero length\n\t"
11463     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11464     $$emit$$"dec     rcx\n\t"
11465     $$emit$$"jge     LOOP\n\t"
11466     $$emit$$"jmp     DONE\n\t"
11467     $$emit$$"# LARGE:\n\t"
11468     if (UseFastStosb) {
11469        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11470        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11471     } else if (UseXMMForObjInit) {
11472        $$emit$$"movdq   $tmp, $val\n\t"
11473        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11474        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11475        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11476        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11477        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11478        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11479        $$emit$$"add     0x40,rax\n\t"
11480        $$emit$$"# L_zero_64_bytes:\n\t"
11481        $$emit$$"sub     0x8,rcx\n\t"
11482        $$emit$$"jge     L_loop\n\t"
11483        $$emit$$"add     0x4,rcx\n\t"
11484        $$emit$$"jl      L_tail\n\t"
11485        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11486        $$emit$$"add     0x20,rax\n\t"
11487        $$emit$$"sub     0x4,rcx\n\t"
11488        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11489        $$emit$$"add     0x4,rcx\n\t"
11490        $$emit$$"jle     L_end\n\t"
11491        $$emit$$"dec     rcx\n\t"
11492        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11493        $$emit$$"vmovq   xmm0,(rax)\n\t"
11494        $$emit$$"add     0x8,rax\n\t"
11495        $$emit$$"dec     rcx\n\t"
11496        $$emit$$"jge     L_sloop\n\t"
11497        $$emit$$"# L_end:\n\t"
11498     } else {
11499        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11500     }
11501     $$emit$$"# DONE"
11502   %}
11503   ins_encode %{
11504     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11505                  $tmp$$XMMRegister, false, false);
11506   %}
11507   ins_pipe(pipe_slow);
11508 %}
11509 
11510 instruct rep_stos_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11511                             Universe dummy, rFlagsReg cr)
11512 %{
11513   predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11514   match(Set dummy (ClearArray (Binary cnt base) val));
11515   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11516 
11517   format %{ $$template
11518     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11519     $$emit$$"jg      LARGE\n\t"
11520     $$emit$$"dec     rcx\n\t"
11521     $$emit$$"js      DONE\t# Zero length\n\t"
11522     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11523     $$emit$$"dec     rcx\n\t"
11524     $$emit$$"jge     LOOP\n\t"
11525     $$emit$$"jmp     DONE\n\t"
11526     $$emit$$"# LARGE:\n\t"
11527     if (UseXMMForObjInit) {
11528        $$emit$$"movdq   $tmp, $val\n\t"
11529        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11530        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11531        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11532        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11533        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11534        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11535        $$emit$$"add     0x40,rax\n\t"
11536        $$emit$$"# L_zero_64_bytes:\n\t"
11537        $$emit$$"sub     0x8,rcx\n\t"
11538        $$emit$$"jge     L_loop\n\t"
11539        $$emit$$"add     0x4,rcx\n\t"
11540        $$emit$$"jl      L_tail\n\t"
11541        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11542        $$emit$$"add     0x20,rax\n\t"
11543        $$emit$$"sub     0x4,rcx\n\t"
11544        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11545        $$emit$$"add     0x4,rcx\n\t"
11546        $$emit$$"jle     L_end\n\t"
11547        $$emit$$"dec     rcx\n\t"
11548        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11549        $$emit$$"vmovq   xmm0,(rax)\n\t"
11550        $$emit$$"add     0x8,rax\n\t"
11551        $$emit$$"dec     rcx\n\t"
11552        $$emit$$"jge     L_sloop\n\t"
11553        $$emit$$"# L_end:\n\t"
11554     } else {
11555        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11556     }
11557     $$emit$$"# DONE"
11558   %}
11559   ins_encode %{
11560     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11561                  $tmp$$XMMRegister, false, true);
11562   %}
11563   ins_pipe(pipe_slow);
11564 %}
11565 
11566 // Small ClearArray AVX512 non-constant length.
11567 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11568                        Universe dummy, rFlagsReg cr)
11569 %{
11570   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11571   match(Set dummy (ClearArray (Binary cnt base) val));
11572   ins_cost(125);
11573   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11574 
11575   format %{ $$template
11576     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11577     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11578     $$emit$$"jg      LARGE\n\t"
11579     $$emit$$"dec     rcx\n\t"
11580     $$emit$$"js      DONE\t# Zero length\n\t"
11581     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11582     $$emit$$"dec     rcx\n\t"
11583     $$emit$$"jge     LOOP\n\t"
11584     $$emit$$"jmp     DONE\n\t"
11585     $$emit$$"# LARGE:\n\t"
11586     if (UseFastStosb) {
11587        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11588        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11589     } else if (UseXMMForObjInit) {
11590        $$emit$$"mov     rdi,rax\n\t"
11591        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11592        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11593        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

11601        $$emit$$"jl      L_tail\n\t"
11602        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11603        $$emit$$"add     0x20,rax\n\t"
11604        $$emit$$"sub     0x4,rcx\n\t"
11605        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11606        $$emit$$"add     0x4,rcx\n\t"
11607        $$emit$$"jle     L_end\n\t"
11608        $$emit$$"dec     rcx\n\t"
11609        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11610        $$emit$$"vmovq   xmm0,(rax)\n\t"
11611        $$emit$$"add     0x8,rax\n\t"
11612        $$emit$$"dec     rcx\n\t"
11613        $$emit$$"jge     L_sloop\n\t"
11614        $$emit$$"# L_end:\n\t"
11615     } else {
11616        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11617     }
11618     $$emit$$"# DONE"
11619   %}
11620   ins_encode %{
11621     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11622                  $tmp$$XMMRegister, false, false, $ktmp$$KRegister);
11623   %}
11624   ins_pipe(pipe_slow);
11625 %}
11626 
11627 instruct rep_stos_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11628                                  Universe dummy, rFlagsReg cr)

11629 %{
11630   predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11631   match(Set dummy (ClearArray (Binary cnt base) val));
11632   ins_cost(125);
11633   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11634 
11635   format %{ $$template
11636     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11637     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11638     $$emit$$"jg      LARGE\n\t"
11639     $$emit$$"dec     rcx\n\t"
11640     $$emit$$"js      DONE\t# Zero length\n\t"
11641     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11642     $$emit$$"dec     rcx\n\t"
11643     $$emit$$"jge     LOOP\n\t"
11644     $$emit$$"jmp     DONE\n\t"
11645     $$emit$$"# LARGE:\n\t"
11646     if (UseFastStosb) {
11647        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11648        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11649     } else if (UseXMMForObjInit) {
11650        $$emit$$"mov     rdi,rax\n\t"
11651        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11652        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11653        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

11661        $$emit$$"jl      L_tail\n\t"
11662        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11663        $$emit$$"add     0x20,rax\n\t"
11664        $$emit$$"sub     0x4,rcx\n\t"
11665        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11666        $$emit$$"add     0x4,rcx\n\t"
11667        $$emit$$"jle     L_end\n\t"
11668        $$emit$$"dec     rcx\n\t"
11669        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11670        $$emit$$"vmovq   xmm0,(rax)\n\t"
11671        $$emit$$"add     0x8,rax\n\t"
11672        $$emit$$"dec     rcx\n\t"
11673        $$emit$$"jge     L_sloop\n\t"
11674        $$emit$$"# L_end:\n\t"
11675     } else {
11676        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11677     }
11678     $$emit$$"# DONE"
11679   %}
11680   ins_encode %{
11681     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11682                  $tmp$$XMMRegister, false, true, $ktmp$$KRegister);
11683   %}
11684   ins_pipe(pipe_slow);
11685 %}
11686 
11687 // Large ClearArray non-AVX512.
11688 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11689                         Universe dummy, rFlagsReg cr)
11690 %{
11691   predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11692   match(Set dummy (ClearArray (Binary cnt base) val));
11693   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11694 
11695   format %{ $$template
11696     if (UseFastStosb) {
11697        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11698        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11699     } else if (UseXMMForObjInit) {
11700        $$emit$$"movdq   $tmp, $val\n\t"
11701        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11702        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11703        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11704        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11705        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11706        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11707        $$emit$$"add     0x40,rax\n\t"
11708        $$emit$$"# L_zero_64_bytes:\n\t"
11709        $$emit$$"sub     0x8,rcx\n\t"
11710        $$emit$$"jge     L_loop\n\t"
11711        $$emit$$"add     0x4,rcx\n\t"
11712        $$emit$$"jl      L_tail\n\t"
11713        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11714        $$emit$$"add     0x20,rax\n\t"
11715        $$emit$$"sub     0x4,rcx\n\t"
11716        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11717        $$emit$$"add     0x4,rcx\n\t"
11718        $$emit$$"jle     L_end\n\t"
11719        $$emit$$"dec     rcx\n\t"
11720        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11721        $$emit$$"vmovq   xmm0,(rax)\n\t"
11722        $$emit$$"add     0x8,rax\n\t"
11723        $$emit$$"dec     rcx\n\t"
11724        $$emit$$"jge     L_sloop\n\t"
11725        $$emit$$"# L_end:\n\t"
11726     } else {
11727        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11728     }
11729   %}
11730   ins_encode %{
11731     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11732                  $tmp$$XMMRegister, true, false);
11733   %}
11734   ins_pipe(pipe_slow);
11735 %}
11736 
11737 instruct rep_stos_large_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11738                                   Universe dummy, rFlagsReg cr)
11739 %{
11740   predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11741   match(Set dummy (ClearArray (Binary cnt base) val));
11742   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11743 
11744   format %{ $$template
11745     if (UseXMMForObjInit) {
11746        $$emit$$"movdq   $tmp, $val\n\t"
11747        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11748        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11749        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11750        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11751        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11752        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11753        $$emit$$"add     0x40,rax\n\t"
11754        $$emit$$"# L_zero_64_bytes:\n\t"
11755        $$emit$$"sub     0x8,rcx\n\t"
11756        $$emit$$"jge     L_loop\n\t"
11757        $$emit$$"add     0x4,rcx\n\t"
11758        $$emit$$"jl      L_tail\n\t"
11759        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11760        $$emit$$"add     0x20,rax\n\t"
11761        $$emit$$"sub     0x4,rcx\n\t"
11762        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11763        $$emit$$"add     0x4,rcx\n\t"
11764        $$emit$$"jle     L_end\n\t"
11765        $$emit$$"dec     rcx\n\t"
11766        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11767        $$emit$$"vmovq   xmm0,(rax)\n\t"
11768        $$emit$$"add     0x8,rax\n\t"
11769        $$emit$$"dec     rcx\n\t"
11770        $$emit$$"jge     L_sloop\n\t"
11771        $$emit$$"# L_end:\n\t"
11772     } else {
11773        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11774     }
11775   %}
11776   ins_encode %{
11777     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11778                  $tmp$$XMMRegister, true, true);
11779   %}
11780   ins_pipe(pipe_slow);
11781 %}
11782 
11783 // Large ClearArray AVX512.
11784 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11785                              Universe dummy, rFlagsReg cr)
11786 %{
11787   predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11788   match(Set dummy (ClearArray (Binary cnt base) val));
11789   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11790 
11791   format %{ $$template
11792     if (UseFastStosb) {
11793        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11794        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11795        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11796     } else if (UseXMMForObjInit) {
11797        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
11798        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11799        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11800        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11801        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11802        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11803        $$emit$$"add     0x40,rax\n\t"
11804        $$emit$$"# L_zero_64_bytes:\n\t"
11805        $$emit$$"sub     0x8,rcx\n\t"
11806        $$emit$$"jge     L_loop\n\t"
11807        $$emit$$"add     0x4,rcx\n\t"
11808        $$emit$$"jl      L_tail\n\t"
11809        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11810        $$emit$$"add     0x20,rax\n\t"
11811        $$emit$$"sub     0x4,rcx\n\t"
11812        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11813        $$emit$$"add     0x4,rcx\n\t"
11814        $$emit$$"jle     L_end\n\t"
11815        $$emit$$"dec     rcx\n\t"
11816        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11817        $$emit$$"vmovq   xmm0,(rax)\n\t"
11818        $$emit$$"add     0x8,rax\n\t"
11819        $$emit$$"dec     rcx\n\t"
11820        $$emit$$"jge     L_sloop\n\t"
11821        $$emit$$"# L_end:\n\t"
11822     } else {
11823        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11824        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11825     }
11826   %}
11827   ins_encode %{
11828     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11829                  $tmp$$XMMRegister, true, false, $ktmp$$KRegister);
11830   %}
11831   ins_pipe(pipe_slow);
11832 %}
11833 
11834 instruct rep_stos_large_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11835                                        Universe dummy, rFlagsReg cr)

11836 %{
11837   predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11838   match(Set dummy (ClearArray (Binary cnt base) val));
11839   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11840 
11841   format %{ $$template
11842     if (UseFastStosb) {
11843        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11844        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11845        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11846     } else if (UseXMMForObjInit) {
11847        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
11848        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11849        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11850        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11851        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11852        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11853        $$emit$$"add     0x40,rax\n\t"
11854        $$emit$$"# L_zero_64_bytes:\n\t"
11855        $$emit$$"sub     0x8,rcx\n\t"
11856        $$emit$$"jge     L_loop\n\t"
11857        $$emit$$"add     0x4,rcx\n\t"
11858        $$emit$$"jl      L_tail\n\t"
11859        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11860        $$emit$$"add     0x20,rax\n\t"
11861        $$emit$$"sub     0x4,rcx\n\t"
11862        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11863        $$emit$$"add     0x4,rcx\n\t"
11864        $$emit$$"jle     L_end\n\t"
11865        $$emit$$"dec     rcx\n\t"
11866        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11867        $$emit$$"vmovq   xmm0,(rax)\n\t"
11868        $$emit$$"add     0x8,rax\n\t"
11869        $$emit$$"dec     rcx\n\t"
11870        $$emit$$"jge     L_sloop\n\t"
11871        $$emit$$"# L_end:\n\t"
11872     } else {
11873        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11874        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11875     }
11876   %}
11877   ins_encode %{
11878     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11879                  $tmp$$XMMRegister, true, true, $ktmp$$KRegister);
11880   %}
11881   ins_pipe(pipe_slow);
11882 %}
11883 
11884 // Small ClearArray AVX512 constant length.
11885 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rax_RegL val, kReg ktmp, Universe dummy, rFlagsReg cr)
11886 %{
11887   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() &&
11888             ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11889   match(Set dummy (ClearArray (Binary cnt base) val));
11890   ins_cost(100);
11891   effect(TEMP tmp, USE_KILL val, TEMP ktmp, KILL cr);
11892   format %{ "clear_mem_imm $base , $cnt  \n\t" %}
11893   ins_encode %{
11894     __ clear_mem($base$$Register, $cnt$$constant, $val$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11895   %}
11896   ins_pipe(pipe_slow);
11897 %}
11898 
11899 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11900                          rax_RegI result, legRegD tmp1, rFlagsReg cr)
11901 %{
11902   predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11903   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11904   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11905 
11906   format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
11907   ins_encode %{
11908     __ string_compare($str1$$Register, $str2$$Register,
11909                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
11910                       $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11911   %}
11912   ins_pipe( pipe_slow );
11913 %}
11914 

13678 
13679   ins_cost(300);
13680   format %{ "call_leaf,runtime " %}
13681   ins_encode(clear_avx, Java_To_Runtime(meth));
13682   ins_pipe(pipe_slow);
13683 %}
13684 
13685 // Call runtime without safepoint and with vector arguments
13686 instruct CallLeafDirectVector(method meth)
13687 %{
13688   match(CallLeafVector);
13689   effect(USE meth);
13690 
13691   ins_cost(300);
13692   format %{ "call_leaf,vector " %}
13693   ins_encode(Java_To_Runtime(meth));
13694   ins_pipe(pipe_slow);
13695 %}
13696 
13697 // Call runtime without safepoint
13698 // entry point is null, target holds the address to call
13699 instruct CallLeafNoFPInDirect(rRegP target)
13700 %{
13701   predicate(n->as_Call()->entry_point() == NULL);
13702   match(CallLeafNoFP target);
13703 
13704   ins_cost(300);
13705   format %{ "call_leaf_nofp,runtime indirect " %}
13706   ins_encode %{
13707      __ call($target$$Register);
13708   %}
13709 
13710   ins_pipe(pipe_slow);
13711 %}
13712 
13713 instruct CallLeafNoFPDirect(method meth)
13714 %{
13715   predicate(n->as_Call()->entry_point() != NULL);
13716   match(CallLeafNoFP);
13717   effect(USE meth);
13718 
13719   ins_cost(300);
13720   format %{ "call_leaf_nofp,runtime " %}
13721   ins_encode(clear_avx, Java_To_Runtime(meth));
13722   ins_pipe(pipe_slow);
13723 %}
13724 
13725 // Return Instruction
13726 // Remove the return address & jump to it.
13727 // Notice: We always emit a nop after a ret to make sure there is room
13728 // for safepoint patching
13729 instruct Ret()
13730 %{
13731   match(Return);
13732 
13733   format %{ "ret" %}
13734   ins_encode %{
13735     __ ret(0);
< prev index next >