< prev index next >

src/hotspot/cpu/x86/x86_64.ad

Print this page

  473 }
  474 
  475 // !!!!! Special hack to get all types of calls to specify the byte offset
  476 //       from the start of the call to the point where the return address
  477 //       will point.
  478 int MachCallStaticJavaNode::ret_addr_offset()
  479 {
  480   int offset = 5; // 5 bytes from start of call to where return address points
  481   offset += clear_avx_size();
  482   return offset;
  483 }
  484 
  485 int MachCallDynamicJavaNode::ret_addr_offset()
  486 {
  487   int offset = 15; // 15 bytes from start of call to where return address points
  488   offset += clear_avx_size();
  489   return offset;
  490 }
  491 
  492 int MachCallRuntimeNode::ret_addr_offset() {




  493   int offset = 13; // movq r10,#addr; callq (r10)
  494   if (this->ideal_Opcode() != Op_CallLeafVector) {
  495     offset += clear_avx_size();
  496   }
  497   return offset;
  498 }

  499 //
  500 // Compute padding required for nodes which need alignment
  501 //
  502 
  503 // The address of the call instruction needs to be 4-byte aligned to
  504 // ensure that it does not span a cache line so that it can be patched.
  505 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
  506 {
  507   current_offset += clear_avx_size(); // skip vzeroupper
  508   current_offset += 1; // skip call opcode byte
  509   return align_up(current_offset, alignment_required()) - current_offset;
  510 }
  511 
  512 // The address of the call instruction needs to be 4-byte aligned to
  513 // ensure that it does not span a cache line so that it can be patched.
  514 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
  515 {
  516   current_offset += clear_avx_size(); // skip vzeroupper
  517   current_offset += 11; // skip movq instruction + call opcode byte
  518   return align_up(current_offset, alignment_required()) - current_offset;

  887     st->print("# stack alignment check");
  888 #endif
  889   }
  890   if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
  891     st->print("\n\t");
  892     st->print("cmpl    [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
  893     st->print("\n\t");
  894     st->print("je      fast_entry\t");
  895     st->print("\n\t");
  896     st->print("call    #nmethod_entry_barrier_stub\t");
  897     st->print("\n\tfast_entry:");
  898   }
  899   st->cr();
  900 }
  901 #endif
  902 
  903 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  904   Compile* C = ra_->C;
  905   C2_MacroAssembler _masm(&cbuf);
  906 
  907   int framesize = C->output()->frame_size_in_bytes();
  908   int bangsize = C->output()->bang_size_in_bytes();
  909 
  910   if (C->clinit_barrier_on_entry()) {
  911     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  912     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  913 
  914     Label L_skip_barrier;
  915     Register klass = rscratch1;
  916 
  917     __ mov_metadata(klass, C->method()->holder()->constant_encoding());
  918     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  919 
  920     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  921 
  922     __ bind(L_skip_barrier);

  923   }
  924 
  925   __ verified_entry(framesize, C->output()->need_stack_bang(bangsize)?bangsize:0, false, C->stub_function() != NULL);


  926 
  927   C->output()->set_frame_complete(cbuf.insts_size());
  928 
  929   if (C->has_mach_constant_base_node()) {
  930     // NOTE: We set the table base offset here because users might be
  931     // emitted before MachConstantBaseNode.
  932     ConstantTable& constant_table = C->output()->constant_table();
  933     constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  934   }
  935 }
  936 
  937 uint MachPrologNode::size(PhaseRegAlloc* ra_) const
  938 {
  939   return MachNode::size(ra_); // too many variables; just compute it
  940                               // the hard way
  941 }
  942 
  943 int MachPrologNode::reloc() const
  944 {
  945   return 0; // a large enough number
  946 }
  947 
  948 //=============================================================================
  949 #ifndef PRODUCT
  950 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
  951 {
  952   Compile* C = ra_->C;
  953   if (generate_vzeroupper(C)) {
  954     st->print("vzeroupper");
  955     st->cr(); st->print("\t");
  956   }
  957 
  958   int framesize = C->output()->frame_size_in_bytes();
  959   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  960   // Remove word for return adr already pushed
  961   // and RBP
  962   framesize -= 2*wordSize;

  970   if (do_polling() && C->is_method_compilation()) {
  971     st->print("\t");
  972     st->print_cr("cmpq    rsp, poll_offset[r15_thread] \n\t"
  973                  "ja      #safepoint_stub\t"
  974                  "# Safepoint: poll for GC");
  975   }
  976 }
  977 #endif
  978 
  979 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
  980 {
  981   Compile* C = ra_->C;
  982   MacroAssembler _masm(&cbuf);
  983 
  984   if (generate_vzeroupper(C)) {
  985     // Clear upper bits of YMM registers when current compiled code uses
  986     // wide vectors to avoid AVX <-> SSE transition penalty during call.
  987     __ vzeroupper();
  988   }
  989 
  990   int framesize = C->output()->frame_size_in_bytes();
  991   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  992   // Remove word for return adr already pushed
  993   // and RBP
  994   framesize -= 2*wordSize;
  995 
  996   // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
  997 
  998   if (framesize) {
  999     emit_opcode(cbuf, Assembler::REX_W);
 1000     if (framesize < 0x80) {
 1001       emit_opcode(cbuf, 0x83); // addq rsp, #framesize
 1002       emit_rm(cbuf, 0x3, 0x00, RSP_enc);
 1003       emit_d8(cbuf, framesize);
 1004     } else {
 1005       emit_opcode(cbuf, 0x81); // addq rsp, #framesize
 1006       emit_rm(cbuf, 0x3, 0x00, RSP_enc);
 1007       emit_d32(cbuf, framesize);
 1008     }
 1009   }
 1010 
 1011   // popq rbp
 1012   emit_opcode(cbuf, 0x58 | RBP_enc);
 1013 
 1014   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
 1015     __ reserved_stack_check();
 1016   }
 1017 
 1018   if (do_polling() && C->is_method_compilation()) {
 1019     MacroAssembler _masm(&cbuf);
 1020     Label dummy_label;
 1021     Label* code_stub = &dummy_label;
 1022     if (!C->output()->in_scratch_emit_size()) {
 1023       C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
 1024       C->output()->add_stub(stub);
 1025       code_stub = &stub->entry();
 1026     }
 1027     __ relocate(relocInfo::poll_return_type);
 1028     __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
 1029   }
 1030 }
 1031 
 1032 uint MachEpilogNode::size(PhaseRegAlloc* ra_) const
 1033 {
 1034   return MachNode::size(ra_); // too many variables; just compute it
 1035                               // the hard way
 1036 }
 1037 
 1038 int MachEpilogNode::reloc() const
 1039 {
 1040   return 2; // a large enough number
 1041 }
 1042 
 1043 const Pipeline* MachEpilogNode::pipeline() const
 1044 {
 1045   return MachNode::pipeline_class();
 1046 }
 1047 
 1048 //=============================================================================
 1049 
 1050 enum RC {
 1051   rc_bad,
 1052   rc_int,
 1053   rc_kreg,
 1054   rc_float,
 1055   rc_stack
 1056 };
 1057 

 1650     emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
 1651     emit_opcode(cbuf, 0x8D); // LEA  reg,[SP+offset]
 1652     emit_rm(cbuf, 0x2, reg & 7, 0x04);
 1653     emit_rm(cbuf, 0x0, 0x04, RSP_enc);
 1654     emit_d32(cbuf, offset);
 1655   } else {
 1656     emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
 1657     emit_opcode(cbuf, 0x8D); // LEA  reg,[SP+offset]
 1658     emit_rm(cbuf, 0x1, reg & 7, 0x04);
 1659     emit_rm(cbuf, 0x0, 0x04, RSP_enc);
 1660     emit_d8(cbuf, offset);
 1661   }
 1662 }
 1663 
 1664 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
 1665 {
 1666   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1667   return (offset < 0x80) ? 5 : 8; // REX
 1668 }
 1669 

















































 1670 //=============================================================================
 1671 #ifndef PRODUCT
 1672 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1673 {
 1674   if (UseCompressedClassPointers) {
 1675     st->print_cr("movl    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1676     st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
 1677     st->print_cr("\tcmpq    rax, rscratch1\t # Inline cache check");
 1678   } else {
 1679     st->print_cr("\tcmpq    rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
 1680                  "# Inline cache check");
 1681   }
 1682   st->print_cr("\tjne     SharedRuntime::_ic_miss_stub");
 1683   st->print_cr("\tnop\t# nops to align entry point");
 1684 }
 1685 #endif
 1686 
 1687 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1688 {
 1689   MacroAssembler masm(&cbuf);

 1692     masm.load_klass(rscratch1, j_rarg0, rscratch2);
 1693     masm.cmpptr(rax, rscratch1);
 1694   } else {
 1695     masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1696   }
 1697 
 1698   masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1699 
 1700   /* WARNING these NOPs are critical so that verified entry point is properly
 1701      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1702   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1703   if (OptoBreakpoint) {
 1704     // Leave space for int3
 1705     nops_cnt -= 1;
 1706   }
 1707   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1708   if (nops_cnt > 0)
 1709     masm.nop(nops_cnt);
 1710 }
 1711 
 1712 uint MachUEPNode::size(PhaseRegAlloc* ra_) const
 1713 {
 1714   return MachNode::size(ra_); // too many variables; just compute it
 1715                               // the hard way
 1716 }
 1717 
 1718 
 1719 //=============================================================================
 1720 
 1721 const bool Matcher::supports_vector_calling_convention(void) {
 1722   if (EnableVectorSupport && UseVectorStubs) {
 1723     return true;
 1724   }
 1725   return false;
 1726 }
 1727 
 1728 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
 1729   assert(EnableVectorSupport && UseVectorStubs, "sanity");
 1730   int lo = XMM0_num;
 1731   int hi = XMM0b_num;
 1732   if (ideal_reg == Op_VecX) hi = XMM0d_num;
 1733   else if (ideal_reg == Op_VecY) hi = XMM0h_num;
 1734   else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
 1735   return OptoRegPair(hi, lo);
 1736 }
 1737 
 1738 // Is this branch offset short enough that a short branch can be used?

 3977   %}
 3978 %}
 3979 
 3980 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 3981 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 3982 %{
 3983   constraint(ALLOC_IN_RC(ptr_reg));
 3984   predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
 3985   match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
 3986 
 3987   op_cost(10);
 3988   format %{"[$reg + $off + $idx << $scale]" %}
 3989   interface(MEMORY_INTER) %{
 3990     base($reg);
 3991     index($idx);
 3992     scale($scale);
 3993     disp($off);
 3994   %}
 3995 %}
 3996 
















 3997 // Indirect Narrow Oop Plus Offset Operand
 3998 // Note: x86 architecture doesn't support "scale * index + offset" without a base
 3999 // we can't free r12 even with CompressedOops::base() == NULL.
 4000 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
 4001   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 4002   constraint(ALLOC_IN_RC(ptr_reg));
 4003   match(AddP (DecodeN reg) off);
 4004 
 4005   op_cost(10);
 4006   format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
 4007   interface(MEMORY_INTER) %{
 4008     base(0xc); // R12
 4009     index($reg);
 4010     scale(0x3);
 4011     disp($off);
 4012   %}
 4013 %}
 4014 
 4015 // Indirect Memory Operand
 4016 operand indirectNarrow(rRegN reg)

 4323     equal(0x4, "e");
 4324     not_equal(0x5, "ne");
 4325     less(0x2, "b");
 4326     greater_equal(0x3, "ae");
 4327     less_equal(0x6, "be");
 4328     greater(0x7, "a");
 4329     overflow(0x0, "o");
 4330     no_overflow(0x1, "no");
 4331   %}
 4332 %}
 4333 
 4334 //----------OPERAND CLASSES----------------------------------------------------
 4335 // Operand Classes are groups of operands that are used as to simplify
 4336 // instruction definitions by not requiring the AD writer to specify separate
 4337 // instructions for every form of operand when the instruction accepts
 4338 // multiple operand types with the same basic encoding and format.  The classic
 4339 // case of this is memory operands.
 4340 
 4341 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
 4342                indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
 4343                indCompressedOopOffset,
 4344                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
 4345                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
 4346                indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 4347 
 4348 //----------PIPELINE-----------------------------------------------------------
 4349 // Rules which define the behavior of the target architectures pipeline.
 4350 pipeline %{
 4351 
 4352 //----------ATTRIBUTES---------------------------------------------------------
 4353 attributes %{
 4354   variable_size_instructions;        // Fixed size instructions
 4355   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
 4356   instruction_unit_size = 1;         // An instruction is 1 bytes long
 4357   instruction_fetch_unit_size = 16;  // The processor fetches one line
 4358   instruction_fetch_units = 1;       // of 16 bytes
 4359 
 4360   // List of nop instructions
 4361   nops( MachNop );
 4362 %}
 4363 

 6911   format %{ "MEMBAR-storestore (empty encoding)" %}
 6912   ins_encode( );
 6913   ins_pipe(empty);
 6914 %}
 6915 
 6916 //----------Move Instructions--------------------------------------------------
 6917 
 6918 instruct castX2P(rRegP dst, rRegL src)
 6919 %{
 6920   match(Set dst (CastX2P src));
 6921 
 6922   format %{ "movq    $dst, $src\t# long->ptr" %}
 6923   ins_encode %{
 6924     if ($dst$$reg != $src$$reg) {
 6925       __ movptr($dst$$Register, $src$$Register);
 6926     }
 6927   %}
 6928   ins_pipe(ialu_reg_reg); // XXX
 6929 %}
 6930 













 6931 instruct castP2X(rRegL dst, rRegP src)
 6932 %{
 6933   match(Set dst (CastP2X src));
 6934 
 6935   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6936   ins_encode %{
 6937     if ($dst$$reg != $src$$reg) {
 6938       __ movptr($dst$$Register, $src$$Register);
 6939     }
 6940   %}
 6941   ins_pipe(ialu_reg_reg); // XXX
 6942 %}
 6943 
 6944 // Convert oop into int for vectors alignment masking
 6945 instruct convP2I(rRegI dst, rRegP src)
 6946 %{
 6947   match(Set dst (ConvL2I (CastP2X src)));
 6948 
 6949   format %{ "movl    $dst, $src\t# ptr -> int" %}
 6950   ins_encode %{

11416   effect(DEF dst, USE src);
11417   ins_cost(100);
11418   format %{ "movd    $dst,$src\t# MoveI2F" %}
11419   ins_encode %{
11420     __ movdl($dst$$XMMRegister, $src$$Register);
11421   %}
11422   ins_pipe( pipe_slow );
11423 %}
11424 
11425 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11426   match(Set dst (MoveL2D src));
11427   effect(DEF dst, USE src);
11428   ins_cost(100);
11429   format %{ "movd    $dst,$src\t# MoveL2D" %}
11430   ins_encode %{
11431      __ movdq($dst$$XMMRegister, $src$$Register);
11432   %}
11433   ins_pipe( pipe_slow );
11434 %}
11435 

11436 // Fast clearing of an array
11437 // Small ClearArray non-AVX512.
11438 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11439                   Universe dummy, rFlagsReg cr)
11440 %{
11441   predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX <= 2));
11442   match(Set dummy (ClearArray cnt base));
11443   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);





















































































































11444 
11445   format %{ $$template
11446     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11447     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11448     $$emit$$"jg      LARGE\n\t"
11449     $$emit$$"dec     rcx\n\t"
11450     $$emit$$"js      DONE\t# Zero length\n\t"
11451     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11452     $$emit$$"dec     rcx\n\t"
11453     $$emit$$"jge     LOOP\n\t"
11454     $$emit$$"jmp     DONE\n\t"
11455     $$emit$$"# LARGE:\n\t"
11456     if (UseFastStosb) {
11457        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11458        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11459     } else if (UseXMMForObjInit) {
11460        $$emit$$"mov     rdi,rax\n\t"
11461        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11462        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11463        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

11471        $$emit$$"jl      L_tail\n\t"
11472        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11473        $$emit$$"add     0x20,rax\n\t"
11474        $$emit$$"sub     0x4,rcx\n\t"
11475        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11476        $$emit$$"add     0x4,rcx\n\t"
11477        $$emit$$"jle     L_end\n\t"
11478        $$emit$$"dec     rcx\n\t"
11479        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11480        $$emit$$"vmovq   xmm0,(rax)\n\t"
11481        $$emit$$"add     0x8,rax\n\t"
11482        $$emit$$"dec     rcx\n\t"
11483        $$emit$$"jge     L_sloop\n\t"
11484        $$emit$$"# L_end:\n\t"
11485     } else {
11486        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11487     }
11488     $$emit$$"# DONE"
11489   %}
11490   ins_encode %{
11491     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11492                  $tmp$$XMMRegister, false, knoreg);
11493   %}
11494   ins_pipe(pipe_slow);
11495 %}
11496 
11497 // Small ClearArray AVX512 non-constant length.
11498 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11499                        Universe dummy, rFlagsReg cr)
11500 %{
11501   predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX > 2));
11502   match(Set dummy (ClearArray cnt base));
11503   ins_cost(125);
11504   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11505 
11506   format %{ $$template
11507     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11508     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11509     $$emit$$"jg      LARGE\n\t"
11510     $$emit$$"dec     rcx\n\t"
11511     $$emit$$"js      DONE\t# Zero length\n\t"
11512     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11513     $$emit$$"dec     rcx\n\t"
11514     $$emit$$"jge     LOOP\n\t"
11515     $$emit$$"jmp     DONE\n\t"
11516     $$emit$$"# LARGE:\n\t"
11517     if (UseFastStosb) {
11518        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11519        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11520     } else if (UseXMMForObjInit) {
11521        $$emit$$"mov     rdi,rax\n\t"
11522        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11523        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11524        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

11532        $$emit$$"jl      L_tail\n\t"
11533        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11534        $$emit$$"add     0x20,rax\n\t"
11535        $$emit$$"sub     0x4,rcx\n\t"
11536        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11537        $$emit$$"add     0x4,rcx\n\t"
11538        $$emit$$"jle     L_end\n\t"
11539        $$emit$$"dec     rcx\n\t"
11540        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11541        $$emit$$"vmovq   xmm0,(rax)\n\t"
11542        $$emit$$"add     0x8,rax\n\t"
11543        $$emit$$"dec     rcx\n\t"
11544        $$emit$$"jge     L_sloop\n\t"
11545        $$emit$$"# L_end:\n\t"
11546     } else {
11547        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11548     }
11549     $$emit$$"# DONE"
11550   %}
11551   ins_encode %{
11552     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11553                  $tmp$$XMMRegister, false, $ktmp$$KRegister);
11554   %}
11555   ins_pipe(pipe_slow);
11556 %}
11557 
11558 // Large ClearArray non-AVX512.
11559 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
11560                         Universe dummy, rFlagsReg cr)
11561 %{
11562   predicate((UseAVX <=2) && ((ClearArrayNode*)n)->is_large());
11563   match(Set dummy (ClearArray cnt base));
11564   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
































































































11565 
11566   format %{ $$template
11567     if (UseFastStosb) {
11568        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11569        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11570        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11571     } else if (UseXMMForObjInit) {
11572        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
11573        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11574        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11575        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11576        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11577        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11578        $$emit$$"add     0x40,rax\n\t"
11579        $$emit$$"# L_zero_64_bytes:\n\t"
11580        $$emit$$"sub     0x8,rcx\n\t"
11581        $$emit$$"jge     L_loop\n\t"
11582        $$emit$$"add     0x4,rcx\n\t"
11583        $$emit$$"jl      L_tail\n\t"
11584        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11585        $$emit$$"add     0x20,rax\n\t"
11586        $$emit$$"sub     0x4,rcx\n\t"
11587        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11588        $$emit$$"add     0x4,rcx\n\t"
11589        $$emit$$"jle     L_end\n\t"
11590        $$emit$$"dec     rcx\n\t"
11591        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11592        $$emit$$"vmovq   xmm0,(rax)\n\t"
11593        $$emit$$"add     0x8,rax\n\t"
11594        $$emit$$"dec     rcx\n\t"
11595        $$emit$$"jge     L_sloop\n\t"
11596        $$emit$$"# L_end:\n\t"
11597     } else {
11598        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11599        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11600     }
11601   %}
11602   ins_encode %{
11603     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11604                  $tmp$$XMMRegister, true, knoreg);
11605   %}
11606   ins_pipe(pipe_slow);
11607 %}
11608 
11609 // Large ClearArray AVX512.
11610 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
11611                              Universe dummy, rFlagsReg cr)
11612 %{
11613   predicate((UseAVX > 2) && ((ClearArrayNode*)n)->is_large());
11614   match(Set dummy (ClearArray cnt base));
11615   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
11616 
11617   format %{ $$template
11618     if (UseFastStosb) {
11619        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11620        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11621        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11622     } else if (UseXMMForObjInit) {
11623        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
11624        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11625        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11626        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11627        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11628        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11629        $$emit$$"add     0x40,rax\n\t"
11630        $$emit$$"# L_zero_64_bytes:\n\t"
11631        $$emit$$"sub     0x8,rcx\n\t"
11632        $$emit$$"jge     L_loop\n\t"
11633        $$emit$$"add     0x4,rcx\n\t"
11634        $$emit$$"jl      L_tail\n\t"
11635        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11636        $$emit$$"add     0x20,rax\n\t"
11637        $$emit$$"sub     0x4,rcx\n\t"
11638        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11639        $$emit$$"add     0x4,rcx\n\t"
11640        $$emit$$"jle     L_end\n\t"
11641        $$emit$$"dec     rcx\n\t"
11642        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11643        $$emit$$"vmovq   xmm0,(rax)\n\t"
11644        $$emit$$"add     0x8,rax\n\t"
11645        $$emit$$"dec     rcx\n\t"
11646        $$emit$$"jge     L_sloop\n\t"
11647        $$emit$$"# L_end:\n\t"
11648     } else {
11649        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11650        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11651     }
11652   %}
11653   ins_encode %{
11654     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
11655                  $tmp$$XMMRegister, true, $ktmp$$KRegister);
11656   %}
11657   ins_pipe(pipe_slow);
11658 %}
11659 
11660 // Small ClearArray AVX512 constant length.
11661 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rRegI zero, kReg ktmp, Universe dummy, rFlagsReg cr)
11662 %{
11663   predicate(!((ClearArrayNode*)n)->is_large() &&
11664               ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11665   match(Set dummy (ClearArray cnt base));
11666   ins_cost(100);
11667   effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr);
11668   format %{ "clear_mem_imm $base , $cnt  \n\t" %}
11669   ins_encode %{
11670    __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11671   %}
11672   ins_pipe(pipe_slow);
11673 %}
11674 
11675 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11676                          rax_RegI result, legRegD tmp1, rFlagsReg cr)
11677 %{
11678   predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11679   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11680   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11681 
11682   format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
11683   ins_encode %{
11684     __ string_compare($str1$$Register, $str2$$Register,
11685                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
11686                       $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11687   %}
11688   ins_pipe( pipe_slow );
11689 %}
11690 

13420 
13421   ins_cost(300);
13422   format %{ "call_leaf,runtime " %}
13423   ins_encode(clear_avx, Java_To_Runtime(meth));
13424   ins_pipe(pipe_slow);
13425 %}
13426 
13427 // Call runtime without safepoint and with vector arguments
13428 instruct CallLeafDirectVector(method meth)
13429 %{
13430   match(CallLeafVector);
13431   effect(USE meth);
13432 
13433   ins_cost(300);
13434   format %{ "call_leaf,vector " %}
13435   ins_encode(Java_To_Runtime(meth));
13436   ins_pipe(pipe_slow);
13437 %}
13438 
13439 // Call runtime without safepoint















13440 instruct CallLeafNoFPDirect(method meth)
13441 %{

13442   match(CallLeafNoFP);
13443   effect(USE meth);
13444 
13445   ins_cost(300);
13446   format %{ "call_leaf_nofp,runtime " %}
13447   ins_encode(clear_avx, Java_To_Runtime(meth));
13448   ins_pipe(pipe_slow);
13449 %}
13450 
13451 // Return Instruction
13452 // Remove the return address & jump to it.
13453 // Notice: We always emit a nop after a ret to make sure there is room
13454 // for safepoint patching
13455 instruct Ret()
13456 %{
13457   match(Return);
13458 
13459   format %{ "ret" %}
13460   ins_encode %{
13461     __ ret(0);

  473 }
  474 
  475 // !!!!! Special hack to get all types of calls to specify the byte offset
  476 //       from the start of the call to the point where the return address
  477 //       will point.
  478 int MachCallStaticJavaNode::ret_addr_offset()
  479 {
  480   int offset = 5; // 5 bytes from start of call to where return address points
  481   offset += clear_avx_size();
  482   return offset;
  483 }
  484 
  485 int MachCallDynamicJavaNode::ret_addr_offset()
  486 {
  487   int offset = 15; // 15 bytes from start of call to where return address points
  488   offset += clear_avx_size();
  489   return offset;
  490 }
  491 
  492 int MachCallRuntimeNode::ret_addr_offset() {
  493   if (_entry_point == NULL) {
  494     // CallLeafNoFPInDirect
  495     return 3; // callq (register)
  496   }
  497   int offset = 13; // movq r10,#addr; callq (r10)
  498   if (this->ideal_Opcode() != Op_CallLeafVector) {
  499     offset += clear_avx_size();
  500   }
  501   return offset;
  502 }
  503 
  504 //
  505 // Compute padding required for nodes which need alignment
  506 //
  507 
  508 // The address of the call instruction needs to be 4-byte aligned to
  509 // ensure that it does not span a cache line so that it can be patched.
  510 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
  511 {
  512   current_offset += clear_avx_size(); // skip vzeroupper
  513   current_offset += 1; // skip call opcode byte
  514   return align_up(current_offset, alignment_required()) - current_offset;
  515 }
  516 
  517 // The address of the call instruction needs to be 4-byte aligned to
  518 // ensure that it does not span a cache line so that it can be patched.
  519 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
  520 {
  521   current_offset += clear_avx_size(); // skip vzeroupper
  522   current_offset += 11; // skip movq instruction + call opcode byte
  523   return align_up(current_offset, alignment_required()) - current_offset;

  892     st->print("# stack alignment check");
  893 #endif
  894   }
  895   if (C->stub_function() != NULL && BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
  896     st->print("\n\t");
  897     st->print("cmpl    [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
  898     st->print("\n\t");
  899     st->print("je      fast_entry\t");
  900     st->print("\n\t");
  901     st->print("call    #nmethod_entry_barrier_stub\t");
  902     st->print("\n\tfast_entry:");
  903   }
  904   st->cr();
  905 }
  906 #endif
  907 
  908 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  909   Compile* C = ra_->C;
  910   C2_MacroAssembler _masm(&cbuf);
  911 
  912   __ verified_entry(C);













  913 
  914   if (ra_->C->stub_function() == NULL) {
  915     __ entry_barrier();
  916   }
  917 
  918   if (!Compile::current()->output()->in_scratch_emit_size()) {
  919     __ bind(*_verified_entry);
  920   }
  921 
  922   C->output()->set_frame_complete(cbuf.insts_size());
  923 
  924   if (C->has_mach_constant_base_node()) {
  925     // NOTE: We set the table base offset here because users might be
  926     // emitted before MachConstantBaseNode.
  927     ConstantTable& constant_table = C->output()->constant_table();
  928     constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  929   }
  930 }
  931 






  932 int MachPrologNode::reloc() const
  933 {
  934   return 0; // a large enough number
  935 }
  936 
  937 //=============================================================================
  938 #ifndef PRODUCT
  939 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
  940 {
  941   Compile* C = ra_->C;
  942   if (generate_vzeroupper(C)) {
  943     st->print("vzeroupper");
  944     st->cr(); st->print("\t");
  945   }
  946 
  947   int framesize = C->output()->frame_size_in_bytes();
  948   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  949   // Remove word for return adr already pushed
  950   // and RBP
  951   framesize -= 2*wordSize;

  959   if (do_polling() && C->is_method_compilation()) {
  960     st->print("\t");
  961     st->print_cr("cmpq    rsp, poll_offset[r15_thread] \n\t"
  962                  "ja      #safepoint_stub\t"
  963                  "# Safepoint: poll for GC");
  964   }
  965 }
  966 #endif
  967 
  968 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
  969 {
  970   Compile* C = ra_->C;
  971   MacroAssembler _masm(&cbuf);
  972 
  973   if (generate_vzeroupper(C)) {
  974     // Clear upper bits of YMM registers when current compiled code uses
  975     // wide vectors to avoid AVX <-> SSE transition penalty during call.
  976     __ vzeroupper();
  977   }
  978 
  979   // Subtract two words to account for return address and rbp
  980   int initial_framesize = C->output()->frame_size_in_bytes() - 2*wordSize;
  981   __ remove_frame(initial_framesize, C->needs_stack_repair());




















  982 
  983   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
  984     __ reserved_stack_check();
  985   }
  986 
  987   if (do_polling() && C->is_method_compilation()) {
  988     MacroAssembler _masm(&cbuf);
  989     Label dummy_label;
  990     Label* code_stub = &dummy_label;
  991     if (!C->output()->in_scratch_emit_size()) {
  992       C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
  993       C->output()->add_stub(stub);
  994       code_stub = &stub->entry();
  995     }
  996     __ relocate(relocInfo::poll_return_type);
  997     __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
  998   }
  999 }
 1000 






 1001 int MachEpilogNode::reloc() const
 1002 {
 1003   return 2; // a large enough number
 1004 }
 1005 
 1006 const Pipeline* MachEpilogNode::pipeline() const
 1007 {
 1008   return MachNode::pipeline_class();
 1009 }
 1010 
 1011 //=============================================================================
 1012 
 1013 enum RC {
 1014   rc_bad,
 1015   rc_int,
 1016   rc_kreg,
 1017   rc_float,
 1018   rc_stack
 1019 };
 1020 

 1613     emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
 1614     emit_opcode(cbuf, 0x8D); // LEA  reg,[SP+offset]
 1615     emit_rm(cbuf, 0x2, reg & 7, 0x04);
 1616     emit_rm(cbuf, 0x0, 0x04, RSP_enc);
 1617     emit_d32(cbuf, offset);
 1618   } else {
 1619     emit_opcode(cbuf, reg < 8 ? Assembler::REX_W : Assembler::REX_WR);
 1620     emit_opcode(cbuf, 0x8D); // LEA  reg,[SP+offset]
 1621     emit_rm(cbuf, 0x1, reg & 7, 0x04);
 1622     emit_rm(cbuf, 0x0, 0x04, RSP_enc);
 1623     emit_d8(cbuf, offset);
 1624   }
 1625 }
 1626 
 1627 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
 1628 {
 1629   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1630   return (offset < 0x80) ? 5 : 8; // REX
 1631 }
 1632 
 1633 //=============================================================================
 1634 #ifndef PRODUCT
 1635 void MachVEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1636 {
 1637   st->print_cr("MachVEPNode");
 1638 }
 1639 #endif
 1640 
 1641 void MachVEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1642 {
 1643   C2_MacroAssembler _masm(&cbuf);
 1644   uint insts_size = cbuf.insts_size();
 1645   if (!_verified) {
 1646     if (UseCompressedClassPointers) {
 1647       __ load_klass(rscratch1, j_rarg0, rscratch2);
 1648       __ cmpptr(rax, rscratch1);
 1649     } else {
 1650       __ cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1651     }
 1652     __ jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1653   } else {
 1654     // TODO 8284443 Avoid creation of temporary frame
 1655     if (ra_->C->stub_function() == NULL) {
 1656       __ verified_entry(ra_->C, 0);
 1657       __ entry_barrier();
 1658       int initial_framesize = ra_->C->output()->frame_size_in_bytes() - 2*wordSize;
 1659       __ remove_frame(initial_framesize, false);
 1660     }
 1661     // Unpack inline type args passed as oop and then jump to
 1662     // the verified entry point (skipping the unverified entry).
 1663     int sp_inc = __ unpack_inline_args(ra_->C, _receiver_only);
 1664     // Emit code for verified entry and save increment for stack repair on return
 1665     __ verified_entry(ra_->C, sp_inc);
 1666     if (Compile::current()->output()->in_scratch_emit_size()) {
 1667       Label dummy_verified_entry;
 1668       __ jmp(dummy_verified_entry);
 1669     } else {
 1670       __ jmp(*_verified_entry);
 1671     }
 1672   }
 1673   /* WARNING these NOPs are critical so that verified entry point is properly
 1674      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1675   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1676   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1677   if (nops_cnt > 0) {
 1678     __ nop(nops_cnt);
 1679   }
 1680 }
 1681 
 1682 //=============================================================================
 1683 #ifndef PRODUCT
 1684 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1685 {
 1686   if (UseCompressedClassPointers) {
 1687     st->print_cr("movl    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1688     st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
 1689     st->print_cr("\tcmpq    rax, rscratch1\t # Inline cache check");
 1690   } else {
 1691     st->print_cr("\tcmpq    rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
 1692                  "# Inline cache check");
 1693   }
 1694   st->print_cr("\tjne     SharedRuntime::_ic_miss_stub");
 1695   st->print_cr("\tnop\t# nops to align entry point");
 1696 }
 1697 #endif
 1698 
 1699 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1700 {
 1701   MacroAssembler masm(&cbuf);

 1704     masm.load_klass(rscratch1, j_rarg0, rscratch2);
 1705     masm.cmpptr(rax, rscratch1);
 1706   } else {
 1707     masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1708   }
 1709 
 1710   masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1711 
 1712   /* WARNING these NOPs are critical so that verified entry point is properly
 1713      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1714   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1715   if (OptoBreakpoint) {
 1716     // Leave space for int3
 1717     nops_cnt -= 1;
 1718   }
 1719   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1720   if (nops_cnt > 0)
 1721     masm.nop(nops_cnt);
 1722 }
 1723 







 1724 //=============================================================================
 1725 
 1726 const bool Matcher::supports_vector_calling_convention(void) {
 1727   if (EnableVectorSupport && UseVectorStubs) {
 1728     return true;
 1729   }
 1730   return false;
 1731 }
 1732 
 1733 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
 1734   assert(EnableVectorSupport && UseVectorStubs, "sanity");
 1735   int lo = XMM0_num;
 1736   int hi = XMM0b_num;
 1737   if (ideal_reg == Op_VecX) hi = XMM0d_num;
 1738   else if (ideal_reg == Op_VecY) hi = XMM0h_num;
 1739   else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
 1740   return OptoRegPair(hi, lo);
 1741 }
 1742 
 1743 // Is this branch offset short enough that a short branch can be used?

 3982   %}
 3983 %}
 3984 
 3985 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 3986 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 3987 %{
 3988   constraint(ALLOC_IN_RC(ptr_reg));
 3989   predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
 3990   match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
 3991 
 3992   op_cost(10);
 3993   format %{"[$reg + $off + $idx << $scale]" %}
 3994   interface(MEMORY_INTER) %{
 3995     base($reg);
 3996     index($idx);
 3997     scale($scale);
 3998     disp($off);
 3999   %}
 4000 %}
 4001 
 4002 // Indirect Narrow Oop Operand
 4003 operand indCompressedOop(rRegN reg) %{
 4004   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 4005   constraint(ALLOC_IN_RC(ptr_reg));
 4006   match(DecodeN reg);
 4007 
 4008   op_cost(10);
 4009   format %{"[R12 + $reg << 3] (compressed oop addressing)" %}
 4010   interface(MEMORY_INTER) %{
 4011     base(0xc); // R12
 4012     index($reg);
 4013     scale(0x3);
 4014     disp(0x0);
 4015   %}
 4016 %}
 4017 
 4018 // Indirect Narrow Oop Plus Offset Operand
 4019 // Note: x86 architecture doesn't support "scale * index + offset" without a base
 4020 // we can't free r12 even with CompressedOops::base() == NULL.
 4021 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
 4022   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 4023   constraint(ALLOC_IN_RC(ptr_reg));
 4024   match(AddP (DecodeN reg) off);
 4025 
 4026   op_cost(10);
 4027   format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
 4028   interface(MEMORY_INTER) %{
 4029     base(0xc); // R12
 4030     index($reg);
 4031     scale(0x3);
 4032     disp($off);
 4033   %}
 4034 %}
 4035 
 4036 // Indirect Memory Operand
 4037 operand indirectNarrow(rRegN reg)

 4344     equal(0x4, "e");
 4345     not_equal(0x5, "ne");
 4346     less(0x2, "b");
 4347     greater_equal(0x3, "ae");
 4348     less_equal(0x6, "be");
 4349     greater(0x7, "a");
 4350     overflow(0x0, "o");
 4351     no_overflow(0x1, "no");
 4352   %}
 4353 %}
 4354 
 4355 //----------OPERAND CLASSES----------------------------------------------------
 4356 // Operand Classes are groups of operands that are used as to simplify
 4357 // instruction definitions by not requiring the AD writer to specify separate
 4358 // instructions for every form of operand when the instruction accepts
 4359 // multiple operand types with the same basic encoding and format.  The classic
 4360 // case of this is memory operands.
 4361 
 4362 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
 4363                indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
 4364                indCompressedOop, indCompressedOopOffset,
 4365                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
 4366                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
 4367                indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 4368 
 4369 //----------PIPELINE-----------------------------------------------------------
 4370 // Rules which define the behavior of the target architectures pipeline.
 4371 pipeline %{
 4372 
 4373 //----------ATTRIBUTES---------------------------------------------------------
 4374 attributes %{
 4375   variable_size_instructions;        // Fixed size instructions
 4376   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
 4377   instruction_unit_size = 1;         // An instruction is 1 bytes long
 4378   instruction_fetch_unit_size = 16;  // The processor fetches one line
 4379   instruction_fetch_units = 1;       // of 16 bytes
 4380 
 4381   // List of nop instructions
 4382   nops( MachNop );
 4383 %}
 4384 

 6932   format %{ "MEMBAR-storestore (empty encoding)" %}
 6933   ins_encode( );
 6934   ins_pipe(empty);
 6935 %}
 6936 
 6937 //----------Move Instructions--------------------------------------------------
 6938 
 6939 instruct castX2P(rRegP dst, rRegL src)
 6940 %{
 6941   match(Set dst (CastX2P src));
 6942 
 6943   format %{ "movq    $dst, $src\t# long->ptr" %}
 6944   ins_encode %{
 6945     if ($dst$$reg != $src$$reg) {
 6946       __ movptr($dst$$Register, $src$$Register);
 6947     }
 6948   %}
 6949   ins_pipe(ialu_reg_reg); // XXX
 6950 %}
 6951 
 6952 instruct castN2X(rRegL dst, rRegN src)
 6953 %{
 6954   match(Set dst (CastP2X src));
 6955 
 6956   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6957   ins_encode %{
 6958     if ($dst$$reg != $src$$reg) {
 6959       __ movptr($dst$$Register, $src$$Register);
 6960     }
 6961   %}
 6962   ins_pipe(ialu_reg_reg); // XXX
 6963 %}
 6964 
 6965 instruct castP2X(rRegL dst, rRegP src)
 6966 %{
 6967   match(Set dst (CastP2X src));
 6968 
 6969   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6970   ins_encode %{
 6971     if ($dst$$reg != $src$$reg) {
 6972       __ movptr($dst$$Register, $src$$Register);
 6973     }
 6974   %}
 6975   ins_pipe(ialu_reg_reg); // XXX
 6976 %}
 6977 
 6978 // Convert oop into int for vectors alignment masking
 6979 instruct convP2I(rRegI dst, rRegP src)
 6980 %{
 6981   match(Set dst (ConvL2I (CastP2X src)));
 6982 
 6983   format %{ "movl    $dst, $src\t# ptr -> int" %}
 6984   ins_encode %{

11450   effect(DEF dst, USE src);
11451   ins_cost(100);
11452   format %{ "movd    $dst,$src\t# MoveI2F" %}
11453   ins_encode %{
11454     __ movdl($dst$$XMMRegister, $src$$Register);
11455   %}
11456   ins_pipe( pipe_slow );
11457 %}
11458 
11459 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
11460   match(Set dst (MoveL2D src));
11461   effect(DEF dst, USE src);
11462   ins_cost(100);
11463   format %{ "movd    $dst,$src\t# MoveL2D" %}
11464   ins_encode %{
11465      __ movdq($dst$$XMMRegister, $src$$Register);
11466   %}
11467   ins_pipe( pipe_slow );
11468 %}
11469 
11470 
11471 // Fast clearing of an array
11472 // Small ClearArray non-AVX512.
11473 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11474                   Universe dummy, rFlagsReg cr)
11475 %{
11476   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11477   match(Set dummy (ClearArray (Binary cnt base) val));
11478   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11479 
11480   format %{ $$template
11481     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11482     $$emit$$"jg      LARGE\n\t"
11483     $$emit$$"dec     rcx\n\t"
11484     $$emit$$"js      DONE\t# Zero length\n\t"
11485     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11486     $$emit$$"dec     rcx\n\t"
11487     $$emit$$"jge     LOOP\n\t"
11488     $$emit$$"jmp     DONE\n\t"
11489     $$emit$$"# LARGE:\n\t"
11490     if (UseFastStosb) {
11491        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11492        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11493     } else if (UseXMMForObjInit) {
11494        $$emit$$"movdq   $tmp, $val\n\t"
11495        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11496        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11497        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11498        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11499        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11500        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11501        $$emit$$"add     0x40,rax\n\t"
11502        $$emit$$"# L_zero_64_bytes:\n\t"
11503        $$emit$$"sub     0x8,rcx\n\t"
11504        $$emit$$"jge     L_loop\n\t"
11505        $$emit$$"add     0x4,rcx\n\t"
11506        $$emit$$"jl      L_tail\n\t"
11507        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11508        $$emit$$"add     0x20,rax\n\t"
11509        $$emit$$"sub     0x4,rcx\n\t"
11510        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11511        $$emit$$"add     0x4,rcx\n\t"
11512        $$emit$$"jle     L_end\n\t"
11513        $$emit$$"dec     rcx\n\t"
11514        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11515        $$emit$$"vmovq   xmm0,(rax)\n\t"
11516        $$emit$$"add     0x8,rax\n\t"
11517        $$emit$$"dec     rcx\n\t"
11518        $$emit$$"jge     L_sloop\n\t"
11519        $$emit$$"# L_end:\n\t"
11520     } else {
11521        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11522     }
11523     $$emit$$"# DONE"
11524   %}
11525   ins_encode %{
11526     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11527                  $tmp$$XMMRegister, false, false);
11528   %}
11529   ins_pipe(pipe_slow);
11530 %}
11531 
11532 instruct rep_stos_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11533                             Universe dummy, rFlagsReg cr)
11534 %{
11535   predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11536   match(Set dummy (ClearArray (Binary cnt base) val));
11537   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11538 
11539   format %{ $$template
11540     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11541     $$emit$$"jg      LARGE\n\t"
11542     $$emit$$"dec     rcx\n\t"
11543     $$emit$$"js      DONE\t# Zero length\n\t"
11544     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11545     $$emit$$"dec     rcx\n\t"
11546     $$emit$$"jge     LOOP\n\t"
11547     $$emit$$"jmp     DONE\n\t"
11548     $$emit$$"# LARGE:\n\t"
11549     if (UseXMMForObjInit) {
11550        $$emit$$"movdq   $tmp, $val\n\t"
11551        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11552        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11553        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11554        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11555        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11556        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11557        $$emit$$"add     0x40,rax\n\t"
11558        $$emit$$"# L_zero_64_bytes:\n\t"
11559        $$emit$$"sub     0x8,rcx\n\t"
11560        $$emit$$"jge     L_loop\n\t"
11561        $$emit$$"add     0x4,rcx\n\t"
11562        $$emit$$"jl      L_tail\n\t"
11563        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11564        $$emit$$"add     0x20,rax\n\t"
11565        $$emit$$"sub     0x4,rcx\n\t"
11566        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11567        $$emit$$"add     0x4,rcx\n\t"
11568        $$emit$$"jle     L_end\n\t"
11569        $$emit$$"dec     rcx\n\t"
11570        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11571        $$emit$$"vmovq   xmm0,(rax)\n\t"
11572        $$emit$$"add     0x8,rax\n\t"
11573        $$emit$$"dec     rcx\n\t"
11574        $$emit$$"jge     L_sloop\n\t"
11575        $$emit$$"# L_end:\n\t"
11576     } else {
11577        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11578     }
11579     $$emit$$"# DONE"
11580   %}
11581   ins_encode %{
11582     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11583                  $tmp$$XMMRegister, false, true);
11584   %}
11585   ins_pipe(pipe_slow);
11586 %}
11587 
11588 // Small ClearArray AVX512 non-constant length.
11589 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11590                        Universe dummy, rFlagsReg cr)
11591 %{
11592   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11593   match(Set dummy (ClearArray (Binary cnt base) val));
11594   ins_cost(125);
11595   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11596 
11597   format %{ $$template
11598     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11599     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11600     $$emit$$"jg      LARGE\n\t"
11601     $$emit$$"dec     rcx\n\t"
11602     $$emit$$"js      DONE\t# Zero length\n\t"
11603     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11604     $$emit$$"dec     rcx\n\t"
11605     $$emit$$"jge     LOOP\n\t"
11606     $$emit$$"jmp     DONE\n\t"
11607     $$emit$$"# LARGE:\n\t"
11608     if (UseFastStosb) {
11609        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11610        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11611     } else if (UseXMMForObjInit) {
11612        $$emit$$"mov     rdi,rax\n\t"
11613        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11614        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11615        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

11623        $$emit$$"jl      L_tail\n\t"
11624        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11625        $$emit$$"add     0x20,rax\n\t"
11626        $$emit$$"sub     0x4,rcx\n\t"
11627        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11628        $$emit$$"add     0x4,rcx\n\t"
11629        $$emit$$"jle     L_end\n\t"
11630        $$emit$$"dec     rcx\n\t"
11631        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11632        $$emit$$"vmovq   xmm0,(rax)\n\t"
11633        $$emit$$"add     0x8,rax\n\t"
11634        $$emit$$"dec     rcx\n\t"
11635        $$emit$$"jge     L_sloop\n\t"
11636        $$emit$$"# L_end:\n\t"
11637     } else {
11638        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11639     }
11640     $$emit$$"# DONE"
11641   %}
11642   ins_encode %{
11643     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11644                  $tmp$$XMMRegister, false, false, $ktmp$$KRegister);
11645   %}
11646   ins_pipe(pipe_slow);
11647 %}
11648 
11649 instruct rep_stos_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11650                                  Universe dummy, rFlagsReg cr)

11651 %{
11652   predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11653   match(Set dummy (ClearArray (Binary cnt base) val));
11654   ins_cost(125);
11655   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11656 
11657   format %{ $$template
11658     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11659     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
11660     $$emit$$"jg      LARGE\n\t"
11661     $$emit$$"dec     rcx\n\t"
11662     $$emit$$"js      DONE\t# Zero length\n\t"
11663     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
11664     $$emit$$"dec     rcx\n\t"
11665     $$emit$$"jge     LOOP\n\t"
11666     $$emit$$"jmp     DONE\n\t"
11667     $$emit$$"# LARGE:\n\t"
11668     if (UseFastStosb) {
11669        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11670        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
11671     } else if (UseXMMForObjInit) {
11672        $$emit$$"mov     rdi,rax\n\t"
11673        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11674        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11675        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

11683        $$emit$$"jl      L_tail\n\t"
11684        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11685        $$emit$$"add     0x20,rax\n\t"
11686        $$emit$$"sub     0x4,rcx\n\t"
11687        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11688        $$emit$$"add     0x4,rcx\n\t"
11689        $$emit$$"jle     L_end\n\t"
11690        $$emit$$"dec     rcx\n\t"
11691        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11692        $$emit$$"vmovq   xmm0,(rax)\n\t"
11693        $$emit$$"add     0x8,rax\n\t"
11694        $$emit$$"dec     rcx\n\t"
11695        $$emit$$"jge     L_sloop\n\t"
11696        $$emit$$"# L_end:\n\t"
11697     } else {
11698        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
11699     }
11700     $$emit$$"# DONE"
11701   %}
11702   ins_encode %{
11703     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11704                  $tmp$$XMMRegister, false, true, $ktmp$$KRegister);
11705   %}
11706   ins_pipe(pipe_slow);
11707 %}
11708 
11709 // Large ClearArray non-AVX512.
11710 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11711                         Universe dummy, rFlagsReg cr)
11712 %{
11713   predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11714   match(Set dummy (ClearArray (Binary cnt base) val));
11715   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11716 
11717   format %{ $$template
11718     if (UseFastStosb) {
11719        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11720        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11721     } else if (UseXMMForObjInit) {
11722        $$emit$$"movdq   $tmp, $val\n\t"
11723        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11724        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11725        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11726        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11727        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11728        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11729        $$emit$$"add     0x40,rax\n\t"
11730        $$emit$$"# L_zero_64_bytes:\n\t"
11731        $$emit$$"sub     0x8,rcx\n\t"
11732        $$emit$$"jge     L_loop\n\t"
11733        $$emit$$"add     0x4,rcx\n\t"
11734        $$emit$$"jl      L_tail\n\t"
11735        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11736        $$emit$$"add     0x20,rax\n\t"
11737        $$emit$$"sub     0x4,rcx\n\t"
11738        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11739        $$emit$$"add     0x4,rcx\n\t"
11740        $$emit$$"jle     L_end\n\t"
11741        $$emit$$"dec     rcx\n\t"
11742        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11743        $$emit$$"vmovq   xmm0,(rax)\n\t"
11744        $$emit$$"add     0x8,rax\n\t"
11745        $$emit$$"dec     rcx\n\t"
11746        $$emit$$"jge     L_sloop\n\t"
11747        $$emit$$"# L_end:\n\t"
11748     } else {
11749        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11750     }
11751   %}
11752   ins_encode %{
11753     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11754                  $tmp$$XMMRegister, true, false);
11755   %}
11756   ins_pipe(pipe_slow);
11757 %}
11758 
11759 instruct rep_stos_large_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
11760                                   Universe dummy, rFlagsReg cr)
11761 %{
11762   predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
11763   match(Set dummy (ClearArray (Binary cnt base) val));
11764   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
11765 
11766   format %{ $$template
11767     if (UseXMMForObjInit) {
11768        $$emit$$"movdq   $tmp, $val\n\t"
11769        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
11770        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
11771        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11772        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11773        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11774        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
11775        $$emit$$"add     0x40,rax\n\t"
11776        $$emit$$"# L_zero_64_bytes:\n\t"
11777        $$emit$$"sub     0x8,rcx\n\t"
11778        $$emit$$"jge     L_loop\n\t"
11779        $$emit$$"add     0x4,rcx\n\t"
11780        $$emit$$"jl      L_tail\n\t"
11781        $$emit$$"vmovdqu $tmp,(rax)\n\t"
11782        $$emit$$"add     0x20,rax\n\t"
11783        $$emit$$"sub     0x4,rcx\n\t"
11784        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11785        $$emit$$"add     0x4,rcx\n\t"
11786        $$emit$$"jle     L_end\n\t"
11787        $$emit$$"dec     rcx\n\t"
11788        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11789        $$emit$$"vmovq   xmm0,(rax)\n\t"
11790        $$emit$$"add     0x8,rax\n\t"
11791        $$emit$$"dec     rcx\n\t"
11792        $$emit$$"jge     L_sloop\n\t"
11793        $$emit$$"# L_end:\n\t"
11794     } else {
11795        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11796     }
11797   %}
11798   ins_encode %{
11799     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11800                  $tmp$$XMMRegister, true, true);
11801   %}
11802   ins_pipe(pipe_slow);
11803 %}
11804 
11805 // Large ClearArray AVX512.
11806 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11807                              Universe dummy, rFlagsReg cr)
11808 %{
11809   predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11810   match(Set dummy (ClearArray (Binary cnt base) val));
11811   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11812 
11813   format %{ $$template
11814     if (UseFastStosb) {
11815        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11816        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11817        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11818     } else if (UseXMMForObjInit) {
11819        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
11820        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11821        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11822        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11823        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11824        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11825        $$emit$$"add     0x40,rax\n\t"
11826        $$emit$$"# L_zero_64_bytes:\n\t"
11827        $$emit$$"sub     0x8,rcx\n\t"
11828        $$emit$$"jge     L_loop\n\t"
11829        $$emit$$"add     0x4,rcx\n\t"
11830        $$emit$$"jl      L_tail\n\t"
11831        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11832        $$emit$$"add     0x20,rax\n\t"
11833        $$emit$$"sub     0x4,rcx\n\t"
11834        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11835        $$emit$$"add     0x4,rcx\n\t"
11836        $$emit$$"jle     L_end\n\t"
11837        $$emit$$"dec     rcx\n\t"
11838        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11839        $$emit$$"vmovq   xmm0,(rax)\n\t"
11840        $$emit$$"add     0x8,rax\n\t"
11841        $$emit$$"dec     rcx\n\t"
11842        $$emit$$"jge     L_sloop\n\t"
11843        $$emit$$"# L_end:\n\t"
11844     } else {
11845        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11846        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11847     }
11848   %}
11849   ins_encode %{
11850     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11851                  $tmp$$XMMRegister, true, false, $ktmp$$KRegister);
11852   %}
11853   ins_pipe(pipe_slow);
11854 %}
11855 
11856 instruct rep_stos_large_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
11857                                        Universe dummy, rFlagsReg cr)

11858 %{
11859   predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
11860   match(Set dummy (ClearArray (Binary cnt base) val));
11861   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
11862 
11863   format %{ $$template
11864     if (UseFastStosb) {
11865        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11866        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
11867        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
11868     } else if (UseXMMForObjInit) {
11869        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
11870        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
11871        $$emit$$"jmpq    L_zero_64_bytes\n\t"
11872        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
11873        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11874        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
11875        $$emit$$"add     0x40,rax\n\t"
11876        $$emit$$"# L_zero_64_bytes:\n\t"
11877        $$emit$$"sub     0x8,rcx\n\t"
11878        $$emit$$"jge     L_loop\n\t"
11879        $$emit$$"add     0x4,rcx\n\t"
11880        $$emit$$"jl      L_tail\n\t"
11881        $$emit$$"vmovdqu ymm0,(rax)\n\t"
11882        $$emit$$"add     0x20,rax\n\t"
11883        $$emit$$"sub     0x4,rcx\n\t"
11884        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
11885        $$emit$$"add     0x4,rcx\n\t"
11886        $$emit$$"jle     L_end\n\t"
11887        $$emit$$"dec     rcx\n\t"
11888        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11889        $$emit$$"vmovq   xmm0,(rax)\n\t"
11890        $$emit$$"add     0x8,rax\n\t"
11891        $$emit$$"dec     rcx\n\t"
11892        $$emit$$"jge     L_sloop\n\t"
11893        $$emit$$"# L_end:\n\t"
11894     } else {
11895        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11896        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11897     }
11898   %}
11899   ins_encode %{
11900     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11901                  $tmp$$XMMRegister, true, true, $ktmp$$KRegister);
11902   %}
11903   ins_pipe(pipe_slow);
11904 %}
11905 
11906 // Small ClearArray AVX512 constant length.
11907 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rax_RegL val, kReg ktmp, Universe dummy, rFlagsReg cr)
11908 %{
11909   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() &&
11910             ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11911   match(Set dummy (ClearArray (Binary cnt base) val));
11912   ins_cost(100);
11913   effect(TEMP tmp, USE_KILL val, TEMP ktmp, KILL cr);
11914   format %{ "clear_mem_imm $base , $cnt  \n\t" %}
11915   ins_encode %{
11916     __ clear_mem($base$$Register, $cnt$$constant, $val$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11917   %}
11918   ins_pipe(pipe_slow);
11919 %}
11920 
11921 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11922                          rax_RegI result, legRegD tmp1, rFlagsReg cr)
11923 %{
11924   predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11925   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11926   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11927 
11928   format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
11929   ins_encode %{
11930     __ string_compare($str1$$Register, $str2$$Register,
11931                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
11932                       $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11933   %}
11934   ins_pipe( pipe_slow );
11935 %}
11936 

13666 
13667   ins_cost(300);
13668   format %{ "call_leaf,runtime " %}
13669   ins_encode(clear_avx, Java_To_Runtime(meth));
13670   ins_pipe(pipe_slow);
13671 %}
13672 
13673 // Call runtime without safepoint and with vector arguments
13674 instruct CallLeafDirectVector(method meth)
13675 %{
13676   match(CallLeafVector);
13677   effect(USE meth);
13678 
13679   ins_cost(300);
13680   format %{ "call_leaf,vector " %}
13681   ins_encode(Java_To_Runtime(meth));
13682   ins_pipe(pipe_slow);
13683 %}
13684 
13685 // Call runtime without safepoint
13686 // entry point is null, target holds the address to call
13687 instruct CallLeafNoFPInDirect(rRegP target)
13688 %{
13689   predicate(n->as_Call()->entry_point() == NULL);
13690   match(CallLeafNoFP target);
13691 
13692   ins_cost(300);
13693   format %{ "call_leaf_nofp,runtime indirect " %}
13694   ins_encode %{
13695      __ call($target$$Register);
13696   %}
13697 
13698   ins_pipe(pipe_slow);
13699 %}
13700 
13701 instruct CallLeafNoFPDirect(method meth)
13702 %{
13703   predicate(n->as_Call()->entry_point() != NULL);
13704   match(CallLeafNoFP);
13705   effect(USE meth);
13706 
13707   ins_cost(300);
13708   format %{ "call_leaf_nofp,runtime " %}
13709   ins_encode(clear_avx, Java_To_Runtime(meth));
13710   ins_pipe(pipe_slow);
13711 %}
13712 
13713 // Return Instruction
13714 // Remove the return address & jump to it.
13715 // Notice: We always emit a nop after a ret to make sure there is room
13716 // for safepoint patching
13717 instruct Ret()
13718 %{
13719   match(Return);
13720 
13721   format %{ "ret" %}
13722   ins_encode %{
13723     __ ret(0);
< prev index next >