< prev index next >

src/hotspot/cpu/x86/x86_64.ad

Print this page

  473 }
  474 
  475 // !!!!! Special hack to get all types of calls to specify the byte offset
  476 //       from the start of the call to the point where the return address
  477 //       will point.
  478 int MachCallStaticJavaNode::ret_addr_offset()
  479 {
  480   int offset = 5; // 5 bytes from start of call to where return address points
  481   offset += clear_avx_size();
  482   return offset;
  483 }
  484 
  485 int MachCallDynamicJavaNode::ret_addr_offset()
  486 {
  487   int offset = 15; // 15 bytes from start of call to where return address points
  488   offset += clear_avx_size();
  489   return offset;
  490 }
  491 
  492 int MachCallRuntimeNode::ret_addr_offset() {




  493   int offset = 13; // movq r10,#addr; callq (r10)
  494   if (this->ideal_Opcode() != Op_CallLeafVector) {
  495     offset += clear_avx_size();
  496   }
  497   return offset;
  498 }

  499 //
  500 // Compute padding required for nodes which need alignment
  501 //
  502 
  503 // The address of the call instruction needs to be 4-byte aligned to
  504 // ensure that it does not span a cache line so that it can be patched.
  505 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
  506 {
  507   current_offset += clear_avx_size(); // skip vzeroupper
  508   current_offset += 1; // skip call opcode byte
  509   return align_up(current_offset, alignment_required()) - current_offset;
  510 }
  511 
  512 // The address of the call instruction needs to be 4-byte aligned to
  513 // ensure that it does not span a cache line so that it can be patched.
  514 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
  515 {
  516   current_offset += clear_avx_size(); // skip vzeroupper
  517   current_offset += 11; // skip movq instruction + call opcode byte
  518   return align_up(current_offset, alignment_required()) - current_offset;

  706     st->print("# stack alignment check");
  707 #endif
  708   }
  709   if (C->stub_function() != nullptr && BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  710     st->print("\n\t");
  711     st->print("cmpl    [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
  712     st->print("\n\t");
  713     st->print("je      fast_entry\t");
  714     st->print("\n\t");
  715     st->print("call    #nmethod_entry_barrier_stub\t");
  716     st->print("\n\tfast_entry:");
  717   }
  718   st->cr();
  719 }
  720 #endif
  721 
  722 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  723   Compile* C = ra_->C;
  724   C2_MacroAssembler _masm(&cbuf);
  725 
  726   int framesize = C->output()->frame_size_in_bytes();
  727   int bangsize = C->output()->bang_size_in_bytes();
  728 
  729   if (C->clinit_barrier_on_entry()) {
  730     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  731     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  732 
  733     Label L_skip_barrier;
  734     Register klass = rscratch1;
  735 
  736     __ mov_metadata(klass, C->method()->holder()->constant_encoding());
  737     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  738 
  739     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  740 
  741     __ bind(L_skip_barrier);

  742   }
  743 
  744   __ verified_entry(framesize, C->output()->need_stack_bang(bangsize)?bangsize:0, false, C->stub_function() != nullptr);


  745 
  746   C->output()->set_frame_complete(cbuf.insts_size());
  747 
  748   if (C->has_mach_constant_base_node()) {
  749     // NOTE: We set the table base offset here because users might be
  750     // emitted before MachConstantBaseNode.
  751     ConstantTable& constant_table = C->output()->constant_table();
  752     constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  753   }
  754 }
  755 
  756 uint MachPrologNode::size(PhaseRegAlloc* ra_) const
  757 {
  758   return MachNode::size(ra_); // too many variables; just compute it
  759                               // the hard way
  760 }
  761 
  762 int MachPrologNode::reloc() const
  763 {
  764   return 0; // a large enough number
  765 }
  766 
  767 //=============================================================================
  768 #ifndef PRODUCT
  769 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
  770 {
  771   Compile* C = ra_->C;
  772   if (generate_vzeroupper(C)) {
  773     st->print("vzeroupper");
  774     st->cr(); st->print("\t");
  775   }
  776 
  777   int framesize = C->output()->frame_size_in_bytes();
  778   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  779   // Remove word for return adr already pushed
  780   // and RBP
  781   framesize -= 2*wordSize;

  789   if (do_polling() && C->is_method_compilation()) {
  790     st->print("\t");
  791     st->print_cr("cmpq    rsp, poll_offset[r15_thread] \n\t"
  792                  "ja      #safepoint_stub\t"
  793                  "# Safepoint: poll for GC");
  794   }
  795 }
  796 #endif
  797 
  798 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
  799 {
  800   Compile* C = ra_->C;
  801   MacroAssembler _masm(&cbuf);
  802 
  803   if (generate_vzeroupper(C)) {
  804     // Clear upper bits of YMM registers when current compiled code uses
  805     // wide vectors to avoid AVX <-> SSE transition penalty during call.
  806     __ vzeroupper();
  807   }
  808 
  809   int framesize = C->output()->frame_size_in_bytes();
  810   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  811   // Remove word for return adr already pushed
  812   // and RBP
  813   framesize -= 2*wordSize;
  814 
  815   // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
  816 
  817   if (framesize) {
  818     __ addq(rsp, framesize);
  819   }
  820 
  821   __ popq(rbp);
  822 
  823   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
  824     __ reserved_stack_check();
  825   }
  826 
  827   if (do_polling() && C->is_method_compilation()) {
  828     MacroAssembler _masm(&cbuf);
  829     Label dummy_label;
  830     Label* code_stub = &dummy_label;
  831     if (!C->output()->in_scratch_emit_size()) {
  832       C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
  833       C->output()->add_stub(stub);
  834       code_stub = &stub->entry();
  835     }
  836     __ relocate(relocInfo::poll_return_type);
  837     __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
  838   }
  839 }
  840 
  841 uint MachEpilogNode::size(PhaseRegAlloc* ra_) const
  842 {
  843   return MachNode::size(ra_); // too many variables; just compute it
  844                               // the hard way
  845 }
  846 
  847 int MachEpilogNode::reloc() const
  848 {
  849   return 2; // a large enough number
  850 }
  851 
  852 const Pipeline* MachEpilogNode::pipeline() const
  853 {
  854   return MachNode::pipeline_class();
  855 }
  856 
  857 //=============================================================================
  858 
  859 enum RC {
  860   rc_bad,
  861   rc_int,
  862   rc_kreg,
  863   rc_float,
  864   rc_stack
  865 };
  866 

 1449   st->print("leaq    %s, [rsp + #%d]\t# box lock",
 1450             Matcher::regName[reg], offset);
 1451 }
 1452 #endif
 1453 
 1454 void BoxLockNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1455 {
 1456   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1457   int reg = ra_->get_encode(this);
 1458 
 1459   MacroAssembler masm(&cbuf);
 1460   masm.lea(as_Register(reg), Address(rsp, offset));
 1461 }
 1462 
 1463 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
 1464 {
 1465   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1466   return (offset < 0x80) ? 5 : 8; // REX
 1467 }
 1468 

















































 1469 //=============================================================================
 1470 #ifndef PRODUCT
 1471 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1472 {
 1473   if (UseCompressedClassPointers) {
 1474     st->print_cr("movl    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1475     st->print_cr("\tcmpl    rscratch1, [rax + CompiledICData::speculated_klass_offset()]\t # Inline cache check");
 1476   } else {
 1477     st->print_cr("movq    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1478     st->print_cr("\tcmpq    rscratch1, [rax + CompiledICData::speculated_klass_offset()]\t # Inline cache check");
 1479   }
 1480   st->print_cr("\tjne     SharedRuntime::_ic_miss_stub");
 1481 }
 1482 #endif
 1483 
 1484 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1485 {
 1486   MacroAssembler masm(&cbuf);
 1487   masm.ic_check(InteriorEntryAlignment);
 1488 }
 1489 
 1490 uint MachUEPNode::size(PhaseRegAlloc* ra_) const
 1491 {
 1492   return MachNode::size(ra_); // too many variables; just compute it
 1493                               // the hard way
 1494 }
 1495 
 1496 
 1497 //=============================================================================
 1498 
 1499 bool Matcher::supports_vector_calling_convention(void) {
 1500   if (EnableVectorSupport && UseVectorStubs) {
 1501     return true;
 1502   }
 1503   return false;
 1504 }
 1505 
 1506 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
 1507   assert(EnableVectorSupport && UseVectorStubs, "sanity");
 1508   int lo = XMM0_num;
 1509   int hi = XMM0b_num;
 1510   if (ideal_reg == Op_VecX) hi = XMM0d_num;
 1511   else if (ideal_reg == Op_VecY) hi = XMM0h_num;
 1512   else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
 1513   return OptoRegPair(hi, lo);
 1514 }
 1515 
 1516 // Is this branch offset short enough that a short branch can be used?

 3081   %}
 3082 %}
 3083 
 3084 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 3085 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 3086 %{
 3087   constraint(ALLOC_IN_RC(ptr_reg));
 3088   predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
 3089   match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
 3090 
 3091   op_cost(10);
 3092   format %{"[$reg + $off + $idx << $scale]" %}
 3093   interface(MEMORY_INTER) %{
 3094     base($reg);
 3095     index($idx);
 3096     scale($scale);
 3097     disp($off);
 3098   %}
 3099 %}
 3100 
















 3101 // Indirect Narrow Oop Plus Offset Operand
 3102 // Note: x86 architecture doesn't support "scale * index + offset" without a base
 3103 // we can't free r12 even with CompressedOops::base() == nullptr.
 3104 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
 3105   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 3106   constraint(ALLOC_IN_RC(ptr_reg));
 3107   match(AddP (DecodeN reg) off);
 3108 
 3109   op_cost(10);
 3110   format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
 3111   interface(MEMORY_INTER) %{
 3112     base(0xc); // R12
 3113     index($reg);
 3114     scale(0x3);
 3115     disp($off);
 3116   %}
 3117 %}
 3118 
 3119 // Indirect Memory Operand
 3120 operand indirectNarrow(rRegN reg)

 3427     equal(0x4, "e");
 3428     not_equal(0x5, "ne");
 3429     less(0x2, "b");
 3430     greater_equal(0x3, "ae");
 3431     less_equal(0x6, "be");
 3432     greater(0x7, "a");
 3433     overflow(0x0, "o");
 3434     no_overflow(0x1, "no");
 3435   %}
 3436 %}
 3437 
 3438 //----------OPERAND CLASSES----------------------------------------------------
 3439 // Operand Classes are groups of operands that are used as to simplify
 3440 // instruction definitions by not requiring the AD writer to specify separate
 3441 // instructions for every form of operand when the instruction accepts
 3442 // multiple operand types with the same basic encoding and format.  The classic
 3443 // case of this is memory operands.
 3444 
 3445 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
 3446                indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
 3447                indCompressedOopOffset,
 3448                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
 3449                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
 3450                indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 3451 
 3452 //----------PIPELINE-----------------------------------------------------------
 3453 // Rules which define the behavior of the target architectures pipeline.
 3454 pipeline %{
 3455 
 3456 //----------ATTRIBUTES---------------------------------------------------------
 3457 attributes %{
 3458   variable_size_instructions;        // Fixed size instructions
 3459   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
 3460   instruction_unit_size = 1;         // An instruction is 1 bytes long
 3461   instruction_fetch_unit_size = 16;  // The processor fetches one line
 3462   instruction_fetch_units = 1;       // of 16 bytes
 3463 
 3464   // List of nop instructions
 3465   nops( MachNop );
 3466 %}
 3467 

 5958   format %{ "MEMBAR-storestore (empty encoding)" %}
 5959   ins_encode( );
 5960   ins_pipe(empty);
 5961 %}
 5962 
 5963 //----------Move Instructions--------------------------------------------------
 5964 
 5965 instruct castX2P(rRegP dst, rRegL src)
 5966 %{
 5967   match(Set dst (CastX2P src));
 5968 
 5969   format %{ "movq    $dst, $src\t# long->ptr" %}
 5970   ins_encode %{
 5971     if ($dst$$reg != $src$$reg) {
 5972       __ movptr($dst$$Register, $src$$Register);
 5973     }
 5974   %}
 5975   ins_pipe(ialu_reg_reg); // XXX
 5976 %}
 5977 













 5978 instruct castP2X(rRegL dst, rRegP src)
 5979 %{
 5980   match(Set dst (CastP2X src));
 5981 
 5982   format %{ "movq    $dst, $src\t# ptr -> long" %}
 5983   ins_encode %{
 5984     if ($dst$$reg != $src$$reg) {
 5985       __ movptr($dst$$Register, $src$$Register);
 5986     }
 5987   %}
 5988   ins_pipe(ialu_reg_reg); // XXX
 5989 %}
 5990 
 5991 // Convert oop into int for vectors alignment masking
 5992 instruct convP2I(rRegI dst, rRegP src)
 5993 %{
 5994   match(Set dst (ConvL2I (CastP2X src)));
 5995 
 5996   format %{ "movl    $dst, $src\t# ptr -> int" %}
 5997   ins_encode %{

10499   effect(DEF dst, USE src);
10500   ins_cost(100);
10501   format %{ "movd    $dst,$src\t# MoveI2F" %}
10502   ins_encode %{
10503     __ movdl($dst$$XMMRegister, $src$$Register);
10504   %}
10505   ins_pipe( pipe_slow );
10506 %}
10507 
10508 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
10509   match(Set dst (MoveL2D src));
10510   effect(DEF dst, USE src);
10511   ins_cost(100);
10512   format %{ "movd    $dst,$src\t# MoveL2D" %}
10513   ins_encode %{
10514      __ movdq($dst$$XMMRegister, $src$$Register);
10515   %}
10516   ins_pipe( pipe_slow );
10517 %}
10518 

10519 // Fast clearing of an array
10520 // Small ClearArray non-AVX512.
10521 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
10522                   Universe dummy, rFlagsReg cr)
10523 %{
10524   predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX <= 2));
10525   match(Set dummy (ClearArray cnt base));
10526   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);





















































































































10527 
10528   format %{ $$template
10529     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10530     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10531     $$emit$$"jg      LARGE\n\t"
10532     $$emit$$"dec     rcx\n\t"
10533     $$emit$$"js      DONE\t# Zero length\n\t"
10534     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10535     $$emit$$"dec     rcx\n\t"
10536     $$emit$$"jge     LOOP\n\t"
10537     $$emit$$"jmp     DONE\n\t"
10538     $$emit$$"# LARGE:\n\t"
10539     if (UseFastStosb) {
10540        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10541        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10542     } else if (UseXMMForObjInit) {
10543        $$emit$$"mov     rdi,rax\n\t"
10544        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10545        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10546        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

10554        $$emit$$"jl      L_tail\n\t"
10555        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10556        $$emit$$"add     0x20,rax\n\t"
10557        $$emit$$"sub     0x4,rcx\n\t"
10558        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10559        $$emit$$"add     0x4,rcx\n\t"
10560        $$emit$$"jle     L_end\n\t"
10561        $$emit$$"dec     rcx\n\t"
10562        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10563        $$emit$$"vmovq   xmm0,(rax)\n\t"
10564        $$emit$$"add     0x8,rax\n\t"
10565        $$emit$$"dec     rcx\n\t"
10566        $$emit$$"jge     L_sloop\n\t"
10567        $$emit$$"# L_end:\n\t"
10568     } else {
10569        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10570     }
10571     $$emit$$"# DONE"
10572   %}
10573   ins_encode %{
10574     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
10575                  $tmp$$XMMRegister, false, knoreg);
10576   %}
10577   ins_pipe(pipe_slow);
10578 %}
10579 
10580 // Small ClearArray AVX512 non-constant length.
10581 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
10582                        Universe dummy, rFlagsReg cr)
10583 %{
10584   predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX > 2));
10585   match(Set dummy (ClearArray cnt base));
10586   ins_cost(125);
10587   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
10588 
10589   format %{ $$template
10590     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10591     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10592     $$emit$$"jg      LARGE\n\t"
10593     $$emit$$"dec     rcx\n\t"
10594     $$emit$$"js      DONE\t# Zero length\n\t"
10595     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10596     $$emit$$"dec     rcx\n\t"
10597     $$emit$$"jge     LOOP\n\t"
10598     $$emit$$"jmp     DONE\n\t"
10599     $$emit$$"# LARGE:\n\t"
10600     if (UseFastStosb) {
10601        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10602        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10603     } else if (UseXMMForObjInit) {
10604        $$emit$$"mov     rdi,rax\n\t"
10605        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10606        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10607        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

10615        $$emit$$"jl      L_tail\n\t"
10616        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10617        $$emit$$"add     0x20,rax\n\t"
10618        $$emit$$"sub     0x4,rcx\n\t"
10619        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10620        $$emit$$"add     0x4,rcx\n\t"
10621        $$emit$$"jle     L_end\n\t"
10622        $$emit$$"dec     rcx\n\t"
10623        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10624        $$emit$$"vmovq   xmm0,(rax)\n\t"
10625        $$emit$$"add     0x8,rax\n\t"
10626        $$emit$$"dec     rcx\n\t"
10627        $$emit$$"jge     L_sloop\n\t"
10628        $$emit$$"# L_end:\n\t"
10629     } else {
10630        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10631     }
10632     $$emit$$"# DONE"
10633   %}
10634   ins_encode %{
10635     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
10636                  $tmp$$XMMRegister, false, $ktmp$$KRegister);
10637   %}
10638   ins_pipe(pipe_slow);
10639 %}
10640 
10641 // Large ClearArray non-AVX512.
10642 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
10643                         Universe dummy, rFlagsReg cr)
10644 %{
10645   predicate((UseAVX <=2) && ((ClearArrayNode*)n)->is_large());
10646   match(Set dummy (ClearArray cnt base));
10647   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
































































































10648 
10649   format %{ $$template
10650     if (UseFastStosb) {
10651        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10652        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10653        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10654     } else if (UseXMMForObjInit) {
10655        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
10656        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10657        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10658        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10659        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10660        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
10661        $$emit$$"add     0x40,rax\n\t"
10662        $$emit$$"# L_zero_64_bytes:\n\t"
10663        $$emit$$"sub     0x8,rcx\n\t"
10664        $$emit$$"jge     L_loop\n\t"
10665        $$emit$$"add     0x4,rcx\n\t"
10666        $$emit$$"jl      L_tail\n\t"
10667        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10668        $$emit$$"add     0x20,rax\n\t"
10669        $$emit$$"sub     0x4,rcx\n\t"
10670        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10671        $$emit$$"add     0x4,rcx\n\t"
10672        $$emit$$"jle     L_end\n\t"
10673        $$emit$$"dec     rcx\n\t"
10674        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10675        $$emit$$"vmovq   xmm0,(rax)\n\t"
10676        $$emit$$"add     0x8,rax\n\t"
10677        $$emit$$"dec     rcx\n\t"
10678        $$emit$$"jge     L_sloop\n\t"
10679        $$emit$$"# L_end:\n\t"
10680     } else {
10681        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10682        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10683     }
10684   %}
10685   ins_encode %{
10686     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
10687                  $tmp$$XMMRegister, true, knoreg);
10688   %}
10689   ins_pipe(pipe_slow);
10690 %}
10691 
10692 // Large ClearArray AVX512.
10693 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
10694                              Universe dummy, rFlagsReg cr)
10695 %{
10696   predicate((UseAVX > 2) && ((ClearArrayNode*)n)->is_large());
10697   match(Set dummy (ClearArray cnt base));
10698   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
10699 
10700   format %{ $$template
10701     if (UseFastStosb) {
10702        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10703        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10704        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10705     } else if (UseXMMForObjInit) {
10706        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
10707        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10708        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10709        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10710        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10711        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
10712        $$emit$$"add     0x40,rax\n\t"
10713        $$emit$$"# L_zero_64_bytes:\n\t"
10714        $$emit$$"sub     0x8,rcx\n\t"
10715        $$emit$$"jge     L_loop\n\t"
10716        $$emit$$"add     0x4,rcx\n\t"
10717        $$emit$$"jl      L_tail\n\t"
10718        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10719        $$emit$$"add     0x20,rax\n\t"
10720        $$emit$$"sub     0x4,rcx\n\t"
10721        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10722        $$emit$$"add     0x4,rcx\n\t"
10723        $$emit$$"jle     L_end\n\t"
10724        $$emit$$"dec     rcx\n\t"
10725        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10726        $$emit$$"vmovq   xmm0,(rax)\n\t"
10727        $$emit$$"add     0x8,rax\n\t"
10728        $$emit$$"dec     rcx\n\t"
10729        $$emit$$"jge     L_sloop\n\t"
10730        $$emit$$"# L_end:\n\t"
10731     } else {
10732        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10733        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10734     }
10735   %}
10736   ins_encode %{
10737     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
10738                  $tmp$$XMMRegister, true, $ktmp$$KRegister);
10739   %}
10740   ins_pipe(pipe_slow);
10741 %}
10742 
10743 // Small ClearArray AVX512 constant length.
10744 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rRegI zero, kReg ktmp, Universe dummy, rFlagsReg cr)
10745 %{
10746   predicate(!((ClearArrayNode*)n)->is_large() &&
10747               ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
10748   match(Set dummy (ClearArray cnt base));
10749   ins_cost(100);
10750   effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr);
10751   format %{ "clear_mem_imm $base , $cnt  \n\t" %}
10752   ins_encode %{
10753    __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
10754   %}
10755   ins_pipe(pipe_slow);
10756 %}
10757 
10758 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
10759                          rax_RegI result, legRegD tmp1, rFlagsReg cr)
10760 %{
10761   predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
10762   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
10763   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
10764 
10765   format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
10766   ins_encode %{
10767     __ string_compare($str1$$Register, $str2$$Register,
10768                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
10769                       $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
10770   %}
10771   ins_pipe( pipe_slow );
10772 %}
10773 

12528 
12529   ins_cost(300);
12530   format %{ "call_leaf,runtime " %}
12531   ins_encode(clear_avx, Java_To_Runtime(meth));
12532   ins_pipe(pipe_slow);
12533 %}
12534 
12535 // Call runtime without safepoint and with vector arguments
12536 instruct CallLeafDirectVector(method meth)
12537 %{
12538   match(CallLeafVector);
12539   effect(USE meth);
12540 
12541   ins_cost(300);
12542   format %{ "call_leaf,vector " %}
12543   ins_encode(Java_To_Runtime(meth));
12544   ins_pipe(pipe_slow);
12545 %}
12546 
12547 // Call runtime without safepoint















12548 instruct CallLeafNoFPDirect(method meth)
12549 %{

12550   match(CallLeafNoFP);
12551   effect(USE meth);
12552 
12553   ins_cost(300);
12554   format %{ "call_leaf_nofp,runtime " %}
12555   ins_encode(clear_avx, Java_To_Runtime(meth));
12556   ins_pipe(pipe_slow);
12557 %}
12558 
12559 // Return Instruction
12560 // Remove the return address & jump to it.
12561 // Notice: We always emit a nop after a ret to make sure there is room
12562 // for safepoint patching
12563 instruct Ret()
12564 %{
12565   match(Return);
12566 
12567   format %{ "ret" %}
12568   ins_encode %{
12569     __ ret(0);

  473 }
  474 
  475 // !!!!! Special hack to get all types of calls to specify the byte offset
  476 //       from the start of the call to the point where the return address
  477 //       will point.
  478 int MachCallStaticJavaNode::ret_addr_offset()
  479 {
  480   int offset = 5; // 5 bytes from start of call to where return address points
  481   offset += clear_avx_size();
  482   return offset;
  483 }
  484 
  485 int MachCallDynamicJavaNode::ret_addr_offset()
  486 {
  487   int offset = 15; // 15 bytes from start of call to where return address points
  488   offset += clear_avx_size();
  489   return offset;
  490 }
  491 
  492 int MachCallRuntimeNode::ret_addr_offset() {
  493   if (_entry_point == nullptr) {
  494     // CallLeafNoFPInDirect
  495     return 3; // callq (register)
  496   }
  497   int offset = 13; // movq r10,#addr; callq (r10)
  498   if (this->ideal_Opcode() != Op_CallLeafVector) {
  499     offset += clear_avx_size();
  500   }
  501   return offset;
  502 }
  503 
  504 //
  505 // Compute padding required for nodes which need alignment
  506 //
  507 
  508 // The address of the call instruction needs to be 4-byte aligned to
  509 // ensure that it does not span a cache line so that it can be patched.
  510 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
  511 {
  512   current_offset += clear_avx_size(); // skip vzeroupper
  513   current_offset += 1; // skip call opcode byte
  514   return align_up(current_offset, alignment_required()) - current_offset;
  515 }
  516 
  517 // The address of the call instruction needs to be 4-byte aligned to
  518 // ensure that it does not span a cache line so that it can be patched.
  519 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
  520 {
  521   current_offset += clear_avx_size(); // skip vzeroupper
  522   current_offset += 11; // skip movq instruction + call opcode byte
  523   return align_up(current_offset, alignment_required()) - current_offset;

  711     st->print("# stack alignment check");
  712 #endif
  713   }
  714   if (C->stub_function() != nullptr && BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  715     st->print("\n\t");
  716     st->print("cmpl    [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
  717     st->print("\n\t");
  718     st->print("je      fast_entry\t");
  719     st->print("\n\t");
  720     st->print("call    #nmethod_entry_barrier_stub\t");
  721     st->print("\n\tfast_entry:");
  722   }
  723   st->cr();
  724 }
  725 #endif
  726 
  727 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  728   Compile* C = ra_->C;
  729   C2_MacroAssembler _masm(&cbuf);
  730 
  731   __ verified_entry(C);













  732 
  733   if (ra_->C->stub_function() == nullptr) {
  734     __ entry_barrier();
  735   }
  736 
  737   if (!Compile::current()->output()->in_scratch_emit_size()) {
  738     __ bind(*_verified_entry);
  739   }
  740 
  741   C->output()->set_frame_complete(cbuf.insts_size());
  742 
  743   if (C->has_mach_constant_base_node()) {
  744     // NOTE: We set the table base offset here because users might be
  745     // emitted before MachConstantBaseNode.
  746     ConstantTable& constant_table = C->output()->constant_table();
  747     constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  748   }
  749 }
  750 






  751 int MachPrologNode::reloc() const
  752 {
  753   return 0; // a large enough number
  754 }
  755 
  756 //=============================================================================
  757 #ifndef PRODUCT
  758 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
  759 {
  760   Compile* C = ra_->C;
  761   if (generate_vzeroupper(C)) {
  762     st->print("vzeroupper");
  763     st->cr(); st->print("\t");
  764   }
  765 
  766   int framesize = C->output()->frame_size_in_bytes();
  767   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  768   // Remove word for return adr already pushed
  769   // and RBP
  770   framesize -= 2*wordSize;

  778   if (do_polling() && C->is_method_compilation()) {
  779     st->print("\t");
  780     st->print_cr("cmpq    rsp, poll_offset[r15_thread] \n\t"
  781                  "ja      #safepoint_stub\t"
  782                  "# Safepoint: poll for GC");
  783   }
  784 }
  785 #endif
  786 
  787 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
  788 {
  789   Compile* C = ra_->C;
  790   MacroAssembler _masm(&cbuf);
  791 
  792   if (generate_vzeroupper(C)) {
  793     // Clear upper bits of YMM registers when current compiled code uses
  794     // wide vectors to avoid AVX <-> SSE transition penalty during call.
  795     __ vzeroupper();
  796   }
  797 
  798   // Subtract two words to account for return address and rbp
  799   int initial_framesize = C->output()->frame_size_in_bytes() - 2*wordSize;
  800   __ remove_frame(initial_framesize, C->needs_stack_repair());










  801 
  802   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
  803     __ reserved_stack_check();
  804   }
  805 
  806   if (do_polling() && C->is_method_compilation()) {
  807     MacroAssembler _masm(&cbuf);
  808     Label dummy_label;
  809     Label* code_stub = &dummy_label;
  810     if (!C->output()->in_scratch_emit_size()) {
  811       C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
  812       C->output()->add_stub(stub);
  813       code_stub = &stub->entry();
  814     }
  815     __ relocate(relocInfo::poll_return_type);
  816     __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
  817   }
  818 }
  819 






  820 int MachEpilogNode::reloc() const
  821 {
  822   return 2; // a large enough number
  823 }
  824 
  825 const Pipeline* MachEpilogNode::pipeline() const
  826 {
  827   return MachNode::pipeline_class();
  828 }
  829 
  830 //=============================================================================
  831 
  832 enum RC {
  833   rc_bad,
  834   rc_int,
  835   rc_kreg,
  836   rc_float,
  837   rc_stack
  838 };
  839 

 1422   st->print("leaq    %s, [rsp + #%d]\t# box lock",
 1423             Matcher::regName[reg], offset);
 1424 }
 1425 #endif
 1426 
 1427 void BoxLockNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1428 {
 1429   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1430   int reg = ra_->get_encode(this);
 1431 
 1432   MacroAssembler masm(&cbuf);
 1433   masm.lea(as_Register(reg), Address(rsp, offset));
 1434 }
 1435 
 1436 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
 1437 {
 1438   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1439   return (offset < 0x80) ? 5 : 8; // REX
 1440 }
 1441 
 1442 //=============================================================================
 1443 #ifndef PRODUCT
 1444 void MachVEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1445 {
 1446   st->print_cr("MachVEPNode");
 1447 }
 1448 #endif
 1449 
 1450 void MachVEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1451 {
 1452   C2_MacroAssembler _masm(&cbuf);
 1453   uint insts_size = cbuf.insts_size();
 1454   if (!_verified) {
 1455     if (UseCompressedClassPointers) {
 1456       __ load_klass(rscratch1, j_rarg0, rscratch2);
 1457       __ cmpptr(rax, rscratch1);
 1458     } else {
 1459       __ cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1460     }
 1461     __ jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1462   } else {
 1463     // TODO 8284443 Avoid creation of temporary frame
 1464     if (ra_->C->stub_function() == nullptr) {
 1465       __ verified_entry(ra_->C, 0);
 1466       __ entry_barrier();
 1467       int initial_framesize = ra_->C->output()->frame_size_in_bytes() - 2*wordSize;
 1468       __ remove_frame(initial_framesize, false);
 1469     }
 1470     // Unpack inline type args passed as oop and then jump to
 1471     // the verified entry point (skipping the unverified entry).
 1472     int sp_inc = __ unpack_inline_args(ra_->C, _receiver_only);
 1473     // Emit code for verified entry and save increment for stack repair on return
 1474     __ verified_entry(ra_->C, sp_inc);
 1475     if (Compile::current()->output()->in_scratch_emit_size()) {
 1476       Label dummy_verified_entry;
 1477       __ jmp(dummy_verified_entry);
 1478     } else {
 1479       __ jmp(*_verified_entry);
 1480     }
 1481   }
 1482   /* WARNING these NOPs are critical so that verified entry point is properly
 1483      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1484   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1485   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1486   if (nops_cnt > 0) {
 1487     __ nop(nops_cnt);
 1488   }
 1489 }
 1490 
 1491 //=============================================================================
 1492 #ifndef PRODUCT
 1493 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1494 {
 1495   if (UseCompressedClassPointers) {
 1496     st->print_cr("movl    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1497     st->print_cr("\tcmpl    rscratch1, [rax + CompiledICData::speculated_klass_offset()]\t # Inline cache check");
 1498   } else {
 1499     st->print_cr("movq    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1500     st->print_cr("\tcmpq    rscratch1, [rax + CompiledICData::speculated_klass_offset()]\t # Inline cache check");
 1501   }
 1502   st->print_cr("\tjne     SharedRuntime::_ic_miss_stub");
 1503 }
 1504 #endif
 1505 
 1506 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1507 {
 1508   MacroAssembler masm(&cbuf);
 1509   masm.ic_check(InteriorEntryAlignment);
 1510 }
 1511 







 1512 //=============================================================================
 1513 
 1514 bool Matcher::supports_vector_calling_convention(void) {
 1515   if (EnableVectorSupport && UseVectorStubs) {
 1516     return true;
 1517   }
 1518   return false;
 1519 }
 1520 
 1521 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
 1522   assert(EnableVectorSupport && UseVectorStubs, "sanity");
 1523   int lo = XMM0_num;
 1524   int hi = XMM0b_num;
 1525   if (ideal_reg == Op_VecX) hi = XMM0d_num;
 1526   else if (ideal_reg == Op_VecY) hi = XMM0h_num;
 1527   else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
 1528   return OptoRegPair(hi, lo);
 1529 }
 1530 
 1531 // Is this branch offset short enough that a short branch can be used?

 3096   %}
 3097 %}
 3098 
 3099 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 3100 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 3101 %{
 3102   constraint(ALLOC_IN_RC(ptr_reg));
 3103   predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
 3104   match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
 3105 
 3106   op_cost(10);
 3107   format %{"[$reg + $off + $idx << $scale]" %}
 3108   interface(MEMORY_INTER) %{
 3109     base($reg);
 3110     index($idx);
 3111     scale($scale);
 3112     disp($off);
 3113   %}
 3114 %}
 3115 
 3116 // Indirect Narrow Oop Operand
 3117 operand indCompressedOop(rRegN reg) %{
 3118   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 3119   constraint(ALLOC_IN_RC(ptr_reg));
 3120   match(DecodeN reg);
 3121 
 3122   op_cost(10);
 3123   format %{"[R12 + $reg << 3] (compressed oop addressing)" %}
 3124   interface(MEMORY_INTER) %{
 3125     base(0xc); // R12
 3126     index($reg);
 3127     scale(0x3);
 3128     disp(0x0);
 3129   %}
 3130 %}
 3131 
 3132 // Indirect Narrow Oop Plus Offset Operand
 3133 // Note: x86 architecture doesn't support "scale * index + offset" without a base
 3134 // we can't free r12 even with CompressedOops::base() == nullptr.
 3135 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
 3136   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 3137   constraint(ALLOC_IN_RC(ptr_reg));
 3138   match(AddP (DecodeN reg) off);
 3139 
 3140   op_cost(10);
 3141   format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
 3142   interface(MEMORY_INTER) %{
 3143     base(0xc); // R12
 3144     index($reg);
 3145     scale(0x3);
 3146     disp($off);
 3147   %}
 3148 %}
 3149 
 3150 // Indirect Memory Operand
 3151 operand indirectNarrow(rRegN reg)

 3458     equal(0x4, "e");
 3459     not_equal(0x5, "ne");
 3460     less(0x2, "b");
 3461     greater_equal(0x3, "ae");
 3462     less_equal(0x6, "be");
 3463     greater(0x7, "a");
 3464     overflow(0x0, "o");
 3465     no_overflow(0x1, "no");
 3466   %}
 3467 %}
 3468 
 3469 //----------OPERAND CLASSES----------------------------------------------------
 3470 // Operand Classes are groups of operands that are used as to simplify
 3471 // instruction definitions by not requiring the AD writer to specify separate
 3472 // instructions for every form of operand when the instruction accepts
 3473 // multiple operand types with the same basic encoding and format.  The classic
 3474 // case of this is memory operands.
 3475 
 3476 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
 3477                indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
 3478                indCompressedOop, indCompressedOopOffset,
 3479                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
 3480                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
 3481                indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 3482 
 3483 //----------PIPELINE-----------------------------------------------------------
 3484 // Rules which define the behavior of the target architectures pipeline.
 3485 pipeline %{
 3486 
 3487 //----------ATTRIBUTES---------------------------------------------------------
 3488 attributes %{
 3489   variable_size_instructions;        // Fixed size instructions
 3490   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
 3491   instruction_unit_size = 1;         // An instruction is 1 bytes long
 3492   instruction_fetch_unit_size = 16;  // The processor fetches one line
 3493   instruction_fetch_units = 1;       // of 16 bytes
 3494 
 3495   // List of nop instructions
 3496   nops( MachNop );
 3497 %}
 3498 

 5989   format %{ "MEMBAR-storestore (empty encoding)" %}
 5990   ins_encode( );
 5991   ins_pipe(empty);
 5992 %}
 5993 
 5994 //----------Move Instructions--------------------------------------------------
 5995 
 5996 instruct castX2P(rRegP dst, rRegL src)
 5997 %{
 5998   match(Set dst (CastX2P src));
 5999 
 6000   format %{ "movq    $dst, $src\t# long->ptr" %}
 6001   ins_encode %{
 6002     if ($dst$$reg != $src$$reg) {
 6003       __ movptr($dst$$Register, $src$$Register);
 6004     }
 6005   %}
 6006   ins_pipe(ialu_reg_reg); // XXX
 6007 %}
 6008 
 6009 instruct castN2X(rRegL dst, rRegN src)
 6010 %{
 6011   match(Set dst (CastP2X src));
 6012 
 6013   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6014   ins_encode %{
 6015     if ($dst$$reg != $src$$reg) {
 6016       __ movptr($dst$$Register, $src$$Register);
 6017     }
 6018   %}
 6019   ins_pipe(ialu_reg_reg); // XXX
 6020 %}
 6021 
 6022 instruct castP2X(rRegL dst, rRegP src)
 6023 %{
 6024   match(Set dst (CastP2X src));
 6025 
 6026   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6027   ins_encode %{
 6028     if ($dst$$reg != $src$$reg) {
 6029       __ movptr($dst$$Register, $src$$Register);
 6030     }
 6031   %}
 6032   ins_pipe(ialu_reg_reg); // XXX
 6033 %}
 6034 
 6035 // Convert oop into int for vectors alignment masking
 6036 instruct convP2I(rRegI dst, rRegP src)
 6037 %{
 6038   match(Set dst (ConvL2I (CastP2X src)));
 6039 
 6040   format %{ "movl    $dst, $src\t# ptr -> int" %}
 6041   ins_encode %{

10543   effect(DEF dst, USE src);
10544   ins_cost(100);
10545   format %{ "movd    $dst,$src\t# MoveI2F" %}
10546   ins_encode %{
10547     __ movdl($dst$$XMMRegister, $src$$Register);
10548   %}
10549   ins_pipe( pipe_slow );
10550 %}
10551 
10552 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
10553   match(Set dst (MoveL2D src));
10554   effect(DEF dst, USE src);
10555   ins_cost(100);
10556   format %{ "movd    $dst,$src\t# MoveL2D" %}
10557   ins_encode %{
10558      __ movdq($dst$$XMMRegister, $src$$Register);
10559   %}
10560   ins_pipe( pipe_slow );
10561 %}
10562 
10563 
10564 // Fast clearing of an array
10565 // Small ClearArray non-AVX512.
10566 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
10567                   Universe dummy, rFlagsReg cr)
10568 %{
10569   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
10570   match(Set dummy (ClearArray (Binary cnt base) val));
10571   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
10572 
10573   format %{ $$template
10574     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10575     $$emit$$"jg      LARGE\n\t"
10576     $$emit$$"dec     rcx\n\t"
10577     $$emit$$"js      DONE\t# Zero length\n\t"
10578     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10579     $$emit$$"dec     rcx\n\t"
10580     $$emit$$"jge     LOOP\n\t"
10581     $$emit$$"jmp     DONE\n\t"
10582     $$emit$$"# LARGE:\n\t"
10583     if (UseFastStosb) {
10584        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10585        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10586     } else if (UseXMMForObjInit) {
10587        $$emit$$"movdq   $tmp, $val\n\t"
10588        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
10589        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
10590        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10591        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10592        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10593        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
10594        $$emit$$"add     0x40,rax\n\t"
10595        $$emit$$"# L_zero_64_bytes:\n\t"
10596        $$emit$$"sub     0x8,rcx\n\t"
10597        $$emit$$"jge     L_loop\n\t"
10598        $$emit$$"add     0x4,rcx\n\t"
10599        $$emit$$"jl      L_tail\n\t"
10600        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10601        $$emit$$"add     0x20,rax\n\t"
10602        $$emit$$"sub     0x4,rcx\n\t"
10603        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10604        $$emit$$"add     0x4,rcx\n\t"
10605        $$emit$$"jle     L_end\n\t"
10606        $$emit$$"dec     rcx\n\t"
10607        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10608        $$emit$$"vmovq   xmm0,(rax)\n\t"
10609        $$emit$$"add     0x8,rax\n\t"
10610        $$emit$$"dec     rcx\n\t"
10611        $$emit$$"jge     L_sloop\n\t"
10612        $$emit$$"# L_end:\n\t"
10613     } else {
10614        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10615     }
10616     $$emit$$"# DONE"
10617   %}
10618   ins_encode %{
10619     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10620                  $tmp$$XMMRegister, false, false);
10621   %}
10622   ins_pipe(pipe_slow);
10623 %}
10624 
10625 instruct rep_stos_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
10626                             Universe dummy, rFlagsReg cr)
10627 %{
10628   predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
10629   match(Set dummy (ClearArray (Binary cnt base) val));
10630   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
10631 
10632   format %{ $$template
10633     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10634     $$emit$$"jg      LARGE\n\t"
10635     $$emit$$"dec     rcx\n\t"
10636     $$emit$$"js      DONE\t# Zero length\n\t"
10637     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10638     $$emit$$"dec     rcx\n\t"
10639     $$emit$$"jge     LOOP\n\t"
10640     $$emit$$"jmp     DONE\n\t"
10641     $$emit$$"# LARGE:\n\t"
10642     if (UseXMMForObjInit) {
10643        $$emit$$"movdq   $tmp, $val\n\t"
10644        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
10645        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
10646        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10647        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10648        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10649        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
10650        $$emit$$"add     0x40,rax\n\t"
10651        $$emit$$"# L_zero_64_bytes:\n\t"
10652        $$emit$$"sub     0x8,rcx\n\t"
10653        $$emit$$"jge     L_loop\n\t"
10654        $$emit$$"add     0x4,rcx\n\t"
10655        $$emit$$"jl      L_tail\n\t"
10656        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10657        $$emit$$"add     0x20,rax\n\t"
10658        $$emit$$"sub     0x4,rcx\n\t"
10659        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10660        $$emit$$"add     0x4,rcx\n\t"
10661        $$emit$$"jle     L_end\n\t"
10662        $$emit$$"dec     rcx\n\t"
10663        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10664        $$emit$$"vmovq   xmm0,(rax)\n\t"
10665        $$emit$$"add     0x8,rax\n\t"
10666        $$emit$$"dec     rcx\n\t"
10667        $$emit$$"jge     L_sloop\n\t"
10668        $$emit$$"# L_end:\n\t"
10669     } else {
10670        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10671     }
10672     $$emit$$"# DONE"
10673   %}
10674   ins_encode %{
10675     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10676                  $tmp$$XMMRegister, false, true);
10677   %}
10678   ins_pipe(pipe_slow);
10679 %}
10680 
10681 // Small ClearArray AVX512 non-constant length.
10682 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
10683                        Universe dummy, rFlagsReg cr)
10684 %{
10685   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
10686   match(Set dummy (ClearArray (Binary cnt base) val));
10687   ins_cost(125);
10688   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
10689 
10690   format %{ $$template
10691     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10692     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10693     $$emit$$"jg      LARGE\n\t"
10694     $$emit$$"dec     rcx\n\t"
10695     $$emit$$"js      DONE\t# Zero length\n\t"
10696     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10697     $$emit$$"dec     rcx\n\t"
10698     $$emit$$"jge     LOOP\n\t"
10699     $$emit$$"jmp     DONE\n\t"
10700     $$emit$$"# LARGE:\n\t"
10701     if (UseFastStosb) {
10702        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10703        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10704     } else if (UseXMMForObjInit) {
10705        $$emit$$"mov     rdi,rax\n\t"
10706        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10707        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10708        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

10716        $$emit$$"jl      L_tail\n\t"
10717        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10718        $$emit$$"add     0x20,rax\n\t"
10719        $$emit$$"sub     0x4,rcx\n\t"
10720        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10721        $$emit$$"add     0x4,rcx\n\t"
10722        $$emit$$"jle     L_end\n\t"
10723        $$emit$$"dec     rcx\n\t"
10724        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10725        $$emit$$"vmovq   xmm0,(rax)\n\t"
10726        $$emit$$"add     0x8,rax\n\t"
10727        $$emit$$"dec     rcx\n\t"
10728        $$emit$$"jge     L_sloop\n\t"
10729        $$emit$$"# L_end:\n\t"
10730     } else {
10731        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10732     }
10733     $$emit$$"# DONE"
10734   %}
10735   ins_encode %{
10736     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10737                  $tmp$$XMMRegister, false, false, $ktmp$$KRegister);
10738   %}
10739   ins_pipe(pipe_slow);
10740 %}
10741 
10742 instruct rep_stos_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
10743                                  Universe dummy, rFlagsReg cr)

10744 %{
10745   predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
10746   match(Set dummy (ClearArray (Binary cnt base) val));
10747   ins_cost(125);
10748   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
10749 
10750   format %{ $$template
10751     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10752     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10753     $$emit$$"jg      LARGE\n\t"
10754     $$emit$$"dec     rcx\n\t"
10755     $$emit$$"js      DONE\t# Zero length\n\t"
10756     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10757     $$emit$$"dec     rcx\n\t"
10758     $$emit$$"jge     LOOP\n\t"
10759     $$emit$$"jmp     DONE\n\t"
10760     $$emit$$"# LARGE:\n\t"
10761     if (UseFastStosb) {
10762        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10763        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10764     } else if (UseXMMForObjInit) {
10765        $$emit$$"mov     rdi,rax\n\t"
10766        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10767        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10768        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

10776        $$emit$$"jl      L_tail\n\t"
10777        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10778        $$emit$$"add     0x20,rax\n\t"
10779        $$emit$$"sub     0x4,rcx\n\t"
10780        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10781        $$emit$$"add     0x4,rcx\n\t"
10782        $$emit$$"jle     L_end\n\t"
10783        $$emit$$"dec     rcx\n\t"
10784        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10785        $$emit$$"vmovq   xmm0,(rax)\n\t"
10786        $$emit$$"add     0x8,rax\n\t"
10787        $$emit$$"dec     rcx\n\t"
10788        $$emit$$"jge     L_sloop\n\t"
10789        $$emit$$"# L_end:\n\t"
10790     } else {
10791        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10792     }
10793     $$emit$$"# DONE"
10794   %}
10795   ins_encode %{
10796     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10797                  $tmp$$XMMRegister, false, true, $ktmp$$KRegister);
10798   %}
10799   ins_pipe(pipe_slow);
10800 %}
10801 
10802 // Large ClearArray non-AVX512.
10803 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
10804                         Universe dummy, rFlagsReg cr)
10805 %{
10806   predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
10807   match(Set dummy (ClearArray (Binary cnt base) val));
10808   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
10809 
10810   format %{ $$template
10811     if (UseFastStosb) {
10812        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10813        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10814     } else if (UseXMMForObjInit) {
10815        $$emit$$"movdq   $tmp, $val\n\t"
10816        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
10817        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
10818        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10819        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10820        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10821        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
10822        $$emit$$"add     0x40,rax\n\t"
10823        $$emit$$"# L_zero_64_bytes:\n\t"
10824        $$emit$$"sub     0x8,rcx\n\t"
10825        $$emit$$"jge     L_loop\n\t"
10826        $$emit$$"add     0x4,rcx\n\t"
10827        $$emit$$"jl      L_tail\n\t"
10828        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10829        $$emit$$"add     0x20,rax\n\t"
10830        $$emit$$"sub     0x4,rcx\n\t"
10831        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10832        $$emit$$"add     0x4,rcx\n\t"
10833        $$emit$$"jle     L_end\n\t"
10834        $$emit$$"dec     rcx\n\t"
10835        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10836        $$emit$$"vmovq   xmm0,(rax)\n\t"
10837        $$emit$$"add     0x8,rax\n\t"
10838        $$emit$$"dec     rcx\n\t"
10839        $$emit$$"jge     L_sloop\n\t"
10840        $$emit$$"# L_end:\n\t"
10841     } else {
10842        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10843     }
10844   %}
10845   ins_encode %{
10846     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10847                  $tmp$$XMMRegister, true, false);
10848   %}
10849   ins_pipe(pipe_slow);
10850 %}
10851 
10852 instruct rep_stos_large_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
10853                                   Universe dummy, rFlagsReg cr)
10854 %{
10855   predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
10856   match(Set dummy (ClearArray (Binary cnt base) val));
10857   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
10858 
10859   format %{ $$template
10860     if (UseXMMForObjInit) {
10861        $$emit$$"movdq   $tmp, $val\n\t"
10862        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
10863        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
10864        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10865        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10866        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10867        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
10868        $$emit$$"add     0x40,rax\n\t"
10869        $$emit$$"# L_zero_64_bytes:\n\t"
10870        $$emit$$"sub     0x8,rcx\n\t"
10871        $$emit$$"jge     L_loop\n\t"
10872        $$emit$$"add     0x4,rcx\n\t"
10873        $$emit$$"jl      L_tail\n\t"
10874        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10875        $$emit$$"add     0x20,rax\n\t"
10876        $$emit$$"sub     0x4,rcx\n\t"
10877        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10878        $$emit$$"add     0x4,rcx\n\t"
10879        $$emit$$"jle     L_end\n\t"
10880        $$emit$$"dec     rcx\n\t"
10881        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10882        $$emit$$"vmovq   xmm0,(rax)\n\t"
10883        $$emit$$"add     0x8,rax\n\t"
10884        $$emit$$"dec     rcx\n\t"
10885        $$emit$$"jge     L_sloop\n\t"
10886        $$emit$$"# L_end:\n\t"
10887     } else {
10888        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10889     }
10890   %}
10891   ins_encode %{
10892     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10893                  $tmp$$XMMRegister, true, true);
10894   %}
10895   ins_pipe(pipe_slow);
10896 %}
10897 
10898 // Large ClearArray AVX512.
10899 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
10900                              Universe dummy, rFlagsReg cr)
10901 %{
10902   predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
10903   match(Set dummy (ClearArray (Binary cnt base) val));
10904   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
10905 
10906   format %{ $$template
10907     if (UseFastStosb) {
10908        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10909        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10910        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10911     } else if (UseXMMForObjInit) {
10912        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
10913        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10914        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10915        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10916        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10917        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
10918        $$emit$$"add     0x40,rax\n\t"
10919        $$emit$$"# L_zero_64_bytes:\n\t"
10920        $$emit$$"sub     0x8,rcx\n\t"
10921        $$emit$$"jge     L_loop\n\t"
10922        $$emit$$"add     0x4,rcx\n\t"
10923        $$emit$$"jl      L_tail\n\t"
10924        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10925        $$emit$$"add     0x20,rax\n\t"
10926        $$emit$$"sub     0x4,rcx\n\t"
10927        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10928        $$emit$$"add     0x4,rcx\n\t"
10929        $$emit$$"jle     L_end\n\t"
10930        $$emit$$"dec     rcx\n\t"
10931        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10932        $$emit$$"vmovq   xmm0,(rax)\n\t"
10933        $$emit$$"add     0x8,rax\n\t"
10934        $$emit$$"dec     rcx\n\t"
10935        $$emit$$"jge     L_sloop\n\t"
10936        $$emit$$"# L_end:\n\t"
10937     } else {
10938        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10939        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10940     }
10941   %}
10942   ins_encode %{
10943     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10944                  $tmp$$XMMRegister, true, false, $ktmp$$KRegister);
10945   %}
10946   ins_pipe(pipe_slow);
10947 %}
10948 
10949 instruct rep_stos_large_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
10950                                        Universe dummy, rFlagsReg cr)

10951 %{
10952   predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
10953   match(Set dummy (ClearArray (Binary cnt base) val));
10954   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
10955 
10956   format %{ $$template
10957     if (UseFastStosb) {
10958        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10959        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10960        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10961     } else if (UseXMMForObjInit) {
10962        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
10963        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10964        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10965        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10966        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10967        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
10968        $$emit$$"add     0x40,rax\n\t"
10969        $$emit$$"# L_zero_64_bytes:\n\t"
10970        $$emit$$"sub     0x8,rcx\n\t"
10971        $$emit$$"jge     L_loop\n\t"
10972        $$emit$$"add     0x4,rcx\n\t"
10973        $$emit$$"jl      L_tail\n\t"
10974        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10975        $$emit$$"add     0x20,rax\n\t"
10976        $$emit$$"sub     0x4,rcx\n\t"
10977        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10978        $$emit$$"add     0x4,rcx\n\t"
10979        $$emit$$"jle     L_end\n\t"
10980        $$emit$$"dec     rcx\n\t"
10981        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10982        $$emit$$"vmovq   xmm0,(rax)\n\t"
10983        $$emit$$"add     0x8,rax\n\t"
10984        $$emit$$"dec     rcx\n\t"
10985        $$emit$$"jge     L_sloop\n\t"
10986        $$emit$$"# L_end:\n\t"
10987     } else {
10988        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10989        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10990     }
10991   %}
10992   ins_encode %{
10993     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10994                  $tmp$$XMMRegister, true, true, $ktmp$$KRegister);
10995   %}
10996   ins_pipe(pipe_slow);
10997 %}
10998 
10999 // Small ClearArray AVX512 constant length.
11000 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rax_RegL val, kReg ktmp, Universe dummy, rFlagsReg cr)
11001 %{
11002   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() &&
11003             ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11004   match(Set dummy (ClearArray (Binary cnt base) val));
11005   ins_cost(100);
11006   effect(TEMP tmp, USE_KILL val, TEMP ktmp, KILL cr);
11007   format %{ "clear_mem_imm $base , $cnt  \n\t" %}
11008   ins_encode %{
11009     __ clear_mem($base$$Register, $cnt$$constant, $val$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11010   %}
11011   ins_pipe(pipe_slow);
11012 %}
11013 
11014 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11015                          rax_RegI result, legRegD tmp1, rFlagsReg cr)
11016 %{
11017   predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11018   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11019   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11020 
11021   format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
11022   ins_encode %{
11023     __ string_compare($str1$$Register, $str2$$Register,
11024                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
11025                       $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11026   %}
11027   ins_pipe( pipe_slow );
11028 %}
11029 

12784 
12785   ins_cost(300);
12786   format %{ "call_leaf,runtime " %}
12787   ins_encode(clear_avx, Java_To_Runtime(meth));
12788   ins_pipe(pipe_slow);
12789 %}
12790 
12791 // Call runtime without safepoint and with vector arguments
12792 instruct CallLeafDirectVector(method meth)
12793 %{
12794   match(CallLeafVector);
12795   effect(USE meth);
12796 
12797   ins_cost(300);
12798   format %{ "call_leaf,vector " %}
12799   ins_encode(Java_To_Runtime(meth));
12800   ins_pipe(pipe_slow);
12801 %}
12802 
12803 // Call runtime without safepoint
12804 // entry point is null, target holds the address to call
12805 instruct CallLeafNoFPInDirect(rRegP target)
12806 %{
12807   predicate(n->as_Call()->entry_point() == nullptr);
12808   match(CallLeafNoFP target);
12809 
12810   ins_cost(300);
12811   format %{ "call_leaf_nofp,runtime indirect " %}
12812   ins_encode %{
12813      __ call($target$$Register);
12814   %}
12815 
12816   ins_pipe(pipe_slow);
12817 %}
12818 
12819 instruct CallLeafNoFPDirect(method meth)
12820 %{
12821   predicate(n->as_Call()->entry_point() != nullptr);
12822   match(CallLeafNoFP);
12823   effect(USE meth);
12824 
12825   ins_cost(300);
12826   format %{ "call_leaf_nofp,runtime " %}
12827   ins_encode(clear_avx, Java_To_Runtime(meth));
12828   ins_pipe(pipe_slow);
12829 %}
12830 
12831 // Return Instruction
12832 // Remove the return address & jump to it.
12833 // Notice: We always emit a nop after a ret to make sure there is room
12834 // for safepoint patching
12835 instruct Ret()
12836 %{
12837   match(Return);
12838 
12839   format %{ "ret" %}
12840   ins_encode %{
12841     __ ret(0);
< prev index next >