< prev index next >

src/hotspot/cpu/x86/x86_64.ad

Print this page

  473 }
  474 
  475 // !!!!! Special hack to get all types of calls to specify the byte offset
  476 //       from the start of the call to the point where the return address
  477 //       will point.
  478 int MachCallStaticJavaNode::ret_addr_offset()
  479 {
  480   int offset = 5; // 5 bytes from start of call to where return address points
  481   offset += clear_avx_size();
  482   return offset;
  483 }
  484 
  485 int MachCallDynamicJavaNode::ret_addr_offset()
  486 {
  487   int offset = 15; // 15 bytes from start of call to where return address points
  488   offset += clear_avx_size();
  489   return offset;
  490 }
  491 
  492 int MachCallRuntimeNode::ret_addr_offset() {




  493   int offset = 13; // movq r10,#addr; callq (r10)
  494   if (this->ideal_Opcode() != Op_CallLeafVector) {
  495     offset += clear_avx_size();
  496   }
  497   return offset;
  498 }

  499 //
  500 // Compute padding required for nodes which need alignment
  501 //
  502 
  503 // The address of the call instruction needs to be 4-byte aligned to
  504 // ensure that it does not span a cache line so that it can be patched.
  505 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
  506 {
  507   current_offset += clear_avx_size(); // skip vzeroupper
  508   current_offset += 1; // skip call opcode byte
  509   return align_up(current_offset, alignment_required()) - current_offset;
  510 }
  511 
  512 // The address of the call instruction needs to be 4-byte aligned to
  513 // ensure that it does not span a cache line so that it can be patched.
  514 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
  515 {
  516   current_offset += clear_avx_size(); // skip vzeroupper
  517   current_offset += 11; // skip movq instruction + call opcode byte
  518   return align_up(current_offset, alignment_required()) - current_offset;

  706     st->print("# stack alignment check");
  707 #endif
  708   }
  709   if (C->stub_function() != nullptr && BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  710     st->print("\n\t");
  711     st->print("cmpl    [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
  712     st->print("\n\t");
  713     st->print("je      fast_entry\t");
  714     st->print("\n\t");
  715     st->print("call    #nmethod_entry_barrier_stub\t");
  716     st->print("\n\tfast_entry:");
  717   }
  718   st->cr();
  719 }
  720 #endif
  721 
  722 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  723   Compile* C = ra_->C;
  724   C2_MacroAssembler _masm(&cbuf);
  725 
  726   int framesize = C->output()->frame_size_in_bytes();
  727   int bangsize = C->output()->bang_size_in_bytes();
  728 
  729   if (C->clinit_barrier_on_entry()) {
  730     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  731     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  732 
  733     Label L_skip_barrier;
  734     Register klass = rscratch1;
  735 
  736     __ mov_metadata(klass, C->method()->holder()->constant_encoding());
  737     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  738 
  739     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  740 
  741     __ bind(L_skip_barrier);

  742   }
  743 
  744   __ verified_entry(framesize, C->output()->need_stack_bang(bangsize)?bangsize:0, false, C->stub_function() != nullptr);


  745 
  746   C->output()->set_frame_complete(cbuf.insts_size());
  747 
  748   if (C->has_mach_constant_base_node()) {
  749     // NOTE: We set the table base offset here because users might be
  750     // emitted before MachConstantBaseNode.
  751     ConstantTable& constant_table = C->output()->constant_table();
  752     constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  753   }
  754 }
  755 
  756 uint MachPrologNode::size(PhaseRegAlloc* ra_) const
  757 {
  758   return MachNode::size(ra_); // too many variables; just compute it
  759                               // the hard way
  760 }
  761 
  762 int MachPrologNode::reloc() const
  763 {
  764   return 0; // a large enough number
  765 }
  766 
  767 //=============================================================================
  768 #ifndef PRODUCT
  769 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
  770 {
  771   Compile* C = ra_->C;
  772   if (generate_vzeroupper(C)) {
  773     st->print("vzeroupper");
  774     st->cr(); st->print("\t");
  775   }
  776 
  777   int framesize = C->output()->frame_size_in_bytes();
  778   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  779   // Remove word for return adr already pushed
  780   // and RBP
  781   framesize -= 2*wordSize;

  789   if (do_polling() && C->is_method_compilation()) {
  790     st->print("\t");
  791     st->print_cr("cmpq    rsp, poll_offset[r15_thread] \n\t"
  792                  "ja      #safepoint_stub\t"
  793                  "# Safepoint: poll for GC");
  794   }
  795 }
  796 #endif
  797 
  798 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
  799 {
  800   Compile* C = ra_->C;
  801   MacroAssembler _masm(&cbuf);
  802 
  803   if (generate_vzeroupper(C)) {
  804     // Clear upper bits of YMM registers when current compiled code uses
  805     // wide vectors to avoid AVX <-> SSE transition penalty during call.
  806     __ vzeroupper();
  807   }
  808 
  809   int framesize = C->output()->frame_size_in_bytes();
  810   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  811   // Remove word for return adr already pushed
  812   // and RBP
  813   framesize -= 2*wordSize;
  814 
  815   // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
  816 
  817   if (framesize) {
  818     __ addq(rsp, framesize);
  819   }
  820 
  821   __ popq(rbp);
  822 
  823   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
  824     __ reserved_stack_check();
  825   }
  826 
  827   if (do_polling() && C->is_method_compilation()) {
  828     MacroAssembler _masm(&cbuf);
  829     Label dummy_label;
  830     Label* code_stub = &dummy_label;
  831     if (!C->output()->in_scratch_emit_size()) {
  832       C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
  833       C->output()->add_stub(stub);
  834       code_stub = &stub->entry();
  835     }
  836     __ relocate(relocInfo::poll_return_type);
  837     __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
  838   }
  839 }
  840 
  841 uint MachEpilogNode::size(PhaseRegAlloc* ra_) const
  842 {
  843   return MachNode::size(ra_); // too many variables; just compute it
  844                               // the hard way
  845 }
  846 
  847 int MachEpilogNode::reloc() const
  848 {
  849   return 2; // a large enough number
  850 }
  851 
  852 const Pipeline* MachEpilogNode::pipeline() const
  853 {
  854   return MachNode::pipeline_class();
  855 }
  856 
  857 //=============================================================================
  858 
  859 enum RC {
  860   rc_bad,
  861   rc_int,
  862   rc_kreg,
  863   rc_float,
  864   rc_stack
  865 };
  866 

 1449   st->print("leaq    %s, [rsp + #%d]\t# box lock",
 1450             Matcher::regName[reg], offset);
 1451 }
 1452 #endif
 1453 
 1454 void BoxLockNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1455 {
 1456   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1457   int reg = ra_->get_encode(this);
 1458 
 1459   MacroAssembler masm(&cbuf);
 1460   masm.lea(as_Register(reg), Address(rsp, offset));
 1461 }
 1462 
 1463 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
 1464 {
 1465   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1466   return (offset < 0x80) ? 5 : 8; // REX
 1467 }
 1468 

















































 1469 //=============================================================================
 1470 #ifndef PRODUCT
 1471 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1472 {
 1473   if (UseCompressedClassPointers) {
 1474     st->print_cr("movl    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1475     st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
 1476     st->print_cr("\tcmpq    rax, rscratch1\t # Inline cache check");
 1477   } else {
 1478     st->print_cr("\tcmpq    rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
 1479                  "# Inline cache check");
 1480   }
 1481   st->print_cr("\tjne     SharedRuntime::_ic_miss_stub");
 1482   st->print_cr("\tnop\t# nops to align entry point");
 1483 }
 1484 #endif
 1485 
 1486 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1487 {
 1488   MacroAssembler masm(&cbuf);

 1491     masm.load_klass(rscratch1, j_rarg0, rscratch2);
 1492     masm.cmpptr(rax, rscratch1);
 1493   } else {
 1494     masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1495   }
 1496 
 1497   masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1498 
 1499   /* WARNING these NOPs are critical so that verified entry point is properly
 1500      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1501   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1502   if (OptoBreakpoint) {
 1503     // Leave space for int3
 1504     nops_cnt -= 1;
 1505   }
 1506   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1507   if (nops_cnt > 0)
 1508     masm.nop(nops_cnt);
 1509 }
 1510 
 1511 uint MachUEPNode::size(PhaseRegAlloc* ra_) const
 1512 {
 1513   return MachNode::size(ra_); // too many variables; just compute it
 1514                               // the hard way
 1515 }
 1516 
 1517 
 1518 //=============================================================================
 1519 
 1520 bool Matcher::supports_vector_calling_convention(void) {
 1521   if (EnableVectorSupport && UseVectorStubs) {
 1522     return true;
 1523   }
 1524   return false;
 1525 }
 1526 
 1527 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
 1528   assert(EnableVectorSupport && UseVectorStubs, "sanity");
 1529   int lo = XMM0_num;
 1530   int hi = XMM0b_num;
 1531   if (ideal_reg == Op_VecX) hi = XMM0d_num;
 1532   else if (ideal_reg == Op_VecY) hi = XMM0h_num;
 1533   else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
 1534   return OptoRegPair(hi, lo);
 1535 }
 1536 
 1537 // Is this branch offset short enough that a short branch can be used?

 3102   %}
 3103 %}
 3104 
 3105 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 3106 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 3107 %{
 3108   constraint(ALLOC_IN_RC(ptr_reg));
 3109   predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
 3110   match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
 3111 
 3112   op_cost(10);
 3113   format %{"[$reg + $off + $idx << $scale]" %}
 3114   interface(MEMORY_INTER) %{
 3115     base($reg);
 3116     index($idx);
 3117     scale($scale);
 3118     disp($off);
 3119   %}
 3120 %}
 3121 
















 3122 // Indirect Narrow Oop Plus Offset Operand
 3123 // Note: x86 architecture doesn't support "scale * index + offset" without a base
 3124 // we can't free r12 even with CompressedOops::base() == nullptr.
 3125 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
 3126   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 3127   constraint(ALLOC_IN_RC(ptr_reg));
 3128   match(AddP (DecodeN reg) off);
 3129 
 3130   op_cost(10);
 3131   format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
 3132   interface(MEMORY_INTER) %{
 3133     base(0xc); // R12
 3134     index($reg);
 3135     scale(0x3);
 3136     disp($off);
 3137   %}
 3138 %}
 3139 
 3140 // Indirect Memory Operand
 3141 operand indirectNarrow(rRegN reg)

 3448     equal(0x4, "e");
 3449     not_equal(0x5, "ne");
 3450     less(0x2, "b");
 3451     greater_equal(0x3, "ae");
 3452     less_equal(0x6, "be");
 3453     greater(0x7, "a");
 3454     overflow(0x0, "o");
 3455     no_overflow(0x1, "no");
 3456   %}
 3457 %}
 3458 
 3459 //----------OPERAND CLASSES----------------------------------------------------
 3460 // Operand Classes are groups of operands that are used as to simplify
 3461 // instruction definitions by not requiring the AD writer to specify separate
 3462 // instructions for every form of operand when the instruction accepts
 3463 // multiple operand types with the same basic encoding and format.  The classic
 3464 // case of this is memory operands.
 3465 
 3466 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
 3467                indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
 3468                indCompressedOopOffset,
 3469                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
 3470                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
 3471                indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 3472 
 3473 //----------PIPELINE-----------------------------------------------------------
 3474 // Rules which define the behavior of the target architectures pipeline.
 3475 pipeline %{
 3476 
 3477 //----------ATTRIBUTES---------------------------------------------------------
 3478 attributes %{
 3479   variable_size_instructions;        // Fixed size instructions
 3480   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
 3481   instruction_unit_size = 1;         // An instruction is 1 bytes long
 3482   instruction_fetch_unit_size = 16;  // The processor fetches one line
 3483   instruction_fetch_units = 1;       // of 16 bytes
 3484 
 3485   // List of nop instructions
 3486   nops( MachNop );
 3487 %}
 3488 

 5979   format %{ "MEMBAR-storestore (empty encoding)" %}
 5980   ins_encode( );
 5981   ins_pipe(empty);
 5982 %}
 5983 
 5984 //----------Move Instructions--------------------------------------------------
 5985 
 5986 instruct castX2P(rRegP dst, rRegL src)
 5987 %{
 5988   match(Set dst (CastX2P src));
 5989 
 5990   format %{ "movq    $dst, $src\t# long->ptr" %}
 5991   ins_encode %{
 5992     if ($dst$$reg != $src$$reg) {
 5993       __ movptr($dst$$Register, $src$$Register);
 5994     }
 5995   %}
 5996   ins_pipe(ialu_reg_reg); // XXX
 5997 %}
 5998 













 5999 instruct castP2X(rRegL dst, rRegP src)
 6000 %{
 6001   match(Set dst (CastP2X src));
 6002 
 6003   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6004   ins_encode %{
 6005     if ($dst$$reg != $src$$reg) {
 6006       __ movptr($dst$$Register, $src$$Register);
 6007     }
 6008   %}
 6009   ins_pipe(ialu_reg_reg); // XXX
 6010 %}
 6011 
 6012 // Convert oop into int for vectors alignment masking
 6013 instruct convP2I(rRegI dst, rRegP src)
 6014 %{
 6015   match(Set dst (ConvL2I (CastP2X src)));
 6016 
 6017   format %{ "movl    $dst, $src\t# ptr -> int" %}
 6018   ins_encode %{

10520   effect(DEF dst, USE src);
10521   ins_cost(100);
10522   format %{ "movd    $dst,$src\t# MoveI2F" %}
10523   ins_encode %{
10524     __ movdl($dst$$XMMRegister, $src$$Register);
10525   %}
10526   ins_pipe( pipe_slow );
10527 %}
10528 
10529 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
10530   match(Set dst (MoveL2D src));
10531   effect(DEF dst, USE src);
10532   ins_cost(100);
10533   format %{ "movd    $dst,$src\t# MoveL2D" %}
10534   ins_encode %{
10535      __ movdq($dst$$XMMRegister, $src$$Register);
10536   %}
10537   ins_pipe( pipe_slow );
10538 %}
10539 

10540 // Fast clearing of an array
10541 // Small ClearArray non-AVX512.
10542 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
10543                   Universe dummy, rFlagsReg cr)
10544 %{
10545   predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX <= 2));
10546   match(Set dummy (ClearArray cnt base));
10547   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);





















































































































10548 
10549   format %{ $$template
10550     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10551     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10552     $$emit$$"jg      LARGE\n\t"
10553     $$emit$$"dec     rcx\n\t"
10554     $$emit$$"js      DONE\t# Zero length\n\t"
10555     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10556     $$emit$$"dec     rcx\n\t"
10557     $$emit$$"jge     LOOP\n\t"
10558     $$emit$$"jmp     DONE\n\t"
10559     $$emit$$"# LARGE:\n\t"
10560     if (UseFastStosb) {
10561        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10562        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10563     } else if (UseXMMForObjInit) {
10564        $$emit$$"mov     rdi,rax\n\t"
10565        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10566        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10567        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

10575        $$emit$$"jl      L_tail\n\t"
10576        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10577        $$emit$$"add     0x20,rax\n\t"
10578        $$emit$$"sub     0x4,rcx\n\t"
10579        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10580        $$emit$$"add     0x4,rcx\n\t"
10581        $$emit$$"jle     L_end\n\t"
10582        $$emit$$"dec     rcx\n\t"
10583        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10584        $$emit$$"vmovq   xmm0,(rax)\n\t"
10585        $$emit$$"add     0x8,rax\n\t"
10586        $$emit$$"dec     rcx\n\t"
10587        $$emit$$"jge     L_sloop\n\t"
10588        $$emit$$"# L_end:\n\t"
10589     } else {
10590        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10591     }
10592     $$emit$$"# DONE"
10593   %}
10594   ins_encode %{
10595     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
10596                  $tmp$$XMMRegister, false, knoreg);
10597   %}
10598   ins_pipe(pipe_slow);
10599 %}
10600 
10601 // Small ClearArray AVX512 non-constant length.
10602 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
10603                        Universe dummy, rFlagsReg cr)
10604 %{
10605   predicate(!((ClearArrayNode*)n)->is_large() && (UseAVX > 2));
10606   match(Set dummy (ClearArray cnt base));
10607   ins_cost(125);
10608   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
10609 
10610   format %{ $$template
10611     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10612     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10613     $$emit$$"jg      LARGE\n\t"
10614     $$emit$$"dec     rcx\n\t"
10615     $$emit$$"js      DONE\t# Zero length\n\t"
10616     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10617     $$emit$$"dec     rcx\n\t"
10618     $$emit$$"jge     LOOP\n\t"
10619     $$emit$$"jmp     DONE\n\t"
10620     $$emit$$"# LARGE:\n\t"
10621     if (UseFastStosb) {
10622        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10623        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10624     } else if (UseXMMForObjInit) {
10625        $$emit$$"mov     rdi,rax\n\t"
10626        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10627        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10628        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

10636        $$emit$$"jl      L_tail\n\t"
10637        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10638        $$emit$$"add     0x20,rax\n\t"
10639        $$emit$$"sub     0x4,rcx\n\t"
10640        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10641        $$emit$$"add     0x4,rcx\n\t"
10642        $$emit$$"jle     L_end\n\t"
10643        $$emit$$"dec     rcx\n\t"
10644        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10645        $$emit$$"vmovq   xmm0,(rax)\n\t"
10646        $$emit$$"add     0x8,rax\n\t"
10647        $$emit$$"dec     rcx\n\t"
10648        $$emit$$"jge     L_sloop\n\t"
10649        $$emit$$"# L_end:\n\t"
10650     } else {
10651        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10652     }
10653     $$emit$$"# DONE"
10654   %}
10655   ins_encode %{
10656     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
10657                  $tmp$$XMMRegister, false, $ktmp$$KRegister);
10658   %}
10659   ins_pipe(pipe_slow);
10660 %}
10661 
10662 // Large ClearArray non-AVX512.
10663 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
10664                         Universe dummy, rFlagsReg cr)
10665 %{
10666   predicate((UseAVX <=2) && ((ClearArrayNode*)n)->is_large());
10667   match(Set dummy (ClearArray cnt base));
10668   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
































































































10669 
10670   format %{ $$template
10671     if (UseFastStosb) {
10672        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10673        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10674        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10675     } else if (UseXMMForObjInit) {
10676        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
10677        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10678        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10679        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10680        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10681        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
10682        $$emit$$"add     0x40,rax\n\t"
10683        $$emit$$"# L_zero_64_bytes:\n\t"
10684        $$emit$$"sub     0x8,rcx\n\t"
10685        $$emit$$"jge     L_loop\n\t"
10686        $$emit$$"add     0x4,rcx\n\t"
10687        $$emit$$"jl      L_tail\n\t"
10688        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10689        $$emit$$"add     0x20,rax\n\t"
10690        $$emit$$"sub     0x4,rcx\n\t"
10691        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10692        $$emit$$"add     0x4,rcx\n\t"
10693        $$emit$$"jle     L_end\n\t"
10694        $$emit$$"dec     rcx\n\t"
10695        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10696        $$emit$$"vmovq   xmm0,(rax)\n\t"
10697        $$emit$$"add     0x8,rax\n\t"
10698        $$emit$$"dec     rcx\n\t"
10699        $$emit$$"jge     L_sloop\n\t"
10700        $$emit$$"# L_end:\n\t"
10701     } else {
10702        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10703        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10704     }
10705   %}
10706   ins_encode %{
10707     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
10708                  $tmp$$XMMRegister, true, knoreg);
10709   %}
10710   ins_pipe(pipe_slow);
10711 %}
10712 
10713 // Large ClearArray AVX512.
10714 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegI zero,
10715                              Universe dummy, rFlagsReg cr)
10716 %{
10717   predicate((UseAVX > 2) && ((ClearArrayNode*)n)->is_large());
10718   match(Set dummy (ClearArray cnt base));
10719   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
10720 
10721   format %{ $$template
10722     if (UseFastStosb) {
10723        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10724        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10725        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10726     } else if (UseXMMForObjInit) {
10727        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
10728        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10729        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10730        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10731        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10732        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
10733        $$emit$$"add     0x40,rax\n\t"
10734        $$emit$$"# L_zero_64_bytes:\n\t"
10735        $$emit$$"sub     0x8,rcx\n\t"
10736        $$emit$$"jge     L_loop\n\t"
10737        $$emit$$"add     0x4,rcx\n\t"
10738        $$emit$$"jl      L_tail\n\t"
10739        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10740        $$emit$$"add     0x20,rax\n\t"
10741        $$emit$$"sub     0x4,rcx\n\t"
10742        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10743        $$emit$$"add     0x4,rcx\n\t"
10744        $$emit$$"jle     L_end\n\t"
10745        $$emit$$"dec     rcx\n\t"
10746        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10747        $$emit$$"vmovq   xmm0,(rax)\n\t"
10748        $$emit$$"add     0x8,rax\n\t"
10749        $$emit$$"dec     rcx\n\t"
10750        $$emit$$"jge     L_sloop\n\t"
10751        $$emit$$"# L_end:\n\t"
10752     } else {
10753        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10754        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10755     }
10756   %}
10757   ins_encode %{
10758     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
10759                  $tmp$$XMMRegister, true, $ktmp$$KRegister);
10760   %}
10761   ins_pipe(pipe_slow);
10762 %}
10763 
10764 // Small ClearArray AVX512 constant length.
10765 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rRegI zero, kReg ktmp, Universe dummy, rFlagsReg cr)
10766 %{
10767   predicate(!((ClearArrayNode*)n)->is_large() &&
10768               ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
10769   match(Set dummy (ClearArray cnt base));
10770   ins_cost(100);
10771   effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr);
10772   format %{ "clear_mem_imm $base , $cnt  \n\t" %}
10773   ins_encode %{
10774    __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
10775   %}
10776   ins_pipe(pipe_slow);
10777 %}
10778 
10779 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
10780                          rax_RegI result, legRegD tmp1, rFlagsReg cr)
10781 %{
10782   predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
10783   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
10784   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
10785 
10786   format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
10787   ins_encode %{
10788     __ string_compare($str1$$Register, $str2$$Register,
10789                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
10790                       $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
10791   %}
10792   ins_pipe( pipe_slow );
10793 %}
10794 

12524 
12525   ins_cost(300);
12526   format %{ "call_leaf,runtime " %}
12527   ins_encode(clear_avx, Java_To_Runtime(meth));
12528   ins_pipe(pipe_slow);
12529 %}
12530 
12531 // Call runtime without safepoint and with vector arguments
12532 instruct CallLeafDirectVector(method meth)
12533 %{
12534   match(CallLeafVector);
12535   effect(USE meth);
12536 
12537   ins_cost(300);
12538   format %{ "call_leaf,vector " %}
12539   ins_encode(Java_To_Runtime(meth));
12540   ins_pipe(pipe_slow);
12541 %}
12542 
12543 // Call runtime without safepoint















12544 instruct CallLeafNoFPDirect(method meth)
12545 %{

12546   match(CallLeafNoFP);
12547   effect(USE meth);
12548 
12549   ins_cost(300);
12550   format %{ "call_leaf_nofp,runtime " %}
12551   ins_encode(clear_avx, Java_To_Runtime(meth));
12552   ins_pipe(pipe_slow);
12553 %}
12554 
12555 // Return Instruction
12556 // Remove the return address & jump to it.
12557 // Notice: We always emit a nop after a ret to make sure there is room
12558 // for safepoint patching
12559 instruct Ret()
12560 %{
12561   match(Return);
12562 
12563   format %{ "ret" %}
12564   ins_encode %{
12565     __ ret(0);

  473 }
  474 
  475 // !!!!! Special hack to get all types of calls to specify the byte offset
  476 //       from the start of the call to the point where the return address
  477 //       will point.
  478 int MachCallStaticJavaNode::ret_addr_offset()
  479 {
  480   int offset = 5; // 5 bytes from start of call to where return address points
  481   offset += clear_avx_size();
  482   return offset;
  483 }
  484 
  485 int MachCallDynamicJavaNode::ret_addr_offset()
  486 {
  487   int offset = 15; // 15 bytes from start of call to where return address points
  488   offset += clear_avx_size();
  489   return offset;
  490 }
  491 
  492 int MachCallRuntimeNode::ret_addr_offset() {
  493   if (_entry_point == nullptr) {
  494     // CallLeafNoFPInDirect
  495     return 3; // callq (register)
  496   }
  497   int offset = 13; // movq r10,#addr; callq (r10)
  498   if (this->ideal_Opcode() != Op_CallLeafVector) {
  499     offset += clear_avx_size();
  500   }
  501   return offset;
  502 }
  503 
  504 //
  505 // Compute padding required for nodes which need alignment
  506 //
  507 
  508 // The address of the call instruction needs to be 4-byte aligned to
  509 // ensure that it does not span a cache line so that it can be patched.
  510 int CallStaticJavaDirectNode::compute_padding(int current_offset) const
  511 {
  512   current_offset += clear_avx_size(); // skip vzeroupper
  513   current_offset += 1; // skip call opcode byte
  514   return align_up(current_offset, alignment_required()) - current_offset;
  515 }
  516 
  517 // The address of the call instruction needs to be 4-byte aligned to
  518 // ensure that it does not span a cache line so that it can be patched.
  519 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
  520 {
  521   current_offset += clear_avx_size(); // skip vzeroupper
  522   current_offset += 11; // skip movq instruction + call opcode byte
  523   return align_up(current_offset, alignment_required()) - current_offset;

  711     st->print("# stack alignment check");
  712 #endif
  713   }
  714   if (C->stub_function() != nullptr && BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  715     st->print("\n\t");
  716     st->print("cmpl    [r15_thread + #disarmed_guard_value_offset], #disarmed_guard_value\t");
  717     st->print("\n\t");
  718     st->print("je      fast_entry\t");
  719     st->print("\n\t");
  720     st->print("call    #nmethod_entry_barrier_stub\t");
  721     st->print("\n\tfast_entry:");
  722   }
  723   st->cr();
  724 }
  725 #endif
  726 
  727 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  728   Compile* C = ra_->C;
  729   C2_MacroAssembler _masm(&cbuf);
  730 
  731   __ verified_entry(C);













  732 
  733   if (ra_->C->stub_function() == nullptr) {
  734     __ entry_barrier();
  735   }
  736 
  737   if (!Compile::current()->output()->in_scratch_emit_size()) {
  738     __ bind(*_verified_entry);
  739   }
  740 
  741   C->output()->set_frame_complete(cbuf.insts_size());
  742 
  743   if (C->has_mach_constant_base_node()) {
  744     // NOTE: We set the table base offset here because users might be
  745     // emitted before MachConstantBaseNode.
  746     ConstantTable& constant_table = C->output()->constant_table();
  747     constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  748   }
  749 }
  750 






  751 int MachPrologNode::reloc() const
  752 {
  753   return 0; // a large enough number
  754 }
  755 
  756 //=============================================================================
  757 #ifndef PRODUCT
  758 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
  759 {
  760   Compile* C = ra_->C;
  761   if (generate_vzeroupper(C)) {
  762     st->print("vzeroupper");
  763     st->cr(); st->print("\t");
  764   }
  765 
  766   int framesize = C->output()->frame_size_in_bytes();
  767   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  768   // Remove word for return adr already pushed
  769   // and RBP
  770   framesize -= 2*wordSize;

  778   if (do_polling() && C->is_method_compilation()) {
  779     st->print("\t");
  780     st->print_cr("cmpq    rsp, poll_offset[r15_thread] \n\t"
  781                  "ja      #safepoint_stub\t"
  782                  "# Safepoint: poll for GC");
  783   }
  784 }
  785 #endif
  786 
  787 void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
  788 {
  789   Compile* C = ra_->C;
  790   MacroAssembler _masm(&cbuf);
  791 
  792   if (generate_vzeroupper(C)) {
  793     // Clear upper bits of YMM registers when current compiled code uses
  794     // wide vectors to avoid AVX <-> SSE transition penalty during call.
  795     __ vzeroupper();
  796   }
  797 
  798   // Subtract two words to account for return address and rbp
  799   int initial_framesize = C->output()->frame_size_in_bytes() - 2*wordSize;
  800   __ remove_frame(initial_framesize, C->needs_stack_repair());










  801 
  802   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
  803     __ reserved_stack_check();
  804   }
  805 
  806   if (do_polling() && C->is_method_compilation()) {
  807     MacroAssembler _masm(&cbuf);
  808     Label dummy_label;
  809     Label* code_stub = &dummy_label;
  810     if (!C->output()->in_scratch_emit_size()) {
  811       C2SafepointPollStub* stub = new (C->comp_arena()) C2SafepointPollStub(__ offset());
  812       C->output()->add_stub(stub);
  813       code_stub = &stub->entry();
  814     }
  815     __ relocate(relocInfo::poll_return_type);
  816     __ safepoint_poll(*code_stub, r15_thread, true /* at_return */, true /* in_nmethod */);
  817   }
  818 }
  819 






  820 int MachEpilogNode::reloc() const
  821 {
  822   return 2; // a large enough number
  823 }
  824 
  825 const Pipeline* MachEpilogNode::pipeline() const
  826 {
  827   return MachNode::pipeline_class();
  828 }
  829 
  830 //=============================================================================
  831 
  832 enum RC {
  833   rc_bad,
  834   rc_int,
  835   rc_kreg,
  836   rc_float,
  837   rc_stack
  838 };
  839 

 1422   st->print("leaq    %s, [rsp + #%d]\t# box lock",
 1423             Matcher::regName[reg], offset);
 1424 }
 1425 #endif
 1426 
 1427 void BoxLockNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1428 {
 1429   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1430   int reg = ra_->get_encode(this);
 1431 
 1432   MacroAssembler masm(&cbuf);
 1433   masm.lea(as_Register(reg), Address(rsp, offset));
 1434 }
 1435 
 1436 uint BoxLockNode::size(PhaseRegAlloc *ra_) const
 1437 {
 1438   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
 1439   return (offset < 0x80) ? 5 : 8; // REX
 1440 }
 1441 
 1442 //=============================================================================
 1443 #ifndef PRODUCT
 1444 void MachVEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1445 {
 1446   st->print_cr("MachVEPNode");
 1447 }
 1448 #endif
 1449 
 1450 void MachVEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1451 {
 1452   C2_MacroAssembler _masm(&cbuf);
 1453   uint insts_size = cbuf.insts_size();
 1454   if (!_verified) {
 1455     if (UseCompressedClassPointers) {
 1456       __ load_klass(rscratch1, j_rarg0, rscratch2);
 1457       __ cmpptr(rax, rscratch1);
 1458     } else {
 1459       __ cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1460     }
 1461     __ jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1462   } else {
 1463     // TODO 8284443 Avoid creation of temporary frame
 1464     if (ra_->C->stub_function() == nullptr) {
 1465       __ verified_entry(ra_->C, 0);
 1466       __ entry_barrier();
 1467       int initial_framesize = ra_->C->output()->frame_size_in_bytes() - 2*wordSize;
 1468       __ remove_frame(initial_framesize, false);
 1469     }
 1470     // Unpack inline type args passed as oop and then jump to
 1471     // the verified entry point (skipping the unverified entry).
 1472     int sp_inc = __ unpack_inline_args(ra_->C, _receiver_only);
 1473     // Emit code for verified entry and save increment for stack repair on return
 1474     __ verified_entry(ra_->C, sp_inc);
 1475     if (Compile::current()->output()->in_scratch_emit_size()) {
 1476       Label dummy_verified_entry;
 1477       __ jmp(dummy_verified_entry);
 1478     } else {
 1479       __ jmp(*_verified_entry);
 1480     }
 1481   }
 1482   /* WARNING these NOPs are critical so that verified entry point is properly
 1483      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1484   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1485   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1486   if (nops_cnt > 0) {
 1487     __ nop(nops_cnt);
 1488   }
 1489 }
 1490 
 1491 //=============================================================================
 1492 #ifndef PRODUCT
 1493 void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 1494 {
 1495   if (UseCompressedClassPointers) {
 1496     st->print_cr("movl    rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
 1497     st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
 1498     st->print_cr("\tcmpq    rax, rscratch1\t # Inline cache check");
 1499   } else {
 1500     st->print_cr("\tcmpq    rax, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t"
 1501                  "# Inline cache check");
 1502   }
 1503   st->print_cr("\tjne     SharedRuntime::_ic_miss_stub");
 1504   st->print_cr("\tnop\t# nops to align entry point");
 1505 }
 1506 #endif
 1507 
 1508 void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
 1509 {
 1510   MacroAssembler masm(&cbuf);

 1513     masm.load_klass(rscratch1, j_rarg0, rscratch2);
 1514     masm.cmpptr(rax, rscratch1);
 1515   } else {
 1516     masm.cmpptr(rax, Address(j_rarg0, oopDesc::klass_offset_in_bytes()));
 1517   }
 1518 
 1519   masm.jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
 1520 
 1521   /* WARNING these NOPs are critical so that verified entry point is properly
 1522      4 bytes aligned for patching by NativeJump::patch_verified_entry() */
 1523   int nops_cnt = 4 - ((cbuf.insts_size() - insts_size) & 0x3);
 1524   if (OptoBreakpoint) {
 1525     // Leave space for int3
 1526     nops_cnt -= 1;
 1527   }
 1528   nops_cnt &= 0x3; // Do not add nops if code is aligned.
 1529   if (nops_cnt > 0)
 1530     masm.nop(nops_cnt);
 1531 }
 1532 







 1533 //=============================================================================
 1534 
 1535 bool Matcher::supports_vector_calling_convention(void) {
 1536   if (EnableVectorSupport && UseVectorStubs) {
 1537     return true;
 1538   }
 1539   return false;
 1540 }
 1541 
 1542 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
 1543   assert(EnableVectorSupport && UseVectorStubs, "sanity");
 1544   int lo = XMM0_num;
 1545   int hi = XMM0b_num;
 1546   if (ideal_reg == Op_VecX) hi = XMM0d_num;
 1547   else if (ideal_reg == Op_VecY) hi = XMM0h_num;
 1548   else if (ideal_reg == Op_VecZ) hi = XMM0p_num;
 1549   return OptoRegPair(hi, lo);
 1550 }
 1551 
 1552 // Is this branch offset short enough that a short branch can be used?

 3117   %}
 3118 %}
 3119 
 3120 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 3121 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 3122 %{
 3123   constraint(ALLOC_IN_RC(ptr_reg));
 3124   predicate(n->in(2)->in(3)->in(1)->as_Type()->type()->is_long()->_lo >= 0);
 3125   match(AddP (AddP reg (LShiftL (ConvI2L idx) scale)) off);
 3126 
 3127   op_cost(10);
 3128   format %{"[$reg + $off + $idx << $scale]" %}
 3129   interface(MEMORY_INTER) %{
 3130     base($reg);
 3131     index($idx);
 3132     scale($scale);
 3133     disp($off);
 3134   %}
 3135 %}
 3136 
 3137 // Indirect Narrow Oop Operand
 3138 operand indCompressedOop(rRegN reg) %{
 3139   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 3140   constraint(ALLOC_IN_RC(ptr_reg));
 3141   match(DecodeN reg);
 3142 
 3143   op_cost(10);
 3144   format %{"[R12 + $reg << 3] (compressed oop addressing)" %}
 3145   interface(MEMORY_INTER) %{
 3146     base(0xc); // R12
 3147     index($reg);
 3148     scale(0x3);
 3149     disp(0x0);
 3150   %}
 3151 %}
 3152 
 3153 // Indirect Narrow Oop Plus Offset Operand
 3154 // Note: x86 architecture doesn't support "scale * index + offset" without a base
 3155 // we can't free r12 even with CompressedOops::base() == nullptr.
 3156 operand indCompressedOopOffset(rRegN reg, immL32 off) %{
 3157   predicate(UseCompressedOops && (CompressedOops::shift() == Address::times_8));
 3158   constraint(ALLOC_IN_RC(ptr_reg));
 3159   match(AddP (DecodeN reg) off);
 3160 
 3161   op_cost(10);
 3162   format %{"[R12 + $reg << 3 + $off] (compressed oop addressing)" %}
 3163   interface(MEMORY_INTER) %{
 3164     base(0xc); // R12
 3165     index($reg);
 3166     scale(0x3);
 3167     disp($off);
 3168   %}
 3169 %}
 3170 
 3171 // Indirect Memory Operand
 3172 operand indirectNarrow(rRegN reg)

 3479     equal(0x4, "e");
 3480     not_equal(0x5, "ne");
 3481     less(0x2, "b");
 3482     greater_equal(0x3, "ae");
 3483     less_equal(0x6, "be");
 3484     greater(0x7, "a");
 3485     overflow(0x0, "o");
 3486     no_overflow(0x1, "no");
 3487   %}
 3488 %}
 3489 
 3490 //----------OPERAND CLASSES----------------------------------------------------
 3491 // Operand Classes are groups of operands that are used as to simplify
 3492 // instruction definitions by not requiring the AD writer to specify separate
 3493 // instructions for every form of operand when the instruction accepts
 3494 // multiple operand types with the same basic encoding and format.  The classic
 3495 // case of this is memory operands.
 3496 
 3497 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
 3498                indIndexScale, indPosIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
 3499                indCompressedOop, indCompressedOopOffset,
 3500                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
 3501                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
 3502                indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 3503 
 3504 //----------PIPELINE-----------------------------------------------------------
 3505 // Rules which define the behavior of the target architectures pipeline.
 3506 pipeline %{
 3507 
 3508 //----------ATTRIBUTES---------------------------------------------------------
 3509 attributes %{
 3510   variable_size_instructions;        // Fixed size instructions
 3511   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
 3512   instruction_unit_size = 1;         // An instruction is 1 bytes long
 3513   instruction_fetch_unit_size = 16;  // The processor fetches one line
 3514   instruction_fetch_units = 1;       // of 16 bytes
 3515 
 3516   // List of nop instructions
 3517   nops( MachNop );
 3518 %}
 3519 

 6010   format %{ "MEMBAR-storestore (empty encoding)" %}
 6011   ins_encode( );
 6012   ins_pipe(empty);
 6013 %}
 6014 
 6015 //----------Move Instructions--------------------------------------------------
 6016 
 6017 instruct castX2P(rRegP dst, rRegL src)
 6018 %{
 6019   match(Set dst (CastX2P src));
 6020 
 6021   format %{ "movq    $dst, $src\t# long->ptr" %}
 6022   ins_encode %{
 6023     if ($dst$$reg != $src$$reg) {
 6024       __ movptr($dst$$Register, $src$$Register);
 6025     }
 6026   %}
 6027   ins_pipe(ialu_reg_reg); // XXX
 6028 %}
 6029 
 6030 instruct castN2X(rRegL dst, rRegN src)
 6031 %{
 6032   match(Set dst (CastP2X src));
 6033 
 6034   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6035   ins_encode %{
 6036     if ($dst$$reg != $src$$reg) {
 6037       __ movptr($dst$$Register, $src$$Register);
 6038     }
 6039   %}
 6040   ins_pipe(ialu_reg_reg); // XXX
 6041 %}
 6042 
 6043 instruct castP2X(rRegL dst, rRegP src)
 6044 %{
 6045   match(Set dst (CastP2X src));
 6046 
 6047   format %{ "movq    $dst, $src\t# ptr -> long" %}
 6048   ins_encode %{
 6049     if ($dst$$reg != $src$$reg) {
 6050       __ movptr($dst$$Register, $src$$Register);
 6051     }
 6052   %}
 6053   ins_pipe(ialu_reg_reg); // XXX
 6054 %}
 6055 
 6056 // Convert oop into int for vectors alignment masking
 6057 instruct convP2I(rRegI dst, rRegP src)
 6058 %{
 6059   match(Set dst (ConvL2I (CastP2X src)));
 6060 
 6061   format %{ "movl    $dst, $src\t# ptr -> int" %}
 6062   ins_encode %{

10564   effect(DEF dst, USE src);
10565   ins_cost(100);
10566   format %{ "movd    $dst,$src\t# MoveI2F" %}
10567   ins_encode %{
10568     __ movdl($dst$$XMMRegister, $src$$Register);
10569   %}
10570   ins_pipe( pipe_slow );
10571 %}
10572 
10573 instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
10574   match(Set dst (MoveL2D src));
10575   effect(DEF dst, USE src);
10576   ins_cost(100);
10577   format %{ "movd    $dst,$src\t# MoveL2D" %}
10578   ins_encode %{
10579      __ movdq($dst$$XMMRegister, $src$$Register);
10580   %}
10581   ins_pipe( pipe_slow );
10582 %}
10583 
10584 
10585 // Fast clearing of an array
10586 // Small ClearArray non-AVX512.
10587 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
10588                   Universe dummy, rFlagsReg cr)
10589 %{
10590   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
10591   match(Set dummy (ClearArray (Binary cnt base) val));
10592   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
10593 
10594   format %{ $$template
10595     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10596     $$emit$$"jg      LARGE\n\t"
10597     $$emit$$"dec     rcx\n\t"
10598     $$emit$$"js      DONE\t# Zero length\n\t"
10599     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10600     $$emit$$"dec     rcx\n\t"
10601     $$emit$$"jge     LOOP\n\t"
10602     $$emit$$"jmp     DONE\n\t"
10603     $$emit$$"# LARGE:\n\t"
10604     if (UseFastStosb) {
10605        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10606        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10607     } else if (UseXMMForObjInit) {
10608        $$emit$$"movdq   $tmp, $val\n\t"
10609        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
10610        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
10611        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10612        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10613        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10614        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
10615        $$emit$$"add     0x40,rax\n\t"
10616        $$emit$$"# L_zero_64_bytes:\n\t"
10617        $$emit$$"sub     0x8,rcx\n\t"
10618        $$emit$$"jge     L_loop\n\t"
10619        $$emit$$"add     0x4,rcx\n\t"
10620        $$emit$$"jl      L_tail\n\t"
10621        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10622        $$emit$$"add     0x20,rax\n\t"
10623        $$emit$$"sub     0x4,rcx\n\t"
10624        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10625        $$emit$$"add     0x4,rcx\n\t"
10626        $$emit$$"jle     L_end\n\t"
10627        $$emit$$"dec     rcx\n\t"
10628        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10629        $$emit$$"vmovq   xmm0,(rax)\n\t"
10630        $$emit$$"add     0x8,rax\n\t"
10631        $$emit$$"dec     rcx\n\t"
10632        $$emit$$"jge     L_sloop\n\t"
10633        $$emit$$"# L_end:\n\t"
10634     } else {
10635        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10636     }
10637     $$emit$$"# DONE"
10638   %}
10639   ins_encode %{
10640     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10641                  $tmp$$XMMRegister, false, false);
10642   %}
10643   ins_pipe(pipe_slow);
10644 %}
10645 
10646 instruct rep_stos_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
10647                             Universe dummy, rFlagsReg cr)
10648 %{
10649   predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
10650   match(Set dummy (ClearArray (Binary cnt base) val));
10651   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
10652 
10653   format %{ $$template
10654     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10655     $$emit$$"jg      LARGE\n\t"
10656     $$emit$$"dec     rcx\n\t"
10657     $$emit$$"js      DONE\t# Zero length\n\t"
10658     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10659     $$emit$$"dec     rcx\n\t"
10660     $$emit$$"jge     LOOP\n\t"
10661     $$emit$$"jmp     DONE\n\t"
10662     $$emit$$"# LARGE:\n\t"
10663     if (UseXMMForObjInit) {
10664        $$emit$$"movdq   $tmp, $val\n\t"
10665        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
10666        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
10667        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10668        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10669        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10670        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
10671        $$emit$$"add     0x40,rax\n\t"
10672        $$emit$$"# L_zero_64_bytes:\n\t"
10673        $$emit$$"sub     0x8,rcx\n\t"
10674        $$emit$$"jge     L_loop\n\t"
10675        $$emit$$"add     0x4,rcx\n\t"
10676        $$emit$$"jl      L_tail\n\t"
10677        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10678        $$emit$$"add     0x20,rax\n\t"
10679        $$emit$$"sub     0x4,rcx\n\t"
10680        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10681        $$emit$$"add     0x4,rcx\n\t"
10682        $$emit$$"jle     L_end\n\t"
10683        $$emit$$"dec     rcx\n\t"
10684        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10685        $$emit$$"vmovq   xmm0,(rax)\n\t"
10686        $$emit$$"add     0x8,rax\n\t"
10687        $$emit$$"dec     rcx\n\t"
10688        $$emit$$"jge     L_sloop\n\t"
10689        $$emit$$"# L_end:\n\t"
10690     } else {
10691        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10692     }
10693     $$emit$$"# DONE"
10694   %}
10695   ins_encode %{
10696     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10697                  $tmp$$XMMRegister, false, true);
10698   %}
10699   ins_pipe(pipe_slow);
10700 %}
10701 
10702 // Small ClearArray AVX512 non-constant length.
10703 instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
10704                        Universe dummy, rFlagsReg cr)
10705 %{
10706   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
10707   match(Set dummy (ClearArray (Binary cnt base) val));
10708   ins_cost(125);
10709   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
10710 
10711   format %{ $$template
10712     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10713     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10714     $$emit$$"jg      LARGE\n\t"
10715     $$emit$$"dec     rcx\n\t"
10716     $$emit$$"js      DONE\t# Zero length\n\t"
10717     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10718     $$emit$$"dec     rcx\n\t"
10719     $$emit$$"jge     LOOP\n\t"
10720     $$emit$$"jmp     DONE\n\t"
10721     $$emit$$"# LARGE:\n\t"
10722     if (UseFastStosb) {
10723        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10724        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10725     } else if (UseXMMForObjInit) {
10726        $$emit$$"mov     rdi,rax\n\t"
10727        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10728        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10729        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

10737        $$emit$$"jl      L_tail\n\t"
10738        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10739        $$emit$$"add     0x20,rax\n\t"
10740        $$emit$$"sub     0x4,rcx\n\t"
10741        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10742        $$emit$$"add     0x4,rcx\n\t"
10743        $$emit$$"jle     L_end\n\t"
10744        $$emit$$"dec     rcx\n\t"
10745        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10746        $$emit$$"vmovq   xmm0,(rax)\n\t"
10747        $$emit$$"add     0x8,rax\n\t"
10748        $$emit$$"dec     rcx\n\t"
10749        $$emit$$"jge     L_sloop\n\t"
10750        $$emit$$"# L_end:\n\t"
10751     } else {
10752        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10753     }
10754     $$emit$$"# DONE"
10755   %}
10756   ins_encode %{
10757     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10758                  $tmp$$XMMRegister, false, false, $ktmp$$KRegister);
10759   %}
10760   ins_pipe(pipe_slow);
10761 %}
10762 
10763 instruct rep_stos_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
10764                                  Universe dummy, rFlagsReg cr)

10765 %{
10766   predicate(!((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
10767   match(Set dummy (ClearArray (Binary cnt base) val));
10768   ins_cost(125);
10769   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
10770 
10771   format %{ $$template
10772     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10773     $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
10774     $$emit$$"jg      LARGE\n\t"
10775     $$emit$$"dec     rcx\n\t"
10776     $$emit$$"js      DONE\t# Zero length\n\t"
10777     $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
10778     $$emit$$"dec     rcx\n\t"
10779     $$emit$$"jge     LOOP\n\t"
10780     $$emit$$"jmp     DONE\n\t"
10781     $$emit$$"# LARGE:\n\t"
10782     if (UseFastStosb) {
10783        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10784        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
10785     } else if (UseXMMForObjInit) {
10786        $$emit$$"mov     rdi,rax\n\t"
10787        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10788        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10789        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"

10797        $$emit$$"jl      L_tail\n\t"
10798        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10799        $$emit$$"add     0x20,rax\n\t"
10800        $$emit$$"sub     0x4,rcx\n\t"
10801        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10802        $$emit$$"add     0x4,rcx\n\t"
10803        $$emit$$"jle     L_end\n\t"
10804        $$emit$$"dec     rcx\n\t"
10805        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10806        $$emit$$"vmovq   xmm0,(rax)\n\t"
10807        $$emit$$"add     0x8,rax\n\t"
10808        $$emit$$"dec     rcx\n\t"
10809        $$emit$$"jge     L_sloop\n\t"
10810        $$emit$$"# L_end:\n\t"
10811     } else {
10812        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
10813     }
10814     $$emit$$"# DONE"
10815   %}
10816   ins_encode %{
10817     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10818                  $tmp$$XMMRegister, false, true, $ktmp$$KRegister);
10819   %}
10820   ins_pipe(pipe_slow);
10821 %}
10822 
10823 // Large ClearArray non-AVX512.
10824 instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
10825                         Universe dummy, rFlagsReg cr)
10826 %{
10827   predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
10828   match(Set dummy (ClearArray (Binary cnt base) val));
10829   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
10830 
10831   format %{ $$template
10832     if (UseFastStosb) {
10833        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10834        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10835     } else if (UseXMMForObjInit) {
10836        $$emit$$"movdq   $tmp, $val\n\t"
10837        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
10838        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
10839        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10840        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10841        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10842        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
10843        $$emit$$"add     0x40,rax\n\t"
10844        $$emit$$"# L_zero_64_bytes:\n\t"
10845        $$emit$$"sub     0x8,rcx\n\t"
10846        $$emit$$"jge     L_loop\n\t"
10847        $$emit$$"add     0x4,rcx\n\t"
10848        $$emit$$"jl      L_tail\n\t"
10849        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10850        $$emit$$"add     0x20,rax\n\t"
10851        $$emit$$"sub     0x4,rcx\n\t"
10852        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10853        $$emit$$"add     0x4,rcx\n\t"
10854        $$emit$$"jle     L_end\n\t"
10855        $$emit$$"dec     rcx\n\t"
10856        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10857        $$emit$$"vmovq   xmm0,(rax)\n\t"
10858        $$emit$$"add     0x8,rax\n\t"
10859        $$emit$$"dec     rcx\n\t"
10860        $$emit$$"jge     L_sloop\n\t"
10861        $$emit$$"# L_end:\n\t"
10862     } else {
10863        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10864     }
10865   %}
10866   ins_encode %{
10867     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10868                  $tmp$$XMMRegister, true, false);
10869   %}
10870   ins_pipe(pipe_slow);
10871 %}
10872 
10873 instruct rep_stos_large_word_copy(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegL val,
10874                                   Universe dummy, rFlagsReg cr)
10875 %{
10876   predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX <= 2));
10877   match(Set dummy (ClearArray (Binary cnt base) val));
10878   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, USE_KILL val, KILL cr);
10879 
10880   format %{ $$template
10881     if (UseXMMForObjInit) {
10882        $$emit$$"movdq   $tmp, $val\n\t"
10883        $$emit$$"punpcklqdq $tmp, $tmp\n\t"
10884        $$emit$$"vinserti128_high $tmp, $tmp\n\t"
10885        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10886        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10887        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10888        $$emit$$"vmovdqu $tmp,0x20(rax)\n\t"
10889        $$emit$$"add     0x40,rax\n\t"
10890        $$emit$$"# L_zero_64_bytes:\n\t"
10891        $$emit$$"sub     0x8,rcx\n\t"
10892        $$emit$$"jge     L_loop\n\t"
10893        $$emit$$"add     0x4,rcx\n\t"
10894        $$emit$$"jl      L_tail\n\t"
10895        $$emit$$"vmovdqu $tmp,(rax)\n\t"
10896        $$emit$$"add     0x20,rax\n\t"
10897        $$emit$$"sub     0x4,rcx\n\t"
10898        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10899        $$emit$$"add     0x4,rcx\n\t"
10900        $$emit$$"jle     L_end\n\t"
10901        $$emit$$"dec     rcx\n\t"
10902        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10903        $$emit$$"vmovq   xmm0,(rax)\n\t"
10904        $$emit$$"add     0x8,rax\n\t"
10905        $$emit$$"dec     rcx\n\t"
10906        $$emit$$"jge     L_sloop\n\t"
10907        $$emit$$"# L_end:\n\t"
10908     } else {
10909        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10910     }
10911   %}
10912   ins_encode %{
10913     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10914                  $tmp$$XMMRegister, true, true);
10915   %}
10916   ins_pipe(pipe_slow);
10917 %}
10918 
10919 // Large ClearArray AVX512.
10920 instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
10921                              Universe dummy, rFlagsReg cr)
10922 %{
10923   predicate(((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
10924   match(Set dummy (ClearArray (Binary cnt base) val));
10925   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
10926 
10927   format %{ $$template
10928     if (UseFastStosb) {
10929        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10930        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10931        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10932     } else if (UseXMMForObjInit) {
10933        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
10934        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10935        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10936        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10937        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10938        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
10939        $$emit$$"add     0x40,rax\n\t"
10940        $$emit$$"# L_zero_64_bytes:\n\t"
10941        $$emit$$"sub     0x8,rcx\n\t"
10942        $$emit$$"jge     L_loop\n\t"
10943        $$emit$$"add     0x4,rcx\n\t"
10944        $$emit$$"jl      L_tail\n\t"
10945        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10946        $$emit$$"add     0x20,rax\n\t"
10947        $$emit$$"sub     0x4,rcx\n\t"
10948        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10949        $$emit$$"add     0x4,rcx\n\t"
10950        $$emit$$"jle     L_end\n\t"
10951        $$emit$$"dec     rcx\n\t"
10952        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
10953        $$emit$$"vmovq   xmm0,(rax)\n\t"
10954        $$emit$$"add     0x8,rax\n\t"
10955        $$emit$$"dec     rcx\n\t"
10956        $$emit$$"jge     L_sloop\n\t"
10957        $$emit$$"# L_end:\n\t"
10958     } else {
10959        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10960        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
10961     }
10962   %}
10963   ins_encode %{
10964     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
10965                  $tmp$$XMMRegister, true, false, $ktmp$$KRegister);
10966   %}
10967   ins_pipe(pipe_slow);
10968 %}
10969 
10970 instruct rep_stos_large_evex_word_copy(rcx_RegL cnt, rdi_RegP base, legRegD tmp, kReg ktmp, rax_RegL val,
10971                                        Universe dummy, rFlagsReg cr)

10972 %{
10973   predicate(((ClearArrayNode*)n)->is_large() && ((ClearArrayNode*)n)->word_copy_only() && (UseAVX > 2));
10974   match(Set dummy (ClearArray (Binary cnt base) val));
10975   effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, USE_KILL val, KILL cr);
10976 
10977   format %{ $$template
10978     if (UseFastStosb) {
10979        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
10980        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
10981        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
10982     } else if (UseXMMForObjInit) {
10983        $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
10984        $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
10985        $$emit$$"jmpq    L_zero_64_bytes\n\t"
10986        $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
10987        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10988        $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
10989        $$emit$$"add     0x40,rax\n\t"
10990        $$emit$$"# L_zero_64_bytes:\n\t"
10991        $$emit$$"sub     0x8,rcx\n\t"
10992        $$emit$$"jge     L_loop\n\t"
10993        $$emit$$"add     0x4,rcx\n\t"
10994        $$emit$$"jl      L_tail\n\t"
10995        $$emit$$"vmovdqu ymm0,(rax)\n\t"
10996        $$emit$$"add     0x20,rax\n\t"
10997        $$emit$$"sub     0x4,rcx\n\t"
10998        $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
10999        $$emit$$"add     0x4,rcx\n\t"
11000        $$emit$$"jle     L_end\n\t"
11001        $$emit$$"dec     rcx\n\t"
11002        $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
11003        $$emit$$"vmovq   xmm0,(rax)\n\t"
11004        $$emit$$"add     0x8,rax\n\t"
11005        $$emit$$"dec     rcx\n\t"
11006        $$emit$$"jge     L_sloop\n\t"
11007        $$emit$$"# L_end:\n\t"
11008     } else {
11009        $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
11010        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
11011     }
11012   %}
11013   ins_encode %{
11014     __ clear_mem($base$$Register, $cnt$$Register, $val$$Register,
11015                  $tmp$$XMMRegister, true, true, $ktmp$$KRegister);
11016   %}
11017   ins_pipe(pipe_slow);
11018 %}
11019 
11020 // Small ClearArray AVX512 constant length.
11021 instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rax_RegL val, kReg ktmp, Universe dummy, rFlagsReg cr)
11022 %{
11023   predicate(!((ClearArrayNode*)n)->is_large() && !((ClearArrayNode*)n)->word_copy_only() &&
11024             ((UseAVX > 2) && VM_Version::supports_avx512vlbw()));
11025   match(Set dummy (ClearArray (Binary cnt base) val));
11026   ins_cost(100);
11027   effect(TEMP tmp, USE_KILL val, TEMP ktmp, KILL cr);
11028   format %{ "clear_mem_imm $base , $cnt  \n\t" %}
11029   ins_encode %{
11030     __ clear_mem($base$$Register, $cnt$$constant, $val$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
11031   %}
11032   ins_pipe(pipe_slow);
11033 %}
11034 
11035 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
11036                          rax_RegI result, legRegD tmp1, rFlagsReg cr)
11037 %{
11038   predicate(!VM_Version::supports_avx512vlbw() && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
11039   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
11040   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
11041 
11042   format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
11043   ins_encode %{
11044     __ string_compare($str1$$Register, $str2$$Register,
11045                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
11046                       $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg);
11047   %}
11048   ins_pipe( pipe_slow );
11049 %}
11050 

12780 
12781   ins_cost(300);
12782   format %{ "call_leaf,runtime " %}
12783   ins_encode(clear_avx, Java_To_Runtime(meth));
12784   ins_pipe(pipe_slow);
12785 %}
12786 
12787 // Call runtime without safepoint and with vector arguments
12788 instruct CallLeafDirectVector(method meth)
12789 %{
12790   match(CallLeafVector);
12791   effect(USE meth);
12792 
12793   ins_cost(300);
12794   format %{ "call_leaf,vector " %}
12795   ins_encode(Java_To_Runtime(meth));
12796   ins_pipe(pipe_slow);
12797 %}
12798 
12799 // Call runtime without safepoint
12800 // entry point is null, target holds the address to call
12801 instruct CallLeafNoFPInDirect(rRegP target)
12802 %{
12803   predicate(n->as_Call()->entry_point() == nullptr);
12804   match(CallLeafNoFP target);
12805 
12806   ins_cost(300);
12807   format %{ "call_leaf_nofp,runtime indirect " %}
12808   ins_encode %{
12809      __ call($target$$Register);
12810   %}
12811 
12812   ins_pipe(pipe_slow);
12813 %}
12814 
12815 instruct CallLeafNoFPDirect(method meth)
12816 %{
12817   predicate(n->as_Call()->entry_point() != nullptr);
12818   match(CallLeafNoFP);
12819   effect(USE meth);
12820 
12821   ins_cost(300);
12822   format %{ "call_leaf_nofp,runtime " %}
12823   ins_encode(clear_avx, Java_To_Runtime(meth));
12824   ins_pipe(pipe_slow);
12825 %}
12826 
12827 // Return Instruction
12828 // Remove the return address & jump to it.
12829 // Notice: We always emit a nop after a ret to make sure there is room
12830 // for safepoint patching
12831 instruct Ret()
12832 %{
12833   match(Return);
12834 
12835   format %{ "ret" %}
12836   ins_encode %{
12837     __ ret(0);
< prev index next >