1 /*
  2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CPU_X86_C2_MACROASSEMBLER_X86_HPP
 26 #define CPU_X86_C2_MACROASSEMBLER_X86_HPP
 27 
 28 // C2_MacroAssembler contains high-level macros for C2
 29 
 30 public:
 31   Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes);
 32 
 33   // special instructions for EVEX
 34   void setvectmask(Register dst, Register src, KRegister mask);
 35   void restorevectmask(KRegister mask);
 36 
 37   // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
 38   // See full desription in macroAssembler_x86.cpp.
 39   void fast_lock(Register obj, Register box, Register tmp,
 40                  Register scr, Register cx1, Register cx2, Register thread,
 41                  BiasedLockingCounters* counters,
 42                  RTMLockingCounters* rtm_counters,
 43                  RTMLockingCounters* stack_rtm_counters,
 44                  Metadata* method_data,
 45                  bool use_rtm, bool profile_rtm);
 46   void fast_unlock(Register obj, Register box, Register tmp, bool use_rtm);
 47 
 48   void fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 49                              Register t, Register thread);
 50   void fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread);
 51 
 52 #if INCLUDE_RTM_OPT
 53   void rtm_counters_update(Register abort_status, Register rtm_counters);
 54   void branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel);
 55   void rtm_abort_ratio_calculation(Register tmp, Register rtm_counters_reg,
 56                                    RTMLockingCounters* rtm_counters,
 57                                    Metadata* method_data);
 58   void rtm_profiling(Register abort_status_Reg, Register rtm_counters_Reg,
 59                      RTMLockingCounters* rtm_counters, Metadata* method_data, bool profile_rtm);
 60   void rtm_retry_lock_on_abort(Register retry_count, Register abort_status, Label& retryLabel);
 61   void rtm_retry_lock_on_busy(Register retry_count, Register box, Register tmp, Register scr, Label& retryLabel);
 62   void rtm_stack_locking(Register obj, Register tmp, Register scr,
 63                          Register retry_on_abort_count,
 64                          RTMLockingCounters* stack_rtm_counters,
 65                          Metadata* method_data, bool profile_rtm,
 66                          Label& DONE_LABEL, Label& IsInflated);
 67   void rtm_inflated_locking(Register obj, Register box, Register tmp,
 68                             Register scr, Register retry_on_busy_count,
 69                             Register retry_on_abort_count,
 70                             RTMLockingCounters* rtm_counters,
 71                             Metadata* method_data, bool profile_rtm,
 72                             Label& DONE_LABEL);
 73 #endif
 74 
 75   // Generic instructions support for use in .ad files C2 code generation
 76   void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr);
 77   void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr);
 78   void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr);
 79   void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr);
 80 
 81   void pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src,
 82                XMMRegister tmp = xnoreg);
 83   void vpminmax(int opcode, BasicType elem_bt,
 84                 XMMRegister dst, XMMRegister src1, XMMRegister src2,
 85                 int vlen_enc);
 86 
 87   void vminmax_fp(int opcode, BasicType elem_bt,
 88                   XMMRegister dst, XMMRegister a, XMMRegister b,
 89                   XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 90                   int vlen_enc);
 91   void evminmax_fp(int opcode, BasicType elem_bt,
 92                    XMMRegister dst, XMMRegister a, XMMRegister b,
 93                    KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 94                    int vlen_enc);
 95 
 96   void signum_fp(int opcode, XMMRegister dst,
 97                  XMMRegister zero, XMMRegister one,
 98                  Register scratch);
 99 
100   void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
101   void vextendbw(bool sign, XMMRegister dst, XMMRegister src);
102   void vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
103   void vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
104 
105   void vshiftd(int opcode, XMMRegister dst, XMMRegister shift);
106   void vshiftd_imm(int opcode, XMMRegister dst, int shift);
107   void vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
108   void vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
109   void vshiftw(int opcode, XMMRegister dst, XMMRegister shift);
110   void vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
111   void vshiftq(int opcode, XMMRegister dst, XMMRegister shift);
112   void vshiftq_imm(int opcode, XMMRegister dst, int shift);
113   void vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
114   void vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
115 
116   void vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, int shift, int vector_len);
117   void vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
118 
119   void varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
120   void varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
121   void varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister vtmp = xnoreg);
122   void varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch);
123   void evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch);
124 
125   void insert(BasicType typ, XMMRegister dst, Register val, int idx);
126   void vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx);
127   void vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len);
128   void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len);
129   void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len);
130 
131   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);
132   void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);
133 
134   // extract
135   void extract(BasicType typ, Register dst, XMMRegister src, int idx);
136   XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex);
137   void get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex);
138   void get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp = noreg, XMMRegister vtmp = xnoreg);
139   void movsxl(BasicType typ, Register dst);
140 
141   // vector test
142   void vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
143                   XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg, KRegister mask = knoreg);
144 
145   // blend
146   void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
147   void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len);
148   void evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
149 
150   void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy);
151   void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes);
152 
153   // vector compare
154   void vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
155               XMMRegister vtmp1, XMMRegister vtmp2, Register scratch);
156   void vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
157                 XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch);
158 
159   // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
160 
161   // dst = src1  reduce(op, src2) using vtmp as temps
162   void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
163 #ifdef _LP64
164   void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
165   void genmask(KRegister dst, Register len, Register temp);
166 #endif // _LP64
167 
168   // dst = reduce(op, src2) using vtmp as temps
169   void reduce_fp(int opcode, int vlen,
170                  XMMRegister dst, XMMRegister src,
171                  XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
172   void reduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
173   void mulreduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
174   void reduceS(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
175   void reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
176                          XMMRegister dst, XMMRegister src,
177                          XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
178   void reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid,
179                           XMMRegister dst, XMMRegister src,
180                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
181  private:
182   void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
183   void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
184 
185   // Int Reduction
186   void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
187   void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
188   void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
189   void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
190 
191   // Byte Reduction
192   void reduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
193   void reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
194   void reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
195   void reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
196   void mulreduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
197   void mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
198   void mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
199   void mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
200 
201   // Short Reduction
202   void reduce4S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
203   void reduce8S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
204   void reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
205   void reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
206 
207   // Long Reduction
208 #ifdef _LP64
209   void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
210   void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
211   void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
212 #endif // _LP64
213 
214   // Float Reduction
215   void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
216   void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
217   void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
218   void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
219 
220   // Double Reduction
221   void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
222   void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
223   void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
224 
225   // Base reduction instruction
226   void reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src);
227   void reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
228 
229  public:
230 #ifdef _LP64
231   void vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, Register tmp,
232                              KRegister ktmp, int masklen, int vec_enc);
233 
234   void vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, XMMRegister xtmp1,
235                              Register tmp, int masklen, int vec_enc);
236 #endif
237   void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
238                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
239 
240   void stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
241                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
242 
243   // IndexOf strings.
244   // Small strings are loaded through stack if they cross page boundary.
245   void string_indexof(Register str1, Register str2,
246                       Register cnt1, Register cnt2,
247                       int int_cnt2,  Register result,
248                       XMMRegister vec, Register tmp,
249                       int ae);
250 
251   // IndexOf for constant substrings with size >= 8 elements
252   // which don't need to be loaded through stack.
253   void string_indexofC8(Register str1, Register str2,
254                       Register cnt1, Register cnt2,
255                       int int_cnt2,  Register result,
256                       XMMRegister vec, Register tmp,
257                       int ae);
258 
259     // Smallest code: we don't need to load through stack,
260     // check string tail.
261 
262   // helper function for string_compare
263   void load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
264                           Address::ScaleFactor scale, Address::ScaleFactor scale1,
265                           Address::ScaleFactor scale2, Register index, int ae);
266   // Compare strings.
267   void string_compare(Register str1, Register str2,
268                       Register cnt1, Register cnt2, Register result,
269                       XMMRegister vec1, int ae, KRegister mask = knoreg);
270 
271   // Search for Non-ASCII character (Negative byte value) in a byte array,
272   // return true if it has any and false otherwise.
273   void has_negatives(Register ary1, Register len,
274                      Register result, Register tmp1,
275                      XMMRegister vec1, XMMRegister vec2, KRegister mask1 = knoreg, KRegister mask2 = knoreg);
276 
277   // Compare char[] or byte[] arrays.
278   void arrays_equals(bool is_array_equ, Register ary1, Register ary2,
279                      Register limit, Register result, Register chr,
280                      XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask = knoreg);
281 
282   void rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
283                        XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, int vlen_enc);
284 
285   void load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp);
286 
287 #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP