1 /*
  2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CPU_X86_C2_MACROASSEMBLER_X86_HPP
 26 #define CPU_X86_C2_MACROASSEMBLER_X86_HPP
 27 
 28 // C2_MacroAssembler contains high-level macros for C2
 29 
 30 public:
 31   Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes);
 32 
 33   // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
 34   // See full description in macroAssembler_x86.cpp.
 35   void fast_lock(Register obj, Register box, Register tmp,
 36                  Register scr, Register cx1, Register cx2,
 37                  RTMLockingCounters* rtm_counters,
 38                  RTMLockingCounters* stack_rtm_counters,
 39                  Metadata* method_data,
 40                  bool use_rtm, bool profile_rtm);
 41   void fast_unlock(Register obj, Register box, Register tmp, bool use_rtm);
 42 
 43 #if INCLUDE_RTM_OPT
 44   void rtm_counters_update(Register abort_status, Register rtm_counters);
 45   void branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel);
 46   void rtm_abort_ratio_calculation(Register tmp, Register rtm_counters_reg,
 47                                    RTMLockingCounters* rtm_counters,
 48                                    Metadata* method_data);
 49   void rtm_profiling(Register abort_status_Reg, Register rtm_counters_Reg,
 50                      RTMLockingCounters* rtm_counters, Metadata* method_data, bool profile_rtm);
 51   void rtm_retry_lock_on_abort(Register retry_count, Register abort_status, Label& retryLabel);
 52   void rtm_retry_lock_on_busy(Register retry_count, Register box, Register tmp, Register scr, Label& retryLabel);
 53   void rtm_stack_locking(Register obj, Register tmp, Register scr,
 54                          Register retry_on_abort_count,
 55                          RTMLockingCounters* stack_rtm_counters,
 56                          Metadata* method_data, bool profile_rtm,
 57                          Label& DONE_LABEL, Label& IsInflated);
 58   void rtm_inflated_locking(Register obj, Register box, Register tmp,
 59                             Register scr, Register retry_on_busy_count,
 60                             Register retry_on_abort_count,
 61                             RTMLockingCounters* rtm_counters,
 62                             Metadata* method_data, bool profile_rtm,
 63                             Label& DONE_LABEL);
 64 #endif
 65 
 66   // Generic instructions support for use in .ad files C2 code generation
 67   void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr);
 68   void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr);
 69   void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr);
 70   void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr);
 71 
 72   void pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src,
 73                XMMRegister tmp = xnoreg);
 74   void vpminmax(int opcode, BasicType elem_bt,
 75                 XMMRegister dst, XMMRegister src1, XMMRegister src2,
 76                 int vlen_enc);
 77 
 78   void vminmax_fp(int opcode, BasicType elem_bt,
 79                   XMMRegister dst, XMMRegister a, XMMRegister b,
 80                   XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 81                   int vlen_enc);
 82   void evminmax_fp(int opcode, BasicType elem_bt,
 83                    XMMRegister dst, XMMRegister a, XMMRegister b,
 84                    KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 85                    int vlen_enc);
 86 
 87   void signum_fp(int opcode, XMMRegister dst,
 88                  XMMRegister zero, XMMRegister one,
 89                  Register scratch);
 90 
 91   void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
 92   void vextendbw(bool sign, XMMRegister dst, XMMRegister src);
 93   void vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
 94   void vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
 95 
 96   void vshiftd(int opcode, XMMRegister dst, XMMRegister shift);
 97   void vshiftd_imm(int opcode, XMMRegister dst, int shift);
 98   void vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
 99   void vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
100   void vshiftw(int opcode, XMMRegister dst, XMMRegister shift);
101   void vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
102   void vshiftq(int opcode, XMMRegister dst, XMMRegister shift);
103   void vshiftq_imm(int opcode, XMMRegister dst, int shift);
104   void vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
105   void vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
106 
107   void vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, int shift, int vector_len);
108   void vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
109 
110   void varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
111   void varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
112   void varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister vtmp = xnoreg);
113   void varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch);
114   void evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch);
115 
116   void insert(BasicType typ, XMMRegister dst, Register val, int idx);
117   void vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx);
118   void vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len);
119   void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len);
120   void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len);
121 
122   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);
123   void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);
124 
125   // extract
126   void extract(BasicType typ, Register dst, XMMRegister src, int idx);
127   XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex);
128   void get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex);
129   void get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp = noreg, XMMRegister vtmp = xnoreg);
130 
131   // vector test
132   void vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
133                   XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg, KRegister mask = knoreg);
134 
135   // blend
136   void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
137   void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len);
138   void evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
139 
140   void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy);
141   void load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, Register tmp, bool novlbwdq, int vlen_enc);
142 
143   void load_vector(XMMRegister dst, Address src, int vlen_in_bytes);
144   void load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch = rscratch1);
145   void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes);
146 
147   // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
148 
149   // dst = src1  reduce(op, src2) using vtmp as temps
150   void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
151 #ifdef _LP64
152   void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
153   void genmask(KRegister dst, Register len, Register temp);
154 #endif // _LP64
155 
156   // dst = reduce(op, src2) using vtmp as temps
157   void reduce_fp(int opcode, int vlen,
158                  XMMRegister dst, XMMRegister src,
159                  XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
160   void reduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
161   void mulreduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
162   void reduceS(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
163   void reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
164                          XMMRegister dst, XMMRegister src,
165                          XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
166   void reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid,
167                           XMMRegister dst, XMMRegister src,
168                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
169  private:
170   void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
171   void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
172 
173   // Int Reduction
174   void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
175   void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
176   void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
177   void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
178 
179   // Byte Reduction
180   void reduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
181   void reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
182   void reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
183   void reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
184   void mulreduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
185   void mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
186   void mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
187   void mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
188 
189   // Short Reduction
190   void reduce4S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
191   void reduce8S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
192   void reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
193   void reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
194 
195   // Long Reduction
196 #ifdef _LP64
197   void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
198   void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
199   void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
200 #endif // _LP64
201 
202   // Float Reduction
203   void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
204   void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
205   void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
206   void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
207 
208   // Double Reduction
209   void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
210   void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
211   void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
212 
213   // Base reduction instruction
214   void reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src);
215   void reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
216 
217  public:
218 #ifdef _LP64
219   void vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen);
220 
221   void vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, int masklen, int masksize, int vec_enc);
222 
223   void vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
224                              Register tmp, int masklen, BasicType bt, int vec_enc);
225   void vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
226                               Register rtmp2, XMMRegister xtmp, int mask_len, int vec_enc);
227 #endif
228 
229   void vector_maskall_operation(KRegister dst, Register src, int mask_len);
230 
231 #ifndef _LP64
232   void vector_maskall_operation32(KRegister dst, Register src, KRegister ktmp, int mask_len);
233 #endif
234 
235   void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
236                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
237 
238   void stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
239                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
240 
241   // IndexOf strings.
242   // Small strings are loaded through stack if they cross page boundary.
243   void string_indexof(Register str1, Register str2,
244                       Register cnt1, Register cnt2,
245                       int int_cnt2,  Register result,
246                       XMMRegister vec, Register tmp,
247                       int ae);
248 
249   // IndexOf for constant substrings with size >= 8 elements
250   // which don't need to be loaded through stack.
251   void string_indexofC8(Register str1, Register str2,
252                       Register cnt1, Register cnt2,
253                       int int_cnt2,  Register result,
254                       XMMRegister vec, Register tmp,
255                       int ae);
256 
257     // Smallest code: we don't need to load through stack,
258     // check string tail.
259 
260   // helper function for string_compare
261   void load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
262                           Address::ScaleFactor scale, Address::ScaleFactor scale1,
263                           Address::ScaleFactor scale2, Register index, int ae);
264   // Compare strings.
265   void string_compare(Register str1, Register str2,
266                       Register cnt1, Register cnt2, Register result,
267                       XMMRegister vec1, int ae, KRegister mask = knoreg);
268 
269   // Search for Non-ASCII character (Negative byte value) in a byte array,
270   // return index of the first such character, otherwise len.
271   void count_positives(Register ary1, Register len,
272                        Register result, Register tmp1,
273                        XMMRegister vec1, XMMRegister vec2, KRegister mask1 = knoreg, KRegister mask2 = knoreg);
274   // Compare char[] or byte[] arrays.
275   void arrays_equals(bool is_array_equ, Register ary1, Register ary2,
276                      Register limit, Register result, Register chr,
277                      XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask = knoreg);
278 
279 
280   void evmasked_op(int ideal_opc, BasicType eType, KRegister mask,
281                    XMMRegister dst, XMMRegister src1, XMMRegister src2,
282                    bool merge, int vlen_enc, bool is_varshift = false);
283 
284   void evmasked_op(int ideal_opc, BasicType eType, KRegister mask,
285                    XMMRegister dst, XMMRegister src1, Address src2,
286                    bool merge, int vlen_enc);
287 
288   void evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
289                    XMMRegister src1, int imm8, bool merge, int vlen_enc);
290 
291   void masked_op(int ideal_opc, int mask_len, KRegister dst,
292                  KRegister src1, KRegister src2);
293 
294   void vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
295                           XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
296                           AddressLiteral float_sign_flip, Register scratch, int vec_enc);
297 
298   void vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
299                            KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
300                            Register scratch, int vec_enc);
301 
302 
303   void vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
304                            KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
305                            Register scratch, int vec_enc);
306 
307   void vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
308                             BasicType from_elem_bt, BasicType to_elem_bt);
309 
310   void vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
311                                              KRegister ktmp1, KRegister ktmp2, Register scratch, AddressLiteral double_sign_flip,
312                                              int vec_enc);
313 
314   void vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
315                                             KRegister ktmp1, KRegister ktmp2, Register scratch, AddressLiteral float_sign_flip,
316                                             int vec_enc);
317 
318   void vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
319                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
320                                            Register scratch, AddressLiteral float_sign_flip,
321                                            int vec_enc);
322 
323 #ifdef _LP64
324   void vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
325                                 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
326                                 AddressLiteral new_mxcsr, Register scratch, int vec_enc);
327 
328   void vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
329                                KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
330                                AddressLiteral new_mxcsr, Register scratch, int vec_enc);
331 
332   void vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
333                               XMMRegister xtmp3, XMMRegister xtmp4, AddressLiteral float_sign_flip,
334                               AddressLiteral new_mxcsr, Register scratch, int vec_enc);
335 #endif
336 
337   void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
338                   bool merge, BasicType bt, int vlen_enc);
339 
340   void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
341                   bool merge, BasicType bt, int vlen_enc);
342 
343   void udivI(Register rax, Register divisor, Register rdx);
344   void umodI(Register rax, Register divisor, Register rdx);
345   void udivmodI(Register rax, Register divisor, Register rdx, Register tmp);
346 
347   #ifdef _LP64
348   void udivL(Register rax, Register divisor, Register rdx);
349   void umodL(Register rax, Register divisor, Register rdx);
350   void udivmodL(Register rax, Register divisor, Register rdx, Register tmp);
351   #endif
352   void vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
353                            XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
354                            int vec_enc);
355 
356   void vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
357                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
358                             int vec_enc);
359 
360 #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP