1 /*
  2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CPU_X86_C2_MACROASSEMBLER_X86_HPP
 26 #define CPU_X86_C2_MACROASSEMBLER_X86_HPP
 27 
 28 // C2_MacroAssembler contains high-level macros for C2
 29 
 30 public:
 31   // C2 compiled method's prolog code.
 32   void verified_entry(Compile* C, int sp_inc = 0);
 33 
 34   void entry_barrier();
 35   void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
 36   static int entry_barrier_stub_size();
 37 
 38   Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes);
 39 
 40   // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
 41   // See full description in macroAssembler_x86.cpp.
 42   void fast_lock(Register obj, Register box, Register tmp,
 43                  Register scr, Register cx1, Register cx2,
 44                  RTMLockingCounters* rtm_counters,
 45                  RTMLockingCounters* stack_rtm_counters,
 46                  Metadata* method_data,
 47                  bool use_rtm, bool profile_rtm);
 48   void fast_unlock(Register obj, Register box, Register tmp, bool use_rtm);
 49 
 50 #if INCLUDE_RTM_OPT
 51   void rtm_counters_update(Register abort_status, Register rtm_counters);
 52   void branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel);
 53   void rtm_abort_ratio_calculation(Register tmp, Register rtm_counters_reg,
 54                                    RTMLockingCounters* rtm_counters,
 55                                    Metadata* method_data);
 56   void rtm_profiling(Register abort_status_Reg, Register rtm_counters_Reg,
 57                      RTMLockingCounters* rtm_counters, Metadata* method_data, bool profile_rtm);
 58   void rtm_retry_lock_on_abort(Register retry_count, Register abort_status, Label& retryLabel);
 59   void rtm_retry_lock_on_busy(Register retry_count, Register box, Register tmp, Register scr, Label& retryLabel);
 60   void rtm_stack_locking(Register obj, Register tmp, Register scr,
 61                          Register retry_on_abort_count,
 62                          RTMLockingCounters* stack_rtm_counters,
 63                          Metadata* method_data, bool profile_rtm,
 64                          Label& DONE_LABEL, Label& IsInflated);
 65   void rtm_inflated_locking(Register obj, Register box, Register tmp,
 66                             Register scr, Register retry_on_busy_count,
 67                             Register retry_on_abort_count,
 68                             RTMLockingCounters* rtm_counters,
 69                             Metadata* method_data, bool profile_rtm,
 70                             Label& DONE_LABEL);
 71 #endif
 72 
 73   // Generic instructions support for use in .ad files C2 code generation
 74   void vabsnegd(int opcode, XMMRegister dst, XMMRegister src);
 75   void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len);
 76   void vabsnegf(int opcode, XMMRegister dst, XMMRegister src);
 77   void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len);
 78 
 79   void pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src,
 80                XMMRegister tmp = xnoreg);
 81   void vpminmax(int opcode, BasicType elem_bt,
 82                 XMMRegister dst, XMMRegister src1, XMMRegister src2,
 83                 int vlen_enc);
 84 
 85   void vminmax_fp(int opcode, BasicType elem_bt,
 86                   XMMRegister dst, XMMRegister a, XMMRegister b,
 87                   XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 88                   int vlen_enc);
 89   void evminmax_fp(int opcode, BasicType elem_bt,
 90                    XMMRegister dst, XMMRegister a, XMMRegister b,
 91                    KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 92                    int vlen_enc);
 93 
 94   void signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one);
 95 
 96   void vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
 97                               bool merge, BasicType bt, int vec_enc);
 98 
 99   void vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, Register rtmp2, int mask_len);
100 
101   void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
102   void vextendbw(bool sign, XMMRegister dst, XMMRegister src);
103   void vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
104   void vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
105 
106   void vshiftd(int opcode, XMMRegister dst, XMMRegister shift);
107   void vshiftd_imm(int opcode, XMMRegister dst, int shift);
108   void vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
109   void vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
110   void vshiftw(int opcode, XMMRegister dst, XMMRegister shift);
111   void vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
112   void vshiftq(int opcode, XMMRegister dst, XMMRegister shift);
113   void vshiftq_imm(int opcode, XMMRegister dst, int shift);
114   void vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
115   void vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
116 
117   void vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, int shift, int vector_len);
118   void vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
119 
120   void varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
121   void varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
122   void varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister vtmp = xnoreg);
123   void varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp);
124   void evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp);
125 
126   void insert(BasicType typ, XMMRegister dst, Register val, int idx);
127   void vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx);
128   void vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len);
129   void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len);
130   void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len);
131 
132   void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len);
133   void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len);
134 
135   // extract
136   void extract(BasicType typ, Register dst, XMMRegister src, int idx);
137   XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex);
138   void get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex);
139   void get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp = xnoreg);
140 
141   // vector test
142   void vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes);
143 
144  // Covert B2X
145  void vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc);
146 #ifdef _LP64
147  void vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc);
148 #endif
149 
150   // blend
151   void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister    src2, int comparison, int vector_len);
152   void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch = noreg);
153   void evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
154 
155   void load_vector(XMMRegister dst, Address        src, int vlen_in_bytes);
156   void load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch = noreg);
157 
158   void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy);
159   void load_vector_mask(KRegister   dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc);
160 
161   void load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen);
162   void load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt);
163 
164   // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
165 
166   // dst = src1  reduce(op, src2) using vtmp as temps
167   void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
168 #ifdef _LP64
169   void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
170   void genmask(KRegister dst, Register len, Register temp);
171 #endif // _LP64
172 
173   // dst = reduce(op, src2) using vtmp as temps
174   void reduce_fp(int opcode, int vlen,
175                  XMMRegister dst, XMMRegister src,
176                  XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
177   void reduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
178   void mulreduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
179   void reduceS(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
180   void reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
181                          XMMRegister dst, XMMRegister src,
182                          XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
183   void reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid,
184                           XMMRegister dst, XMMRegister src,
185                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg);
186  private:
187   void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
188   void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
189 
190   // Int Reduction
191   void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
192   void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
193   void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
194   void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
195 
196   // Byte Reduction
197   void reduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
198   void reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
199   void reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
200   void reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
201   void mulreduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
202   void mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
203   void mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
204   void mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
205 
206   // Short Reduction
207   void reduce4S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
208   void reduce8S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
209   void reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
210   void reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
211 
212   // Long Reduction
213 #ifdef _LP64
214   void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
215   void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
216   void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
217 #endif // _LP64
218 
219   // Float Reduction
220   void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
221   void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
222   void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
223   void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
224 
225   // Double Reduction
226   void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
227   void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
228   void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
229 
230   // Base reduction instruction
231   void reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src);
232   void reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
233 
234  public:
235 #ifdef _LP64
236   void vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen);
237 
238   void vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, int masklen, int masksize, int vec_enc);
239 
240   void vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
241                              Register tmp, int masklen, BasicType bt, int vec_enc);
242   void vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
243                               Register rtmp2, XMMRegister xtmp, int mask_len, int vec_enc);
244 #endif
245 
246   void vector_maskall_operation(KRegister dst, Register src, int mask_len);
247 
248 #ifndef _LP64
249   void vector_maskall_operation32(KRegister dst, Register src, KRegister ktmp, int mask_len);
250 #endif
251 
252   void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
253                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
254 
255   void stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
256                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
257 
258   // IndexOf strings.
259   // Small strings are loaded through stack if they cross page boundary.
260   void string_indexof(Register str1, Register str2,
261                       Register cnt1, Register cnt2,
262                       int int_cnt2,  Register result,
263                       XMMRegister vec, Register tmp,
264                       int ae);
265 
266   // IndexOf for constant substrings with size >= 8 elements
267   // which don't need to be loaded through stack.
268   void string_indexofC8(Register str1, Register str2,
269                       Register cnt1, Register cnt2,
270                       int int_cnt2,  Register result,
271                       XMMRegister vec, Register tmp,
272                       int ae);
273 
274     // Smallest code: we don't need to load through stack,
275     // check string tail.
276 
277   // helper function for string_compare
278   void load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
279                           Address::ScaleFactor scale, Address::ScaleFactor scale1,
280                           Address::ScaleFactor scale2, Register index, int ae);
281   // Compare strings.
282   void string_compare(Register str1, Register str2,
283                       Register cnt1, Register cnt2, Register result,
284                       XMMRegister vec1, int ae, KRegister mask = knoreg);
285 
286   // Search for Non-ASCII character (Negative byte value) in a byte array,
287   // return index of the first such character, otherwise len.
288   void count_positives(Register ary1, Register len,
289                        Register result, Register tmp1,
290                        XMMRegister vec1, XMMRegister vec2, KRegister mask1 = knoreg, KRegister mask2 = knoreg);
291   // Compare char[] or byte[] arrays.
292   void arrays_equals(bool is_array_equ, Register ary1, Register ary2,
293                      Register limit, Register result, Register chr,
294                      XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask = knoreg);
295 
296 
297   void evmasked_op(int ideal_opc, BasicType eType, KRegister mask,
298                    XMMRegister dst, XMMRegister src1, XMMRegister src2,
299                    bool merge, int vlen_enc, bool is_varshift = false);
300 
301   void evmasked_op(int ideal_opc, BasicType eType, KRegister mask,
302                    XMMRegister dst, XMMRegister src1, Address src2,
303                    bool merge, int vlen_enc);
304 
305   void evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
306                    XMMRegister src1, int imm8, bool merge, int vlen_enc);
307 
308   void masked_op(int ideal_opc, int mask_len, KRegister dst,
309                  KRegister src1, KRegister src2);
310 
311   void vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
312                             BasicType from_elem_bt, BasicType to_elem_bt);
313 
314   void vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
315                                   XMMRegister xtmp, Register rscratch, int vec_enc);
316 
317   void vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
318                           XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
319                           AddressLiteral float_sign_flip, Register rscratch, int vec_enc);
320 
321   void vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
322                            XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
323                            Register rscratch, int vec_enc);
324 
325   void vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
326                            KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
327                            Register rscratch, int vec_enc);
328 
329   void vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
330                            XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral sign_flip,
331                            Register rscratch, int vec_enc);
332 
333   void vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
334                           XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
335                           AddressLiteral float_sign_flip, Register rscratch, int vec_enc);
336 
337 
338   void vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
339                                                    XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
340                                                    AddressLiteral float_sign_flip, int vec_enc);
341 
342   void vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
343                                                     KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral float_sign_flip,
344                                                     int vec_enc);
345 
346   void vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
347                                                      KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral double_sign_flip,
348                                                      int vec_enc);
349 
350   void vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
351                                                    KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral float_sign_flip,
352                                                    int vec_enc);
353 
354   void vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
355                                                     KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral double_sign_flip,
356                                                     int vec_enc);
357 
358   void vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
359                                                   XMMRegister xtmp4, Register rscratch, AddressLiteral float_sign_flip,
360                                                   int vec_enc);
361 
362   void vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
363                                             XMMRegister xtmp, int index, int vec_enc);
364 
365   void vector_mask_cast(XMMRegister dst, XMMRegister src, BasicType dst_bt, BasicType src_bt, int vlen);
366 
367 #ifdef _LP64
368   void vector_round_double_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
369                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2);
370 
371   void vector_round_float_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
372                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2);
373 
374   void vector_round_float_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
375                               Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4);
376 #endif // _LP64
377 
378   void udivI(Register rax, Register divisor, Register rdx);
379   void umodI(Register rax, Register divisor, Register rdx);
380   void udivmodI(Register rax, Register divisor, Register rdx, Register tmp);
381 
382 #ifdef _LP64
383   void reverseI(Register dst, Register src, XMMRegister xtmp1,
384                 XMMRegister xtmp2, Register rtmp);
385   void reverseL(Register dst, Register src, XMMRegister xtmp1,
386                 XMMRegister xtmp2, Register rtmp1, Register rtmp2);
387   void udivL(Register rax, Register divisor, Register rdx);
388   void umodL(Register rax, Register divisor, Register rdx);
389   void udivmodL(Register rax, Register divisor, Register rdx, Register tmp);
390 #endif
391 
392   void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
393                   bool merge, BasicType bt, int vlen_enc);
394 
395   void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
396                   bool merge, BasicType bt, int vlen_enc);
397 
398   void vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
399                           XMMRegister xtmp2, Register rtmp, int vec_enc);
400 
401   void vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
402                                XMMRegister xtmp, Register rscratch = noreg);
403 
404   void vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc);
405 
406   void vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
407                            XMMRegister xtmp2, Register rtmp, int vec_enc);
408 
409   void vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
410                             XMMRegister xtmp2, Register rtmp, int vec_enc);
411 
412   void vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
413                              XMMRegister xtmp2, Register rtmp, int vec_enc);
414 
415   void vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
416                             XMMRegister xtmp2, Register rtmp, int vec_enc);
417 
418   void vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
419                                 XMMRegister xtmp2, Register rtmp, int vec_enc);
420 
421   void vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
422                                      KRegister mask, bool merge, int vec_enc);
423 
424   void vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc);
425 
426   void vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
427                              XMMRegister xtmp2, Register rtmp, int vec_enc);
428 
429   void vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
430                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
431                                        KRegister ktmp, Register rtmp, bool merge, int vec_enc);
432 
433   void vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
434                                            XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
435 
436   void vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
437                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
438 
439   void vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
440                                           XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc);
441 
442   void vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
443                                            XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
444 
445   void vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
446                                       XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
447 
448   void vpadd(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc);
449 
450   void vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc);
451 
452   void vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
453                                         XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, KRegister ktmp,
454                                         Register rtmp, int vec_enc);
455 
456   void vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
457                          XMMRegister xtmp1, Register rtmp, int vec_enc);
458 
459   void vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
460                                        XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
461 
462   void vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
463                          XMMRegister xtmp1, int vec_enc);
464 
465   void vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
466                           KRegister ktmp1, int vec_enc);
467 
468   void vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, int vec_enc);
469 
470   void vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, int vec_enc);
471 
472   void rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
473                        XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, int vlen_enc);
474 
475 #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP