1 /*
  2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
 26 #define CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
 27 
 28 // C2_MacroAssembler contains high-level macros for C2
 29 
 30  private:
 31   // Return true if the phase output is in the scratch emit size mode.
 32   virtual bool in_scratch_emit_size() override;
 33 
 34   void neon_reduce_logical_helper(int opc, bool sf, Register Rd, Register Rn, Register Rm,
 35                                   enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
 36 
 37  public:
 38   void entry_barrier();
 39 
 40   // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
 41   void fast_lock(Register object, Register box, Register tmp, Register tmp2, Register tmp3);
 42   void fast_unlock(Register object, Register box, Register tmp, Register tmp2);
 43   // Code used by cmpFastLockLightweight and cmpFastUnlockLightweight mach instructions in .ad file.
 44   void fast_lock_lightweight(Register object, Register box, Register t1, Register t2, Register t3);
 45   void fast_unlock_lightweight(Register object, Register box, Register t1, Register t2, Register t3);
 46 
 47   void string_compare(Register str1, Register str2,
 48                       Register cnt1, Register cnt2, Register result,
 49                       Register tmp1, Register tmp2, FloatRegister vtmp1,
 50                       FloatRegister vtmp2, FloatRegister vtmp3,
 51                       PRegister pgtmp1, PRegister pgtmp2, int ae);
 52 
 53   void string_indexof(Register str1, Register str2,
 54                       Register cnt1, Register cnt2,
 55                       Register tmp1, Register tmp2,
 56                       Register tmp3, Register tmp4,
 57                       Register tmp5, Register tmp6,
 58                       int int_cnt1, Register result, int ae);
 59 
 60   void string_indexof_char(Register str1, Register cnt1,
 61                            Register ch, Register result,
 62                            Register tmp1, Register tmp2, Register tmp3);
 63 
 64   void stringL_indexof_char(Register str1, Register cnt1,
 65                             Register ch, Register result,
 66                             Register tmp1, Register tmp2, Register tmp3);
 67 
 68   void string_indexof_char_sve(Register str1, Register cnt1,
 69                                Register ch, Register result,
 70                                FloatRegister ztmp1, FloatRegister ztmp2,
 71                                PRegister pgtmp, PRegister ptmp, bool isL);
 72 
 73   // Compress the least significant bit of each byte to the rightmost and clear
 74   // the higher garbage bits.
 75   void bytemask_compress(Register dst);
 76 
 77   // Pack the lowest-numbered bit of each mask element in src into a long value
 78   // in dst, at most the first 64 lane elements.
 79   void sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
 80                         FloatRegister vtmp1, FloatRegister vtmp2);
 81 
 82   // Unpack the mask, a long value in src, into predicate register dst based on the
 83   // corresponding data type. Note that dst can support at most 64 lanes.
 84   void sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
 85                           FloatRegister vtmp1, FloatRegister vtmp2);
 86 
 87   // SIMD&FP comparison
 88   void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
 89                     FloatRegister src2, Condition cond, bool isQ);
 90 
 91   void neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
 92                          Condition cond, bool isQ);
 93 
 94   void sve_compare(PRegister pd, BasicType bt, PRegister pg,
 95                    FloatRegister zn, FloatRegister zm, Condition cond);
 96 
 97   void sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp);
 98 
 99   // Vector cast
100   void neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
101                           FloatRegister src, BasicType src_bt, bool is_unsigned = false);
102 
103   void neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
104                           FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes);
105 
106   void sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
107                          FloatRegister src, SIMD_RegVariant src_size, bool is_unsigned = false);
108 
109   void sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
110                          FloatRegister src, SIMD_RegVariant src_size, FloatRegister tmp);
111 
112   void sve_vmaskcast_extend(PRegister dst, PRegister src,
113                             uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);
114 
115   void sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
116                             uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);
117 
118   // Vector reduction
119   void neon_reduce_add_integral(Register dst, BasicType bt,
120                                 Register isrc, FloatRegister vsrc,
121                                 unsigned vector_length_in_bytes, FloatRegister vtmp);
122 
123   void neon_reduce_mul_integral(Register dst, BasicType bt,
124                                 Register isrc, FloatRegister vsrc,
125                                 unsigned vector_length_in_bytes,
126                                 FloatRegister vtmp1, FloatRegister vtmp2);
127 
128   void neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
129                           FloatRegister fsrc, FloatRegister vsrc,
130                           unsigned vector_length_in_bytes, FloatRegister vtmp);
131 
132   void neon_reduce_logical(int opc, Register dst, BasicType bt, Register isrc,
133                            FloatRegister vsrc, unsigned vector_length_in_bytes);
134 
135   void neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
136                                    Register isrc, FloatRegister vsrc,
137                                    unsigned vector_length_in_bytes, FloatRegister vtmp);
138 
139   void sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
140                            FloatRegister src2, PRegister pg, FloatRegister tmp);
141 
142   // Set elements of the dst predicate to true for lanes in the range of
143   // [0, lane_cnt), or to false otherwise. The input "lane_cnt" should be
144   // smaller than or equal to the supported max vector length of the basic
145   // type. Clobbers: rscratch1 and the rFlagsReg.
146   void sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt);
147 
148   // Extract a scalar element from an sve vector at position 'idx'.
149   // The input elements in src are expected to be of integral type.
150   void sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
151                             int idx, FloatRegister vtmp);
152 
153   // java.lang.Math::round intrinsics
154   void vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
155                          FloatRegister tmp2, FloatRegister tmp3,
156                          SIMD_Arrangement T);
157   void vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
158                         FloatRegister tmp2, PRegister pgtmp,
159                         SIMD_RegVariant T);
160 
161   // Pack active elements of src, under the control of mask, into the
162   // lowest-numbered elements of dst. Any remaining elements of dst will
163   // be filled with zero.
164   void sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
165                          FloatRegister vtmp1, FloatRegister vtmp2,
166                          FloatRegister vtmp3, FloatRegister vtmp4,
167                          PRegister ptmp, PRegister pgtmp);
168 
169   void sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
170                           FloatRegister vtmp1, FloatRegister vtmp2,
171                           PRegister pgtmp);
172 
173   void neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
174 
175   void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
176 
177   // java.lang.Math::signum intrinsics
178   void vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
179                           FloatRegister one, SIMD_Arrangement T);
180 
181   void vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
182                          FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T);
183 
184 #endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP