1 /*
  2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
 26 #define CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
 27 
 28 // C2_MacroAssembler contains high-level macros for C2
 29 
 30  private:
 31   // Return true if the phase output is in the scratch emit size mode.
 32   virtual bool in_scratch_emit_size() override;
 33 
 34   void neon_reduce_logical_helper(int opc, bool sf, Register Rd, Register Rn, Register Rm,
 35                                   enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
 36 
 37   // Helper functions for min/max reduction operations
 38 
 39   void decode_minmax_reduction_opc(int opc, bool* is_min, bool* is_unsigned, Condition* cond);
 40 
 41   void neon_minmaxp(bool is_unsigned, bool is_min, FloatRegister dst,
 42                     SIMD_Arrangement size, FloatRegister src1, FloatRegister src2) {
 43     auto m = is_unsigned ? (is_min ? &Assembler::uminp : &Assembler::umaxp)
 44                          : (is_min ? &Assembler::sminp : &Assembler::smaxp);
 45     (this->*m)(dst, size, src1, src2);
 46   }
 47 
 48   // Typedefs used to disambiguate overloaded member functions.
 49   typedef void (Assembler::*neon_reduction2)
 50     (FloatRegister, SIMD_Arrangement, FloatRegister);
 51 
 52   void neon_minmaxv(bool is_unsigned, bool is_min, FloatRegister dst,
 53                     SIMD_Arrangement size, FloatRegister src) {
 54     auto m = is_unsigned ? (is_min ? (neon_reduction2)&Assembler::uminv
 55                                    : (neon_reduction2)&Assembler::umaxv)
 56                          : (is_min ? &Assembler::sminv
 57                                    : &Assembler::smaxv);
 58     (this->*m)(dst, size, src);
 59   }
 60 
 61   void sve_minmaxv(bool is_unsigned, bool is_min, FloatRegister dst,
 62                    SIMD_RegVariant size, PRegister pg, FloatRegister src) {
 63     auto m = is_unsigned ? (is_min ? &Assembler::sve_uminv : &Assembler::sve_umaxv)
 64                          : (is_min ? &Assembler::sve_sminv : &Assembler::sve_smaxv);
 65     (this->*m)(dst, size, pg, src);
 66   }
 67 
 68   void select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
 69                                     FloatRegister src2, FloatRegister index,
 70                                     FloatRegister tmp, unsigned vector_length_in_bytes);
 71 
 72   void select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
 73                                    FloatRegister src2, FloatRegister index,
 74                                    FloatRegister tmp, SIMD_RegVariant T,
 75                                    unsigned vector_length_in_bytes);
 76 
 77  public:
 78   void entry_barrier();
 79 
 80   // jdk.internal.util.ArraysSupport.vectorizedHashCode
 81   address arrays_hashcode(Register ary, Register cnt, Register result, FloatRegister vdata0,
 82                           FloatRegister vdata1, FloatRegister vdata2, FloatRegister vdata3,
 83                           FloatRegister vmul0, FloatRegister vmul1, FloatRegister vmul2,
 84                           FloatRegister vmul3, FloatRegister vpow, FloatRegister vpowm,
 85                           BasicType eltype);
 86 
 87   // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
 88   void fast_lock(Register object, Register box, Register t1, Register t2, Register t3);
 89   void fast_unlock(Register object, Register box, Register t1, Register t2, Register t3);
 90 
 91   void string_compare(Register str1, Register str2,
 92                       Register cnt1, Register cnt2, Register result,
 93                       Register tmp1, Register tmp2, FloatRegister vtmp1,
 94                       FloatRegister vtmp2, FloatRegister vtmp3,
 95                       PRegister pgtmp1, PRegister pgtmp2, int ae);
 96 
 97   void string_indexof(Register str1, Register str2,
 98                       Register cnt1, Register cnt2,
 99                       Register tmp1, Register tmp2,
100                       Register tmp3, Register tmp4,
101                       Register tmp5, Register tmp6,
102                       int int_cnt1, Register result, int ae);
103 
104   void string_indexof_char(Register str1, Register cnt1,
105                            Register ch, Register result,
106                            Register tmp1, Register tmp2, Register tmp3);
107 
108   void stringL_indexof_char(Register str1, Register cnt1,
109                             Register ch, Register result,
110                             Register tmp1, Register tmp2, Register tmp3);
111 
112   void string_indexof_char_sve(Register str1, Register cnt1,
113                                Register ch, Register result,
114                                FloatRegister ztmp1, FloatRegister ztmp2,
115                                PRegister pgtmp, PRegister ptmp, bool isL);
116 
117   // Compress the least significant bit of each byte to the rightmost and clear
118   // the higher garbage bits.
119   void bytemask_compress(Register dst);
120 
121   // Pack the value of each mask element in "src" into a long value in "dst", at most the
122   // first 64 lane elements. The input "src" is a vector of boolean represented as bytes
123   // with 0x00/0x01 as element values. Each lane value from "src" is packed into one bit in
124   // "dst".
125   void sve_vmask_tolong(Register dst, FloatRegister src, FloatRegister vtmp, int lane_cnt);
126 
127   void sve2_vmask_tolong(Register dst, FloatRegister src, FloatRegister vtmp1,
128                          FloatRegister vtmp2, int lane_cnt);
129 
130   // Unpack the mask, a long value in "src", into vector register "dst" with boolean type.
131   // Each bit in "src" is unpacked into one byte lane in "dst". Note that "dst" can support
132   // at most 64 lanes.
133   void sve_vmask_fromlong(FloatRegister dst, Register src, FloatRegister vtmp, int lane_cnt);
134 
135   // SIMD&FP comparison
136   void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
137                     FloatRegister src2, Condition cond, bool isQ);
138 
139   void neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
140                          Condition cond, bool isQ);
141 
142   void sve_compare(PRegister pd, BasicType bt, PRegister pg,
143                    FloatRegister zn, FloatRegister zm, Condition cond);
144 
145   void sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp);
146 
147   // Vector cast
148   void neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
149                           FloatRegister src, BasicType src_bt, bool is_unsigned = false);
150 
151   void neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
152                           FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes);
153 
154   void sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
155                          FloatRegister src, SIMD_RegVariant src_size, bool is_unsigned = false);
156 
157   void sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
158                          FloatRegister src, SIMD_RegVariant src_size, FloatRegister tmp);
159 
160   void sve_vmaskcast_extend(PRegister dst, PRegister src,
161                             uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);
162 
163   void sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
164                             uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);
165 
166   // Vector reduction
167   void neon_reduce_add_integral(Register dst, BasicType bt,
168                                 Register isrc, FloatRegister vsrc,
169                                 unsigned vector_length_in_bytes, FloatRegister vtmp);
170 
171   void neon_reduce_mul_integral(Register dst, BasicType bt,
172                                 Register isrc, FloatRegister vsrc,
173                                 unsigned vector_length_in_bytes,
174                                 FloatRegister vtmp1, FloatRegister vtmp2);
175 
176   void neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
177                           FloatRegister fsrc, FloatRegister vsrc,
178                           unsigned vector_length_in_bytes, FloatRegister vtmp);
179 
180   void neon_reduce_logical(int opc, Register dst, BasicType bt, Register isrc,
181                            FloatRegister vsrc, unsigned vector_length_in_bytes);
182 
183   void neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
184                                    Register isrc, FloatRegister vsrc,
185                                    unsigned vector_length_in_bytes, FloatRegister vtmp);
186 
187   void sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
188                            FloatRegister src2, PRegister pg, FloatRegister tmp);
189 
190   // Set elements of the dst predicate to true for lanes in the range of
191   // [0, lane_cnt), or to false otherwise. The input "lane_cnt" should be
192   // smaller than or equal to the supported max vector length of the basic
193   // type. Clobbers: rscratch1 and the rFlagsReg.
194   void sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt);
195 
196   // Extract a scalar element from an sve vector at position 'idx'.
197   // The input elements in src are expected to be of integral type.
198   void sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
199                             int idx, FloatRegister vtmp);
200 
201   // java.lang.Math::round intrinsics
202   void vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
203                          FloatRegister tmp2, FloatRegister tmp3,
204                          SIMD_Arrangement T);
205   void vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
206                         FloatRegister tmp2, PRegister pgtmp,
207                         SIMD_RegVariant T);
208 
209   // Pack active elements of src, under the control of mask, into the
210   // lowest-numbered elements of dst. Any remaining elements of dst will
211   // be filled with zero.
212   void sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
213                          FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
214                          PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes);
215 
216   void sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
217                           FloatRegister vzr, FloatRegister vtmp,
218                           PRegister pgtmp, unsigned vector_length_in_bytes);
219 
220   void neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
221 
222   void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
223 
224   void neon_rearrange_hsd(FloatRegister dst, FloatRegister src, FloatRegister shuffle,
225                           FloatRegister tmp, BasicType bt, bool isQ);
226   // java.lang.Math::signum intrinsics
227   void vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
228                           FloatRegister one, SIMD_Arrangement T);
229 
230   void vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
231                          FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T);
232 
233   void verify_int_in_range(uint idx, const TypeInt* t, Register val, Register tmp);
234   void verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp);
235 
236   void reconstruct_frame_pointer(Register rtmp);
237 
238   // Select from a table of two vectors
239   void select_from_two_vectors(FloatRegister dst, FloatRegister src1, FloatRegister src2,
240                                FloatRegister index, FloatRegister tmp, BasicType bt,
241                                unsigned vector_length_in_bytes);
242 
243   void vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
244                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
245                           int vector_length_in_bytes);
246   void vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
247                          FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
248                          int vector_length_in_bytes);
249 #endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP