1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
26 #define CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
27
28 // C2_MacroAssembler contains high-level macros for C2
29
30 private:
31 // Return true if the phase output is in the scratch emit size mode.
32 virtual bool in_scratch_emit_size() override;
33
34 void neon_reduce_logical_helper(int opc, bool sf, Register Rd, Register Rn, Register Rm,
35 enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
36
37 // Helper functions for min/max reduction operations
38
39 void decode_minmax_reduction_opc(int opc, bool* is_min, bool* is_unsigned, Condition* cond);
40
41 void neon_minmaxp(bool is_unsigned, bool is_min, FloatRegister dst,
42 SIMD_Arrangement size, FloatRegister src1, FloatRegister src2) {
43 auto m = is_unsigned ? (is_min ? &Assembler::uminp : &Assembler::umaxp)
44 : (is_min ? &Assembler::sminp : &Assembler::smaxp);
45 (this->*m)(dst, size, src1, src2);
46 }
47
48 // Typedefs used to disambiguate overloaded member functions.
49 typedef void (Assembler::*neon_reduction2)
50 (FloatRegister, SIMD_Arrangement, FloatRegister);
51
52 void neon_minmaxv(bool is_unsigned, bool is_min, FloatRegister dst,
53 SIMD_Arrangement size, FloatRegister src) {
54 auto m = is_unsigned ? (is_min ? (neon_reduction2)&Assembler::uminv
55 : (neon_reduction2)&Assembler::umaxv)
56 : (is_min ? &Assembler::sminv
57 : &Assembler::smaxv);
58 (this->*m)(dst, size, src);
59 }
60
61 void sve_minmaxv(bool is_unsigned, bool is_min, FloatRegister dst,
62 SIMD_RegVariant size, PRegister pg, FloatRegister src) {
63 auto m = is_unsigned ? (is_min ? &Assembler::sve_uminv : &Assembler::sve_umaxv)
64 : (is_min ? &Assembler::sve_sminv : &Assembler::sve_smaxv);
65 (this->*m)(dst, size, pg, src);
66 }
67
68 void select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
69 FloatRegister src2, FloatRegister index,
70 FloatRegister tmp, unsigned vector_length_in_bytes);
71
72 void select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
73 FloatRegister src2, FloatRegister index,
74 FloatRegister tmp, SIMD_RegVariant T,
75 unsigned vector_length_in_bytes);
76
77 public:
78 using Assembler::sve_cpy;
79
80 void entry_barrier();
81
82 // jdk.internal.util.ArraysSupport.vectorizedHashCode
83 address arrays_hashcode(Register ary, Register cnt, Register result, FloatRegister vdata0,
84 FloatRegister vdata1, FloatRegister vdata2, FloatRegister vdata3,
85 FloatRegister vmul0, FloatRegister vmul1, FloatRegister vmul2,
86 FloatRegister vmul3, FloatRegister vpow, FloatRegister vpowm,
87 BasicType eltype);
88
89 // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
90 void fast_lock(Register object, Register box, Register t1, Register t2, Register t3);
91 void fast_unlock(Register object, Register box, Register t1, Register t2, Register t3);
92
93 void string_compare(Register str1, Register str2,
94 Register cnt1, Register cnt2, Register result,
95 Register tmp1, Register tmp2, FloatRegister vtmp1,
96 FloatRegister vtmp2, FloatRegister vtmp3,
97 PRegister pgtmp1, PRegister pgtmp2, int ae);
98
99 void string_indexof(Register str1, Register str2,
100 Register cnt1, Register cnt2,
101 Register tmp1, Register tmp2,
102 Register tmp3, Register tmp4,
103 Register tmp5, Register tmp6,
104 int int_cnt1, Register result, int ae);
105
106 void string_indexof_char(Register str1, Register cnt1,
107 Register ch, Register result,
108 Register tmp1, Register tmp2, Register tmp3);
109
110 void stringL_indexof_char(Register str1, Register cnt1,
111 Register ch, Register result,
112 Register tmp1, Register tmp2, Register tmp3);
113
114 void string_indexof_char_sve(Register str1, Register cnt1,
115 Register ch, Register result,
116 FloatRegister ztmp1, FloatRegister ztmp2,
117 PRegister pgtmp, PRegister ptmp, bool isL);
118
119 // Compress the least significant bit of each byte to the rightmost and clear
120 // the higher garbage bits.
121 void bytemask_compress(Register dst);
122
123 // Pack the value of each mask element in "src" into a long value in "dst", at most the
124 // first 64 lane elements. The input "src" is a vector of boolean represented as bytes
125 // with 0x00/0x01 as element values. Each lane value from "src" is packed into one bit in
126 // "dst".
127 void sve_vmask_tolong(Register dst, FloatRegister src, FloatRegister vtmp, int lane_cnt);
128
129 void sve2_vmask_tolong(Register dst, FloatRegister src, FloatRegister vtmp1,
130 FloatRegister vtmp2, int lane_cnt);
131
132 // Unpack the mask, a long value in "src", into vector register "dst" with boolean type.
133 // Each bit in "src" is unpacked into one byte lane in "dst". Note that "dst" can support
134 // at most 64 lanes.
135 void sve_vmask_fromlong(FloatRegister dst, Register src, FloatRegister vtmp, int lane_cnt);
136
137 // SIMD&FP comparison
138 void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
139 FloatRegister src2, Condition cond, bool isQ);
140
141 void neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
142 Condition cond, bool isQ);
143
144 void sve_compare(PRegister pd, BasicType bt, PRegister pg,
145 FloatRegister zn, FloatRegister zm, Condition cond);
146
147 void sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp);
148
149 // Vector cast
150 void neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
151 FloatRegister src, BasicType src_bt, bool is_unsigned = false);
152
153 void neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
154 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes);
155
156 void sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
157 FloatRegister src, SIMD_RegVariant src_size, bool is_unsigned = false);
158
159 void sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
160 FloatRegister src, SIMD_RegVariant src_size, FloatRegister tmp);
161
162 void sve_vmaskcast_extend(PRegister dst, PRegister src,
163 uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);
164
165 void sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
166 uint dst_element_length_in_bytes, uint src_element_lenght_in_bytes);
167
168 // Vector reduction
169 void neon_reduce_add_integral(Register dst, BasicType bt,
170 Register isrc, FloatRegister vsrc,
171 unsigned vector_length_in_bytes, FloatRegister vtmp);
172
173 void neon_reduce_mul_integral(Register dst, BasicType bt,
174 Register isrc, FloatRegister vsrc,
175 unsigned vector_length_in_bytes,
176 FloatRegister vtmp1, FloatRegister vtmp2);
177
178 void neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
179 FloatRegister fsrc, FloatRegister vsrc,
180 unsigned vector_length_in_bytes, FloatRegister vtmp);
181
182 void neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
183 unsigned vector_length_in_bytes, FloatRegister vtmp);
184
185 void neon_reduce_logical(int opc, Register dst, BasicType bt, Register isrc,
186 FloatRegister vsrc, unsigned vector_length_in_bytes);
187
188 void neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
189 Register isrc, FloatRegister vsrc,
190 unsigned vector_length_in_bytes, FloatRegister vtmp);
191
192 void sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
193 FloatRegister src2, PRegister pg, FloatRegister tmp);
194
195 // Set elements of the dst predicate to true for lanes in the range of
196 // [0, lane_cnt), or to false otherwise. The input "lane_cnt" should be
197 // smaller than or equal to the supported max vector length of the basic
198 // type. Clobbers: rscratch1 and the rFlagsReg.
199 void sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt);
200
201 // Extract a scalar element from an sve vector at position 'idx'.
202 // The input elements in src are expected to be of integral type.
203 void sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
204 int idx, FloatRegister vtmp);
205
206 // java.lang.Math::round intrinsics
207 void vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
208 FloatRegister tmp2, FloatRegister tmp3,
209 SIMD_Arrangement T);
210 void vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
211 FloatRegister tmp2, PRegister pgtmp,
212 SIMD_RegVariant T);
213
214 // Pack active elements of src, under the control of mask, into the
215 // lowest-numbered elements of dst. Any remaining elements of dst will
216 // be filled with zero.
217 void sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
218 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
219 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes);
220
221 void sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
222 FloatRegister vzr, FloatRegister vtmp,
223 PRegister pgtmp, unsigned vector_length_in_bytes);
224
225 void neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
226
227 void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
228
229 void neon_rearrange_hsd(FloatRegister dst, FloatRegister src, FloatRegister shuffle,
230 FloatRegister tmp, BasicType bt, bool isQ);
231 // java.lang.Math::signum intrinsics
232 void vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
233 FloatRegister one, SIMD_Arrangement T);
234
235 void vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
236 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T);
237
238 void verify_int_in_range(uint idx, const TypeInt* t, Register val, Register tmp);
239 void verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp);
240
241 void reconstruct_frame_pointer(Register rtmp);
242
243 // Select from a table of two vectors
244 void select_from_two_vectors(FloatRegister dst, FloatRegister src1, FloatRegister src2,
245 FloatRegister index, FloatRegister tmp, BasicType bt,
246 unsigned vector_length_in_bytes);
247
248 void vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
249 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
250 int vector_length_in_bytes);
251 void vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
252 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
253 int vector_length_in_bytes);
254
255 void sve_cpy(FloatRegister dst, SIMD_RegVariant T, PRegister pg, int imm8,
256 bool isMerge);
257 int vector_iota_entry_index(BasicType bt);
258 #endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP