1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/macroAssembler.hpp"
26 #include "gc/shared/barrierSet.hpp"
27 #include "gc/shared/barrierSetAssembler.hpp"
28 #include "oops/objArrayKlass.hpp"
29 #include "runtime/sharedRuntime.hpp"
30 #include "runtime/stubRoutines.hpp"
31 #include "stubGenerator_x86_64.hpp"
32 #ifdef COMPILER2
33 #include "opto/c2_globals.hpp"
34 #endif
35 #if INCLUDE_JVMCI
36 #include "jvmci/jvmci_globals.hpp"
37 #endif
38
39 #define __ _masm->
40
41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
42
43 #ifdef PRODUCT
44 #define BLOCK_COMMENT(str) /* nothing */
45 #else
46 #define BLOCK_COMMENT(str) __ block_comment(str)
47 #endif // PRODUCT
48
49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
50
51 #ifdef PRODUCT
52 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
53 #else
54 #define INC_COUNTER_NP(counter, rscratch) \
55 BLOCK_COMMENT("inc_counter " #counter); \
56 inc_counter_np(_masm, counter, rscratch);
57
58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
59 __ incrementl(ExternalAddress((address)&counter), rscratch);
60 }
61
62 #if COMPILER2_OR_JVMCI
63 static uint& get_profile_ctr(int shift) {
64 if (shift == 0) {
65 return SharedRuntime::_jbyte_array_copy_ctr;
66 } else if (shift == 1) {
67 return SharedRuntime::_jshort_array_copy_ctr;
68 } else if (shift == 2) {
69 return SharedRuntime::_jint_array_copy_ctr;
70 } else {
71 assert(shift == 3, "");
72 return SharedRuntime::_jlong_array_copy_ctr;
73 }
74 }
75 #endif // COMPILER2_OR_JVMCI
76 #endif // !PRODUCT
77
78 void StubGenerator::generate_arraycopy_stubs() {
79 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
80 // entry immediately following their stack push. This can be used
81 // as a post-push branch target for compatible stubs when they
82 // identify a special case that can be handled by the fallback
83 // stub e.g a disjoint copy stub may be use as a special case
84 // fallback for its compatible conjoint copy stub.
85 //
86 // A no push entry is always returned in the following local and
87 // then published by assigning to the appropriate entry field in
88 // class StubRoutines. The entry value is then passed to the
89 // generator for the compatible stub. That means the entry must be
90 // listed when saving to/restoring from the AOT cache, ensuring
91 // that the inter-stub jumps are noted at AOT-cache save and
92 // relocated at AOT cache load.
93 address nopush_entry;
94
95 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(&nopush_entry);
96 // disjoint nopush entry is needed by conjoint copy
97 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
98 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
99 // conjoint nopush entry is needed by generic/unsafe copy
100 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
101
102 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
103 // disjoint nopush entry is needed by conjoint copy
104 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
105 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
106 // conjoint nopush entry is needed by generic/unsafe copy
107 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
108
109 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
110 // disjoint nopush entry is needed by conjoint copy
111 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
112 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
113 // conjoint nopush entry is needed by generic/unsafe copy
114 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
115
116 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
117 // disjoint nopush entry is needed by conjoint copy
118 StubRoutines::_jlong_disjoint_arraycopy_nopush = nopush_entry;
119 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
120 // conjoint nopush entry is needed by generic/unsafe copy
121 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
122
123 if (UseCompressedOops) {
124 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
125 // disjoint nopush entry is needed by conjoint copy
126 StubRoutines::_oop_disjoint_arraycopy_nopush = nopush_entry;
127 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
128 // conjoint nopush entry is needed by generic/unsafe copy
129 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
130 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
131 // disjoint nopush entry is needed by conjoint copy
132 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
133 // note that we don't need a returned nopush entry because the
134 // generic/unsafe copy does not cater for uninit arrays.
135 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
136 } else {
137 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
138 // disjoint nopush entry is needed by conjoint copy
139 StubRoutines::_oop_disjoint_arraycopy_nopush = nopush_entry;
140 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
141 // conjoint nopush entry is needed by generic/unsafe copy
142 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
143 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
144 // disjoint nopush entry is needed by conjoint copy
145 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
146 // note that we don't need a returned nopush entry because the
147 // generic/unsafe copy does not cater for uninit arrays.
148 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
149 }
150
151 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
152 // checkcast nopush entry is needed by generic copy
153 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
154 // note that we don't need a returned nopush entry because the
155 // generic copy does not cater for uninit arrays.
156 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
157
158 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
159 StubRoutines::_jshort_arraycopy_nopush,
160 StubRoutines::_jint_arraycopy_nopush,
161 StubRoutines::_jlong_arraycopy_nopush);
162 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
163 StubRoutines::_jshort_arraycopy_nopush,
164 StubRoutines::_jint_arraycopy_nopush,
165 StubRoutines::_oop_arraycopy_nopush,
166 StubRoutines::_jlong_arraycopy_nopush,
167 StubRoutines::_checkcast_arraycopy_nopush);
168
169 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
170 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
171 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
172 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
173 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
174 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
175
176 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
177
178 // We don't generate specialized code for HeapWord-aligned source
179 // arrays, so just use the code we've already generated
180 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy;
181 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy;
182
183 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
184 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy;
185
186 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
187 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
188
189 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
190 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
191
192 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
193 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
194
195 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
196 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
197 }
198
199
200 // Verify that a register contains clean 32-bits positive value
201 // (high 32-bits are 0) so it could be used in 64-bits shifts.
202 //
203 // Input:
204 // Rint - 32-bits value
205 // Rtmp - scratch
206 //
207 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
208 #ifdef ASSERT
209 Label L;
210 assert_different_registers(Rtmp, Rint);
211 __ movslq(Rtmp, Rint);
212 __ cmpq(Rtmp, Rint);
213 __ jcc(Assembler::equal, L);
214 __ stop("high 32-bits of int value are not 0");
215 __ bind(L);
216 #endif
217 }
218
219
220 // Generate overlap test for array copy stubs
221 //
222 // Input:
223 // c_rarg0 - from
224 // c_rarg1 - to
225 // c_rarg2 - element count
226 //
227 // Output:
228 // rax - &from[element count - 1]
229 //
230 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
231 const Register from = c_rarg0;
232 const Register to = c_rarg1;
233 const Register count = c_rarg2;
234 const Register end_from = rax;
235
236 __ cmpptr(to, from);
237 __ lea(end_from, Address(from, count, sf, 0));
238 if (NOLp == nullptr) {
239 RuntimeAddress no_overlap(no_overlap_target);
240 __ jump_cc(Assembler::belowEqual, no_overlap);
241 __ cmpptr(to, end_from);
242 __ jump_cc(Assembler::aboveEqual, no_overlap);
243 } else {
244 __ jcc(Assembler::belowEqual, (*NOLp));
245 __ cmpptr(to, end_from);
246 __ jcc(Assembler::aboveEqual, (*NOLp));
247 }
248 }
249
250
251 // Copy big chunks forward
252 //
253 // Inputs:
254 // end_from - source arrays end address
255 // end_to - destination array end address
256 // qword_count - 64-bits element count, negative
257 // tmp1 - scratch
258 // L_copy_bytes - entry label
259 // L_copy_8_bytes - exit label
260 //
261 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
262 Register qword_count, Register tmp1,
263 Register tmp2, Label& L_copy_bytes,
264 Label& L_copy_8_bytes, DecoratorSet decorators,
265 BasicType type) {
266 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
267 DEBUG_ONLY(__ stop("enter at entry label, not here"));
268 Label L_loop;
269 __ align(OptoLoopAlignment);
270 if (UseUnalignedLoadStores) {
271 Label L_end;
272 __ BIND(L_loop);
273 if (UseAVX >= 2) {
274 bs->copy_load_at(_masm, decorators, type, 32,
275 xmm0, Address(end_from, qword_count, Address::times_8, -56),
276 tmp1, xmm1);
277 bs->copy_store_at(_masm, decorators, type, 32,
278 Address(end_to, qword_count, Address::times_8, -56), xmm0,
279 tmp1, tmp2, xmm1);
280
281 bs->copy_load_at(_masm, decorators, type, 32,
282 xmm0, Address(end_from, qword_count, Address::times_8, -24),
283 tmp1, xmm1);
284 bs->copy_store_at(_masm, decorators, type, 32,
285 Address(end_to, qword_count, Address::times_8, -24), xmm0,
286 tmp1, tmp2, xmm1);
287 } else {
288 bs->copy_load_at(_masm, decorators, type, 16,
289 xmm0, Address(end_from, qword_count, Address::times_8, -56),
290 tmp1, xmm1);
291 bs->copy_store_at(_masm, decorators, type, 16,
292 Address(end_to, qword_count, Address::times_8, -56), xmm0,
293 tmp1, tmp2, xmm1);
294 bs->copy_load_at(_masm, decorators, type, 16,
295 xmm0, Address(end_from, qword_count, Address::times_8, -40),
296 tmp1, xmm1);
297 bs->copy_store_at(_masm, decorators, type, 16,
298 Address(end_to, qword_count, Address::times_8, -40), xmm0,
299 tmp1, tmp2, xmm1);
300 bs->copy_load_at(_masm, decorators, type, 16,
301 xmm0, Address(end_from, qword_count, Address::times_8, -24),
302 tmp1, xmm1);
303 bs->copy_store_at(_masm, decorators, type, 16,
304 Address(end_to, qword_count, Address::times_8, -24), xmm0,
305 tmp1, tmp2, xmm1);
306 bs->copy_load_at(_masm, decorators, type, 16,
307 xmm0, Address(end_from, qword_count, Address::times_8, -8),
308 tmp1, xmm1);
309 bs->copy_store_at(_masm, decorators, type, 16,
310 Address(end_to, qword_count, Address::times_8, -8), xmm0,
311 tmp1, tmp2, xmm1);
312 }
313
314 __ BIND(L_copy_bytes);
315 __ addptr(qword_count, 8);
316 __ jcc(Assembler::lessEqual, L_loop);
317 __ subptr(qword_count, 4); // sub(8) and add(4)
318 __ jcc(Assembler::greater, L_end);
319 // Copy trailing 32 bytes
320 if (UseAVX >= 2) {
321 bs->copy_load_at(_masm, decorators, type, 32,
322 xmm0, Address(end_from, qword_count, Address::times_8, -24),
323 tmp1, xmm1);
324 bs->copy_store_at(_masm, decorators, type, 32,
325 Address(end_to, qword_count, Address::times_8, -24), xmm0,
326 tmp1, tmp2, xmm1);
327 } else {
328 bs->copy_load_at(_masm, decorators, type, 16,
329 xmm0, Address(end_from, qword_count, Address::times_8, -24),
330 tmp1, xmm1);
331 bs->copy_store_at(_masm, decorators, type, 16,
332 Address(end_to, qword_count, Address::times_8, -24), xmm0,
333 tmp1, tmp2, xmm1);
334 bs->copy_load_at(_masm, decorators, type, 16,
335 xmm0, Address(end_from, qword_count, Address::times_8, -8),
336 tmp1, xmm1);
337 bs->copy_store_at(_masm, decorators, type, 16,
338 Address(end_to, qword_count, Address::times_8, -8), xmm0,
339 tmp1, tmp2, xmm1);
340 }
341 __ addptr(qword_count, 4);
342 __ BIND(L_end);
343 } else {
344 // Copy 32-bytes per iteration
345 __ BIND(L_loop);
346 bs->copy_load_at(_masm, decorators, type, 8,
347 tmp1, Address(end_from, qword_count, Address::times_8, -24),
348 tmp2);
349 bs->copy_store_at(_masm, decorators, type, 8,
350 Address(end_to, qword_count, Address::times_8, -24), tmp1,
351 tmp2);
352 bs->copy_load_at(_masm, decorators, type, 8,
353 tmp1, Address(end_from, qword_count, Address::times_8, -16),
354 tmp2);
355 bs->copy_store_at(_masm, decorators, type, 8,
356 Address(end_to, qword_count, Address::times_8, -16), tmp1,
357 tmp2);
358 bs->copy_load_at(_masm, decorators, type, 8,
359 tmp1, Address(end_from, qword_count, Address::times_8, -8),
360 tmp2);
361 bs->copy_store_at(_masm, decorators, type, 8,
362 Address(end_to, qword_count, Address::times_8, -8), tmp1,
363 tmp2);
364 bs->copy_load_at(_masm, decorators, type, 8,
365 tmp1, Address(end_from, qword_count, Address::times_8, 0),
366 tmp2);
367 bs->copy_store_at(_masm, decorators, type, 8,
368 Address(end_to, qword_count, Address::times_8, 0), tmp1,
369 tmp2);
370
371 __ BIND(L_copy_bytes);
372 __ addptr(qword_count, 4);
373 __ jcc(Assembler::lessEqual, L_loop);
374 }
375 __ subptr(qword_count, 4);
376 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
377 }
378
379
380 // Copy big chunks backward
381 //
382 // Inputs:
383 // from - source arrays address
384 // dest - destination array address
385 // qword_count - 64-bits element count
386 // tmp1 - scratch
387 // L_copy_bytes - entry label
388 // L_copy_8_bytes - exit label
389 //
390 void StubGenerator::copy_bytes_backward(Register from, Register dest,
391 Register qword_count, Register tmp1,
392 Register tmp2, Label& L_copy_bytes,
393 Label& L_copy_8_bytes, DecoratorSet decorators,
394 BasicType type) {
395 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
396 DEBUG_ONLY(__ stop("enter at entry label, not here"));
397 Label L_loop;
398 __ align(OptoLoopAlignment);
399 if (UseUnalignedLoadStores) {
400 Label L_end;
401 __ BIND(L_loop);
402 if (UseAVX >= 2) {
403 bs->copy_load_at(_masm, decorators, type, 32,
404 xmm0, Address(from, qword_count, Address::times_8, 32),
405 tmp1, xmm1);
406 bs->copy_store_at(_masm, decorators, type, 32,
407 Address(dest, qword_count, Address::times_8, 32), xmm0,
408 tmp1, tmp2, xmm1);
409 bs->copy_load_at(_masm, decorators, type, 32,
410 xmm0, Address(from, qword_count, Address::times_8, 0),
411 tmp1, xmm1);
412 bs->copy_store_at(_masm, decorators, type, 32,
413 Address(dest, qword_count, Address::times_8, 0), xmm0,
414 tmp1, tmp2, xmm1);
415 } else {
416 bs->copy_load_at(_masm, decorators, type, 16,
417 xmm0, Address(from, qword_count, Address::times_8, 48),
418 tmp1, xmm1);
419 bs->copy_store_at(_masm, decorators, type, 16,
420 Address(dest, qword_count, Address::times_8, 48), xmm0,
421 tmp1, tmp2, xmm1);
422 bs->copy_load_at(_masm, decorators, type, 16,
423 xmm0, Address(from, qword_count, Address::times_8, 32),
424 tmp1, xmm1);
425 bs->copy_store_at(_masm, decorators, type, 16,
426 Address(dest, qword_count, Address::times_8, 32), xmm0,
427 tmp1, tmp2, xmm1);
428 bs->copy_load_at(_masm, decorators, type, 16,
429 xmm0, Address(from, qword_count, Address::times_8, 16),
430 tmp1, xmm1);
431 bs->copy_store_at(_masm, decorators, type, 16,
432 Address(dest, qword_count, Address::times_8, 16), xmm0,
433 tmp1, tmp2, xmm1);
434 bs->copy_load_at(_masm, decorators, type, 16,
435 xmm0, Address(from, qword_count, Address::times_8, 0),
436 tmp1, xmm1);
437 bs->copy_store_at(_masm, decorators, type, 16,
438 Address(dest, qword_count, Address::times_8, 0), xmm0,
439 tmp1, tmp2, xmm1);
440 }
441
442 __ BIND(L_copy_bytes);
443 __ subptr(qword_count, 8);
444 __ jcc(Assembler::greaterEqual, L_loop);
445
446 __ addptr(qword_count, 4); // add(8) and sub(4)
447 __ jcc(Assembler::less, L_end);
448 // Copy trailing 32 bytes
449 if (UseAVX >= 2) {
450 bs->copy_load_at(_masm, decorators, type, 32,
451 xmm0, Address(from, qword_count, Address::times_8, 0),
452 tmp1, xmm1);
453 bs->copy_store_at(_masm, decorators, type, 32,
454 Address(dest, qword_count, Address::times_8, 0), xmm0,
455 tmp1, tmp2, xmm1);
456 } else {
457 bs->copy_load_at(_masm, decorators, type, 16,
458 xmm0, Address(from, qword_count, Address::times_8, 16),
459 tmp1, xmm1);
460 bs->copy_store_at(_masm, decorators, type, 16,
461 Address(dest, qword_count, Address::times_8, 16), xmm0,
462 tmp1, tmp2, xmm1);
463 bs->copy_load_at(_masm, decorators, type, 16,
464 xmm0, Address(from, qword_count, Address::times_8, 0),
465 tmp1, xmm1);
466 bs->copy_store_at(_masm, decorators, type, 16,
467 Address(dest, qword_count, Address::times_8, 0), xmm0,
468 tmp1, tmp2, xmm1);
469 }
470 __ subptr(qword_count, 4);
471 __ BIND(L_end);
472 } else {
473 // Copy 32-bytes per iteration
474 __ BIND(L_loop);
475 bs->copy_load_at(_masm, decorators, type, 8,
476 tmp1, Address(from, qword_count, Address::times_8, 24),
477 tmp2);
478 bs->copy_store_at(_masm, decorators, type, 8,
479 Address(dest, qword_count, Address::times_8, 24), tmp1,
480 tmp2);
481 bs->copy_load_at(_masm, decorators, type, 8,
482 tmp1, Address(from, qword_count, Address::times_8, 16),
483 tmp2);
484 bs->copy_store_at(_masm, decorators, type, 8,
485 Address(dest, qword_count, Address::times_8, 16), tmp1,
486 tmp2);
487 bs->copy_load_at(_masm, decorators, type, 8,
488 tmp1, Address(from, qword_count, Address::times_8, 8),
489 tmp2);
490 bs->copy_store_at(_masm, decorators, type, 8,
491 Address(dest, qword_count, Address::times_8, 8), tmp1,
492 tmp2);
493 bs->copy_load_at(_masm, decorators, type, 8,
494 tmp1, Address(from, qword_count, Address::times_8, 0),
495 tmp2);
496 bs->copy_store_at(_masm, decorators, type, 8,
497 Address(dest, qword_count, Address::times_8, 0), tmp1,
498 tmp2);
499
500 __ BIND(L_copy_bytes);
501 __ subptr(qword_count, 4);
502 __ jcc(Assembler::greaterEqual, L_loop);
503 }
504 __ addptr(qword_count, 4);
505 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
506 }
507
508 #if COMPILER2_OR_JVMCI
509
510 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
511 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
512 // for both special cases (various small block sizes) and aligned copy loop. This is the
513 // default configuration.
514 // - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
515 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
516 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
517 // better performance for disjoint copies. For conjoint/backward copy vector based
518 // copy performs better.
519 // - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
520 // 64 byte vector registers (ZMMs).
521
522 // Inputs:
523 // c_rarg0 - source array address
524 // c_rarg1 - destination array address
525 // c_rarg2 - element count, treated as ssize_t, can be zero
526 //
527 //
528 // Side Effects:
529 // disjoint_copy_avx3_masked is set to the no-overlap entry point
530 // used by generate_conjoint_[byte/int/short/long]_copy().
531 //
532 address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry) {
533 // aligned is always false -- x86_64 always uses the unaligned code
534 const bool aligned = false;
535 int shift;
536 bool is_oop;
537 bool dest_uninitialized;
538
539 switch (stub_id) {
540 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
541 shift = 0;
542 is_oop = false;
543 dest_uninitialized = false;
544 break;
545 case StubId::stubgen_jshort_disjoint_arraycopy_id:
546 shift = 1;
547 is_oop = false;
548 dest_uninitialized = false;
549 break;
550 case StubId::stubgen_jint_disjoint_arraycopy_id:
551 shift = 2;
552 is_oop = false;
553 dest_uninitialized = false;
554 break;
555 case StubId::stubgen_jlong_disjoint_arraycopy_id:
556 shift = 3;
557 is_oop = false;
558 dest_uninitialized = false;
559 break;
560 case StubId::stubgen_oop_disjoint_arraycopy_id:
561 shift = (UseCompressedOops ? 2 : 3);
562 is_oop = true;
563 dest_uninitialized = false;
564 break;
565 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
566 shift = (UseCompressedOops ? 2 : 3);
567 is_oop = true;
568 dest_uninitialized = true;
569 break;
570 default:
571 ShouldNotReachHere();
572 }
573 GrowableArray<address> entries;
574 GrowableArray<address> extras;
575 bool add_handlers = !is_oop && !aligned;
576 bool add_relocs = UseZGC && is_oop;
577 bool add_extras = add_handlers || add_relocs;
578 // The stub employs one unsafe handler region by default but has two
579 // when MaxVectorSize == 64 So we may expect 0, 3 or 6 extras.
580 int handlers_count = (MaxVectorSize == 64 ? 2 : 1);
581 int expected_entry_count = (entry != nullptr ? 2 : 1);
582 int expected_extra_count = (add_handlers ? handlers_count : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/1/2 x UMAM {start,end,handler}
583 int entry_count = StubInfo::entry_count(stub_id);
584 assert(entry_count == expected_entry_count, "sanity check");
585 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
586 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
587 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
588 if (start != nullptr) {
589 assert(entries.length() == expected_entry_count - 1,
590 "unexpected entry count %d", entries.length());
591 assert(!add_handlers || extras.length() == expected_extra_count,
592 "unexpected handler addresses count %d", extras.length());
593 if (entry != nullptr) {
594 *entry = entries.at(0);
595 }
596 if (add_handlers) {
597 // restore 1/2 x UMAM {start,end,handler} addresses from extras
598 register_unsafe_access_handlers(extras, 0, handlers_count);
599 }
600 #if INCLUDE_ZGC
601 // register addresses at which ZGC does colour patching
602 if (add_relocs) {
603 register_reloc_addresses(extras, 0, extras.length());
604 }
605 #endif // INCLUDE_ZGC
606 return start;
607 }
608
609 __ align(CodeEntryAlignment);
610 StubCodeMark mark(this, stub_id);
611 start = __ pc();
612
613 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
614 const int large_threshold = 2621440; // 2.5 MB
615 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
616 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
617 Label L_copy_large, L_finish;
618 const Register from = rdi; // source array address
619 const Register to = rsi; // destination array address
620 const Register count = rdx; // elements count
621 const Register temp1 = r8;
622 const Register temp2 = r11;
623 const Register temp3 = rax;
624 const Register temp4 = rcx;
625 // End pointers are inclusive, and if count is not zero they point
626 // to the last unit copied: end_to[0] := end_from[0]
627
628 __ enter(); // required for proper stackwalking of RuntimeStub frame
629 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
630
631 if (entry != nullptr) {
632 *entry = __ pc();
633 entries.append(*entry);
634 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
635 BLOCK_COMMENT("Entry:");
636 }
637
638 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
639 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
640
641 setup_argument_regs(type);
642
643 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
644 if (dest_uninitialized) {
645 decorators |= IS_DEST_UNINITIALIZED;
646 }
647 if (aligned) {
648 decorators |= ARRAYCOPY_ALIGNED;
649 }
650 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
651 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
652
653 {
654 // Type(shift) byte(0), short(1), int(2), long(3)
655 int loop_size[] = { 192, 96, 48, 24};
656 int threshold[] = { 4096, 2048, 1024, 512};
657
658 // UnsafeMemoryAccess page error: continue after unsafe access
659 UnsafeMemoryAccessMark umam(this, add_handlers, true);
660 // 'from', 'to' and 'count' are now valid
661
662 // temp1 holds remaining count and temp4 holds running count used to compute
663 // next address offset for start of to/from addresses (temp4 * scale).
664 __ mov64(temp4, 0);
665 __ movq(temp1, count);
666
667 // Zero length check.
668 __ BIND(L_tail);
669 __ cmpq(temp1, 0);
670 __ jcc(Assembler::lessEqual, L_exit);
671
672 // Special cases using 32 byte [masked] vector copy operations.
673 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
674 temp4, temp3, use64byteVector, L_entry, L_exit);
675
676 // PRE-MAIN-POST loop for aligned copy.
677 __ BIND(L_entry);
678
679 if (MaxVectorSize == 64) {
680 __ movq(temp2, temp1);
681 __ shlq(temp2, shift);
682 __ cmpq(temp2, large_threshold);
683 __ jcc(Assembler::greaterEqual, L_copy_large);
684 }
685 if (CopyAVX3Threshold != 0) {
686 __ cmpq(count, threshold[shift]);
687 if (MaxVectorSize == 64) {
688 // Copy using 64 byte vectors.
689 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
690 } else {
691 assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
692 // REP MOVS offer a faster copy path.
693 __ jcc(Assembler::greaterEqual, L_repmovs);
694 }
695 }
696
697 if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
698 // Partial copy to make dst address 32 byte aligned.
699 __ movq(temp2, to);
700 __ andq(temp2, 31);
701 __ jcc(Assembler::equal, L_main_pre_loop);
702
703 __ negptr(temp2);
704 __ addq(temp2, 32);
705 if (shift) {
706 __ shrq(temp2, shift);
707 }
708 __ movq(temp3, temp2);
709 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
710 __ movq(temp4, temp2);
711 __ movq(temp1, count);
712 __ subq(temp1, temp2);
713
714 __ cmpq(temp1, loop_size[shift]);
715 __ jcc(Assembler::less, L_tail);
716
717 __ BIND(L_main_pre_loop);
718 __ subq(temp1, loop_size[shift]);
719
720 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
721 __ align32();
722 __ BIND(L_main_loop);
723 copy64_avx(to, from, temp4, xmm1, false, shift, 0);
724 copy64_avx(to, from, temp4, xmm1, false, shift, 64);
725 copy64_avx(to, from, temp4, xmm1, false, shift, 128);
726 __ addptr(temp4, loop_size[shift]);
727 __ subq(temp1, loop_size[shift]);
728 __ jcc(Assembler::greater, L_main_loop);
729
730 __ addq(temp1, loop_size[shift]);
731
732 // Tail loop.
733 __ jmp(L_tail);
734
735 __ BIND(L_repmovs);
736 __ movq(temp2, temp1);
737 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
738 __ movq(temp3, to);
739 __ movq(to, from);
740 __ movq(from, temp3);
741 // Save to/from for restoration post rep_mov.
742 __ movq(temp1, to);
743 __ movq(temp3, from);
744 if(shift < 3) {
745 __ shrq(temp2, 3-shift); // quad word count
746 }
747 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX).
748 __ rep_mov();
749 __ shlq(temp2, 3); // convert quad words into byte count.
750 if(shift) {
751 __ shrq(temp2, shift); // type specific count.
752 }
753 // Restore original addresses in to/from.
754 __ movq(to, temp3);
755 __ movq(from, temp1);
756 __ movq(temp4, temp2);
757 __ movq(temp1, count);
758 __ subq(temp1, temp2); // tailing part (less than a quad ward size).
759 __ jmp(L_tail);
760 }
761
762 if (MaxVectorSize > 32) {
763 __ BIND(L_pre_main_post_64);
764 // Partial copy to make dst address 64 byte aligned.
765 __ movq(temp2, to);
766 __ andq(temp2, 63);
767 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
768
769 __ negptr(temp2);
770 __ addq(temp2, 64);
771 if (shift) {
772 __ shrq(temp2, shift);
773 }
774 __ movq(temp3, temp2);
775 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
776 __ movq(temp4, temp2);
777 __ movq(temp1, count);
778 __ subq(temp1, temp2);
779
780 __ cmpq(temp1, loop_size[shift]);
781 __ jcc(Assembler::less, L_tail64);
782
783 __ BIND(L_main_pre_loop_64bytes);
784 __ subq(temp1, loop_size[shift]);
785
786 // Main loop with aligned copy block size of 192 bytes at
787 // 64 byte copy granularity.
788 __ align32();
789 __ BIND(L_main_loop_64bytes);
790 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
791 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
792 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
793 __ addptr(temp4, loop_size[shift]);
794 __ subq(temp1, loop_size[shift]);
795 __ jcc(Assembler::greater, L_main_loop_64bytes);
796
797 __ addq(temp1, loop_size[shift]);
798 // Zero length check.
799 __ jcc(Assembler::lessEqual, L_exit);
800
801 __ BIND(L_tail64);
802
803 // Tail handling using 64 byte [masked] vector copy operations.
804 use64byteVector = true;
805 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
806 temp4, temp3, use64byteVector, L_entry, L_exit);
807 }
808 __ BIND(L_exit);
809 }
810
811 __ BIND(L_finish);
812 address ucme_exit_pc = __ pc();
813 // When called from generic_arraycopy r11 contains specific values
814 // used during arraycopy epilogue, re-initializing r11.
815 if (is_oop) {
816 __ movq(r11, shift == 3 ? count : to);
817 }
818 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
819 restore_argument_regs(type);
820 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
821 __ xorptr(rax, rax); // return 0
822 __ vzeroupper();
823 __ leave(); // required for proper stackwalking of RuntimeStub frame
824 __ ret(0);
825
826 if (MaxVectorSize == 64) {
827 __ BIND(L_copy_large);
828 UnsafeMemoryAccessMark umam(this, add_handlers, false, ucme_exit_pc);
829 arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
830 __ jmp(L_finish);
831 }
832 // retrieve the registered handler addresses
833 address end = __ pc();
834 if (add_handlers) {
835 retrieve_unsafe_access_handlers(start, end, extras);
836 }
837 assert(extras.length() == expected_extra_count,
838 "unexpected handler addresses count %d", extras.length());
839 #if INCLUDE_ZGC
840 // retrieve addresses at which ZGC does colour patching
841 if (add_relocs) {
842 retrieve_reloc_addresses(start, end, extras);
843 }
844 #endif // INCLUDE_ZGC
845
846 // record the stub entry and end plus the no_push entry and any
847 // extra handler addresses
848 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
849
850 return start;
851 }
852
853 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
854 Register temp3, Register temp4, Register count,
855 XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
856 XMMRegister xmm4, int shift) {
857
858 // Type(shift) byte(0), short(1), int(2), long(3)
859 int loop_size[] = { 256, 128, 64, 32};
860 int threshold[] = { 4096, 2048, 1024, 512};
861
862 Label L_main_loop_large;
863 Label L_tail_large;
864 Label L_exit_large;
865 Label L_entry_large;
866 Label L_main_pre_loop_large;
867 Label L_pre_main_post_large;
868
869 assert(MaxVectorSize == 64, "vector length != 64");
870 __ BIND(L_entry_large);
871
872 __ BIND(L_pre_main_post_large);
873 // Partial copy to make dst address 64 byte aligned.
874 __ movq(temp2, to);
875 __ andq(temp2, 63);
876 __ jcc(Assembler::equal, L_main_pre_loop_large);
877
878 __ negptr(temp2);
879 __ addq(temp2, 64);
880 if (shift) {
881 __ shrq(temp2, shift);
882 }
883 __ movq(temp3, temp2);
884 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
885 __ movq(temp4, temp2);
886 __ movq(temp1, count);
887 __ subq(temp1, temp2);
888
889 __ cmpq(temp1, loop_size[shift]);
890 __ jcc(Assembler::less, L_tail_large);
891
892 __ BIND(L_main_pre_loop_large);
893 __ subq(temp1, loop_size[shift]);
894
895 // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
896 __ align32();
897 __ BIND(L_main_loop_large);
898 copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
899 __ addptr(temp4, loop_size[shift]);
900 __ subq(temp1, loop_size[shift]);
901 __ jcc(Assembler::greater, L_main_loop_large);
902 // fence needed because copy256_avx3 uses non-temporal stores
903 __ sfence();
904
905 __ addq(temp1, loop_size[shift]);
906 // Zero length check.
907 __ jcc(Assembler::lessEqual, L_exit_large);
908 __ BIND(L_tail_large);
909 // Tail handling using 64 byte [masked] vector copy operations.
910 __ cmpq(temp1, 0);
911 __ jcc(Assembler::lessEqual, L_exit_large);
912 arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
913 temp4, temp3, L_exit_large);
914 __ BIND(L_exit_large);
915 }
916
917 // Inputs:
918 // c_rarg0 - source array address
919 // c_rarg1 - destination array address
920 // c_rarg2 - element count, treated as ssize_t, can be zero
921 //
922 //
923 address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry, address nooverlap_target) {
924 // aligned is always false -- x86_64 always uses the unaligned code
925 const bool aligned = false;
926 int shift;
927 bool is_oop;
928 bool dest_uninitialized;
929
930 switch (stub_id) {
931 case StubId::stubgen_jbyte_arraycopy_id:
932 shift = 0;
933 is_oop = false;
934 dest_uninitialized = false;
935 break;
936 case StubId::stubgen_jshort_arraycopy_id:
937 shift = 1;
938 is_oop = false;
939 dest_uninitialized = false;
940 break;
941 case StubId::stubgen_jint_arraycopy_id:
942 shift = 2;
943 is_oop = false;
944 dest_uninitialized = false;
945 break;
946 case StubId::stubgen_jlong_arraycopy_id:
947 shift = 3;
948 is_oop = false;
949 dest_uninitialized = false;
950 break;
951 case StubId::stubgen_oop_arraycopy_id:
952 shift = (UseCompressedOops ? 2 : 3);
953 is_oop = true;
954 dest_uninitialized = false;
955 break;
956 case StubId::stubgen_oop_arraycopy_uninit_id:
957 shift = (UseCompressedOops ? 2 : 3);
958 is_oop = true;
959 dest_uninitialized = true;
960 break;
961 default:
962 ShouldNotReachHere();
963 }
964 GrowableArray<address> entries;
965 GrowableArray<address> extras;
966 bool add_handlers = !is_oop && !aligned;
967 bool add_relocs = UseZGC && is_oop;
968 bool add_extras = add_handlers || add_relocs;
969 int expected_entry_count = (entry != nullptr ? 2 : 1);
970 int expected_handler_count = (add_handlers ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/1 x UMAM {start,end,handler}
971 int entry_count = StubInfo::entry_count(stub_id);
972 assert(entry_count == expected_entry_count, "sanity check");
973 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
974 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
975 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
976 if (start != nullptr) {
977 assert(entries.length() == expected_entry_count - 1,
978 "unexpected entry count %d", entries.length());
979 assert(!add_handlers || extras.length() == expected_handler_count,
980 "unexpected handler addresses count %d", extras.length());
981 if (entry != nullptr) {
982 *entry = entries.at(0);
983 }
984 if (add_handlers) {
985 // restore 1 x UMAM {start,end,handler} addresses from extras
986 register_unsafe_access_handlers(extras, 0, 1);
987 }
988 #if INCLUDE_ZGC
989 if (add_relocs) {
990 // register addresses at which ZGC does colour patching
991 register_reloc_addresses(extras, 0, extras.length());
992 }
993 #endif // INCLUDE_ZGC
994 return start;
995 }
996 __ align(CodeEntryAlignment);
997 StubCodeMark mark(this, stub_id);
998 start = __ pc();
999
1000 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
1001
1002 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1003 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1004 const Register from = rdi; // source array address
1005 const Register to = rsi; // destination array address
1006 const Register count = rdx; // elements count
1007 const Register temp1 = r8;
1008 const Register temp2 = rcx;
1009 const Register temp3 = r11;
1010 const Register temp4 = rax;
1011 // End pointers are inclusive, and if count is not zero they point
1012 // to the last unit copied: end_to[0] := end_from[0]
1013
1014 __ enter(); // required for proper stackwalking of RuntimeStub frame
1015 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1016
1017 if (entry != nullptr) {
1018 *entry = __ pc();
1019 entries.append(*entry);
1020 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1021 BLOCK_COMMENT("Entry:");
1022 }
1023
1024 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1025
1026 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1027 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1028
1029 setup_argument_regs(type);
1030
1031 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1032 if (dest_uninitialized) {
1033 decorators |= IS_DEST_UNINITIALIZED;
1034 }
1035 if (aligned) {
1036 decorators |= ARRAYCOPY_ALIGNED;
1037 }
1038 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1039 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1040 {
1041 // Type(shift) byte(0), short(1), int(2), long(3)
1042 int loop_size[] = { 192, 96, 48, 24};
1043 int threshold[] = { 4096, 2048, 1024, 512};
1044
1045 // UnsafeMemoryAccess page error: continue after unsafe access
1046 UnsafeMemoryAccessMark umam(this, add_handlers, true);
1047 // 'from', 'to' and 'count' are now valid
1048
1049 // temp1 holds remaining count.
1050 __ movq(temp1, count);
1051
1052 // Zero length check.
1053 __ BIND(L_tail);
1054 __ cmpq(temp1, 0);
1055 __ jcc(Assembler::lessEqual, L_exit);
1056
1057 __ mov64(temp2, 0);
1058 __ movq(temp3, temp1);
1059 // Special cases using 32 byte [masked] vector copy operations.
1060 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1061 temp4, use64byteVector, L_entry, L_exit);
1062
1063 // PRE-MAIN-POST loop for aligned copy.
1064 __ BIND(L_entry);
1065
1066 if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
1067 __ cmpq(temp1, threshold[shift]);
1068 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1069 }
1070
1071 if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
1072 // Partial copy to make dst address 32 byte aligned.
1073 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1074 __ andq(temp2, 31);
1075 __ jcc(Assembler::equal, L_main_pre_loop);
1076
1077 if (shift) {
1078 __ shrq(temp2, shift);
1079 }
1080 __ subq(temp1, temp2);
1081 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1082
1083 __ cmpq(temp1, loop_size[shift]);
1084 __ jcc(Assembler::less, L_tail);
1085
1086 __ BIND(L_main_pre_loop);
1087
1088 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1089 __ align32();
1090 __ BIND(L_main_loop);
1091 copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1092 copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1093 copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1094 __ subptr(temp1, loop_size[shift]);
1095 __ cmpq(temp1, loop_size[shift]);
1096 __ jcc(Assembler::greater, L_main_loop);
1097
1098 // Tail loop.
1099 __ jmp(L_tail);
1100 }
1101
1102 if (MaxVectorSize > 32) {
1103 __ BIND(L_pre_main_post_64);
1104 // Partial copy to make dst address 64 byte aligned.
1105 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1106 __ andq(temp2, 63);
1107 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1108
1109 if (shift) {
1110 __ shrq(temp2, shift);
1111 }
1112 __ subq(temp1, temp2);
1113 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1114
1115 __ cmpq(temp1, loop_size[shift]);
1116 __ jcc(Assembler::less, L_tail64);
1117
1118 __ BIND(L_main_pre_loop_64bytes);
1119
1120 // Main loop with aligned copy block size of 192 bytes at
1121 // 64 byte copy granularity.
1122 __ align32();
1123 __ BIND(L_main_loop_64bytes);
1124 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1125 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1126 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1127 __ subq(temp1, loop_size[shift]);
1128 __ cmpq(temp1, loop_size[shift]);
1129 __ jcc(Assembler::greater, L_main_loop_64bytes);
1130
1131 // Zero length check.
1132 __ cmpq(temp1, 0);
1133 __ jcc(Assembler::lessEqual, L_exit);
1134
1135 __ BIND(L_tail64);
1136
1137 // Tail handling using 64 byte [masked] vector copy operations.
1138 use64byteVector = true;
1139 __ mov64(temp2, 0);
1140 __ movq(temp3, temp1);
1141 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1142 temp4, use64byteVector, L_entry, L_exit);
1143 }
1144 __ BIND(L_exit);
1145 }
1146 address ucme_exit_pc = __ pc();
1147 // When called from generic_arraycopy r11 contains specific values
1148 // used during arraycopy epilogue, re-initializing r11.
1149 if(is_oop) {
1150 __ movq(r11, count);
1151 }
1152 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1153 restore_argument_regs(type);
1154 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1155 __ xorptr(rax, rax); // return 0
1156 __ vzeroupper();
1157 __ leave(); // required for proper stackwalking of RuntimeStub frame
1158 __ ret(0);
1159
1160 // retrieve the registered handler addresses
1161 address end = __ pc();
1162 if (add_handlers) {
1163 retrieve_unsafe_access_handlers(start, end, extras);
1164 }
1165 assert(extras.length() == expected_handler_count,
1166 "unexpected handler addresses count %d", extras.length());
1167 #if INCLUDE_ZGC
1168 // retrieve addresses at which ZGC does colour patching
1169 if (add_relocs) {
1170 retrieve_reloc_addresses(start, end, extras);
1171 }
1172 #endif // INCLUDE_ZGC
1173 // record the stub entry and end plus the no_push entry and any
1174 // extra handler addresses
1175 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
1176
1177 return start;
1178 }
1179
1180 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1181 Register to, Register count, int shift,
1182 Register index, Register temp,
1183 bool use64byteVector, Label& L_entry, Label& L_exit) {
1184 Label L_entry_64, L_entry_96, L_entry_128;
1185 Label L_entry_160, L_entry_192;
1186
1187 int size_mat[][6] = {
1188 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
1189 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
1190 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
1191 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
1192 };
1193
1194 // Case A) Special case for length less than equal to 32 bytes.
1195 __ cmpq(count, size_mat[shift][0]);
1196 __ jccb(Assembler::greater, L_entry_64);
1197 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1198 __ jmp(L_exit);
1199
1200 // Case B) Special case for length less than equal to 64 bytes.
1201 __ BIND(L_entry_64);
1202 __ cmpq(count, size_mat[shift][1]);
1203 __ jccb(Assembler::greater, L_entry_96);
1204 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1205 __ jmp(L_exit);
1206
1207 // Case C) Special case for length less than equal to 96 bytes.
1208 __ BIND(L_entry_96);
1209 __ cmpq(count, size_mat[shift][2]);
1210 __ jccb(Assembler::greater, L_entry_128);
1211 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1212 __ subq(count, 64 >> shift);
1213 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1214 __ jmp(L_exit);
1215
1216 // Case D) Special case for length less than equal to 128 bytes.
1217 __ BIND(L_entry_128);
1218 __ cmpq(count, size_mat[shift][3]);
1219 __ jccb(Assembler::greater, L_entry_160);
1220 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1221 copy32_avx(to, from, index, xmm, shift, 64);
1222 __ subq(count, 96 >> shift);
1223 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1224 __ jmp(L_exit);
1225
1226 // Case E) Special case for length less than equal to 160 bytes.
1227 __ BIND(L_entry_160);
1228 __ cmpq(count, size_mat[shift][4]);
1229 __ jccb(Assembler::greater, L_entry_192);
1230 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1231 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1232 __ subq(count, 128 >> shift);
1233 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1234 __ jmp(L_exit);
1235
1236 // Case F) Special case for length less than equal to 192 bytes.
1237 __ BIND(L_entry_192);
1238 __ cmpq(count, size_mat[shift][5]);
1239 __ jcc(Assembler::greater, L_entry);
1240 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1241 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1242 copy32_avx(to, from, index, xmm, shift, 128);
1243 __ subq(count, 160 >> shift);
1244 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1245 __ jmp(L_exit);
1246 }
1247
1248 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1249 Register to, Register count, int shift, Register index,
1250 Register temp, Label& L_exit) {
1251 Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1252
1253 int size_mat[][4] = {
1254 /* T_BYTE */ {64, 128, 192, 256},
1255 /* T_SHORT*/ {32, 64 , 96 , 128},
1256 /* T_INT */ {16, 32 , 48 , 64},
1257 /* T_LONG */ { 8, 16 , 24 , 32}
1258 };
1259
1260 assert(MaxVectorSize == 64, "vector length != 64");
1261 // Case A) Special case for length less than or equal to 64 bytes.
1262 __ BIND(L_entry_64);
1263 __ cmpq(count, size_mat[shift][0]);
1264 __ jccb(Assembler::greater, L_entry_128);
1265 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1266 __ jmp(L_exit);
1267
1268 // Case B) Special case for length less than or equal to 128 bytes.
1269 __ BIND(L_entry_128);
1270 __ cmpq(count, size_mat[shift][1]);
1271 __ jccb(Assembler::greater, L_entry_192);
1272 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1273 __ subq(count, 64 >> shift);
1274 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1275 __ jmp(L_exit);
1276
1277 // Case C) Special case for length less than or equal to 192 bytes.
1278 __ BIND(L_entry_192);
1279 __ cmpq(count, size_mat[shift][2]);
1280 __ jcc(Assembler::greater, L_entry_256);
1281 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1282 copy64_avx(to, from, index, xmm, false, shift, 64, true);
1283 __ subq(count, 128 >> shift);
1284 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1285 __ jmp(L_exit);
1286
1287 // Case D) Special case for length less than or equal to 256 bytes.
1288 __ BIND(L_entry_256);
1289 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1290 copy64_avx(to, from, index, xmm, false, shift, 64, true);
1291 copy64_avx(to, from, index, xmm, false, shift, 128, true);
1292 __ subq(count, 192 >> shift);
1293 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1294 __ jmp(L_exit);
1295 }
1296
1297 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1298 Register to, Register start_index, Register end_index,
1299 Register count, int shift, Register temp,
1300 bool use64byteVector, Label& L_entry, Label& L_exit) {
1301 Label L_entry_64, L_entry_96, L_entry_128;
1302 Label L_entry_160, L_entry_192;
1303 bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
1304
1305 int size_mat[][6] = {
1306 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
1307 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
1308 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
1309 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
1310 };
1311
1312 // Case A) Special case for length less than equal to 32 bytes.
1313 __ cmpq(count, size_mat[shift][0]);
1314 __ jccb(Assembler::greater, L_entry_64);
1315 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1316 __ jmp(L_exit);
1317
1318 // Case B) Special case for length less than equal to 64 bytes.
1319 __ BIND(L_entry_64);
1320 __ cmpq(count, size_mat[shift][1]);
1321 __ jccb(Assembler::greater, L_entry_96);
1322 if (avx3) {
1323 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1324 } else {
1325 copy32_avx(to, from, end_index, xmm, shift, -32);
1326 __ subq(count, 32 >> shift);
1327 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1328 }
1329 __ jmp(L_exit);
1330
1331 // Case C) Special case for length less than equal to 96 bytes.
1332 __ BIND(L_entry_96);
1333 __ cmpq(count, size_mat[shift][2]);
1334 __ jccb(Assembler::greater, L_entry_128);
1335 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1336 __ subq(count, 64 >> shift);
1337 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1338 __ jmp(L_exit);
1339
1340 // Case D) Special case for length less than equal to 128 bytes.
1341 __ BIND(L_entry_128);
1342 __ cmpq(count, size_mat[shift][3]);
1343 __ jccb(Assembler::greater, L_entry_160);
1344 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1345 copy32_avx(to, from, end_index, xmm, shift, -96);
1346 __ subq(count, 96 >> shift);
1347 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1348 __ jmp(L_exit);
1349
1350 // Case E) Special case for length less than equal to 160 bytes.
1351 __ BIND(L_entry_160);
1352 __ cmpq(count, size_mat[shift][4]);
1353 __ jccb(Assembler::greater, L_entry_192);
1354 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1355 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1356 __ subq(count, 128 >> shift);
1357 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1358 __ jmp(L_exit);
1359
1360 // Case F) Special case for length less than equal to 192 bytes.
1361 __ BIND(L_entry_192);
1362 __ cmpq(count, size_mat[shift][5]);
1363 __ jcc(Assembler::greater, L_entry);
1364 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1365 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1366 copy32_avx(to, from, end_index, xmm, shift, -160);
1367 __ subq(count, 160 >> shift);
1368 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1369 __ jmp(L_exit);
1370 }
1371
1372 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1373 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1374 int shift, int offset) {
1375 if (MaxVectorSize == 64) {
1376 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1377 __ prefetcht0(Address(src, index, scale, offset + 0x200));
1378 __ prefetcht0(Address(src, index, scale, offset + 0x240));
1379 __ prefetcht0(Address(src, index, scale, offset + 0x280));
1380 __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1381
1382 __ prefetcht0(Address(src, index, scale, offset + 0x400));
1383 __ prefetcht0(Address(src, index, scale, offset + 0x440));
1384 __ prefetcht0(Address(src, index, scale, offset + 0x480));
1385 __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1386
1387 __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1388 __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1389 __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1390 __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1391
1392 __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1393 __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1394 __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1395 __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1396 }
1397 }
1398
1399 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1400 KRegister mask, Register length, Register index,
1401 Register temp, int shift, int offset,
1402 bool use64byteVector) {
1403 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1404 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1405 if (!use64byteVector) {
1406 copy32_avx(dst, src, index, xmm, shift, offset);
1407 __ subptr(length, 32 >> shift);
1408 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1409 } else {
1410 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1411 assert(MaxVectorSize == 64, "vector length != 64");
1412 __ mov64(temp, -1L);
1413 __ bzhiq(temp, temp, length);
1414 __ kmovql(mask, temp);
1415 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1416 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1417 }
1418 }
1419
1420
1421 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1422 KRegister mask, Register length, Register index,
1423 Register temp, int shift, int offset) {
1424 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1425 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1426 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1427 __ mov64(temp, -1L);
1428 __ bzhiq(temp, temp, length);
1429 __ kmovql(mask, temp);
1430 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1431 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1432 }
1433
1434
1435 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1436 int shift, int offset) {
1437 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1438 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1439 __ vmovdqu(xmm, Address(src, index, scale, offset));
1440 __ vmovdqu(Address(dst, index, scale, offset), xmm);
1441 }
1442
1443
1444 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1445 bool conjoint, int shift, int offset, bool use64byteVector) {
1446 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1447 if (!use64byteVector) {
1448 if (conjoint) {
1449 copy32_avx(dst, src, index, xmm, shift, offset+32);
1450 copy32_avx(dst, src, index, xmm, shift, offset);
1451 } else {
1452 copy32_avx(dst, src, index, xmm, shift, offset);
1453 copy32_avx(dst, src, index, xmm, shift, offset+32);
1454 }
1455 } else {
1456 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1457 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1458 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1459 }
1460 }
1461
1462 #endif // COMPILER2_OR_JVMCI
1463
1464
1465 // Arguments:
1466 // entry - location for return of (post-push) entry
1467 //
1468 // Inputs:
1469 // c_rarg0 - source array address
1470 // c_rarg1 - destination array address
1471 // c_rarg2 - element count, treated as ssize_t, can be zero
1472 //
1473 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1474 // we let the hardware handle it. The one to eight bytes within words,
1475 // dwords or qwords that span cache line boundaries will still be loaded
1476 // and stored atomically.
1477 //
1478 // Side Effects:
1479 // entry is set to the no-overlap entry point
1480 // used by generate_conjoint_byte_copy().
1481 //
1482 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1483 StubId stub_id = StubId::stubgen_jbyte_disjoint_arraycopy_id;
1484 // aligned is always false -- x86_64 always uses the unaligned code
1485 const bool aligned = false;
1486 #if COMPILER2_OR_JVMCI
1487 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1488 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1489 }
1490 #endif
1491 GrowableArray<address> entries;
1492 GrowableArray<address> extras;
1493 int expected_entry_count = (entry != nullptr ? 2 : 1);
1494 int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1495 int entry_count = StubInfo::entry_count(stub_id);
1496 assert(entry_count == expected_entry_count, "sanity check");
1497 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1498 address start = load_archive_data(stub_id, entries_ptr, &extras);
1499 if (start != nullptr) {
1500 assert(entries.length() == expected_entry_count - 1,
1501 "unexpected entry count %d", entries.length());
1502 assert(extras.length() == expected_handler_count,
1503 "unexpected handler addresses count %d", extras.length());
1504 if (entry != nullptr) {
1505 *entry = entries.at(0);
1506 }
1507 // restore 2 UMAM {start,end,handler} addresses from extras
1508 register_unsafe_access_handlers(extras, 0, 2);
1509 return start;
1510 }
1511 __ align(CodeEntryAlignment);
1512 StubCodeMark mark(this, stub_id);
1513 start = __ pc();
1514 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1515
1516 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1517 Label L_copy_byte, L_exit;
1518 const Register from = rdi; // source array address
1519 const Register to = rsi; // destination array address
1520 const Register count = rdx; // elements count
1521 const Register byte_count = rcx;
1522 const Register qword_count = count;
1523 const Register end_from = from; // source array end address
1524 const Register end_to = to; // destination array end address
1525 // End pointers are inclusive, and if count is not zero they point
1526 // to the last unit copied: end_to[0] := end_from[0]
1527
1528 __ enter(); // required for proper stackwalking of RuntimeStub frame
1529 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1530
1531 if (entry != nullptr) {
1532 *entry = __ pc();
1533 entries.append(*entry);
1534 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1535 BLOCK_COMMENT("Entry:");
1536 }
1537
1538 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1539 // r9 and r10 may be used to save non-volatile registers
1540
1541 {
1542 // UnsafeMemoryAccess page error: continue after unsafe access
1543 UnsafeMemoryAccessMark umam(this, !aligned, true);
1544 // 'from', 'to' and 'count' are now valid
1545 __ movptr(byte_count, count);
1546 __ shrptr(count, 3); // count => qword_count
1547
1548 // Copy from low to high addresses. Use 'to' as scratch.
1549 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1550 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1551 __ negptr(qword_count); // make the count negative
1552 __ jmp(L_copy_bytes);
1553
1554 // Copy trailing qwords
1555 __ BIND(L_copy_8_bytes);
1556 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1557 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1558 __ increment(qword_count);
1559 __ jcc(Assembler::notZero, L_copy_8_bytes);
1560
1561 // Check for and copy trailing dword
1562 __ BIND(L_copy_4_bytes);
1563 __ testl(byte_count, 4);
1564 __ jccb(Assembler::zero, L_copy_2_bytes);
1565 __ movl(rax, Address(end_from, 8));
1566 __ movl(Address(end_to, 8), rax);
1567
1568 __ addptr(end_from, 4);
1569 __ addptr(end_to, 4);
1570
1571 // Check for and copy trailing word
1572 __ BIND(L_copy_2_bytes);
1573 __ testl(byte_count, 2);
1574 __ jccb(Assembler::zero, L_copy_byte);
1575 __ movw(rax, Address(end_from, 8));
1576 __ movw(Address(end_to, 8), rax);
1577
1578 __ addptr(end_from, 2);
1579 __ addptr(end_to, 2);
1580
1581 // Check for and copy trailing byte
1582 __ BIND(L_copy_byte);
1583 __ testl(byte_count, 1);
1584 __ jccb(Assembler::zero, L_exit);
1585 __ movb(rax, Address(end_from, 8));
1586 __ movb(Address(end_to, 8), rax);
1587 }
1588 __ BIND(L_exit);
1589 address ucme_exit_pc = __ pc();
1590 restore_arg_regs();
1591 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1592 __ xorptr(rax, rax); // return 0
1593 __ vzeroupper();
1594 __ leave(); // required for proper stackwalking of RuntimeStub frame
1595 __ ret(0);
1596
1597 {
1598 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1599 // Copy in multi-bytes chunks
1600 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1601 __ jmp(L_copy_4_bytes);
1602 }
1603
1604 // retrieve the registered handler addresses
1605 address end = __ pc();
1606 retrieve_unsafe_access_handlers(start, end, extras);
1607 assert(extras.length() == expected_handler_count,
1608 "unexpected handler addresses count %d", extras.length());
1609
1610 // record the stub entry and end plus the no_push entry and any
1611 // extra handler addresses
1612 store_archive_data(stub_id, start, end, entries_ptr, &extras);
1613
1614 return start;
1615 }
1616
1617
1618 // Arguments:
1619 // entry - location for return of (post-push) entry
1620 // nooverlap_target - entry to branch to if no overlap detected
1621 //
1622 // Inputs:
1623 // c_rarg0 - source array address
1624 // c_rarg1 - destination array address
1625 // c_rarg2 - element count, treated as ssize_t, can be zero
1626 //
1627 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1628 // we let the hardware handle it. The one to eight bytes within words,
1629 // dwords or qwords that span cache line boundaries will still be loaded
1630 // and stored atomically.
1631 //
1632 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1633 StubId stub_id = StubId::stubgen_jbyte_arraycopy_id;
1634 // aligned is always false -- x86_64 always uses the unaligned code
1635 const bool aligned = false;
1636 #if COMPILER2_OR_JVMCI
1637 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1638 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1639 }
1640 #endif
1641 GrowableArray<address> entries;
1642 GrowableArray<address> extras;
1643 int expected_entry_count = (entry != nullptr ? 2 : 1);
1644 int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1645 int entry_count = StubInfo::entry_count(stub_id);
1646 assert(entry_count == expected_entry_count, "sanity check");
1647 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1648 address start = load_archive_data(stub_id, entries_ptr, &extras);
1649 if (start != nullptr) {
1650 assert(entries.length() == expected_entry_count - 1,
1651 "unexpected entry count %d", entries.length());
1652 assert(extras.length() == expected_handler_count,
1653 "unexpected handler addresses count %d", extras.length());
1654 if (entry != nullptr) {
1655 *entry = entries.at(0);
1656 }
1657 // restore 2 UMAM {start,end,handler} addresses from extras
1658 register_unsafe_access_handlers(extras, 0, 2);
1659 return start;
1660 }
1661 __ align(CodeEntryAlignment);
1662 StubCodeMark mark(this, stub_id);
1663 start = __ pc();
1664 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1665
1666 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1667 const Register from = rdi; // source array address
1668 const Register to = rsi; // destination array address
1669 const Register count = rdx; // elements count
1670 const Register byte_count = rcx;
1671 const Register qword_count = count;
1672
1673 __ enter(); // required for proper stackwalking of RuntimeStub frame
1674 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1675
1676 if (entry != nullptr) {
1677 *entry = __ pc();
1678 entries.append(*entry);
1679 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1680 BLOCK_COMMENT("Entry:");
1681 }
1682
1683 array_overlap_test(nooverlap_target, Address::times_1);
1684 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1685 // r9 and r10 may be used to save non-volatile registers
1686
1687 {
1688 // UnsafeMemoryAccess page error: continue after unsafe access
1689 UnsafeMemoryAccessMark umam(this, !aligned, true);
1690 // 'from', 'to' and 'count' are now valid
1691 __ movptr(byte_count, count);
1692 __ shrptr(count, 3); // count => qword_count
1693
1694 // Copy from high to low addresses.
1695
1696 // Check for and copy trailing byte
1697 __ testl(byte_count, 1);
1698 __ jcc(Assembler::zero, L_copy_2_bytes);
1699 __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1700 __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1701 __ decrement(byte_count); // Adjust for possible trailing word
1702
1703 // Check for and copy trailing word
1704 __ BIND(L_copy_2_bytes);
1705 __ testl(byte_count, 2);
1706 __ jcc(Assembler::zero, L_copy_4_bytes);
1707 __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1708 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1709
1710 // Check for and copy trailing dword
1711 __ BIND(L_copy_4_bytes);
1712 __ testl(byte_count, 4);
1713 __ jcc(Assembler::zero, L_copy_bytes);
1714 __ movl(rax, Address(from, qword_count, Address::times_8));
1715 __ movl(Address(to, qword_count, Address::times_8), rax);
1716 __ jmp(L_copy_bytes);
1717
1718 // Copy trailing qwords
1719 __ BIND(L_copy_8_bytes);
1720 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1721 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1722 __ decrement(qword_count);
1723 __ jcc(Assembler::notZero, L_copy_8_bytes);
1724 }
1725 restore_arg_regs();
1726 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1727 __ xorptr(rax, rax); // return 0
1728 __ vzeroupper();
1729 __ leave(); // required for proper stackwalking of RuntimeStub frame
1730 __ ret(0);
1731
1732 {
1733 // UnsafeMemoryAccess page error: continue after unsafe access
1734 UnsafeMemoryAccessMark umam(this, !aligned, true);
1735 // Copy in multi-bytes chunks
1736 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1737 }
1738 restore_arg_regs();
1739 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1740 __ xorptr(rax, rax); // return 0
1741 __ vzeroupper();
1742 __ leave(); // required for proper stackwalking of RuntimeStub frame
1743 __ ret(0);
1744
1745 // retrieve the registered handler addresses
1746 address end = __ pc();
1747 retrieve_unsafe_access_handlers(start, end, extras);
1748 assert(extras.length() == expected_handler_count,
1749 "unexpected handler addresses count %d", extras.length());
1750
1751 // record the stub entry and end plus the no_push entry and any
1752 // extra handler addresses
1753 store_archive_data(stub_id, start, end, entries_ptr, &extras);
1754
1755 return start;
1756 }
1757
1758
1759 // Arguments:
1760 // entry - location for return of (post-push) entry
1761 //
1762 // Inputs:
1763 // c_rarg0 - source array address
1764 // c_rarg1 - destination array address
1765 // c_rarg2 - element count, treated as ssize_t, can be zero
1766 //
1767 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1768 // let the hardware handle it. The two or four words within dwords
1769 // or qwords that span cache line boundaries will still be loaded
1770 // and stored atomically.
1771 //
1772 // Side Effects:
1773 // entry is set to the no-overlap entry point
1774 // used by generate_conjoint_short_copy().
1775 //
1776 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1777 StubId stub_id = StubId::stubgen_jshort_disjoint_arraycopy_id;
1778 // aligned is always false -- x86_64 always uses the unaligned code
1779 const bool aligned = false;
1780 #if COMPILER2_OR_JVMCI
1781 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1782 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1783 }
1784 #endif
1785 GrowableArray<address> entries;
1786 GrowableArray<address> extras;
1787 int expected_entry_count = (entry != nullptr ? 2 : 1);
1788 int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1789 int entry_count = StubInfo::entry_count(stub_id);
1790 assert(entry_count == expected_entry_count, "sanity check");
1791 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1792 address start = load_archive_data(stub_id, entries_ptr, &extras);
1793 if (start != nullptr) {
1794 assert(entries.length() == expected_entry_count - 1,
1795 "unexpected entry count %d", entries.length());
1796 assert(extras.length() == expected_handler_count,
1797 "unexpected handler addresses count %d", extras.length());
1798 if (entry != nullptr) {
1799 *entry = entries.at(0);
1800 }
1801 // restore 2 UMAM {start,end,handler} addresses from extras
1802 register_unsafe_access_handlers(extras, 0, 2);
1803 return start;
1804 }
1805 __ align(CodeEntryAlignment);
1806 StubCodeMark mark(this, stub_id);
1807 start = __ pc();
1808 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1809
1810 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1811 const Register from = rdi; // source array address
1812 const Register to = rsi; // destination array address
1813 const Register count = rdx; // elements count
1814 const Register word_count = rcx;
1815 const Register qword_count = count;
1816 const Register end_from = from; // source array end address
1817 const Register end_to = to; // destination array end address
1818 // End pointers are inclusive, and if count is not zero they point
1819 // to the last unit copied: end_to[0] := end_from[0]
1820
1821 __ enter(); // required for proper stackwalking of RuntimeStub frame
1822 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1823
1824 if (entry != nullptr) {
1825 *entry = __ pc();
1826 entries.append(*entry);
1827 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1828 BLOCK_COMMENT("Entry:");
1829 }
1830
1831 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1832 // r9 and r10 may be used to save non-volatile registers
1833
1834 {
1835 // UnsafeMemoryAccess page error: continue after unsafe access
1836 UnsafeMemoryAccessMark umam(this, !aligned, true);
1837 // 'from', 'to' and 'count' are now valid
1838 __ movptr(word_count, count);
1839 __ shrptr(count, 2); // count => qword_count
1840
1841 // Copy from low to high addresses. Use 'to' as scratch.
1842 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1843 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1844 __ negptr(qword_count);
1845 __ jmp(L_copy_bytes);
1846
1847 // Copy trailing qwords
1848 __ BIND(L_copy_8_bytes);
1849 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1850 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1851 __ increment(qword_count);
1852 __ jcc(Assembler::notZero, L_copy_8_bytes);
1853
1854 // Original 'dest' is trashed, so we can't use it as a
1855 // base register for a possible trailing word copy
1856
1857 // Check for and copy trailing dword
1858 __ BIND(L_copy_4_bytes);
1859 __ testl(word_count, 2);
1860 __ jccb(Assembler::zero, L_copy_2_bytes);
1861 __ movl(rax, Address(end_from, 8));
1862 __ movl(Address(end_to, 8), rax);
1863
1864 __ addptr(end_from, 4);
1865 __ addptr(end_to, 4);
1866
1867 // Check for and copy trailing word
1868 __ BIND(L_copy_2_bytes);
1869 __ testl(word_count, 1);
1870 __ jccb(Assembler::zero, L_exit);
1871 __ movw(rax, Address(end_from, 8));
1872 __ movw(Address(end_to, 8), rax);
1873 }
1874 __ BIND(L_exit);
1875 address ucme_exit_pc = __ pc();
1876 restore_arg_regs();
1877 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1878 __ xorptr(rax, rax); // return 0
1879 __ vzeroupper();
1880 __ leave(); // required for proper stackwalking of RuntimeStub frame
1881 __ ret(0);
1882
1883 {
1884 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1885 // Copy in multi-bytes chunks
1886 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1887 __ jmp(L_copy_4_bytes);
1888 }
1889
1890 // retrieve the registered handler addresses
1891 address end = __ pc();
1892 retrieve_unsafe_access_handlers(start, end, extras);
1893 assert(extras.length() == expected_handler_count,
1894 "unexpected handler addresses count %d", extras.length());
1895
1896 // record the stub entry and end plus the no_push entry and any
1897 // extra handler addresses
1898 store_archive_data(stub_id, start, end, entries_ptr, &extras);
1899
1900 return start;
1901 }
1902
1903
1904 address StubGenerator::generate_fill(StubId stub_id) {
1905 BasicType t;
1906 bool aligned;
1907 switch (stub_id) {
1908 case StubId::stubgen_jbyte_fill_id:
1909 t = T_BYTE;
1910 aligned = false;
1911 break;
1912 case StubId::stubgen_jshort_fill_id:
1913 t = T_SHORT;
1914 aligned = false;
1915 break;
1916 case StubId::stubgen_jint_fill_id:
1917 t = T_INT;
1918 aligned = false;
1919 break;
1920 case StubId::stubgen_arrayof_jbyte_fill_id:
1921 t = T_BYTE;
1922 aligned = true;
1923 break;
1924 case StubId::stubgen_arrayof_jshort_fill_id:
1925 t = T_SHORT;
1926 aligned = true;
1927 break;
1928 case StubId::stubgen_arrayof_jint_fill_id:
1929 t = T_INT;
1930 aligned = true;
1931 break;
1932 default:
1933 ShouldNotReachHere();
1934 }
1935 int entry_count = StubInfo::entry_count(stub_id);
1936 assert(entry_count == 1, "sanity check");
1937 GrowableArray<address> extras;
1938 bool add_handlers = ((t == T_BYTE) && !aligned);
1939 int handlers_count = (add_handlers ? 1 : 0);
1940 int expected_extras_count = (handlers_count * UnsafeMemoryAccess::COLUMN_COUNT); // 0/1 x UMAM {start,end,handler}
1941 GrowableArray<address>* extras_ptr = (add_handlers ? &extras : nullptr);
1942 address start = load_archive_data(stub_id, nullptr, extras_ptr);
1943 if (start != nullptr) {
1944 assert(extras.length() == expected_extras_count,
1945 "unexpected handler addresses count %d", extras.length());
1946 if (add_handlers) {
1947 // restore 1 x UMAM {start,end,handler} addresses from extras
1948 register_unsafe_access_handlers(extras, 0, 1);
1949 }
1950 return start;
1951 }
1952
1953 __ align(CodeEntryAlignment);
1954 StubCodeMark mark(this, stub_id);
1955 start = __ pc();
1956
1957 BLOCK_COMMENT("Entry:");
1958
1959 const Register to = c_rarg0; // destination array address
1960 const Register value = c_rarg1; // value
1961 const Register count = c_rarg2; // elements count
1962 __ mov(r11, count);
1963
1964 __ enter(); // required for proper stackwalking of RuntimeStub frame
1965
1966 {
1967 // Add set memory mark to protect against unsafe accesses faulting
1968 UnsafeMemoryAccessMark umam(this, add_handlers, true);
1969 __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1970 }
1971
1972 __ vzeroupper();
1973 __ leave(); // required for proper stackwalking of RuntimeStub frame
1974 __ ret(0);
1975
1976 address end = __ pc();
1977 if (add_handlers) {
1978 retrieve_unsafe_access_handlers(start, end, extras);
1979 }
1980 assert(extras.length() == expected_extras_count,
1981 "unexpected handler addresses count %d", extras.length());
1982 // record the stub entry and end
1983 store_archive_data(stub_id, start, end, nullptr, extras_ptr);
1984
1985 return start;
1986 }
1987
1988
1989 // Arguments:
1990 // entry - location for return of (post-push) entry
1991 // nooverlap_target - entry to branch to if no overlap detected
1992 //
1993 // Inputs:
1994 // c_rarg0 - source array address
1995 // c_rarg1 - destination array address
1996 // c_rarg2 - element count, treated as ssize_t, can be zero
1997 //
1998 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1999 // let the hardware handle it. The two or four words within dwords
2000 // or qwords that span cache line boundaries will still be loaded
2001 // and stored atomically.
2002 //
2003 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
2004 StubId stub_id = StubId::stubgen_jshort_arraycopy_id;
2005 // aligned is always false -- x86_64 always uses the unaligned code
2006 const bool aligned = false;
2007 #if COMPILER2_OR_JVMCI
2008 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2009 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2010 }
2011 #endif
2012 GrowableArray<address> entries;
2013 GrowableArray<address> extras;
2014 int expected_entry_count = (entry != nullptr ? 2 : 1);
2015 int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
2016 int entry_count = StubInfo::entry_count(stub_id);
2017 assert(entry_count == expected_entry_count, "sanity check");
2018 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2019 address start = load_archive_data(stub_id, entries_ptr, &extras);
2020 if (start != nullptr) {
2021 assert(entries.length() == expected_entry_count - 1,
2022 "unexpected entry count %d", entries.length());
2023 assert(extras.length() == expected_handler_count,
2024 "unexpected handler addresses count %d", extras.length());
2025 if (entry != nullptr) {
2026 *entry = entries.at(0);
2027 }
2028 // restore 2 UMAM {start,end,handler} addresses from extras
2029 register_unsafe_access_handlers(extras, 0, 2);
2030 return start;
2031 }
2032 __ align(CodeEntryAlignment);
2033 StubCodeMark mark(this, stub_id);
2034 start = __ pc();
2035 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2036
2037 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2038 const Register from = rdi; // source array address
2039 const Register to = rsi; // destination array address
2040 const Register count = rdx; // elements count
2041 const Register word_count = rcx;
2042 const Register qword_count = count;
2043
2044 __ enter(); // required for proper stackwalking of RuntimeStub frame
2045 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2046
2047 if (entry != nullptr) {
2048 *entry = __ pc();
2049 entries.append(*entry);
2050 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2051 BLOCK_COMMENT("Entry:");
2052 }
2053
2054 array_overlap_test(nooverlap_target, Address::times_2);
2055 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2056 // r9 and r10 may be used to save non-volatile registers
2057
2058 {
2059 // UnsafeMemoryAccess page error: continue after unsafe access
2060 UnsafeMemoryAccessMark umam(this, !aligned, true);
2061 // 'from', 'to' and 'count' are now valid
2062 __ movptr(word_count, count);
2063 __ shrptr(count, 2); // count => qword_count
2064
2065 // Copy from high to low addresses. Use 'to' as scratch.
2066
2067 // Check for and copy trailing word
2068 __ testl(word_count, 1);
2069 __ jccb(Assembler::zero, L_copy_4_bytes);
2070 __ movw(rax, Address(from, word_count, Address::times_2, -2));
2071 __ movw(Address(to, word_count, Address::times_2, -2), rax);
2072
2073 // Check for and copy trailing dword
2074 __ BIND(L_copy_4_bytes);
2075 __ testl(word_count, 2);
2076 __ jcc(Assembler::zero, L_copy_bytes);
2077 __ movl(rax, Address(from, qword_count, Address::times_8));
2078 __ movl(Address(to, qword_count, Address::times_8), rax);
2079 __ jmp(L_copy_bytes);
2080
2081 // Copy trailing qwords
2082 __ BIND(L_copy_8_bytes);
2083 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2084 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2085 __ decrement(qword_count);
2086 __ jcc(Assembler::notZero, L_copy_8_bytes);
2087 }
2088 restore_arg_regs();
2089 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2090 __ xorptr(rax, rax); // return 0
2091 __ vzeroupper();
2092 __ leave(); // required for proper stackwalking of RuntimeStub frame
2093 __ ret(0);
2094
2095 {
2096 // UnsafeMemoryAccess page error: continue after unsafe access
2097 UnsafeMemoryAccessMark umam(this, !aligned, true);
2098 // Copy in multi-bytes chunks
2099 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
2100 }
2101 restore_arg_regs();
2102 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2103 __ xorptr(rax, rax); // return 0
2104 __ vzeroupper();
2105 __ leave(); // required for proper stackwalking of RuntimeStub frame
2106 __ ret(0);
2107
2108 // retrieve the registered handler addresses
2109 address end = __ pc();
2110 retrieve_unsafe_access_handlers(start, end, extras);
2111 assert(extras.length() == expected_handler_count,
2112 "unexpected handler addresses count %d", extras.length());
2113
2114 // record the stub entry and end plus the no_push entry and any
2115 // extra handler addresses
2116 store_archive_data(stub_id, start, end, entries_ptr, &extras);
2117
2118 return start;
2119 }
2120
2121
2122 // Arguments:
2123 // stub_id - unqiue id for stub to generate
2124 // entry - location for return of (post-push) entry
2125 // is_oop - true => oop array, so generate store check code
2126 //
2127 // Inputs:
2128 // c_rarg0 - source array address
2129 // c_rarg1 - destination array address
2130 // c_rarg2 - element count, treated as ssize_t, can be zero
2131 //
2132 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2133 // the hardware handle it. The two dwords within qwords that span
2134 // cache line boundaries will still be loaded and stored atomically.
2135 //
2136 // Side Effects:
2137 // disjoint_int_copy_entry is set to the no-overlap entry point
2138 // used by generate_conjoint_int_oop_copy().
2139 //
2140 address StubGenerator::generate_disjoint_int_oop_copy(StubId stub_id, address* entry) {
2141 // aligned is always false -- x86_64 always uses the unaligned code
2142 const bool aligned = false;
2143 bool is_oop;
2144 bool dest_uninitialized;
2145 switch (stub_id) {
2146 case StubId::stubgen_jint_disjoint_arraycopy_id:
2147 is_oop = false;
2148 dest_uninitialized = false;
2149 break;
2150 case StubId::stubgen_oop_disjoint_arraycopy_id:
2151 assert(UseCompressedOops, "inconsistent oop copy size!");
2152 is_oop = true;
2153 dest_uninitialized = false;
2154 break;
2155 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2156 assert(UseCompressedOops, "inconsistent oop copy size!");
2157 is_oop = true;
2158 dest_uninitialized = true;
2159 break;
2160 default:
2161 ShouldNotReachHere();
2162 }
2163
2164 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2165 #if COMPILER2_OR_JVMCI
2166 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2167 return generate_disjoint_copy_avx3_masked(stub_id, entry);
2168 }
2169 #endif
2170 GrowableArray<address> entries;
2171 GrowableArray<address> extras;
2172 bool add_handlers = !is_oop && !aligned;
2173 bool add_relocs = UseZGC && is_oop;
2174 bool add_extras = add_handlers || add_relocs;
2175 int expected_entry_count = (entry != nullptr ? 2 : 1);
2176 int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2177 int entry_count = StubInfo::entry_count(stub_id);
2178 assert(entry_count == expected_entry_count, "sanity check");
2179 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2180 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2181 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2182 if (start != nullptr) {
2183 assert(entries.length() == expected_entry_count - 1,
2184 "unexpected entry count %d", entries.length());
2185 assert(!add_handlers || extras.length() == expected_handler_count,
2186 "unexpected handler addresses count %d", extras.length());
2187 if (entry != nullptr) {
2188 *entry = entries.at(0);
2189 }
2190 if (add_handlers) {
2191 // restore 2 UMAM {start,end,handler} addresses from extras
2192 register_unsafe_access_handlers(extras, 0, 2);
2193 }
2194 #if INCLUDE_ZGC
2195 // register addresses at which ZGC does colour patching
2196 if (add_relocs) {
2197 register_reloc_addresses(extras, 0, extras.length());
2198 }
2199 #endif // INCLUDE_ZGC
2200 return start;
2201 }
2202
2203 __ align(CodeEntryAlignment);
2204 StubCodeMark mark(this, stub_id);
2205 start = __ pc();
2206
2207 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2208 const Register from = rdi; // source array address
2209 const Register to = rsi; // destination array address
2210 const Register count = rdx; // elements count
2211 const Register dword_count = rcx;
2212 const Register qword_count = count;
2213 const Register end_from = from; // source array end address
2214 const Register end_to = to; // destination array end address
2215 // End pointers are inclusive, and if count is not zero they point
2216 // to the last unit copied: end_to[0] := end_from[0]
2217
2218 __ enter(); // required for proper stackwalking of RuntimeStub frame
2219 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2220
2221 if (entry != nullptr) {
2222 *entry = __ pc();
2223 entries.append(*entry);
2224 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2225 BLOCK_COMMENT("Entry:");
2226 }
2227
2228 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2229 // r9 is used to save r15_thread
2230
2231 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2232 if (dest_uninitialized) {
2233 decorators |= IS_DEST_UNINITIALIZED;
2234 }
2235 if (aligned) {
2236 decorators |= ARRAYCOPY_ALIGNED;
2237 }
2238
2239 BasicType type = is_oop ? T_OBJECT : T_INT;
2240 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2241
2242 {
2243 // UnsafeMemoryAccess page error: continue after unsafe access
2244 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2245 // 'from', 'to' and 'count' are now valid
2246 __ movptr(dword_count, count);
2247 __ shrptr(count, 1); // count => qword_count
2248
2249 // Copy from low to high addresses. Use 'to' as scratch.
2250 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2251 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2252 __ negptr(qword_count);
2253 __ jmp(L_copy_bytes);
2254
2255 // Copy trailing qwords
2256 __ BIND(L_copy_8_bytes);
2257 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2258 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2259 __ increment(qword_count);
2260 __ jcc(Assembler::notZero, L_copy_8_bytes);
2261
2262 // Check for and copy trailing dword
2263 __ BIND(L_copy_4_bytes);
2264 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2265 __ jccb(Assembler::zero, L_exit);
2266 __ movl(rax, Address(end_from, 8));
2267 __ movl(Address(end_to, 8), rax);
2268 }
2269 __ BIND(L_exit);
2270 address ucme_exit_pc = __ pc();
2271 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2272 restore_arg_regs_using_thread();
2273 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2274 __ vzeroupper();
2275 __ xorptr(rax, rax); // return 0
2276 __ leave(); // required for proper stackwalking of RuntimeStub frame
2277 __ ret(0);
2278
2279 {
2280 UnsafeMemoryAccessMark umam(this, add_handlers, false, ucme_exit_pc);
2281 // Copy in multi-bytes chunks
2282 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2283 __ jmp(L_copy_4_bytes);
2284 }
2285
2286 // retrieve the registered handler addresses
2287 address end = __ pc();
2288 if (add_handlers) {
2289 retrieve_unsafe_access_handlers(start, end, extras);
2290 }
2291 assert(extras.length() == expected_handler_count,
2292 "unexpected handler addresses count %d", extras.length());
2293 #if INCLUDE_ZGC
2294 // retrieve addresses at which ZGC does colour patching
2295 if (add_relocs) {
2296 retrieve_reloc_addresses(start, end, extras);
2297 }
2298 #endif // INCLUDE_ZGC
2299
2300 // record the stub entry and end plus the no_push entry and any
2301 // extra handler addresses
2302 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2303
2304 return start;
2305 }
2306
2307
2308 // Arguments:
2309 // entry - location for return of (post-push) entry
2310 // nooverlap_target - entry to branch to if no overlap detected
2311 // is_oop - true => oop array, so generate store check code
2312 //
2313 // Inputs:
2314 // c_rarg0 - source array address
2315 // c_rarg1 - destination array address
2316 // c_rarg2 - element count, treated as ssize_t, can be zero
2317 //
2318 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2319 // the hardware handle it. The two dwords within qwords that span
2320 // cache line boundaries will still be loaded and stored atomically.
2321 //
2322 address StubGenerator::generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2323 // aligned is always false -- x86_64 always uses the unaligned code
2324 const bool aligned = false;
2325 bool is_oop;
2326 bool dest_uninitialized;
2327 switch (stub_id) {
2328 case StubId::stubgen_jint_arraycopy_id:
2329 is_oop = false;
2330 dest_uninitialized = false;
2331 break;
2332 case StubId::stubgen_oop_arraycopy_id:
2333 assert(UseCompressedOops, "inconsistent oop copy size!");
2334 is_oop = true;
2335 dest_uninitialized = false;
2336 break;
2337 case StubId::stubgen_oop_arraycopy_uninit_id:
2338 assert(UseCompressedOops, "inconsistent oop copy size!");
2339 is_oop = true;
2340 dest_uninitialized = true;
2341 break;
2342 default:
2343 ShouldNotReachHere();
2344 }
2345
2346 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2347 #if COMPILER2_OR_JVMCI
2348 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2349 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2350 }
2351 #endif
2352 bool add_handlers = !is_oop && !aligned;
2353 bool add_relocs = UseZGC && is_oop;
2354 bool add_extras = add_handlers || add_relocs;
2355 GrowableArray<address> entries;
2356 GrowableArray<address> extras;
2357 int expected_entry_count = (entry != nullptr ? 2 : 1);
2358 int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2359 int entry_count = StubInfo::entry_count(stub_id);
2360 assert(entry_count == expected_entry_count, "sanity check");
2361 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2362 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2363 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2364 if (start != nullptr) {
2365 assert(entries.length() == expected_entry_count - 1,
2366 "unexpected entry count %d", entries.length());
2367 assert(!add_handlers || extras.length() == expected_handler_count,
2368 "unexpected handler addresses count %d", extras.length());
2369 if (entry != nullptr) {
2370 *entry = entries.at(0);
2371 }
2372 if (add_handlers) {
2373 // restore 2 UMAM {start,end,handler} addresses from extras
2374 register_unsafe_access_handlers(extras, 0, 2);
2375 }
2376 #if INCLUDE_ZGC
2377 // register addresses at which ZGC does colour patching
2378 if (add_relocs) {
2379 register_reloc_addresses(extras, 6, extras.length());
2380 }
2381 #endif // INCLUDE_ZGC
2382 return start;
2383 }
2384
2385 __ align(CodeEntryAlignment);
2386 StubCodeMark mark(this, stub_id);
2387 start = __ pc();
2388
2389 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2390 const Register from = rdi; // source array address
2391 const Register to = rsi; // destination array address
2392 const Register count = rdx; // elements count
2393 const Register dword_count = rcx;
2394 const Register qword_count = count;
2395
2396 __ enter(); // required for proper stackwalking of RuntimeStub frame
2397 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2398
2399 if (entry != nullptr) {
2400 *entry = __ pc();
2401 entries.append(*entry);
2402 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2403 BLOCK_COMMENT("Entry:");
2404 }
2405
2406 array_overlap_test(nooverlap_target, Address::times_4);
2407 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2408 // r9 is used to save r15_thread
2409
2410 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2411 if (dest_uninitialized) {
2412 decorators |= IS_DEST_UNINITIALIZED;
2413 }
2414 if (aligned) {
2415 decorators |= ARRAYCOPY_ALIGNED;
2416 }
2417
2418 BasicType type = is_oop ? T_OBJECT : T_INT;
2419 // no registers are destroyed by this call
2420 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2421
2422 assert_clean_int(count, rax); // Make sure 'count' is clean int.
2423 {
2424 // UnsafeMemoryAccess page error: continue after unsafe access
2425 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2426 // 'from', 'to' and 'count' are now valid
2427 __ movptr(dword_count, count);
2428 __ shrptr(count, 1); // count => qword_count
2429
2430 // Copy from high to low addresses. Use 'to' as scratch.
2431
2432 // Check for and copy trailing dword
2433 __ testl(dword_count, 1);
2434 __ jcc(Assembler::zero, L_copy_bytes);
2435 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2436 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2437 __ jmp(L_copy_bytes);
2438
2439 // Copy trailing qwords
2440 __ BIND(L_copy_8_bytes);
2441 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2442 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2443 __ decrement(qword_count);
2444 __ jcc(Assembler::notZero, L_copy_8_bytes);
2445 }
2446 if (is_oop) {
2447 __ jmp(L_exit);
2448 }
2449 restore_arg_regs_using_thread();
2450 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2451 __ xorptr(rax, rax); // return 0
2452 __ vzeroupper();
2453 __ leave(); // required for proper stackwalking of RuntimeStub frame
2454 __ ret(0);
2455
2456 {
2457 // UnsafeMemoryAccess page error: continue after unsafe access
2458 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2459 // Copy in multi-bytes chunks
2460 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2461 }
2462
2463 __ BIND(L_exit);
2464 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2465 restore_arg_regs_using_thread();
2466 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2467 __ xorptr(rax, rax); // return 0
2468 __ vzeroupper();
2469 __ leave(); // required for proper stackwalking of RuntimeStub frame
2470 __ ret(0);
2471
2472 // retrieve the registered handler addresses
2473 address end = __ pc();
2474 if (add_handlers) {
2475 retrieve_unsafe_access_handlers(start, end, extras);
2476 }
2477 assert(extras.length() == expected_handler_count,
2478 "unexpected handler addresses count %d", extras.length());
2479 #if INCLUDE_ZGC
2480 // retrieve addresses at which ZGC does colour patching
2481 if (add_relocs) {
2482 retrieve_reloc_addresses(start, end, extras);
2483 }
2484 #endif // INCLUDE_ZGC
2485 // record the stub entry and end plus the no_push entry and any
2486 // extra handler addresses
2487 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2488
2489 return start;
2490 }
2491
2492
2493 // Arguments:
2494 // entry - location for return of (post-push) entry
2495 //
2496 // Inputs:
2497 // c_rarg0 - source array address
2498 // c_rarg1 - destination array address
2499 // c_rarg2 - element count, treated as ssize_t, can be zero
2500 //
2501 // Side Effects:
2502 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2503 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2504 //
2505 address StubGenerator::generate_disjoint_long_oop_copy(StubId stub_id, address *entry) {
2506 // aligned is always false -- x86_64 always uses the unaligned code
2507 const bool aligned = false;
2508 bool is_oop;
2509 bool dest_uninitialized;
2510 switch (stub_id) {
2511 case StubId::stubgen_jlong_disjoint_arraycopy_id:
2512 is_oop = false;
2513 dest_uninitialized = false;
2514 break;
2515 case StubId::stubgen_oop_disjoint_arraycopy_id:
2516 assert(!UseCompressedOops, "inconsistent oop copy size!");
2517 is_oop = true;
2518 dest_uninitialized = false;
2519 break;
2520 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2521 assert(!UseCompressedOops, "inconsistent oop copy size!");
2522 is_oop = true;
2523 dest_uninitialized = true;
2524 break;
2525 default:
2526 ShouldNotReachHere();
2527 }
2528
2529 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2530 #if COMPILER2_OR_JVMCI
2531 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2532 return generate_disjoint_copy_avx3_masked(stub_id, entry);
2533 }
2534 #endif
2535 bool add_handlers = !is_oop && !aligned;
2536 bool add_relocs = UseZGC && is_oop;
2537 bool add_extras = add_handlers || add_relocs;
2538 GrowableArray<address> entries;
2539 GrowableArray<address> extras;
2540 int expected_entry_count = (entry != nullptr ? 2 : 1);
2541 int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2542 int entry_count = StubInfo::entry_count(stub_id);
2543 assert(entry_count == expected_entry_count, "sanity check");
2544 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2545 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2546 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2547 if (start != nullptr) {
2548 assert(entries.length() == expected_entry_count - 1,
2549 "unexpected entry count %d", entries.length());
2550 assert(!add_handlers || extras.length() == expected_handler_count,
2551 "unexpected handler addresses count %d", extras.length());
2552 if (entry != nullptr) {
2553 *entry = entries.at(0);
2554 }
2555 if (add_handlers) {
2556 // restore 2 UMAM {start,end,handler} addresses from extras
2557 register_unsafe_access_handlers(extras, 0, 2);
2558 }
2559 #if INCLUDE_ZGC
2560 // register addresses at which ZGC does colour patching
2561 if (add_relocs) {
2562 register_reloc_addresses(extras, 0, extras.length());
2563 }
2564 #endif // INCLUDE_ZGC
2565 return start;
2566 }
2567
2568 __ align(CodeEntryAlignment);
2569 StubCodeMark mark(this, stub_id);
2570 start = __ pc();
2571
2572 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2573 const Register from = rdi; // source array address
2574 const Register to = rsi; // destination array address
2575 const Register qword_count = rdx; // elements count
2576 const Register end_from = from; // source array end address
2577 const Register end_to = rcx; // destination array end address
2578 const Register saved_count = r11;
2579 // End pointers are inclusive, and if count is not zero they point
2580 // to the last unit copied: end_to[0] := end_from[0]
2581
2582 __ enter(); // required for proper stackwalking of RuntimeStub frame
2583 // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2584 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2585
2586 if (entry != nullptr) {
2587 *entry = __ pc();
2588 entries.append(*entry);
2589 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2590 BLOCK_COMMENT("Entry:");
2591 }
2592
2593 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2594 // r9 is used to save r15_thread
2595 // 'from', 'to' and 'qword_count' are now valid
2596
2597 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2598 if (dest_uninitialized) {
2599 decorators |= IS_DEST_UNINITIALIZED;
2600 }
2601 if (aligned) {
2602 decorators |= ARRAYCOPY_ALIGNED;
2603 }
2604
2605 BasicType type = is_oop ? T_OBJECT : T_LONG;
2606 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2607 {
2608 // UnsafeMemoryAccess page error: continue after unsafe access
2609 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2610
2611 // Copy from low to high addresses. Use 'to' as scratch.
2612 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2613 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2614 __ negptr(qword_count);
2615 __ jmp(L_copy_bytes);
2616
2617 // Copy trailing qwords
2618 __ BIND(L_copy_8_bytes);
2619 bs->copy_load_at(_masm, decorators, type, 8,
2620 rax, Address(end_from, qword_count, Address::times_8, 8),
2621 r10);
2622 bs->copy_store_at(_masm, decorators, type, 8,
2623 Address(end_to, qword_count, Address::times_8, 8), rax,
2624 r10);
2625 __ increment(qword_count);
2626 __ jcc(Assembler::notZero, L_copy_8_bytes);
2627 }
2628 if (is_oop) {
2629 __ jmp(L_exit);
2630 } else {
2631 restore_arg_regs_using_thread();
2632 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2633 __ xorptr(rax, rax); // return 0
2634 __ vzeroupper();
2635 __ leave(); // required for proper stackwalking of RuntimeStub frame
2636 __ ret(0);
2637 }
2638
2639 {
2640 // UnsafeMemoryAccess page error: continue after unsafe access
2641 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2642 // Copy in multi-bytes chunks
2643 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2644 }
2645
2646 __ BIND(L_exit);
2647 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2648 restore_arg_regs_using_thread();
2649 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2650 SharedRuntime::_jlong_array_copy_ctr,
2651 rscratch1); // Update counter after rscratch1 is free
2652 __ vzeroupper();
2653 __ xorptr(rax, rax); // return 0
2654 __ leave(); // required for proper stackwalking of RuntimeStub frame
2655 __ ret(0);
2656
2657 // retrieve the registered handler addresses
2658 address end = __ pc();
2659 if (add_handlers) {
2660 retrieve_unsafe_access_handlers(start, end, extras);
2661 }
2662 assert(extras.length() == expected_handler_count,
2663 "unexpected handler addresses count %d", extras.length());
2664 #if INCLUDE_ZGC
2665 // retrieve addresses at which ZGC does colour patching
2666 if (add_relocs) {
2667 retrieve_reloc_addresses(start, end, extras);
2668 }
2669 #endif // INCLUDE_ZGC
2670 // record the stub entry and end plus the no_push entry and any
2671 // extra handler addresses
2672 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2673
2674 return start;
2675 }
2676
2677
2678 // Arguments:
2679 // entry - location for return of (post-push) entry
2680 // nooverlap_target - entry to branch to if no overlap detected
2681 // is_oop - true => oop array, so generate store check code
2682 //
2683 // Inputs:
2684 // c_rarg0 - source array address
2685 // c_rarg1 - destination array address
2686 // c_rarg2 - element count, treated as ssize_t, can be zero
2687 //
2688 address StubGenerator::generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2689 // aligned is always false -- x86_64 always uses the unaligned code
2690 const bool aligned = false;
2691 bool is_oop;
2692 bool dest_uninitialized;
2693 switch (stub_id) {
2694 case StubId::stubgen_jlong_arraycopy_id:
2695 is_oop = false;
2696 dest_uninitialized = false;
2697 break;
2698 case StubId::stubgen_oop_arraycopy_id:
2699 assert(!UseCompressedOops, "inconsistent oop copy size!");
2700 is_oop = true;
2701 dest_uninitialized = false;
2702 break;
2703 case StubId::stubgen_oop_arraycopy_uninit_id:
2704 assert(!UseCompressedOops, "inconsistent oop copy size!");
2705 is_oop = true;
2706 dest_uninitialized = true;
2707 break;
2708 default:
2709 ShouldNotReachHere();
2710 }
2711
2712 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2713 #if COMPILER2_OR_JVMCI
2714 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2715 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2716 }
2717 #endif
2718 bool add_handlers = !is_oop && !aligned;
2719 bool add_relocs = UseZGC && is_oop;
2720 bool add_extras = add_handlers || add_relocs;
2721 GrowableArray<address> entries;
2722 GrowableArray<address> extras;
2723 int expected_entry_count = (entry != nullptr ? 2 : 1);
2724 int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2725 int entry_count = StubInfo::entry_count(stub_id);
2726 assert(entry_count == expected_entry_count, "sanity check");
2727 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2728 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2729 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2730 if (start != nullptr) {
2731 assert(entries.length() == expected_entry_count - 1,
2732 "unexpected entry count %d", entries.length());
2733 assert(!add_handlers || extras.length() == expected_handler_count,
2734 "unexpected handler addresses count %d", extras.length());
2735 if (entry != nullptr) {
2736 *entry = entries.at(0);
2737 }
2738 if (add_handlers) {
2739 // restore 2 UMAM {start,end,handler} addresses from extras
2740 register_unsafe_access_handlers(extras, 0, 2);
2741 }
2742 #if INCLUDE_ZGC
2743 // register addresses at which ZGC does colour patching
2744 if (add_relocs) {
2745 register_reloc_addresses(extras, 0, extras.length());
2746 }
2747 #endif // INCLUDE_ZGC
2748 return start;
2749 }
2750
2751 __ align(CodeEntryAlignment);
2752 StubCodeMark mark(this, stub_id);
2753 start = __ pc();
2754
2755 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2756 const Register from = rdi; // source array address
2757 const Register to = rsi; // destination array address
2758 const Register qword_count = rdx; // elements count
2759 const Register saved_count = rcx;
2760
2761 __ enter(); // required for proper stackwalking of RuntimeStub frame
2762 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2763
2764 if (entry != nullptr) {
2765 *entry = __ pc();
2766 entries.append(*entry);
2767 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2768 BLOCK_COMMENT("Entry:");
2769 }
2770
2771 array_overlap_test(nooverlap_target, Address::times_8);
2772 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2773 // r9 is used to save r15_thread
2774 // 'from', 'to' and 'qword_count' are now valid
2775
2776 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2777 if (dest_uninitialized) {
2778 decorators |= IS_DEST_UNINITIALIZED;
2779 }
2780 if (aligned) {
2781 decorators |= ARRAYCOPY_ALIGNED;
2782 }
2783
2784 BasicType type = is_oop ? T_OBJECT : T_LONG;
2785 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2786 {
2787 // UnsafeMemoryAccess page error: continue after unsafe access
2788 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2789
2790 __ jmp(L_copy_bytes);
2791
2792 // Copy trailing qwords
2793 __ BIND(L_copy_8_bytes);
2794 bs->copy_load_at(_masm, decorators, type, 8,
2795 rax, Address(from, qword_count, Address::times_8, -8),
2796 r10);
2797 bs->copy_store_at(_masm, decorators, type, 8,
2798 Address(to, qword_count, Address::times_8, -8), rax,
2799 r10);
2800 __ decrement(qword_count);
2801 __ jcc(Assembler::notZero, L_copy_8_bytes);
2802 }
2803 if (is_oop) {
2804 __ jmp(L_exit);
2805 } else {
2806 restore_arg_regs_using_thread();
2807 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2808 __ xorptr(rax, rax); // return 0
2809 __ vzeroupper();
2810 __ leave(); // required for proper stackwalking of RuntimeStub frame
2811 __ ret(0);
2812 }
2813 {
2814 // UnsafeMemoryAccess page error: continue after unsafe access
2815 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2816
2817 // Copy in multi-bytes chunks
2818 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2819 }
2820 __ BIND(L_exit);
2821 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2822 restore_arg_regs_using_thread();
2823 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2824 SharedRuntime::_jlong_array_copy_ctr,
2825 rscratch1); // Update counter after rscratch1 is free
2826 __ vzeroupper();
2827 __ xorptr(rax, rax); // return 0
2828 __ leave(); // required for proper stackwalking of RuntimeStub frame
2829 __ ret(0);
2830
2831
2832 // retrieve the registered handler addresses
2833 address end = __ pc();
2834 if (add_handlers) {
2835 retrieve_unsafe_access_handlers(start, end, extras);
2836 }
2837 assert(extras.length() == expected_handler_count,
2838 "unexpected handler addresses count %d", extras.length());
2839 #if INCLUDE_ZGC
2840 // retrieve addresses at which ZGC does colour patching
2841 if ((UseZGC && is_oop)) {
2842 retrieve_reloc_addresses(start, end, extras);
2843 }
2844 #endif // INCLUDE_ZGC
2845 // record the stub entry and end plus the no_push entry and any
2846 // extra handler addresses
2847 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2848
2849 return start;
2850 }
2851
2852
2853 // Helper for generating a dynamic type check.
2854 // Smashes no registers.
2855 void StubGenerator::generate_type_check(Register sub_klass,
2856 Register super_check_offset,
2857 Register super_klass,
2858 Label& L_success) {
2859 assert_different_registers(sub_klass, super_check_offset, super_klass);
2860
2861 BLOCK_COMMENT("type_check:");
2862
2863 Label L_miss;
2864
2865 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2866 super_check_offset);
2867 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2868
2869 // Fall through on failure!
2870 __ BIND(L_miss);
2871 }
2872
2873 //
2874 // Generate checkcasting array copy stub
2875 //
2876 // Input:
2877 // c_rarg0 - source array address
2878 // c_rarg1 - destination array address
2879 // c_rarg2 - element count, treated as ssize_t, can be zero
2880 // c_rarg3 - size_t ckoff (super_check_offset)
2881 // not Win64
2882 // c_rarg4 - oop ckval (super_klass)
2883 // Win64
2884 // rsp+40 - oop ckval (super_klass)
2885 //
2886 // Output:
2887 // rax == 0 - success
2888 // rax == -1^K - failure, where K is partial transfer count
2889 //
2890 address StubGenerator::generate_checkcast_copy(StubId stub_id, address *entry) {
2891
2892 bool dest_uninitialized;
2893 switch (stub_id) {
2894 case StubId::stubgen_checkcast_arraycopy_id:
2895 dest_uninitialized = false;
2896 break;
2897 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2898 dest_uninitialized = true;
2899 break;
2900 default:
2901 ShouldNotReachHere();
2902 }
2903
2904 GrowableArray<address> entries;
2905 GrowableArray<address> extras;
2906 int expected_entry_count = (entry != nullptr ? 2 : 1);
2907 int entry_count = StubInfo::entry_count(stub_id);
2908 assert(entry_count == expected_entry_count, "sanity check");
2909 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2910 GrowableArray<address>* extras_ptr = (UseZGC ? &extras : nullptr);
2911 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2912 if (start != nullptr) {
2913 assert(entries.length() == expected_entry_count - 1,
2914 "unexpected addresses count %d", entries.length());
2915 if (entry != nullptr) {
2916 *entry = entries.at(0);
2917 }
2918 #if INCLUDE_ZGC
2919 if (UseZGC) {
2920 register_reloc_addresses(extras, 0, extras.length());
2921 }
2922 #endif // INCLUDE_ZGC
2923 return start;
2924 }
2925
2926 Label L_load_element, L_store_element, L_do_card_marks, L_done;
2927
2928 // Input registers (after setup_arg_regs)
2929 const Register from = rdi; // source array address
2930 const Register to = rsi; // destination array address
2931 const Register length = rdx; // elements count
2932 const Register ckoff = rcx; // super_check_offset
2933 const Register ckval = r8; // super_klass
2934
2935 // Registers used as temps (r13, r14 are save-on-entry)
2936 const Register end_from = from; // source array end address
2937 const Register end_to = r13; // destination array end address
2938 const Register count = rdx; // -(count_remaining)
2939 const Register r14_length = r14; // saved copy of length
2940 // End pointers are inclusive, and if length is not zero they point
2941 // to the last unit copied: end_to[0] := end_from[0]
2942
2943 const Register rax_oop = rax; // actual oop copied
2944 const Register r11_klass = r11; // oop._klass
2945
2946 //---------------------------------------------------------------
2947 // Assembler stub will be used for this call to arraycopy
2948 // if the two arrays are subtypes of Object[] but the
2949 // destination array type is not equal to or a supertype
2950 // of the source type. Each element must be separately
2951 // checked.
2952
2953 __ align(CodeEntryAlignment);
2954 StubCodeMark mark(this, stub_id);
2955 start = __ pc();
2956
2957 __ enter(); // required for proper stackwalking of RuntimeStub frame
2958
2959 #ifdef ASSERT
2960 // caller guarantees that the arrays really are different
2961 // otherwise, we would have to make conjoint checks
2962 { Label L;
2963 array_overlap_test(L, TIMES_OOP);
2964 __ stop("checkcast_copy within a single array");
2965 __ bind(L);
2966 }
2967 #endif //ASSERT
2968
2969 setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2970 // ckoff => rcx, ckval => r8
2971 // r9 is used to save r15_thread
2972 #ifdef _WIN64
2973 // last argument (#4) is on stack on Win64
2974 __ movptr(ckval, Address(rsp, 6 * wordSize));
2975 #endif
2976
2977 // Caller of this entry point must set up the argument registers.
2978 if (entry != nullptr) {
2979 *entry = __ pc();
2980 entries.append(*entry);
2981 BLOCK_COMMENT("Entry:");
2982 }
2983
2984 // allocate spill slots for r13, r14
2985 enum {
2986 saved_r13_offset,
2987 saved_r14_offset,
2988 saved_r10_offset,
2989 saved_rbp_offset
2990 };
2991 __ subptr(rsp, saved_rbp_offset * wordSize);
2992 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2993 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2994 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2995
2996 #ifdef ASSERT
2997 Label L2;
2998 __ get_thread_slow(r14);
2999 __ cmpptr(r15_thread, r14);
3000 __ jcc(Assembler::equal, L2);
3001 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
3002 __ bind(L2);
3003 #endif // ASSERT
3004
3005 // check that int operands are properly extended to size_t
3006 assert_clean_int(length, rax);
3007 assert_clean_int(ckoff, rax);
3008
3009 #ifdef ASSERT
3010 BLOCK_COMMENT("assert consistent ckoff/ckval");
3011 // The ckoff and ckval must be mutually consistent,
3012 // even though caller generates both.
3013 { Label L;
3014 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3015 __ cmpl(ckoff, Address(ckval, sco_offset));
3016 __ jcc(Assembler::equal, L);
3017 __ stop("super_check_offset inconsistent");
3018 __ bind(L);
3019 }
3020 #endif //ASSERT
3021
3022 // Loop-invariant addresses. They are exclusive end pointers.
3023 Address end_from_addr(from, length, TIMES_OOP, 0);
3024 Address end_to_addr(to, length, TIMES_OOP, 0);
3025 // Loop-variant addresses. They assume post-incremented count < 0.
3026 Address from_element_addr(end_from, count, TIMES_OOP, 0);
3027 Address to_element_addr(end_to, count, TIMES_OOP, 0);
3028
3029 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
3030 if (dest_uninitialized) {
3031 decorators |= IS_DEST_UNINITIALIZED;
3032 }
3033
3034 BasicType type = T_OBJECT;
3035 size_t element_size = UseCompressedOops ? 4 : 8;
3036
3037 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3038 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
3039
3040 // Copy from low to high addresses, indexed from the end of each array.
3041 __ lea(end_from, end_from_addr);
3042 __ lea(end_to, end_to_addr);
3043 __ movptr(r14_length, length); // save a copy of the length
3044 assert(length == count, ""); // else fix next line:
3045 __ negptr(count); // negate and test the length
3046 __ jcc(Assembler::notZero, L_load_element);
3047
3048 // Empty array: Nothing to do.
3049 __ xorptr(rax, rax); // return 0 on (trivial) success
3050 __ jmp(L_done);
3051
3052 // ======== begin loop ========
3053 // (Loop is rotated; its entry is L_load_element.)
3054 // Loop control:
3055 // for (count = -count; count != 0; count++)
3056 // Base pointers src, dst are biased by 8*(count-1),to last element.
3057 __ align(OptoLoopAlignment);
3058
3059 __ BIND(L_store_element);
3060 bs->copy_store_at(_masm,
3061 decorators,
3062 type,
3063 element_size,
3064 to_element_addr,
3065 rax_oop,
3066 r10);
3067 __ increment(count); // increment the count toward zero
3068 __ jcc(Assembler::zero, L_do_card_marks);
3069
3070 // ======== loop entry is here ========
3071 __ BIND(L_load_element);
3072 bs->copy_load_at(_masm,
3073 decorators,
3074 type,
3075 element_size,
3076 rax_oop,
3077 from_element_addr,
3078 r10);
3079 __ testptr(rax_oop, rax_oop);
3080 __ jcc(Assembler::zero, L_store_element);
3081
3082 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
3083 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
3084 // ======== end loop ========
3085
3086 // It was a real error; we must depend on the caller to finish the job.
3087 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
3088 // Emit GC store barriers for the oops we have copied (r14 + rdx),
3089 // and report their number to the caller.
3090 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
3091 Label L_post_barrier;
3092 __ addptr(r14_length, count); // K = (original - remaining) oops
3093 __ movptr(rax, r14_length); // save the value
3094 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
3095 __ jccb(Assembler::notZero, L_post_barrier);
3096 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
3097
3098 // Come here on success only.
3099 __ BIND(L_do_card_marks);
3100 __ xorptr(rax, rax); // return 0 on success
3101
3102 __ BIND(L_post_barrier);
3103 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
3104
3105 // Common exit point (success or failure).
3106 __ BIND(L_done);
3107 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
3108 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
3109 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
3110 restore_arg_regs_using_thread();
3111 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
3112 __ leave(); // required for proper stackwalking of RuntimeStub frame
3113 __ ret(0);
3114
3115 address end = __ pc();
3116 #if INCLUDE_ZGC
3117 // retrieve addresses at which ZGC does colour patching
3118 if (UseZGC) {
3119 retrieve_reloc_addresses(start, end, extras);
3120 }
3121 #endif // INCLUDE_ZGC
3122 // record the stub entry and end plus the no_push entry
3123 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
3124
3125 return start;
3126 }
3127
3128
3129 // Generate 'unsafe' array copy stub
3130 // Though just as safe as the other stubs, it takes an unscaled
3131 // size_t argument instead of an element count.
3132 //
3133 // Input:
3134 // c_rarg0 - source array address
3135 // c_rarg1 - destination array address
3136 // c_rarg2 - byte count, treated as ssize_t, can be zero
3137 //
3138 // Examines the alignment of the operands and dispatches
3139 // to a long, int, short, or byte copy loop.
3140 //
3141 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
3142 address int_copy_entry, address long_copy_entry) {
3143
3144 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
3145 int entry_count = StubInfo::entry_count(stub_id);
3146 assert(entry_count == 1, "sanity check");
3147 address start = load_archive_data(stub_id);
3148 if (start != nullptr) {
3149 return start;
3150 }
3151
3152 Label L_long_aligned, L_int_aligned, L_short_aligned;
3153
3154 // Input registers (before setup_arg_regs)
3155 const Register from = c_rarg0; // source array address
3156 const Register to = c_rarg1; // destination array address
3157 const Register size = c_rarg2; // byte count (size_t)
3158
3159 // Register used as a temp
3160 const Register bits = rax; // test copy of low bits
3161
3162 __ align(CodeEntryAlignment);
3163 StubCodeMark mark(this, stub_id);
3164 start = __ pc();
3165
3166 __ enter(); // required for proper stackwalking of RuntimeStub frame
3167
3168 // bump this on entry, not on exit:
3169 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
3170
3171 __ mov(bits, from);
3172 __ orptr(bits, to);
3173 __ orptr(bits, size);
3174
3175 __ testb(bits, BytesPerLong-1);
3176 __ jccb(Assembler::zero, L_long_aligned);
3177
3178 __ testb(bits, BytesPerInt-1);
3179 __ jccb(Assembler::zero, L_int_aligned);
3180
3181 __ testb(bits, BytesPerShort-1);
3182 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
3183
3184 __ BIND(L_short_aligned);
3185 __ shrptr(size, LogBytesPerShort); // size => short_count
3186 __ jump(RuntimeAddress(short_copy_entry));
3187
3188 __ BIND(L_int_aligned);
3189 __ shrptr(size, LogBytesPerInt); // size => int_count
3190 __ jump(RuntimeAddress(int_copy_entry));
3191
3192 __ BIND(L_long_aligned);
3193 __ shrptr(size, LogBytesPerLong); // size => qword_count
3194 __ jump(RuntimeAddress(long_copy_entry));
3195
3196 // record the stub entry and end plus
3197 store_archive_data(stub_id, start, __ pc());
3198
3199 return start;
3200 }
3201
3202
3203 // Static enum for helper
3204 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
3205 // Helper for generate_unsafe_setmemory
3206 //
3207 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
3208 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
3209 Register size, Register wide_value,
3210 Register tmp, Label& L_exit,
3211 MacroAssembler *_masm) {
3212 Label L_Loop, L_Tail, L_TailLoop;
3213
3214 int shiftval = 0;
3215 int incr = 0;
3216
3217 switch (type) {
3218 case USM_SHORT:
3219 shiftval = 1;
3220 incr = 16;
3221 break;
3222 case USM_DWORD:
3223 shiftval = 2;
3224 incr = 32;
3225 break;
3226 case USM_QUADWORD:
3227 shiftval = 3;
3228 incr = 64;
3229 break;
3230 }
3231
3232 // At this point, we know the lower bits of size are zero
3233 __ shrq(size, shiftval);
3234 // size now has number of X-byte chunks (2, 4 or 8)
3235
3236 // Number of (8*X)-byte chunks into tmp
3237 __ movq(tmp, size);
3238 __ shrq(tmp, 3);
3239 __ jccb(Assembler::zero, L_Tail);
3240
3241 __ BIND(L_Loop);
3242
3243 // Unroll 8 stores
3244 for (int i = 0; i < 8; i++) {
3245 switch (type) {
3246 case USM_SHORT:
3247 __ movw(Address(dest, (2 * i)), wide_value);
3248 break;
3249 case USM_DWORD:
3250 __ movl(Address(dest, (4 * i)), wide_value);
3251 break;
3252 case USM_QUADWORD:
3253 __ movq(Address(dest, (8 * i)), wide_value);
3254 break;
3255 }
3256 }
3257 __ addq(dest, incr);
3258 __ decrementq(tmp);
3259 __ jccb(Assembler::notZero, L_Loop);
3260
3261 __ BIND(L_Tail);
3262
3263 // Find number of remaining X-byte chunks
3264 __ andq(size, 0x7);
3265
3266 // If zero, then we're done
3267 __ jccb(Assembler::zero, L_exit);
3268
3269 __ BIND(L_TailLoop);
3270
3271 switch (type) {
3272 case USM_SHORT:
3273 __ movw(Address(dest, 0), wide_value);
3274 break;
3275 case USM_DWORD:
3276 __ movl(Address(dest, 0), wide_value);
3277 break;
3278 case USM_QUADWORD:
3279 __ movq(Address(dest, 0), wide_value);
3280 break;
3281 }
3282 __ addq(dest, incr >> 3);
3283 __ decrementq(size);
3284 __ jccb(Assembler::notZero, L_TailLoop);
3285 }
3286
3287 // Generate 'unsafe' set memory stub
3288 // Though just as safe as the other stubs, it takes an unscaled
3289 // size_t (# bytes) argument instead of an element count.
3290 //
3291 // Input:
3292 // c_rarg0 - destination array address
3293 // c_rarg1 - byte count (size_t)
3294 // c_rarg2 - byte value
3295 //
3296 // Examines the alignment of the operands and dispatches
3297 // to an int, short, or byte fill loop.
3298 //
3299 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
3300 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
3301 int entry_count = StubInfo::entry_count(stub_id);
3302 assert(entry_count == 1, "sanity check");
3303 // we expect three set of extra unsafememory access handler entries
3304 GrowableArray<address> extras;
3305 int expected_handler_count = 3 * UnsafeMemoryAccess::COLUMN_COUNT;
3306 address start = load_archive_data(stub_id, nullptr, &extras);
3307 if (start != nullptr) {
3308 assert(extras.length() == expected_handler_count,
3309 "unexpected handler addresses count %d", extras.length());
3310 register_unsafe_access_handlers(extras, 0, 3);
3311 return start;
3312 }
3313
3314 __ align(CodeEntryAlignment);
3315 StubCodeMark mark(this, stub_id);
3316 start = __ pc();
3317 __ enter(); // required for proper stackwalking of RuntimeStub frame
3318
3319 assert(unsafe_byte_fill != nullptr, "Invalid call");
3320
3321 // bump this on entry, not on exit:
3322 INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
3323
3324 {
3325 Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
3326
3327 const Register dest = c_rarg0;
3328 const Register size = c_rarg1;
3329 const Register byteVal = c_rarg2;
3330 const Register wide_value = rax;
3331 const Register rScratch1 = r10;
3332
3333 assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
3334
3335 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
3336
3337 __ testq(size, size);
3338 __ jcc(Assembler::zero, L_exit);
3339
3340 // Propagate byte to full Register
3341 __ movzbl(rScratch1, byteVal);
3342 __ mov64(wide_value, 0x0101010101010101ULL);
3343 __ imulq(wide_value, rScratch1);
3344
3345 // Check for pointer & size alignment
3346 __ movq(rScratch1, dest);
3347 __ orq(rScratch1, size);
3348
3349 __ testb(rScratch1, 7);
3350 __ jcc(Assembler::equal, L_fillQuadwords);
3351
3352 __ testb(rScratch1, 3);
3353 __ jcc(Assembler::equal, L_fillDwords);
3354
3355 __ testb(rScratch1, 1);
3356 __ jcc(Assembler::notEqual, L_fillBytes);
3357
3358 // Fill words
3359 {
3360 UnsafeMemoryAccessMark umam(this, true, true);
3361
3362 // At this point, we know the lower bit of size is zero and a
3363 // multiple of 2
3364 do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
3365 L_exit, _masm);
3366 }
3367 __ jmpb(L_exit);
3368
3369 __ BIND(L_fillQuadwords);
3370
3371 // Fill QUADWORDs
3372 {
3373 UnsafeMemoryAccessMark umam(this, true, true);
3374
3375 // At this point, we know the lower 3 bits of size are zero and a
3376 // multiple of 8
3377 do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
3378 L_exit, _masm);
3379 }
3380 __ BIND(L_exit);
3381
3382 __ leave(); // required for proper stackwalking of RuntimeStub frame
3383 __ ret(0);
3384
3385 __ BIND(L_fillDwords);
3386
3387 // Fill DWORDs
3388 {
3389 UnsafeMemoryAccessMark umam(this, true, true);
3390
3391 // At this point, we know the lower 2 bits of size are zero and a
3392 // multiple of 4
3393 do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
3394 L_exit, _masm);
3395 }
3396 __ jmpb(L_exit);
3397
3398 __ BIND(L_fillBytes);
3399 // Set up for tail call to previously generated byte fill routine
3400 // Parameter order is (ptr, byteVal, size)
3401 __ xchgq(c_rarg1, c_rarg2);
3402 __ leave(); // Clear effect of enter()
3403 __ jump(RuntimeAddress(unsafe_byte_fill));
3404 }
3405
3406 // retrieve the registered handler addresses
3407 address end = __ pc();
3408 retrieve_unsafe_access_handlers(start, end, extras);
3409 assert(extras.length() == expected_handler_count,
3410 "unexpected handler addresses count %d", extras.length());
3411
3412 // record the stub entry and end plus the no_push entry and any
3413 // extra handler addresses
3414 store_archive_data(stub_id, start, end, nullptr, &extras);
3415
3416 return start;
3417 }
3418
3419 // Perform range checks on the proposed arraycopy.
3420 // Kills temp, but nothing else.
3421 // Also, clean the sign bits of src_pos and dst_pos.
3422 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0)
3423 Register src_pos, // source position (c_rarg1)
3424 Register dst, // destination array oo (c_rarg2)
3425 Register dst_pos, // destination position (c_rarg3)
3426 Register length,
3427 Register temp,
3428 Label& L_failed) {
3429 BLOCK_COMMENT("arraycopy_range_checks:");
3430
3431 // if (src_pos + length > arrayOop(src)->length()) FAIL;
3432 __ movl(temp, length);
3433 __ addl(temp, src_pos); // src_pos + length
3434 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
3435 __ jcc(Assembler::above, L_failed);
3436
3437 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
3438 __ movl(temp, length);
3439 __ addl(temp, dst_pos); // dst_pos + length
3440 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
3441 __ jcc(Assembler::above, L_failed);
3442
3443 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
3444 // Move with sign extension can be used since they are positive.
3445 __ movslq(src_pos, src_pos);
3446 __ movslq(dst_pos, dst_pos);
3447
3448 BLOCK_COMMENT("arraycopy_range_checks done");
3449 }
3450
3451
3452 // Generate generic array copy stubs
3453 //
3454 // Input:
3455 // c_rarg0 - src oop
3456 // c_rarg1 - src_pos (32-bits)
3457 // c_rarg2 - dst oop
3458 // c_rarg3 - dst_pos (32-bits)
3459 // not Win64
3460 // c_rarg4 - element count (32-bits)
3461 // Win64
3462 // rsp+40 - element count (32-bits)
3463 //
3464 // Output:
3465 // rax == 0 - success
3466 // rax == -1^K - failure, where K is partial transfer count
3467 //
3468 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
3469 address int_copy_entry, address oop_copy_entry,
3470 address long_copy_entry, address checkcast_copy_entry) {
3471
3472 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
3473 int entry_count = StubInfo::entry_count(stub_id);
3474 assert(entry_count == 1, "sanity check");
3475 address start = load_archive_data(stub_id);
3476 if (start != nullptr) {
3477 return start;
3478 }
3479
3480 Label L_failed, L_failed_0, L_skip_failed_0, L_objArray;
3481 Label L_copy_shorts, L_copy_ints, L_copy_longs;
3482
3483 // Input registers
3484 const Register src = c_rarg0; // source array oop
3485 const Register src_pos = c_rarg1; // source position
3486 const Register dst = c_rarg2; // destination array oop
3487 const Register dst_pos = c_rarg3; // destination position
3488 #ifndef _WIN64
3489 const Register length = c_rarg4;
3490 const Register rklass_tmp = r9; // load_klass
3491 #else
3492 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64
3493 const Register rklass_tmp = rdi; // load_klass
3494 #endif
3495
3496 StubCodeMark mark(this, stub_id);
3497 __ align(CodeEntryAlignment);
3498 start = __ pc();
3499
3500 __ enter(); // required for proper stackwalking of RuntimeStub frame
3501
3502 #ifdef _WIN64
3503 __ push_ppx(rklass_tmp); // rdi is callee-save on Windows
3504 #endif
3505
3506 // bump this on entry, not on exit:
3507 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
3508
3509 //-----------------------------------------------------------------------
3510 // Assembler stub will be used for this call to arraycopy
3511 // if the following conditions are met:
3512 //
3513 // (1) src and dst must not be null.
3514 // (2) src_pos must not be negative.
3515 // (3) dst_pos must not be negative.
3516 // (4) length must not be negative.
3517 // (5) src klass and dst klass should be the same and not null.
3518 // (6) src and dst should be arrays.
3519 // (7) src_pos + length must not exceed length of src.
3520 // (8) dst_pos + length must not exceed length of dst.
3521 //
3522
3523 // if (src == nullptr) return -1;
3524 __ testptr(src, src); // src oop
3525 size_t j1off = __ offset();
3526 __ jccb(Assembler::zero, L_failed_0);
3527
3528 // if (src_pos < 0) return -1;
3529 __ testl(src_pos, src_pos); // src_pos (32-bits)
3530 __ jccb(Assembler::negative, L_failed_0);
3531
3532 // if (dst == nullptr) return -1;
3533 __ testptr(dst, dst); // dst oop
3534 __ jccb(Assembler::zero, L_failed_0);
3535
3536 // if (dst_pos < 0) return -1;
3537 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3538 size_t j4off = __ offset();
3539 // skip over the failure trampoline
3540 __ jccb(Assembler::positive, L_skip_failed_0);
3541
3542 // The first four tests are very dense code,
3543 // but not quite dense enough to put four
3544 // jumps in a 16-byte instruction fetch buffer.
3545 // That's good, because some branch predicters
3546 // do not like jumps so close together.
3547 // Make sure of this.
3548 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3549
3550 // Short-hop target to L_failed. Makes for denser prologue code.
3551 __ BIND(L_failed_0);
3552 __ jmp(L_failed);
3553
3554 // continue here if first 4 checks pass
3555 __ bind(L_skip_failed_0);
3556
3557 // registers used as temp
3558 const Register r11_length = r11; // elements count to copy
3559 const Register r10_src_klass = r10; // array klass
3560
3561 // if (length < 0) return -1;
3562 __ movl(r11_length, length); // length (elements count, 32-bits value)
3563 __ testl(r11_length, r11_length);
3564 __ jccb(Assembler::negative, L_failed_0);
3565
3566 __ load_klass(r10_src_klass, src, rklass_tmp);
3567 #ifdef ASSERT
3568 // assert(src->klass() != nullptr);
3569 {
3570 BLOCK_COMMENT("assert klasses not null {");
3571 Label L1, L2;
3572 __ testptr(r10_src_klass, r10_src_klass);
3573 __ jcc(Assembler::notZero, L2); // it is broken if klass is null
3574 __ bind(L1);
3575 __ stop("broken null klass");
3576 __ bind(L2);
3577 __ load_klass(rax, dst, rklass_tmp);
3578 __ cmpq(rax, 0);
3579 __ jcc(Assembler::equal, L1); // this would be broken also
3580 BLOCK_COMMENT("} assert klasses not null done");
3581 }
3582 #endif
3583
3584 // Load layout helper (32-bits)
3585 //
3586 // |array_tag| | header_size | element_type | |log2_element_size|
3587 // 32 30 24 16 8 2 0
3588 //
3589 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3590 //
3591
3592 const int lh_offset = in_bytes(Klass::layout_helper_offset());
3593
3594 // Handle objArrays completely differently...
3595 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3596 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3597 __ jcc(Assembler::equal, L_objArray);
3598
3599 // if (src->klass() != dst->klass()) return -1;
3600 __ load_klass(rax, dst, rklass_tmp);
3601 __ cmpq(r10_src_klass, rax);
3602 __ jcc(Assembler::notEqual, L_failed);
3603
3604 // Check for flat inline type array -> return -1
3605 __ test_flat_array_oop(src, rax, L_failed);
3606
3607 // Check for null-free (non-flat) inline type array -> handle as object array
3608 __ test_null_free_array_oop(src, rax, L_objArray);
3609
3610 const Register rax_lh = rax; // layout helper
3611 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3612
3613 // Check for flat inline type array -> return -1
3614 __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
3615 __ jcc(Assembler::notZero, L_failed);
3616
3617 // if (!src->is_Array()) return -1;
3618 __ cmpl(rax_lh, Klass::_lh_neutral_value);
3619 __ jcc(Assembler::greaterEqual, L_failed);
3620
3621 // At this point, it is known to be a typeArray (array_tag 0x3).
3622 #ifdef ASSERT
3623 {
3624 BLOCK_COMMENT("assert primitive array {");
3625 Label L;
3626 __ movl(rklass_tmp, rax_lh);
3627 __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3628 __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3629 __ jcc(Assembler::equal, L);
3630 __ stop("must be a primitive array");
3631 __ bind(L);
3632 BLOCK_COMMENT("} assert primitive array done");
3633 }
3634 #endif
3635
3636 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3637 r10, L_failed);
3638
3639 // TypeArrayKlass
3640 //
3641 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3642 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3643 //
3644
3645 const Register r10_offset = r10; // array offset
3646 const Register rax_elsize = rax_lh; // element size
3647
3648 __ movl(r10_offset, rax_lh);
3649 __ shrl(r10_offset, Klass::_lh_header_size_shift);
3650 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
3651 __ addptr(src, r10_offset); // src array offset
3652 __ addptr(dst, r10_offset); // dst array offset
3653 BLOCK_COMMENT("choose copy loop based on element size");
3654 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3655
3656 #ifdef _WIN64
3657 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3658 #endif
3659
3660 // next registers should be set before the jump to corresponding stub
3661 const Register from = c_rarg0; // source array address
3662 const Register to = c_rarg1; // destination array address
3663 const Register count = c_rarg2; // elements count
3664
3665 // 'from', 'to', 'count' registers should be set in such order
3666 // since they are the same as 'src', 'src_pos', 'dst'.
3667
3668 __ cmpl(rax_elsize, 0);
3669 __ jccb(Assembler::notEqual, L_copy_shorts);
3670 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3671 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3672 __ movl2ptr(count, r11_length); // length
3673 __ jump(RuntimeAddress(byte_copy_entry));
3674
3675 __ BIND(L_copy_shorts);
3676 __ cmpl(rax_elsize, LogBytesPerShort);
3677 __ jccb(Assembler::notEqual, L_copy_ints);
3678 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3679 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3680 __ movl2ptr(count, r11_length); // length
3681 __ jump(RuntimeAddress(short_copy_entry));
3682
3683 __ BIND(L_copy_ints);
3684 __ cmpl(rax_elsize, LogBytesPerInt);
3685 __ jccb(Assembler::notEqual, L_copy_longs);
3686 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3687 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3688 __ movl2ptr(count, r11_length); // length
3689 __ jump(RuntimeAddress(int_copy_entry));
3690
3691 __ BIND(L_copy_longs);
3692 #ifdef ASSERT
3693 {
3694 BLOCK_COMMENT("assert long copy {");
3695 Label L;
3696 __ cmpl(rax_elsize, LogBytesPerLong);
3697 __ jcc(Assembler::equal, L);
3698 __ stop("must be long copy, but elsize is wrong");
3699 __ bind(L);
3700 BLOCK_COMMENT("} assert long copy done");
3701 }
3702 #endif
3703 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3704 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3705 __ movl2ptr(count, r11_length); // length
3706 __ jump(RuntimeAddress(long_copy_entry));
3707
3708 // ObjArrayKlass
3709 __ BIND(L_objArray);
3710 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
3711
3712 Label L_plain_copy, L_checkcast_copy;
3713 // test array classes for subtyping
3714 __ load_klass(rax, dst, rklass_tmp);
3715 __ cmpq(r10_src_klass, rax); // usual case is exact equality
3716 __ jcc(Assembler::notEqual, L_checkcast_copy);
3717
3718 // Identically typed arrays can be copied without element-wise checks.
3719 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3720 r10, L_failed);
3721
3722 __ lea(from, Address(src, src_pos, TIMES_OOP,
3723 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3724 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3725 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3726 __ movl2ptr(count, r11_length); // length
3727 __ BIND(L_plain_copy);
3728 #ifdef _WIN64
3729 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3730 #endif
3731 __ jump(RuntimeAddress(oop_copy_entry));
3732
3733 __ BIND(L_checkcast_copy);
3734 // live at this point: r10_src_klass, r11_length, rax (dst_klass)
3735 {
3736 // Before looking at dst.length, make sure dst is also an objArray.
3737 // This check also fails for flat arrays which are not supported.
3738 __ cmpl(Address(rax, lh_offset), objArray_lh);
3739 __ jcc(Assembler::notEqual, L_failed);
3740
3741 #ifdef ASSERT
3742 {
3743 BLOCK_COMMENT("assert not null-free array {");
3744 Label L;
3745 __ test_non_null_free_array_oop(dst, rklass_tmp, L);
3746 __ stop("unexpected null-free array");
3747 __ bind(L);
3748 BLOCK_COMMENT("} assert not null-free array");
3749 }
3750 #endif
3751
3752 // It is safe to examine both src.length and dst.length.
3753 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3754 rax, L_failed);
3755
3756 const Register r11_dst_klass = r11;
3757 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3758
3759 // Marshal the base address arguments now, freeing registers.
3760 __ lea(from, Address(src, src_pos, TIMES_OOP,
3761 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3762 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3763 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3764 __ movl(count, length); // length (reloaded)
3765 Register sco_temp = c_rarg3; // this register is free now
3766 assert_different_registers(from, to, count, sco_temp,
3767 r11_dst_klass, r10_src_klass);
3768 assert_clean_int(count, sco_temp);
3769
3770 // Generate the type check.
3771 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3772 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3773 assert_clean_int(sco_temp, rax);
3774 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3775
3776 // Fetch destination element klass from the ObjArrayKlass header.
3777 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3778 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3779 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
3780 assert_clean_int(sco_temp, rax);
3781
3782 #ifdef _WIN64
3783 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3784 #endif
3785
3786 // the checkcast_copy loop needs two extra arguments:
3787 assert(c_rarg3 == sco_temp, "#3 already in place");
3788 // Set up arguments for checkcast_copy_entry.
3789 setup_arg_regs_using_thread(4);
3790 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3791 __ jump(RuntimeAddress(checkcast_copy_entry));
3792 }
3793
3794 __ BIND(L_failed);
3795 #ifdef _WIN64
3796 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3797 #endif
3798 __ xorptr(rax, rax);
3799 __ notptr(rax); // return -1
3800 __ leave(); // required for proper stackwalking of RuntimeStub frame
3801 __ ret(0);
3802
3803 // record the stub entry and end
3804 store_archive_data(stub_id, start, __ pc());
3805
3806 return start;
3807 }
3808
3809 #undef __