1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/macroAssembler.hpp"
26 #include "gc/shared/barrierSet.hpp"
27 #include "gc/shared/barrierSetAssembler.hpp"
28 #include "oops/objArrayKlass.hpp"
29 #include "runtime/sharedRuntime.hpp"
30 #include "runtime/stubRoutines.hpp"
31 #include "stubGenerator_x86_64.hpp"
32 #ifdef COMPILER2
33 #include "opto/c2_globals.hpp"
34 #endif
35 #if INCLUDE_JVMCI
36 #include "jvmci/jvmci_globals.hpp"
37 #endif
38
39 #define __ _masm->
40
41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
42
43 #ifdef PRODUCT
44 #define BLOCK_COMMENT(str) /* nothing */
45 #else
46 #define BLOCK_COMMENT(str) __ block_comment(str)
47 #endif // PRODUCT
48
49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
50
51 #ifdef PRODUCT
52 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
53 #else
54 #define INC_COUNTER_NP(counter, rscratch) \
55 BLOCK_COMMENT("inc_counter " #counter); \
56 inc_counter_np(_masm, counter, rscratch);
57
58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
59 __ incrementl(ExternalAddress((address)&counter), rscratch);
60 }
61
62 #if COMPILER2_OR_JVMCI
63 static uint& get_profile_ctr(int shift) {
64 if (shift == 0) {
65 return SharedRuntime::_jbyte_array_copy_ctr;
66 } else if (shift == 1) {
67 return SharedRuntime::_jshort_array_copy_ctr;
68 } else if (shift == 2) {
69 return SharedRuntime::_jint_array_copy_ctr;
70 } else {
71 assert(shift == 3, "");
72 return SharedRuntime::_jlong_array_copy_ctr;
73 }
74 }
75 #endif // COMPILER2_OR_JVMCI
76 #endif // !PRODUCT
77
78 void StubGenerator::generate_arraycopy_stubs() {
79 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
80 // entry immediately following their stack push. This can be used
81 // as a post-push branch target for compatible stubs when they
82 // identify a special case that can be handled by the fallback
83 // stub e.g a disjoint copy stub may be use as a special case
84 // fallback for its compatible conjoint copy stub.
85 //
86 // A no push entry is always returned in the following local and
87 // then published by assigning to the appropriate entry field in
88 // class StubRoutines. The entry value is then passed to the
89 // generator for the compatible stub. That means the entry must be
90 // listed when saving to/restoring from the AOT cache, ensuring
91 // that the inter-stub jumps are noted at AOT-cache save and
92 // relocated at AOT cache load.
93 address nopush_entry;
94
95 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(&nopush_entry);
96 // disjoint nopush entry is needed by conjoint copy
97 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
98 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
99 // conjoint nopush entry is needed by generic/unsafe copy
100 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
101
102 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
103 // disjoint nopush entry is needed by conjoint copy
104 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
105 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
106 // conjoint nopush entry is needed by generic/unsafe copy
107 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
108
109 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
110 // disjoint nopush entry is needed by conjoint copy
111 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
112 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
113 // conjoint nopush entry is needed by generic/unsafe copy
114 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
115
116 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
117 // disjoint nopush entry is needed by conjoint copy
118 StubRoutines::_jlong_disjoint_arraycopy_nopush = nopush_entry;
119 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
120 // conjoint nopush entry is needed by generic/unsafe copy
121 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
122
123 if (UseCompressedOops) {
124 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
125 // disjoint nopush entry is needed by conjoint copy
126 StubRoutines::_oop_disjoint_arraycopy_nopush = nopush_entry;
127 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
128 // conjoint nopush entry is needed by generic/unsafe copy
129 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
130 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
131 // disjoint nopush entry is needed by conjoint copy
132 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
133 // note that we don't need a returned nopush entry because the
134 // generic/unsafe copy does not cater for uninit arrays.
135 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
136 } else {
137 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
138 // disjoint nopush entry is needed by conjoint copy
139 StubRoutines::_oop_disjoint_arraycopy_nopush = nopush_entry;
140 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
141 // conjoint nopush entry is needed by generic/unsafe copy
142 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
143 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
144 // disjoint nopush entry is needed by conjoint copy
145 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
146 // note that we don't need a returned nopush entry because the
147 // generic/unsafe copy does not cater for uninit arrays.
148 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
149 }
150
151 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
152 // checkcast nopush entry is needed by generic copy
153 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
154 // note that we don't need a returned nopush entry because the
155 // generic copy does not cater for uninit arrays.
156 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
157
158 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
159 StubRoutines::_jshort_arraycopy_nopush,
160 StubRoutines::_jint_arraycopy_nopush,
161 StubRoutines::_jlong_arraycopy_nopush);
162 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
163 StubRoutines::_jshort_arraycopy_nopush,
164 StubRoutines::_jint_arraycopy_nopush,
165 StubRoutines::_oop_arraycopy_nopush,
166 StubRoutines::_jlong_arraycopy_nopush,
167 StubRoutines::_checkcast_arraycopy_nopush);
168
169 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
170 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
171 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
172 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
173 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
174 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
175
176 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
177
178 // We don't generate specialized code for HeapWord-aligned source
179 // arrays, so just use the code we've already generated
180 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy;
181 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy;
182
183 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
184 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy;
185
186 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
187 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
188
189 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
190 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
191
192 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
193 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
194
195 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
196 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
197 }
198
199
200 // Verify that a register contains clean 32-bits positive value
201 // (high 32-bits are 0) so it could be used in 64-bits shifts.
202 //
203 // Input:
204 // Rint - 32-bits value
205 // Rtmp - scratch
206 //
207 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
208 #ifdef ASSERT
209 Label L;
210 assert_different_registers(Rtmp, Rint);
211 __ movslq(Rtmp, Rint);
212 __ cmpq(Rtmp, Rint);
213 __ jcc(Assembler::equal, L);
214 __ stop("high 32-bits of int value are not 0");
215 __ bind(L);
216 #endif
217 }
218
219
220 // Generate overlap test for array copy stubs
221 //
222 // Input:
223 // c_rarg0 - from
224 // c_rarg1 - to
225 // c_rarg2 - element count
226 //
227 // Output:
228 // rax - &from[element count - 1]
229 //
230 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
231 const Register from = c_rarg0;
232 const Register to = c_rarg1;
233 const Register count = c_rarg2;
234 const Register end_from = rax;
235
236 __ cmpptr(to, from);
237 __ lea(end_from, Address(from, count, sf, 0));
238 if (NOLp == nullptr) {
239 RuntimeAddress no_overlap(no_overlap_target);
240 __ jump_cc(Assembler::belowEqual, no_overlap);
241 __ cmpptr(to, end_from);
242 __ jump_cc(Assembler::aboveEqual, no_overlap);
243 } else {
244 __ jcc(Assembler::belowEqual, (*NOLp));
245 __ cmpptr(to, end_from);
246 __ jcc(Assembler::aboveEqual, (*NOLp));
247 }
248 }
249
250
251 // Copy big chunks forward
252 //
253 // Inputs:
254 // end_from - source arrays end address
255 // end_to - destination array end address
256 // qword_count - 64-bits element count, negative
257 // tmp1 - scratch
258 // L_copy_bytes - entry label
259 // L_copy_8_bytes - exit label
260 //
261 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
262 Register qword_count, Register tmp1,
263 Register tmp2, Label& L_copy_bytes,
264 Label& L_copy_8_bytes, DecoratorSet decorators,
265 BasicType type) {
266 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
267 DEBUG_ONLY(__ stop("enter at entry label, not here"));
268 Label L_loop;
269 __ align(OptoLoopAlignment);
270 if (UseUnalignedLoadStores) {
271 Label L_end;
272 __ BIND(L_loop);
273 if (UseAVX >= 2) {
274 bs->copy_load_at(_masm, decorators, type, 32,
275 xmm0, Address(end_from, qword_count, Address::times_8, -56),
276 tmp1, xmm1);
277 bs->copy_store_at(_masm, decorators, type, 32,
278 Address(end_to, qword_count, Address::times_8, -56), xmm0,
279 tmp1, tmp2, xmm1);
280
281 bs->copy_load_at(_masm, decorators, type, 32,
282 xmm0, Address(end_from, qword_count, Address::times_8, -24),
283 tmp1, xmm1);
284 bs->copy_store_at(_masm, decorators, type, 32,
285 Address(end_to, qword_count, Address::times_8, -24), xmm0,
286 tmp1, tmp2, xmm1);
287 } else {
288 bs->copy_load_at(_masm, decorators, type, 16,
289 xmm0, Address(end_from, qword_count, Address::times_8, -56),
290 tmp1, xmm1);
291 bs->copy_store_at(_masm, decorators, type, 16,
292 Address(end_to, qword_count, Address::times_8, -56), xmm0,
293 tmp1, tmp2, xmm1);
294 bs->copy_load_at(_masm, decorators, type, 16,
295 xmm0, Address(end_from, qword_count, Address::times_8, -40),
296 tmp1, xmm1);
297 bs->copy_store_at(_masm, decorators, type, 16,
298 Address(end_to, qword_count, Address::times_8, -40), xmm0,
299 tmp1, tmp2, xmm1);
300 bs->copy_load_at(_masm, decorators, type, 16,
301 xmm0, Address(end_from, qword_count, Address::times_8, -24),
302 tmp1, xmm1);
303 bs->copy_store_at(_masm, decorators, type, 16,
304 Address(end_to, qword_count, Address::times_8, -24), xmm0,
305 tmp1, tmp2, xmm1);
306 bs->copy_load_at(_masm, decorators, type, 16,
307 xmm0, Address(end_from, qword_count, Address::times_8, -8),
308 tmp1, xmm1);
309 bs->copy_store_at(_masm, decorators, type, 16,
310 Address(end_to, qword_count, Address::times_8, -8), xmm0,
311 tmp1, tmp2, xmm1);
312 }
313
314 __ BIND(L_copy_bytes);
315 __ addptr(qword_count, 8);
316 __ jcc(Assembler::lessEqual, L_loop);
317 __ subptr(qword_count, 4); // sub(8) and add(4)
318 __ jcc(Assembler::greater, L_end);
319 // Copy trailing 32 bytes
320 if (UseAVX >= 2) {
321 bs->copy_load_at(_masm, decorators, type, 32,
322 xmm0, Address(end_from, qword_count, Address::times_8, -24),
323 tmp1, xmm1);
324 bs->copy_store_at(_masm, decorators, type, 32,
325 Address(end_to, qword_count, Address::times_8, -24), xmm0,
326 tmp1, tmp2, xmm1);
327 } else {
328 bs->copy_load_at(_masm, decorators, type, 16,
329 xmm0, Address(end_from, qword_count, Address::times_8, -24),
330 tmp1, xmm1);
331 bs->copy_store_at(_masm, decorators, type, 16,
332 Address(end_to, qword_count, Address::times_8, -24), xmm0,
333 tmp1, tmp2, xmm1);
334 bs->copy_load_at(_masm, decorators, type, 16,
335 xmm0, Address(end_from, qword_count, Address::times_8, -8),
336 tmp1, xmm1);
337 bs->copy_store_at(_masm, decorators, type, 16,
338 Address(end_to, qword_count, Address::times_8, -8), xmm0,
339 tmp1, tmp2, xmm1);
340 }
341 __ addptr(qword_count, 4);
342 __ BIND(L_end);
343 } else {
344 // Copy 32-bytes per iteration
345 __ BIND(L_loop);
346 bs->copy_load_at(_masm, decorators, type, 8,
347 tmp1, Address(end_from, qword_count, Address::times_8, -24),
348 tmp2);
349 bs->copy_store_at(_masm, decorators, type, 8,
350 Address(end_to, qword_count, Address::times_8, -24), tmp1,
351 tmp2);
352 bs->copy_load_at(_masm, decorators, type, 8,
353 tmp1, Address(end_from, qword_count, Address::times_8, -16),
354 tmp2);
355 bs->copy_store_at(_masm, decorators, type, 8,
356 Address(end_to, qword_count, Address::times_8, -16), tmp1,
357 tmp2);
358 bs->copy_load_at(_masm, decorators, type, 8,
359 tmp1, Address(end_from, qword_count, Address::times_8, -8),
360 tmp2);
361 bs->copy_store_at(_masm, decorators, type, 8,
362 Address(end_to, qword_count, Address::times_8, -8), tmp1,
363 tmp2);
364 bs->copy_load_at(_masm, decorators, type, 8,
365 tmp1, Address(end_from, qword_count, Address::times_8, 0),
366 tmp2);
367 bs->copy_store_at(_masm, decorators, type, 8,
368 Address(end_to, qword_count, Address::times_8, 0), tmp1,
369 tmp2);
370
371 __ BIND(L_copy_bytes);
372 __ addptr(qword_count, 4);
373 __ jcc(Assembler::lessEqual, L_loop);
374 }
375 __ subptr(qword_count, 4);
376 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
377 }
378
379
380 // Copy big chunks backward
381 //
382 // Inputs:
383 // from - source arrays address
384 // dest - destination array address
385 // qword_count - 64-bits element count
386 // tmp1 - scratch
387 // L_copy_bytes - entry label
388 // L_copy_8_bytes - exit label
389 //
390 void StubGenerator::copy_bytes_backward(Register from, Register dest,
391 Register qword_count, Register tmp1,
392 Register tmp2, Label& L_copy_bytes,
393 Label& L_copy_8_bytes, DecoratorSet decorators,
394 BasicType type) {
395 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
396 DEBUG_ONLY(__ stop("enter at entry label, not here"));
397 Label L_loop;
398 __ align(OptoLoopAlignment);
399 if (UseUnalignedLoadStores) {
400 Label L_end;
401 __ BIND(L_loop);
402 if (UseAVX >= 2) {
403 bs->copy_load_at(_masm, decorators, type, 32,
404 xmm0, Address(from, qword_count, Address::times_8, 32),
405 tmp1, xmm1);
406 bs->copy_store_at(_masm, decorators, type, 32,
407 Address(dest, qword_count, Address::times_8, 32), xmm0,
408 tmp1, tmp2, xmm1);
409 bs->copy_load_at(_masm, decorators, type, 32,
410 xmm0, Address(from, qword_count, Address::times_8, 0),
411 tmp1, xmm1);
412 bs->copy_store_at(_masm, decorators, type, 32,
413 Address(dest, qword_count, Address::times_8, 0), xmm0,
414 tmp1, tmp2, xmm1);
415 } else {
416 bs->copy_load_at(_masm, decorators, type, 16,
417 xmm0, Address(from, qword_count, Address::times_8, 48),
418 tmp1, xmm1);
419 bs->copy_store_at(_masm, decorators, type, 16,
420 Address(dest, qword_count, Address::times_8, 48), xmm0,
421 tmp1, tmp2, xmm1);
422 bs->copy_load_at(_masm, decorators, type, 16,
423 xmm0, Address(from, qword_count, Address::times_8, 32),
424 tmp1, xmm1);
425 bs->copy_store_at(_masm, decorators, type, 16,
426 Address(dest, qword_count, Address::times_8, 32), xmm0,
427 tmp1, tmp2, xmm1);
428 bs->copy_load_at(_masm, decorators, type, 16,
429 xmm0, Address(from, qword_count, Address::times_8, 16),
430 tmp1, xmm1);
431 bs->copy_store_at(_masm, decorators, type, 16,
432 Address(dest, qword_count, Address::times_8, 16), xmm0,
433 tmp1, tmp2, xmm1);
434 bs->copy_load_at(_masm, decorators, type, 16,
435 xmm0, Address(from, qword_count, Address::times_8, 0),
436 tmp1, xmm1);
437 bs->copy_store_at(_masm, decorators, type, 16,
438 Address(dest, qword_count, Address::times_8, 0), xmm0,
439 tmp1, tmp2, xmm1);
440 }
441
442 __ BIND(L_copy_bytes);
443 __ subptr(qword_count, 8);
444 __ jcc(Assembler::greaterEqual, L_loop);
445
446 __ addptr(qword_count, 4); // add(8) and sub(4)
447 __ jcc(Assembler::less, L_end);
448 // Copy trailing 32 bytes
449 if (UseAVX >= 2) {
450 bs->copy_load_at(_masm, decorators, type, 32,
451 xmm0, Address(from, qword_count, Address::times_8, 0),
452 tmp1, xmm1);
453 bs->copy_store_at(_masm, decorators, type, 32,
454 Address(dest, qword_count, Address::times_8, 0), xmm0,
455 tmp1, tmp2, xmm1);
456 } else {
457 bs->copy_load_at(_masm, decorators, type, 16,
458 xmm0, Address(from, qword_count, Address::times_8, 16),
459 tmp1, xmm1);
460 bs->copy_store_at(_masm, decorators, type, 16,
461 Address(dest, qword_count, Address::times_8, 16), xmm0,
462 tmp1, tmp2, xmm1);
463 bs->copy_load_at(_masm, decorators, type, 16,
464 xmm0, Address(from, qword_count, Address::times_8, 0),
465 tmp1, xmm1);
466 bs->copy_store_at(_masm, decorators, type, 16,
467 Address(dest, qword_count, Address::times_8, 0), xmm0,
468 tmp1, tmp2, xmm1);
469 }
470 __ subptr(qword_count, 4);
471 __ BIND(L_end);
472 } else {
473 // Copy 32-bytes per iteration
474 __ BIND(L_loop);
475 bs->copy_load_at(_masm, decorators, type, 8,
476 tmp1, Address(from, qword_count, Address::times_8, 24),
477 tmp2);
478 bs->copy_store_at(_masm, decorators, type, 8,
479 Address(dest, qword_count, Address::times_8, 24), tmp1,
480 tmp2);
481 bs->copy_load_at(_masm, decorators, type, 8,
482 tmp1, Address(from, qword_count, Address::times_8, 16),
483 tmp2);
484 bs->copy_store_at(_masm, decorators, type, 8,
485 Address(dest, qword_count, Address::times_8, 16), tmp1,
486 tmp2);
487 bs->copy_load_at(_masm, decorators, type, 8,
488 tmp1, Address(from, qword_count, Address::times_8, 8),
489 tmp2);
490 bs->copy_store_at(_masm, decorators, type, 8,
491 Address(dest, qword_count, Address::times_8, 8), tmp1,
492 tmp2);
493 bs->copy_load_at(_masm, decorators, type, 8,
494 tmp1, Address(from, qword_count, Address::times_8, 0),
495 tmp2);
496 bs->copy_store_at(_masm, decorators, type, 8,
497 Address(dest, qword_count, Address::times_8, 0), tmp1,
498 tmp2);
499
500 __ BIND(L_copy_bytes);
501 __ subptr(qword_count, 4);
502 __ jcc(Assembler::greaterEqual, L_loop);
503 }
504 __ addptr(qword_count, 4);
505 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
506 }
507
508 #if COMPILER2_OR_JVMCI
509
510 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
511 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
512 // for both special cases (various small block sizes) and aligned copy loop. This is the
513 // default configuration.
514 // - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
515 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
516 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
517 // better performance for disjoint copies. For conjoint/backward copy vector based
518 // copy performs better.
519 // - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
520 // 64 byte vector registers (ZMMs).
521
522 // Inputs:
523 // c_rarg0 - source array address
524 // c_rarg1 - destination array address
525 // c_rarg2 - element count, treated as ssize_t, can be zero
526 //
527 //
528 // Side Effects:
529 // disjoint_copy_avx3_masked is set to the no-overlap entry point
530 // used by generate_conjoint_[byte/int/short/long]_copy().
531 //
532 address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry) {
533 // aligned is always false -- x86_64 always uses the unaligned code
534 const bool aligned = false;
535 int shift;
536 bool is_oop;
537 bool dest_uninitialized;
538
539 switch (stub_id) {
540 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
541 shift = 0;
542 is_oop = false;
543 dest_uninitialized = false;
544 break;
545 case StubId::stubgen_jshort_disjoint_arraycopy_id:
546 shift = 1;
547 is_oop = false;
548 dest_uninitialized = false;
549 break;
550 case StubId::stubgen_jint_disjoint_arraycopy_id:
551 shift = 2;
552 is_oop = false;
553 dest_uninitialized = false;
554 break;
555 case StubId::stubgen_jlong_disjoint_arraycopy_id:
556 shift = 3;
557 is_oop = false;
558 dest_uninitialized = false;
559 break;
560 case StubId::stubgen_oop_disjoint_arraycopy_id:
561 shift = (UseCompressedOops ? 2 : 3);
562 is_oop = true;
563 dest_uninitialized = false;
564 break;
565 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
566 shift = (UseCompressedOops ? 2 : 3);
567 is_oop = true;
568 dest_uninitialized = true;
569 break;
570 default:
571 ShouldNotReachHere();
572 }
573
574 __ align(CodeEntryAlignment);
575 StubCodeMark mark(this, stub_id);
576 address start = __ pc();
577
578 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
579 const int large_threshold = 2621440; // 2.5 MB
580 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
581 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
582 Label L_copy_large, L_finish;
583 const Register from = rdi; // source array address
584 const Register to = rsi; // destination array address
585 const Register count = rdx; // elements count
586 const Register temp1 = r8;
587 const Register temp2 = r11;
588 const Register temp3 = rax;
589 const Register temp4 = rcx;
590 // End pointers are inclusive, and if count is not zero they point
591 // to the last unit copied: end_to[0] := end_from[0]
592
593 __ enter(); // required for proper stackwalking of RuntimeStub frame
594 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
595
596 if (entry != nullptr) {
597 *entry = __ pc();
598 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
599 BLOCK_COMMENT("Entry:");
600 }
601
602 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
603 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
604
605 setup_argument_regs(type);
606
607 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
608 if (dest_uninitialized) {
609 decorators |= IS_DEST_UNINITIALIZED;
610 }
611 if (aligned) {
612 decorators |= ARRAYCOPY_ALIGNED;
613 }
614 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
615 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
616
617 {
618 // Type(shift) byte(0), short(1), int(2), long(3)
619 int loop_size[] = { 192, 96, 48, 24};
620 int threshold[] = { 4096, 2048, 1024, 512};
621
622 // UnsafeMemoryAccess page error: continue after unsafe access
623 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
624 // 'from', 'to' and 'count' are now valid
625
626 // temp1 holds remaining count and temp4 holds running count used to compute
627 // next address offset for start of to/from addresses (temp4 * scale).
628 __ mov64(temp4, 0);
629 __ movq(temp1, count);
630
631 // Zero length check.
632 __ BIND(L_tail);
633 __ cmpq(temp1, 0);
634 __ jcc(Assembler::lessEqual, L_exit);
635
636 // Special cases using 32 byte [masked] vector copy operations.
637 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
638 temp4, temp3, use64byteVector, L_entry, L_exit);
639
640 // PRE-MAIN-POST loop for aligned copy.
641 __ BIND(L_entry);
642
643 if (MaxVectorSize == 64) {
644 __ movq(temp2, temp1);
645 __ shlq(temp2, shift);
646 __ cmpq(temp2, large_threshold);
647 __ jcc(Assembler::greaterEqual, L_copy_large);
648 }
649 if (CopyAVX3Threshold != 0) {
650 __ cmpq(count, threshold[shift]);
651 if (MaxVectorSize == 64) {
652 // Copy using 64 byte vectors.
653 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
654 } else {
655 assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
656 // REP MOVS offer a faster copy path.
657 __ jcc(Assembler::greaterEqual, L_repmovs);
658 }
659 }
660
661 if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
662 // Partial copy to make dst address 32 byte aligned.
663 __ movq(temp2, to);
664 __ andq(temp2, 31);
665 __ jcc(Assembler::equal, L_main_pre_loop);
666
667 __ negptr(temp2);
668 __ addq(temp2, 32);
669 if (shift) {
670 __ shrq(temp2, shift);
671 }
672 __ movq(temp3, temp2);
673 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
674 __ movq(temp4, temp2);
675 __ movq(temp1, count);
676 __ subq(temp1, temp2);
677
678 __ cmpq(temp1, loop_size[shift]);
679 __ jcc(Assembler::less, L_tail);
680
681 __ BIND(L_main_pre_loop);
682 __ subq(temp1, loop_size[shift]);
683
684 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
685 __ align32();
686 __ BIND(L_main_loop);
687 copy64_avx(to, from, temp4, xmm1, false, shift, 0);
688 copy64_avx(to, from, temp4, xmm1, false, shift, 64);
689 copy64_avx(to, from, temp4, xmm1, false, shift, 128);
690 __ addptr(temp4, loop_size[shift]);
691 __ subq(temp1, loop_size[shift]);
692 __ jcc(Assembler::greater, L_main_loop);
693
694 __ addq(temp1, loop_size[shift]);
695
696 // Tail loop.
697 __ jmp(L_tail);
698
699 __ BIND(L_repmovs);
700 __ movq(temp2, temp1);
701 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
702 __ movq(temp3, to);
703 __ movq(to, from);
704 __ movq(from, temp3);
705 // Save to/from for restoration post rep_mov.
706 __ movq(temp1, to);
707 __ movq(temp3, from);
708 if(shift < 3) {
709 __ shrq(temp2, 3-shift); // quad word count
710 }
711 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX).
712 __ rep_mov();
713 __ shlq(temp2, 3); // convert quad words into byte count.
714 if(shift) {
715 __ shrq(temp2, shift); // type specific count.
716 }
717 // Restore original addresses in to/from.
718 __ movq(to, temp3);
719 __ movq(from, temp1);
720 __ movq(temp4, temp2);
721 __ movq(temp1, count);
722 __ subq(temp1, temp2); // tailing part (less than a quad ward size).
723 __ jmp(L_tail);
724 }
725
726 if (MaxVectorSize > 32) {
727 __ BIND(L_pre_main_post_64);
728 // Partial copy to make dst address 64 byte aligned.
729 __ movq(temp2, to);
730 __ andq(temp2, 63);
731 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
732
733 __ negptr(temp2);
734 __ addq(temp2, 64);
735 if (shift) {
736 __ shrq(temp2, shift);
737 }
738 __ movq(temp3, temp2);
739 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
740 __ movq(temp4, temp2);
741 __ movq(temp1, count);
742 __ subq(temp1, temp2);
743
744 __ cmpq(temp1, loop_size[shift]);
745 __ jcc(Assembler::less, L_tail64);
746
747 __ BIND(L_main_pre_loop_64bytes);
748 __ subq(temp1, loop_size[shift]);
749
750 // Main loop with aligned copy block size of 192 bytes at
751 // 64 byte copy granularity.
752 __ align32();
753 __ BIND(L_main_loop_64bytes);
754 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
755 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
756 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
757 __ addptr(temp4, loop_size[shift]);
758 __ subq(temp1, loop_size[shift]);
759 __ jcc(Assembler::greater, L_main_loop_64bytes);
760
761 __ addq(temp1, loop_size[shift]);
762 // Zero length check.
763 __ jcc(Assembler::lessEqual, L_exit);
764
765 __ BIND(L_tail64);
766
767 // Tail handling using 64 byte [masked] vector copy operations.
768 use64byteVector = true;
769 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
770 temp4, temp3, use64byteVector, L_entry, L_exit);
771 }
772 __ BIND(L_exit);
773 }
774
775 __ BIND(L_finish);
776 address ucme_exit_pc = __ pc();
777 // When called from generic_arraycopy r11 contains specific values
778 // used during arraycopy epilogue, re-initializing r11.
779 if (is_oop) {
780 __ movq(r11, shift == 3 ? count : to);
781 }
782 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
783 restore_argument_regs(type);
784 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
785 __ xorptr(rax, rax); // return 0
786 __ vzeroupper();
787 __ leave(); // required for proper stackwalking of RuntimeStub frame
788 __ ret(0);
789
790 if (MaxVectorSize == 64) {
791 __ BIND(L_copy_large);
792 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
793 arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
794 __ jmp(L_finish);
795 }
796 return start;
797 }
798
799 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
800 Register temp3, Register temp4, Register count,
801 XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
802 XMMRegister xmm4, int shift) {
803
804 // Type(shift) byte(0), short(1), int(2), long(3)
805 int loop_size[] = { 256, 128, 64, 32};
806 int threshold[] = { 4096, 2048, 1024, 512};
807
808 Label L_main_loop_large;
809 Label L_tail_large;
810 Label L_exit_large;
811 Label L_entry_large;
812 Label L_main_pre_loop_large;
813 Label L_pre_main_post_large;
814
815 assert(MaxVectorSize == 64, "vector length != 64");
816 __ BIND(L_entry_large);
817
818 __ BIND(L_pre_main_post_large);
819 // Partial copy to make dst address 64 byte aligned.
820 __ movq(temp2, to);
821 __ andq(temp2, 63);
822 __ jcc(Assembler::equal, L_main_pre_loop_large);
823
824 __ negptr(temp2);
825 __ addq(temp2, 64);
826 if (shift) {
827 __ shrq(temp2, shift);
828 }
829 __ movq(temp3, temp2);
830 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
831 __ movq(temp4, temp2);
832 __ movq(temp1, count);
833 __ subq(temp1, temp2);
834
835 __ cmpq(temp1, loop_size[shift]);
836 __ jcc(Assembler::less, L_tail_large);
837
838 __ BIND(L_main_pre_loop_large);
839 __ subq(temp1, loop_size[shift]);
840
841 // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
842 __ align32();
843 __ BIND(L_main_loop_large);
844 copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
845 __ addptr(temp4, loop_size[shift]);
846 __ subq(temp1, loop_size[shift]);
847 __ jcc(Assembler::greater, L_main_loop_large);
848 // fence needed because copy256_avx3 uses non-temporal stores
849 __ sfence();
850
851 __ addq(temp1, loop_size[shift]);
852 // Zero length check.
853 __ jcc(Assembler::lessEqual, L_exit_large);
854 __ BIND(L_tail_large);
855 // Tail handling using 64 byte [masked] vector copy operations.
856 __ cmpq(temp1, 0);
857 __ jcc(Assembler::lessEqual, L_exit_large);
858 arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
859 temp4, temp3, L_exit_large);
860 __ BIND(L_exit_large);
861 }
862
863 // Inputs:
864 // c_rarg0 - source array address
865 // c_rarg1 - destination array address
866 // c_rarg2 - element count, treated as ssize_t, can be zero
867 //
868 //
869 address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry, address nooverlap_target) {
870 // aligned is always false -- x86_64 always uses the unaligned code
871 const bool aligned = false;
872 int shift;
873 bool is_oop;
874 bool dest_uninitialized;
875
876 switch (stub_id) {
877 case StubId::stubgen_jbyte_arraycopy_id:
878 shift = 0;
879 is_oop = false;
880 dest_uninitialized = false;
881 break;
882 case StubId::stubgen_jshort_arraycopy_id:
883 shift = 1;
884 is_oop = false;
885 dest_uninitialized = false;
886 break;
887 case StubId::stubgen_jint_arraycopy_id:
888 shift = 2;
889 is_oop = false;
890 dest_uninitialized = false;
891 break;
892 case StubId::stubgen_jlong_arraycopy_id:
893 shift = 3;
894 is_oop = false;
895 dest_uninitialized = false;
896 break;
897 case StubId::stubgen_oop_arraycopy_id:
898 shift = (UseCompressedOops ? 2 : 3);
899 is_oop = true;
900 dest_uninitialized = false;
901 break;
902 case StubId::stubgen_oop_arraycopy_uninit_id:
903 shift = (UseCompressedOops ? 2 : 3);
904 is_oop = true;
905 dest_uninitialized = true;
906 break;
907 default:
908 ShouldNotReachHere();
909 }
910
911 __ align(CodeEntryAlignment);
912 StubCodeMark mark(this, stub_id);
913 address start = __ pc();
914
915 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
916
917 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
918 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
919 const Register from = rdi; // source array address
920 const Register to = rsi; // destination array address
921 const Register count = rdx; // elements count
922 const Register temp1 = r8;
923 const Register temp2 = rcx;
924 const Register temp3 = r11;
925 const Register temp4 = rax;
926 // End pointers are inclusive, and if count is not zero they point
927 // to the last unit copied: end_to[0] := end_from[0]
928
929 __ enter(); // required for proper stackwalking of RuntimeStub frame
930 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
931
932 if (entry != nullptr) {
933 *entry = __ pc();
934 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
935 BLOCK_COMMENT("Entry:");
936 }
937
938 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
939
940 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
941 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
942
943 setup_argument_regs(type);
944
945 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
946 if (dest_uninitialized) {
947 decorators |= IS_DEST_UNINITIALIZED;
948 }
949 if (aligned) {
950 decorators |= ARRAYCOPY_ALIGNED;
951 }
952 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
953 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
954 {
955 // Type(shift) byte(0), short(1), int(2), long(3)
956 int loop_size[] = { 192, 96, 48, 24};
957 int threshold[] = { 4096, 2048, 1024, 512};
958
959 // UnsafeMemoryAccess page error: continue after unsafe access
960 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
961 // 'from', 'to' and 'count' are now valid
962
963 // temp1 holds remaining count.
964 __ movq(temp1, count);
965
966 // Zero length check.
967 __ BIND(L_tail);
968 __ cmpq(temp1, 0);
969 __ jcc(Assembler::lessEqual, L_exit);
970
971 __ mov64(temp2, 0);
972 __ movq(temp3, temp1);
973 // Special cases using 32 byte [masked] vector copy operations.
974 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
975 temp4, use64byteVector, L_entry, L_exit);
976
977 // PRE-MAIN-POST loop for aligned copy.
978 __ BIND(L_entry);
979
980 if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
981 __ cmpq(temp1, threshold[shift]);
982 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
983 }
984
985 if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
986 // Partial copy to make dst address 32 byte aligned.
987 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
988 __ andq(temp2, 31);
989 __ jcc(Assembler::equal, L_main_pre_loop);
990
991 if (shift) {
992 __ shrq(temp2, shift);
993 }
994 __ subq(temp1, temp2);
995 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
996
997 __ cmpq(temp1, loop_size[shift]);
998 __ jcc(Assembler::less, L_tail);
999
1000 __ BIND(L_main_pre_loop);
1001
1002 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1003 __ align32();
1004 __ BIND(L_main_loop);
1005 copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1006 copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1007 copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1008 __ subptr(temp1, loop_size[shift]);
1009 __ cmpq(temp1, loop_size[shift]);
1010 __ jcc(Assembler::greater, L_main_loop);
1011
1012 // Tail loop.
1013 __ jmp(L_tail);
1014 }
1015
1016 if (MaxVectorSize > 32) {
1017 __ BIND(L_pre_main_post_64);
1018 // Partial copy to make dst address 64 byte aligned.
1019 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1020 __ andq(temp2, 63);
1021 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1022
1023 if (shift) {
1024 __ shrq(temp2, shift);
1025 }
1026 __ subq(temp1, temp2);
1027 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1028
1029 __ cmpq(temp1, loop_size[shift]);
1030 __ jcc(Assembler::less, L_tail64);
1031
1032 __ BIND(L_main_pre_loop_64bytes);
1033
1034 // Main loop with aligned copy block size of 192 bytes at
1035 // 64 byte copy granularity.
1036 __ align32();
1037 __ BIND(L_main_loop_64bytes);
1038 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1039 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1040 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1041 __ subq(temp1, loop_size[shift]);
1042 __ cmpq(temp1, loop_size[shift]);
1043 __ jcc(Assembler::greater, L_main_loop_64bytes);
1044
1045 // Zero length check.
1046 __ cmpq(temp1, 0);
1047 __ jcc(Assembler::lessEqual, L_exit);
1048
1049 __ BIND(L_tail64);
1050
1051 // Tail handling using 64 byte [masked] vector copy operations.
1052 use64byteVector = true;
1053 __ mov64(temp2, 0);
1054 __ movq(temp3, temp1);
1055 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1056 temp4, use64byteVector, L_entry, L_exit);
1057 }
1058 __ BIND(L_exit);
1059 }
1060 address ucme_exit_pc = __ pc();
1061 // When called from generic_arraycopy r11 contains specific values
1062 // used during arraycopy epilogue, re-initializing r11.
1063 if(is_oop) {
1064 __ movq(r11, count);
1065 }
1066 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1067 restore_argument_regs(type);
1068 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1069 __ xorptr(rax, rax); // return 0
1070 __ vzeroupper();
1071 __ leave(); // required for proper stackwalking of RuntimeStub frame
1072 __ ret(0);
1073
1074 return start;
1075 }
1076
1077 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1078 Register to, Register count, int shift,
1079 Register index, Register temp,
1080 bool use64byteVector, Label& L_entry, Label& L_exit) {
1081 Label L_entry_64, L_entry_96, L_entry_128;
1082 Label L_entry_160, L_entry_192;
1083
1084 int size_mat[][6] = {
1085 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
1086 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
1087 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
1088 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
1089 };
1090
1091 // Case A) Special case for length less than equal to 32 bytes.
1092 __ cmpq(count, size_mat[shift][0]);
1093 __ jccb(Assembler::greater, L_entry_64);
1094 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1095 __ jmp(L_exit);
1096
1097 // Case B) Special case for length less than equal to 64 bytes.
1098 __ BIND(L_entry_64);
1099 __ cmpq(count, size_mat[shift][1]);
1100 __ jccb(Assembler::greater, L_entry_96);
1101 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1102 __ jmp(L_exit);
1103
1104 // Case C) Special case for length less than equal to 96 bytes.
1105 __ BIND(L_entry_96);
1106 __ cmpq(count, size_mat[shift][2]);
1107 __ jccb(Assembler::greater, L_entry_128);
1108 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1109 __ subq(count, 64 >> shift);
1110 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1111 __ jmp(L_exit);
1112
1113 // Case D) Special case for length less than equal to 128 bytes.
1114 __ BIND(L_entry_128);
1115 __ cmpq(count, size_mat[shift][3]);
1116 __ jccb(Assembler::greater, L_entry_160);
1117 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1118 copy32_avx(to, from, index, xmm, shift, 64);
1119 __ subq(count, 96 >> shift);
1120 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1121 __ jmp(L_exit);
1122
1123 // Case E) Special case for length less than equal to 160 bytes.
1124 __ BIND(L_entry_160);
1125 __ cmpq(count, size_mat[shift][4]);
1126 __ jccb(Assembler::greater, L_entry_192);
1127 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1128 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1129 __ subq(count, 128 >> shift);
1130 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1131 __ jmp(L_exit);
1132
1133 // Case F) Special case for length less than equal to 192 bytes.
1134 __ BIND(L_entry_192);
1135 __ cmpq(count, size_mat[shift][5]);
1136 __ jcc(Assembler::greater, L_entry);
1137 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1138 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1139 copy32_avx(to, from, index, xmm, shift, 128);
1140 __ subq(count, 160 >> shift);
1141 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1142 __ jmp(L_exit);
1143 }
1144
1145 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1146 Register to, Register count, int shift, Register index,
1147 Register temp, Label& L_exit) {
1148 Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1149
1150 int size_mat[][4] = {
1151 /* T_BYTE */ {64, 128, 192, 256},
1152 /* T_SHORT*/ {32, 64 , 96 , 128},
1153 /* T_INT */ {16, 32 , 48 , 64},
1154 /* T_LONG */ { 8, 16 , 24 , 32}
1155 };
1156
1157 assert(MaxVectorSize == 64, "vector length != 64");
1158 // Case A) Special case for length less than or equal to 64 bytes.
1159 __ BIND(L_entry_64);
1160 __ cmpq(count, size_mat[shift][0]);
1161 __ jccb(Assembler::greater, L_entry_128);
1162 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1163 __ jmp(L_exit);
1164
1165 // Case B) Special case for length less than or equal to 128 bytes.
1166 __ BIND(L_entry_128);
1167 __ cmpq(count, size_mat[shift][1]);
1168 __ jccb(Assembler::greater, L_entry_192);
1169 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1170 __ subq(count, 64 >> shift);
1171 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1172 __ jmp(L_exit);
1173
1174 // Case C) Special case for length less than or equal to 192 bytes.
1175 __ BIND(L_entry_192);
1176 __ cmpq(count, size_mat[shift][2]);
1177 __ jcc(Assembler::greater, L_entry_256);
1178 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1179 copy64_avx(to, from, index, xmm, false, shift, 64, true);
1180 __ subq(count, 128 >> shift);
1181 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1182 __ jmp(L_exit);
1183
1184 // Case D) Special case for length less than or equal to 256 bytes.
1185 __ BIND(L_entry_256);
1186 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1187 copy64_avx(to, from, index, xmm, false, shift, 64, true);
1188 copy64_avx(to, from, index, xmm, false, shift, 128, true);
1189 __ subq(count, 192 >> shift);
1190 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1191 __ jmp(L_exit);
1192 }
1193
1194 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1195 Register to, Register start_index, Register end_index,
1196 Register count, int shift, Register temp,
1197 bool use64byteVector, Label& L_entry, Label& L_exit) {
1198 Label L_entry_64, L_entry_96, L_entry_128;
1199 Label L_entry_160, L_entry_192;
1200 bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
1201
1202 int size_mat[][6] = {
1203 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
1204 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
1205 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
1206 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
1207 };
1208
1209 // Case A) Special case for length less than equal to 32 bytes.
1210 __ cmpq(count, size_mat[shift][0]);
1211 __ jccb(Assembler::greater, L_entry_64);
1212 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1213 __ jmp(L_exit);
1214
1215 // Case B) Special case for length less than equal to 64 bytes.
1216 __ BIND(L_entry_64);
1217 __ cmpq(count, size_mat[shift][1]);
1218 __ jccb(Assembler::greater, L_entry_96);
1219 if (avx3) {
1220 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1221 } else {
1222 copy32_avx(to, from, end_index, xmm, shift, -32);
1223 __ subq(count, 32 >> shift);
1224 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1225 }
1226 __ jmp(L_exit);
1227
1228 // Case C) Special case for length less than equal to 96 bytes.
1229 __ BIND(L_entry_96);
1230 __ cmpq(count, size_mat[shift][2]);
1231 __ jccb(Assembler::greater, L_entry_128);
1232 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1233 __ subq(count, 64 >> shift);
1234 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1235 __ jmp(L_exit);
1236
1237 // Case D) Special case for length less than equal to 128 bytes.
1238 __ BIND(L_entry_128);
1239 __ cmpq(count, size_mat[shift][3]);
1240 __ jccb(Assembler::greater, L_entry_160);
1241 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1242 copy32_avx(to, from, end_index, xmm, shift, -96);
1243 __ subq(count, 96 >> shift);
1244 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1245 __ jmp(L_exit);
1246
1247 // Case E) Special case for length less than equal to 160 bytes.
1248 __ BIND(L_entry_160);
1249 __ cmpq(count, size_mat[shift][4]);
1250 __ jccb(Assembler::greater, L_entry_192);
1251 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1252 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1253 __ subq(count, 128 >> shift);
1254 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1255 __ jmp(L_exit);
1256
1257 // Case F) Special case for length less than equal to 192 bytes.
1258 __ BIND(L_entry_192);
1259 __ cmpq(count, size_mat[shift][5]);
1260 __ jcc(Assembler::greater, L_entry);
1261 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1262 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1263 copy32_avx(to, from, end_index, xmm, shift, -160);
1264 __ subq(count, 160 >> shift);
1265 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1266 __ jmp(L_exit);
1267 }
1268
1269 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1270 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1271 int shift, int offset) {
1272 if (MaxVectorSize == 64) {
1273 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1274 __ prefetcht0(Address(src, index, scale, offset + 0x200));
1275 __ prefetcht0(Address(src, index, scale, offset + 0x240));
1276 __ prefetcht0(Address(src, index, scale, offset + 0x280));
1277 __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1278
1279 __ prefetcht0(Address(src, index, scale, offset + 0x400));
1280 __ prefetcht0(Address(src, index, scale, offset + 0x440));
1281 __ prefetcht0(Address(src, index, scale, offset + 0x480));
1282 __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1283
1284 __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1285 __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1286 __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1287 __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1288
1289 __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1290 __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1291 __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1292 __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1293 }
1294 }
1295
1296 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1297 KRegister mask, Register length, Register index,
1298 Register temp, int shift, int offset,
1299 bool use64byteVector) {
1300 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1301 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1302 if (!use64byteVector) {
1303 copy32_avx(dst, src, index, xmm, shift, offset);
1304 __ subptr(length, 32 >> shift);
1305 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1306 } else {
1307 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1308 assert(MaxVectorSize == 64, "vector length != 64");
1309 __ mov64(temp, -1L);
1310 __ bzhiq(temp, temp, length);
1311 __ kmovql(mask, temp);
1312 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1313 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1314 }
1315 }
1316
1317
1318 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1319 KRegister mask, Register length, Register index,
1320 Register temp, int shift, int offset) {
1321 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1322 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1323 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1324 __ mov64(temp, -1L);
1325 __ bzhiq(temp, temp, length);
1326 __ kmovql(mask, temp);
1327 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1328 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1329 }
1330
1331
1332 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1333 int shift, int offset) {
1334 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1335 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1336 __ vmovdqu(xmm, Address(src, index, scale, offset));
1337 __ vmovdqu(Address(dst, index, scale, offset), xmm);
1338 }
1339
1340
1341 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1342 bool conjoint, int shift, int offset, bool use64byteVector) {
1343 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1344 if (!use64byteVector) {
1345 if (conjoint) {
1346 copy32_avx(dst, src, index, xmm, shift, offset+32);
1347 copy32_avx(dst, src, index, xmm, shift, offset);
1348 } else {
1349 copy32_avx(dst, src, index, xmm, shift, offset);
1350 copy32_avx(dst, src, index, xmm, shift, offset+32);
1351 }
1352 } else {
1353 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1354 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1355 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1356 }
1357 }
1358
1359 #endif // COMPILER2_OR_JVMCI
1360
1361
1362 // Arguments:
1363 // entry - location for return of (post-push) entry
1364 //
1365 // Inputs:
1366 // c_rarg0 - source array address
1367 // c_rarg1 - destination array address
1368 // c_rarg2 - element count, treated as ssize_t, can be zero
1369 //
1370 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1371 // we let the hardware handle it. The one to eight bytes within words,
1372 // dwords or qwords that span cache line boundaries will still be loaded
1373 // and stored atomically.
1374 //
1375 // Side Effects:
1376 // entry is set to the no-overlap entry point
1377 // used by generate_conjoint_byte_copy().
1378 //
1379 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1380 StubId stub_id = StubId::stubgen_jbyte_disjoint_arraycopy_id;
1381 // aligned is always false -- x86_64 always uses the unaligned code
1382 const bool aligned = false;
1383 #if COMPILER2_OR_JVMCI
1384 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1385 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1386 }
1387 #endif
1388 __ align(CodeEntryAlignment);
1389 StubCodeMark mark(this, stub_id);
1390 address start = __ pc();
1391 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1392
1393 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1394 Label L_copy_byte, L_exit;
1395 const Register from = rdi; // source array address
1396 const Register to = rsi; // destination array address
1397 const Register count = rdx; // elements count
1398 const Register byte_count = rcx;
1399 const Register qword_count = count;
1400 const Register end_from = from; // source array end address
1401 const Register end_to = to; // destination array end address
1402 // End pointers are inclusive, and if count is not zero they point
1403 // to the last unit copied: end_to[0] := end_from[0]
1404
1405 __ enter(); // required for proper stackwalking of RuntimeStub frame
1406 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1407
1408 if (entry != nullptr) {
1409 *entry = __ pc();
1410 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1411 BLOCK_COMMENT("Entry:");
1412 }
1413
1414 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1415 // r9 and r10 may be used to save non-volatile registers
1416
1417 {
1418 // UnsafeMemoryAccess page error: continue after unsafe access
1419 UnsafeMemoryAccessMark umam(this, !aligned, true);
1420 // 'from', 'to' and 'count' are now valid
1421 __ movptr(byte_count, count);
1422 __ shrptr(count, 3); // count => qword_count
1423
1424 // Copy from low to high addresses. Use 'to' as scratch.
1425 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1426 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1427 __ negptr(qword_count); // make the count negative
1428 __ jmp(L_copy_bytes);
1429
1430 // Copy trailing qwords
1431 __ BIND(L_copy_8_bytes);
1432 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1433 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1434 __ increment(qword_count);
1435 __ jcc(Assembler::notZero, L_copy_8_bytes);
1436
1437 // Check for and copy trailing dword
1438 __ BIND(L_copy_4_bytes);
1439 __ testl(byte_count, 4);
1440 __ jccb(Assembler::zero, L_copy_2_bytes);
1441 __ movl(rax, Address(end_from, 8));
1442 __ movl(Address(end_to, 8), rax);
1443
1444 __ addptr(end_from, 4);
1445 __ addptr(end_to, 4);
1446
1447 // Check for and copy trailing word
1448 __ BIND(L_copy_2_bytes);
1449 __ testl(byte_count, 2);
1450 __ jccb(Assembler::zero, L_copy_byte);
1451 __ movw(rax, Address(end_from, 8));
1452 __ movw(Address(end_to, 8), rax);
1453
1454 __ addptr(end_from, 2);
1455 __ addptr(end_to, 2);
1456
1457 // Check for and copy trailing byte
1458 __ BIND(L_copy_byte);
1459 __ testl(byte_count, 1);
1460 __ jccb(Assembler::zero, L_exit);
1461 __ movb(rax, Address(end_from, 8));
1462 __ movb(Address(end_to, 8), rax);
1463 }
1464 __ BIND(L_exit);
1465 address ucme_exit_pc = __ pc();
1466 restore_arg_regs();
1467 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1468 __ xorptr(rax, rax); // return 0
1469 __ vzeroupper();
1470 __ leave(); // required for proper stackwalking of RuntimeStub frame
1471 __ ret(0);
1472
1473 {
1474 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1475 // Copy in multi-bytes chunks
1476 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1477 __ jmp(L_copy_4_bytes);
1478 }
1479 return start;
1480 }
1481
1482
1483 // Arguments:
1484 // entry - location for return of (post-push) entry
1485 // nooverlap_target - entry to branch to if no overlap detected
1486 //
1487 // Inputs:
1488 // c_rarg0 - source array address
1489 // c_rarg1 - destination array address
1490 // c_rarg2 - element count, treated as ssize_t, can be zero
1491 //
1492 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1493 // we let the hardware handle it. The one to eight bytes within words,
1494 // dwords or qwords that span cache line boundaries will still be loaded
1495 // and stored atomically.
1496 //
1497 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1498 StubId stub_id = StubId::stubgen_jbyte_arraycopy_id;
1499 // aligned is always false -- x86_64 always uses the unaligned code
1500 const bool aligned = false;
1501 #if COMPILER2_OR_JVMCI
1502 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1503 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1504 }
1505 #endif
1506 __ align(CodeEntryAlignment);
1507 StubCodeMark mark(this, stub_id);
1508 address start = __ pc();
1509 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1510
1511 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1512 const Register from = rdi; // source array address
1513 const Register to = rsi; // destination array address
1514 const Register count = rdx; // elements count
1515 const Register byte_count = rcx;
1516 const Register qword_count = count;
1517
1518 __ enter(); // required for proper stackwalking of RuntimeStub frame
1519 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1520
1521 if (entry != nullptr) {
1522 *entry = __ pc();
1523 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1524 BLOCK_COMMENT("Entry:");
1525 }
1526
1527 array_overlap_test(nooverlap_target, Address::times_1);
1528 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1529 // r9 and r10 may be used to save non-volatile registers
1530
1531 {
1532 // UnsafeMemoryAccess page error: continue after unsafe access
1533 UnsafeMemoryAccessMark umam(this, !aligned, true);
1534 // 'from', 'to' and 'count' are now valid
1535 __ movptr(byte_count, count);
1536 __ shrptr(count, 3); // count => qword_count
1537
1538 // Copy from high to low addresses.
1539
1540 // Check for and copy trailing byte
1541 __ testl(byte_count, 1);
1542 __ jcc(Assembler::zero, L_copy_2_bytes);
1543 __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1544 __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1545 __ decrement(byte_count); // Adjust for possible trailing word
1546
1547 // Check for and copy trailing word
1548 __ BIND(L_copy_2_bytes);
1549 __ testl(byte_count, 2);
1550 __ jcc(Assembler::zero, L_copy_4_bytes);
1551 __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1552 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1553
1554 // Check for and copy trailing dword
1555 __ BIND(L_copy_4_bytes);
1556 __ testl(byte_count, 4);
1557 __ jcc(Assembler::zero, L_copy_bytes);
1558 __ movl(rax, Address(from, qword_count, Address::times_8));
1559 __ movl(Address(to, qword_count, Address::times_8), rax);
1560 __ jmp(L_copy_bytes);
1561
1562 // Copy trailing qwords
1563 __ BIND(L_copy_8_bytes);
1564 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1565 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1566 __ decrement(qword_count);
1567 __ jcc(Assembler::notZero, L_copy_8_bytes);
1568 }
1569 restore_arg_regs();
1570 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1571 __ xorptr(rax, rax); // return 0
1572 __ vzeroupper();
1573 __ leave(); // required for proper stackwalking of RuntimeStub frame
1574 __ ret(0);
1575
1576 {
1577 // UnsafeMemoryAccess page error: continue after unsafe access
1578 UnsafeMemoryAccessMark umam(this, !aligned, true);
1579 // Copy in multi-bytes chunks
1580 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1581 }
1582 restore_arg_regs();
1583 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1584 __ xorptr(rax, rax); // return 0
1585 __ vzeroupper();
1586 __ leave(); // required for proper stackwalking of RuntimeStub frame
1587 __ ret(0);
1588
1589 return start;
1590 }
1591
1592
1593 // Arguments:
1594 // entry - location for return of (post-push) entry
1595 //
1596 // Inputs:
1597 // c_rarg0 - source array address
1598 // c_rarg1 - destination array address
1599 // c_rarg2 - element count, treated as ssize_t, can be zero
1600 //
1601 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1602 // let the hardware handle it. The two or four words within dwords
1603 // or qwords that span cache line boundaries will still be loaded
1604 // and stored atomically.
1605 //
1606 // Side Effects:
1607 // entry is set to the no-overlap entry point
1608 // used by generate_conjoint_short_copy().
1609 //
1610 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1611 StubId stub_id = StubId::stubgen_jshort_disjoint_arraycopy_id;
1612 // aligned is always false -- x86_64 always uses the unaligned code
1613 const bool aligned = false;
1614 #if COMPILER2_OR_JVMCI
1615 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1616 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1617 }
1618 #endif
1619
1620 __ align(CodeEntryAlignment);
1621 StubCodeMark mark(this, stub_id);
1622 address start = __ pc();
1623 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1624
1625 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1626 const Register from = rdi; // source array address
1627 const Register to = rsi; // destination array address
1628 const Register count = rdx; // elements count
1629 const Register word_count = rcx;
1630 const Register qword_count = count;
1631 const Register end_from = from; // source array end address
1632 const Register end_to = to; // destination array end address
1633 // End pointers are inclusive, and if count is not zero they point
1634 // to the last unit copied: end_to[0] := end_from[0]
1635
1636 __ enter(); // required for proper stackwalking of RuntimeStub frame
1637 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1638
1639 if (entry != nullptr) {
1640 *entry = __ pc();
1641 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1642 BLOCK_COMMENT("Entry:");
1643 }
1644
1645 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1646 // r9 and r10 may be used to save non-volatile registers
1647
1648 {
1649 // UnsafeMemoryAccess page error: continue after unsafe access
1650 UnsafeMemoryAccessMark umam(this, !aligned, true);
1651 // 'from', 'to' and 'count' are now valid
1652 __ movptr(word_count, count);
1653 __ shrptr(count, 2); // count => qword_count
1654
1655 // Copy from low to high addresses. Use 'to' as scratch.
1656 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1657 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1658 __ negptr(qword_count);
1659 __ jmp(L_copy_bytes);
1660
1661 // Copy trailing qwords
1662 __ BIND(L_copy_8_bytes);
1663 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1664 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1665 __ increment(qword_count);
1666 __ jcc(Assembler::notZero, L_copy_8_bytes);
1667
1668 // Original 'dest' is trashed, so we can't use it as a
1669 // base register for a possible trailing word copy
1670
1671 // Check for and copy trailing dword
1672 __ BIND(L_copy_4_bytes);
1673 __ testl(word_count, 2);
1674 __ jccb(Assembler::zero, L_copy_2_bytes);
1675 __ movl(rax, Address(end_from, 8));
1676 __ movl(Address(end_to, 8), rax);
1677
1678 __ addptr(end_from, 4);
1679 __ addptr(end_to, 4);
1680
1681 // Check for and copy trailing word
1682 __ BIND(L_copy_2_bytes);
1683 __ testl(word_count, 1);
1684 __ jccb(Assembler::zero, L_exit);
1685 __ movw(rax, Address(end_from, 8));
1686 __ movw(Address(end_to, 8), rax);
1687 }
1688 __ BIND(L_exit);
1689 address ucme_exit_pc = __ pc();
1690 restore_arg_regs();
1691 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1692 __ xorptr(rax, rax); // return 0
1693 __ vzeroupper();
1694 __ leave(); // required for proper stackwalking of RuntimeStub frame
1695 __ ret(0);
1696
1697 {
1698 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1699 // Copy in multi-bytes chunks
1700 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1701 __ jmp(L_copy_4_bytes);
1702 }
1703
1704 return start;
1705 }
1706
1707
1708 address StubGenerator::generate_fill(StubId stub_id) {
1709 BasicType t;
1710 bool aligned;
1711
1712 switch (stub_id) {
1713 case StubId::stubgen_jbyte_fill_id:
1714 t = T_BYTE;
1715 aligned = false;
1716 break;
1717 case StubId::stubgen_jshort_fill_id:
1718 t = T_SHORT;
1719 aligned = false;
1720 break;
1721 case StubId::stubgen_jint_fill_id:
1722 t = T_INT;
1723 aligned = false;
1724 break;
1725 case StubId::stubgen_arrayof_jbyte_fill_id:
1726 t = T_BYTE;
1727 aligned = true;
1728 break;
1729 case StubId::stubgen_arrayof_jshort_fill_id:
1730 t = T_SHORT;
1731 aligned = true;
1732 break;
1733 case StubId::stubgen_arrayof_jint_fill_id:
1734 t = T_INT;
1735 aligned = true;
1736 break;
1737 default:
1738 ShouldNotReachHere();
1739 }
1740
1741 __ align(CodeEntryAlignment);
1742 StubCodeMark mark(this, stub_id);
1743 address start = __ pc();
1744
1745 BLOCK_COMMENT("Entry:");
1746
1747 const Register to = c_rarg0; // destination array address
1748 const Register value = c_rarg1; // value
1749 const Register count = c_rarg2; // elements count
1750 __ mov(r11, count);
1751
1752 __ enter(); // required for proper stackwalking of RuntimeStub frame
1753
1754 {
1755 // Add set memory mark to protect against unsafe accesses faulting
1756 UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true);
1757 __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1758 }
1759
1760 __ vzeroupper();
1761 __ leave(); // required for proper stackwalking of RuntimeStub frame
1762 __ ret(0);
1763
1764 return start;
1765 }
1766
1767
1768 // Arguments:
1769 // entry - location for return of (post-push) entry
1770 // nooverlap_target - entry to branch to if no overlap detected
1771 //
1772 // Inputs:
1773 // c_rarg0 - source array address
1774 // c_rarg1 - destination array address
1775 // c_rarg2 - element count, treated as ssize_t, can be zero
1776 //
1777 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1778 // let the hardware handle it. The two or four words within dwords
1779 // or qwords that span cache line boundaries will still be loaded
1780 // and stored atomically.
1781 //
1782 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
1783 StubId stub_id = StubId::stubgen_jshort_arraycopy_id;
1784 // aligned is always false -- x86_64 always uses the unaligned code
1785 const bool aligned = false;
1786 #if COMPILER2_OR_JVMCI
1787 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1788 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1789 }
1790 #endif
1791
1792 __ align(CodeEntryAlignment);
1793 StubCodeMark mark(this, stub_id);
1794 address start = __ pc();
1795 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1796
1797 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1798 const Register from = rdi; // source array address
1799 const Register to = rsi; // destination array address
1800 const Register count = rdx; // elements count
1801 const Register word_count = rcx;
1802 const Register qword_count = count;
1803
1804 __ enter(); // required for proper stackwalking of RuntimeStub frame
1805 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1806
1807 if (entry != nullptr) {
1808 *entry = __ pc();
1809 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1810 BLOCK_COMMENT("Entry:");
1811 }
1812
1813 array_overlap_test(nooverlap_target, Address::times_2);
1814 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1815 // r9 and r10 may be used to save non-volatile registers
1816
1817 {
1818 // UnsafeMemoryAccess page error: continue after unsafe access
1819 UnsafeMemoryAccessMark umam(this, !aligned, true);
1820 // 'from', 'to' and 'count' are now valid
1821 __ movptr(word_count, count);
1822 __ shrptr(count, 2); // count => qword_count
1823
1824 // Copy from high to low addresses. Use 'to' as scratch.
1825
1826 // Check for and copy trailing word
1827 __ testl(word_count, 1);
1828 __ jccb(Assembler::zero, L_copy_4_bytes);
1829 __ movw(rax, Address(from, word_count, Address::times_2, -2));
1830 __ movw(Address(to, word_count, Address::times_2, -2), rax);
1831
1832 // Check for and copy trailing dword
1833 __ BIND(L_copy_4_bytes);
1834 __ testl(word_count, 2);
1835 __ jcc(Assembler::zero, L_copy_bytes);
1836 __ movl(rax, Address(from, qword_count, Address::times_8));
1837 __ movl(Address(to, qword_count, Address::times_8), rax);
1838 __ jmp(L_copy_bytes);
1839
1840 // Copy trailing qwords
1841 __ BIND(L_copy_8_bytes);
1842 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1843 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1844 __ decrement(qword_count);
1845 __ jcc(Assembler::notZero, L_copy_8_bytes);
1846 }
1847 restore_arg_regs();
1848 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1849 __ xorptr(rax, rax); // return 0
1850 __ vzeroupper();
1851 __ leave(); // required for proper stackwalking of RuntimeStub frame
1852 __ ret(0);
1853
1854 {
1855 // UnsafeMemoryAccess page error: continue after unsafe access
1856 UnsafeMemoryAccessMark umam(this, !aligned, true);
1857 // Copy in multi-bytes chunks
1858 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1859 }
1860 restore_arg_regs();
1861 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1862 __ xorptr(rax, rax); // return 0
1863 __ vzeroupper();
1864 __ leave(); // required for proper stackwalking of RuntimeStub frame
1865 __ ret(0);
1866
1867 return start;
1868 }
1869
1870
1871 // Arguments:
1872 // stub_id - unqiue id for stub to generate
1873 // entry - location for return of (post-push) entry
1874 // is_oop - true => oop array, so generate store check code
1875 //
1876 // Inputs:
1877 // c_rarg0 - source array address
1878 // c_rarg1 - destination array address
1879 // c_rarg2 - element count, treated as ssize_t, can be zero
1880 //
1881 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1882 // the hardware handle it. The two dwords within qwords that span
1883 // cache line boundaries will still be loaded and stored atomically.
1884 //
1885 // Side Effects:
1886 // disjoint_int_copy_entry is set to the no-overlap entry point
1887 // used by generate_conjoint_int_oop_copy().
1888 //
1889 address StubGenerator::generate_disjoint_int_oop_copy(StubId stub_id, address* entry) {
1890 // aligned is always false -- x86_64 always uses the unaligned code
1891 const bool aligned = false;
1892 bool is_oop;
1893 bool dest_uninitialized;
1894 switch (stub_id) {
1895 case StubId::stubgen_jint_disjoint_arraycopy_id:
1896 is_oop = false;
1897 dest_uninitialized = false;
1898 break;
1899 case StubId::stubgen_oop_disjoint_arraycopy_id:
1900 assert(UseCompressedOops, "inconsistent oop copy size!");
1901 is_oop = true;
1902 dest_uninitialized = false;
1903 break;
1904 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1905 assert(UseCompressedOops, "inconsistent oop copy size!");
1906 is_oop = true;
1907 dest_uninitialized = true;
1908 break;
1909 default:
1910 ShouldNotReachHere();
1911 }
1912
1913 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1914 #if COMPILER2_OR_JVMCI
1915 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1916 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1917 }
1918 #endif
1919
1920 __ align(CodeEntryAlignment);
1921 StubCodeMark mark(this, stub_id);
1922 address start = __ pc();
1923
1924 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1925 const Register from = rdi; // source array address
1926 const Register to = rsi; // destination array address
1927 const Register count = rdx; // elements count
1928 const Register dword_count = rcx;
1929 const Register qword_count = count;
1930 const Register end_from = from; // source array end address
1931 const Register end_to = to; // destination array end address
1932 // End pointers are inclusive, and if count is not zero they point
1933 // to the last unit copied: end_to[0] := end_from[0]
1934
1935 __ enter(); // required for proper stackwalking of RuntimeStub frame
1936 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1937
1938 if (entry != nullptr) {
1939 *entry = __ pc();
1940 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1941 BLOCK_COMMENT("Entry:");
1942 }
1943
1944 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1945 // r9 is used to save r15_thread
1946
1947 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1948 if (dest_uninitialized) {
1949 decorators |= IS_DEST_UNINITIALIZED;
1950 }
1951 if (aligned) {
1952 decorators |= ARRAYCOPY_ALIGNED;
1953 }
1954
1955 BasicType type = is_oop ? T_OBJECT : T_INT;
1956 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1957
1958 {
1959 // UnsafeMemoryAccess page error: continue after unsafe access
1960 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1961 // 'from', 'to' and 'count' are now valid
1962 __ movptr(dword_count, count);
1963 __ shrptr(count, 1); // count => qword_count
1964
1965 // Copy from low to high addresses. Use 'to' as scratch.
1966 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1967 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1968 __ negptr(qword_count);
1969 __ jmp(L_copy_bytes);
1970
1971 // Copy trailing qwords
1972 __ BIND(L_copy_8_bytes);
1973 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1974 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1975 __ increment(qword_count);
1976 __ jcc(Assembler::notZero, L_copy_8_bytes);
1977
1978 // Check for and copy trailing dword
1979 __ BIND(L_copy_4_bytes);
1980 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1981 __ jccb(Assembler::zero, L_exit);
1982 __ movl(rax, Address(end_from, 8));
1983 __ movl(Address(end_to, 8), rax);
1984 }
1985 __ BIND(L_exit);
1986 address ucme_exit_pc = __ pc();
1987 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1988 restore_arg_regs_using_thread();
1989 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1990 __ vzeroupper();
1991 __ xorptr(rax, rax); // return 0
1992 __ leave(); // required for proper stackwalking of RuntimeStub frame
1993 __ ret(0);
1994
1995 {
1996 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
1997 // Copy in multi-bytes chunks
1998 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
1999 __ jmp(L_copy_4_bytes);
2000 }
2001
2002 return start;
2003 }
2004
2005
2006 // Arguments:
2007 // entry - location for return of (post-push) entry
2008 // nooverlap_target - entry to branch to if no overlap detected
2009 // is_oop - true => oop array, so generate store check code
2010 //
2011 // Inputs:
2012 // c_rarg0 - source array address
2013 // c_rarg1 - destination array address
2014 // c_rarg2 - element count, treated as ssize_t, can be zero
2015 //
2016 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2017 // the hardware handle it. The two dwords within qwords that span
2018 // cache line boundaries will still be loaded and stored atomically.
2019 //
2020 address StubGenerator::generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2021 // aligned is always false -- x86_64 always uses the unaligned code
2022 const bool aligned = false;
2023 bool is_oop;
2024 bool dest_uninitialized;
2025 switch (stub_id) {
2026 case StubId::stubgen_jint_arraycopy_id:
2027 is_oop = false;
2028 dest_uninitialized = false;
2029 break;
2030 case StubId::stubgen_oop_arraycopy_id:
2031 assert(UseCompressedOops, "inconsistent oop copy size!");
2032 is_oop = true;
2033 dest_uninitialized = false;
2034 break;
2035 case StubId::stubgen_oop_arraycopy_uninit_id:
2036 assert(UseCompressedOops, "inconsistent oop copy size!");
2037 is_oop = true;
2038 dest_uninitialized = true;
2039 break;
2040 default:
2041 ShouldNotReachHere();
2042 }
2043
2044 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2045 #if COMPILER2_OR_JVMCI
2046 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2047 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2048 }
2049 #endif
2050
2051 __ align(CodeEntryAlignment);
2052 StubCodeMark mark(this, stub_id);
2053 address start = __ pc();
2054
2055 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2056 const Register from = rdi; // source array address
2057 const Register to = rsi; // destination array address
2058 const Register count = rdx; // elements count
2059 const Register dword_count = rcx;
2060 const Register qword_count = count;
2061
2062 __ enter(); // required for proper stackwalking of RuntimeStub frame
2063 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2064
2065 if (entry != nullptr) {
2066 *entry = __ pc();
2067 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2068 BLOCK_COMMENT("Entry:");
2069 }
2070
2071 array_overlap_test(nooverlap_target, Address::times_4);
2072 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2073 // r9 is used to save r15_thread
2074
2075 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2076 if (dest_uninitialized) {
2077 decorators |= IS_DEST_UNINITIALIZED;
2078 }
2079 if (aligned) {
2080 decorators |= ARRAYCOPY_ALIGNED;
2081 }
2082
2083 BasicType type = is_oop ? T_OBJECT : T_INT;
2084 // no registers are destroyed by this call
2085 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2086
2087 assert_clean_int(count, rax); // Make sure 'count' is clean int.
2088 {
2089 // UnsafeMemoryAccess page error: continue after unsafe access
2090 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2091 // 'from', 'to' and 'count' are now valid
2092 __ movptr(dword_count, count);
2093 __ shrptr(count, 1); // count => qword_count
2094
2095 // Copy from high to low addresses. Use 'to' as scratch.
2096
2097 // Check for and copy trailing dword
2098 __ testl(dword_count, 1);
2099 __ jcc(Assembler::zero, L_copy_bytes);
2100 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2101 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2102 __ jmp(L_copy_bytes);
2103
2104 // Copy trailing qwords
2105 __ BIND(L_copy_8_bytes);
2106 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2107 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2108 __ decrement(qword_count);
2109 __ jcc(Assembler::notZero, L_copy_8_bytes);
2110 }
2111 if (is_oop) {
2112 __ jmp(L_exit);
2113 }
2114 restore_arg_regs_using_thread();
2115 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2116 __ xorptr(rax, rax); // return 0
2117 __ vzeroupper();
2118 __ leave(); // required for proper stackwalking of RuntimeStub frame
2119 __ ret(0);
2120
2121 {
2122 // UnsafeMemoryAccess page error: continue after unsafe access
2123 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2124 // Copy in multi-bytes chunks
2125 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2126 }
2127
2128 __ BIND(L_exit);
2129 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2130 restore_arg_regs_using_thread();
2131 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2132 __ xorptr(rax, rax); // return 0
2133 __ vzeroupper();
2134 __ leave(); // required for proper stackwalking of RuntimeStub frame
2135 __ ret(0);
2136
2137 return start;
2138 }
2139
2140
2141 // Arguments:
2142 // entry - location for return of (post-push) entry
2143 //
2144 // Inputs:
2145 // c_rarg0 - source array address
2146 // c_rarg1 - destination array address
2147 // c_rarg2 - element count, treated as ssize_t, can be zero
2148 //
2149 // Side Effects:
2150 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2151 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2152 //
2153 address StubGenerator::generate_disjoint_long_oop_copy(StubId stub_id, address *entry) {
2154 // aligned is always false -- x86_64 always uses the unaligned code
2155 const bool aligned = false;
2156 bool is_oop;
2157 bool dest_uninitialized;
2158 switch (stub_id) {
2159 case StubId::stubgen_jlong_disjoint_arraycopy_id:
2160 is_oop = false;
2161 dest_uninitialized = false;
2162 break;
2163 case StubId::stubgen_oop_disjoint_arraycopy_id:
2164 assert(!UseCompressedOops, "inconsistent oop copy size!");
2165 is_oop = true;
2166 dest_uninitialized = false;
2167 break;
2168 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2169 assert(!UseCompressedOops, "inconsistent oop copy size!");
2170 is_oop = true;
2171 dest_uninitialized = true;
2172 break;
2173 default:
2174 ShouldNotReachHere();
2175 }
2176
2177 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2178 #if COMPILER2_OR_JVMCI
2179 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2180 return generate_disjoint_copy_avx3_masked(stub_id, entry);
2181 }
2182 #endif
2183
2184 __ align(CodeEntryAlignment);
2185 StubCodeMark mark(this, stub_id);
2186 address start = __ pc();
2187
2188 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2189 const Register from = rdi; // source array address
2190 const Register to = rsi; // destination array address
2191 const Register qword_count = rdx; // elements count
2192 const Register end_from = from; // source array end address
2193 const Register end_to = rcx; // destination array end address
2194 const Register saved_count = r11;
2195 // End pointers are inclusive, and if count is not zero they point
2196 // to the last unit copied: end_to[0] := end_from[0]
2197
2198 __ enter(); // required for proper stackwalking of RuntimeStub frame
2199 // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2200 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2201
2202 if (entry != nullptr) {
2203 *entry = __ pc();
2204 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2205 BLOCK_COMMENT("Entry:");
2206 }
2207
2208 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2209 // r9 is used to save r15_thread
2210 // 'from', 'to' and 'qword_count' are now valid
2211
2212 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2213 if (dest_uninitialized) {
2214 decorators |= IS_DEST_UNINITIALIZED;
2215 }
2216 if (aligned) {
2217 decorators |= ARRAYCOPY_ALIGNED;
2218 }
2219
2220 BasicType type = is_oop ? T_OBJECT : T_LONG;
2221 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2222 {
2223 // UnsafeMemoryAccess page error: continue after unsafe access
2224 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2225
2226 // Copy from low to high addresses. Use 'to' as scratch.
2227 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2228 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2229 __ negptr(qword_count);
2230 __ jmp(L_copy_bytes);
2231
2232 // Copy trailing qwords
2233 __ BIND(L_copy_8_bytes);
2234 bs->copy_load_at(_masm, decorators, type, 8,
2235 rax, Address(end_from, qword_count, Address::times_8, 8),
2236 r10);
2237 bs->copy_store_at(_masm, decorators, type, 8,
2238 Address(end_to, qword_count, Address::times_8, 8), rax,
2239 r10);
2240 __ increment(qword_count);
2241 __ jcc(Assembler::notZero, L_copy_8_bytes);
2242 }
2243 if (is_oop) {
2244 __ jmp(L_exit);
2245 } else {
2246 restore_arg_regs_using_thread();
2247 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2248 __ xorptr(rax, rax); // return 0
2249 __ vzeroupper();
2250 __ leave(); // required for proper stackwalking of RuntimeStub frame
2251 __ ret(0);
2252 }
2253
2254 {
2255 // UnsafeMemoryAccess page error: continue after unsafe access
2256 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2257 // Copy in multi-bytes chunks
2258 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2259 }
2260
2261 __ BIND(L_exit);
2262 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2263 restore_arg_regs_using_thread();
2264 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2265 SharedRuntime::_jlong_array_copy_ctr,
2266 rscratch1); // Update counter after rscratch1 is free
2267 __ vzeroupper();
2268 __ xorptr(rax, rax); // return 0
2269 __ leave(); // required for proper stackwalking of RuntimeStub frame
2270 __ ret(0);
2271
2272 return start;
2273 }
2274
2275
2276 // Arguments:
2277 // entry - location for return of (post-push) entry
2278 // nooverlap_target - entry to branch to if no overlap detected
2279 // is_oop - true => oop array, so generate store check code
2280 //
2281 // Inputs:
2282 // c_rarg0 - source array address
2283 // c_rarg1 - destination array address
2284 // c_rarg2 - element count, treated as ssize_t, can be zero
2285 //
2286 address StubGenerator::generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2287 // aligned is always false -- x86_64 always uses the unaligned code
2288 const bool aligned = false;
2289 bool is_oop;
2290 bool dest_uninitialized;
2291 switch (stub_id) {
2292 case StubId::stubgen_jlong_arraycopy_id:
2293 is_oop = false;
2294 dest_uninitialized = false;
2295 break;
2296 case StubId::stubgen_oop_arraycopy_id:
2297 assert(!UseCompressedOops, "inconsistent oop copy size!");
2298 is_oop = true;
2299 dest_uninitialized = false;
2300 break;
2301 case StubId::stubgen_oop_arraycopy_uninit_id:
2302 assert(!UseCompressedOops, "inconsistent oop copy size!");
2303 is_oop = true;
2304 dest_uninitialized = true;
2305 break;
2306 default:
2307 ShouldNotReachHere();
2308 }
2309
2310 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2311 #if COMPILER2_OR_JVMCI
2312 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2313 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2314 }
2315 #endif
2316
2317 __ align(CodeEntryAlignment);
2318 StubCodeMark mark(this, stub_id);
2319 address start = __ pc();
2320
2321 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2322 const Register from = rdi; // source array address
2323 const Register to = rsi; // destination array address
2324 const Register qword_count = rdx; // elements count
2325 const Register saved_count = rcx;
2326
2327 __ enter(); // required for proper stackwalking of RuntimeStub frame
2328 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2329
2330 if (entry != nullptr) {
2331 *entry = __ pc();
2332 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2333 BLOCK_COMMENT("Entry:");
2334 }
2335
2336 array_overlap_test(nooverlap_target, Address::times_8);
2337 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2338 // r9 is used to save r15_thread
2339 // 'from', 'to' and 'qword_count' are now valid
2340
2341 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2342 if (dest_uninitialized) {
2343 decorators |= IS_DEST_UNINITIALIZED;
2344 }
2345 if (aligned) {
2346 decorators |= ARRAYCOPY_ALIGNED;
2347 }
2348
2349 BasicType type = is_oop ? T_OBJECT : T_LONG;
2350 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2351 {
2352 // UnsafeMemoryAccess page error: continue after unsafe access
2353 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2354
2355 __ jmp(L_copy_bytes);
2356
2357 // Copy trailing qwords
2358 __ BIND(L_copy_8_bytes);
2359 bs->copy_load_at(_masm, decorators, type, 8,
2360 rax, Address(from, qword_count, Address::times_8, -8),
2361 r10);
2362 bs->copy_store_at(_masm, decorators, type, 8,
2363 Address(to, qword_count, Address::times_8, -8), rax,
2364 r10);
2365 __ decrement(qword_count);
2366 __ jcc(Assembler::notZero, L_copy_8_bytes);
2367 }
2368 if (is_oop) {
2369 __ jmp(L_exit);
2370 } else {
2371 restore_arg_regs_using_thread();
2372 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2373 __ xorptr(rax, rax); // return 0
2374 __ vzeroupper();
2375 __ leave(); // required for proper stackwalking of RuntimeStub frame
2376 __ ret(0);
2377 }
2378 {
2379 // UnsafeMemoryAccess page error: continue after unsafe access
2380 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2381
2382 // Copy in multi-bytes chunks
2383 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2384 }
2385 __ BIND(L_exit);
2386 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2387 restore_arg_regs_using_thread();
2388 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2389 SharedRuntime::_jlong_array_copy_ctr,
2390 rscratch1); // Update counter after rscratch1 is free
2391 __ vzeroupper();
2392 __ xorptr(rax, rax); // return 0
2393 __ leave(); // required for proper stackwalking of RuntimeStub frame
2394 __ ret(0);
2395
2396 return start;
2397 }
2398
2399
2400 // Helper for generating a dynamic type check.
2401 // Smashes no registers.
2402 void StubGenerator::generate_type_check(Register sub_klass,
2403 Register super_check_offset,
2404 Register super_klass,
2405 Label& L_success) {
2406 assert_different_registers(sub_klass, super_check_offset, super_klass);
2407
2408 BLOCK_COMMENT("type_check:");
2409
2410 Label L_miss;
2411
2412 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2413 super_check_offset);
2414 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2415
2416 // Fall through on failure!
2417 __ BIND(L_miss);
2418 }
2419
2420 //
2421 // Generate checkcasting array copy stub
2422 //
2423 // Input:
2424 // c_rarg0 - source array address
2425 // c_rarg1 - destination array address
2426 // c_rarg2 - element count, treated as ssize_t, can be zero
2427 // c_rarg3 - size_t ckoff (super_check_offset)
2428 // not Win64
2429 // c_rarg4 - oop ckval (super_klass)
2430 // Win64
2431 // rsp+40 - oop ckval (super_klass)
2432 //
2433 // Output:
2434 // rax == 0 - success
2435 // rax == -1^K - failure, where K is partial transfer count
2436 //
2437 address StubGenerator::generate_checkcast_copy(StubId stub_id, address *entry) {
2438
2439 bool dest_uninitialized;
2440 switch (stub_id) {
2441 case StubId::stubgen_checkcast_arraycopy_id:
2442 dest_uninitialized = false;
2443 break;
2444 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2445 dest_uninitialized = true;
2446 break;
2447 default:
2448 ShouldNotReachHere();
2449 }
2450
2451 Label L_load_element, L_store_element, L_do_card_marks, L_done;
2452
2453 // Input registers (after setup_arg_regs)
2454 const Register from = rdi; // source array address
2455 const Register to = rsi; // destination array address
2456 const Register length = rdx; // elements count
2457 const Register ckoff = rcx; // super_check_offset
2458 const Register ckval = r8; // super_klass
2459
2460 // Registers used as temps (r13, r14 are save-on-entry)
2461 const Register end_from = from; // source array end address
2462 const Register end_to = r13; // destination array end address
2463 const Register count = rdx; // -(count_remaining)
2464 const Register r14_length = r14; // saved copy of length
2465 // End pointers are inclusive, and if length is not zero they point
2466 // to the last unit copied: end_to[0] := end_from[0]
2467
2468 const Register rax_oop = rax; // actual oop copied
2469 const Register r11_klass = r11; // oop._klass
2470
2471 //---------------------------------------------------------------
2472 // Assembler stub will be used for this call to arraycopy
2473 // if the two arrays are subtypes of Object[] but the
2474 // destination array type is not equal to or a supertype
2475 // of the source type. Each element must be separately
2476 // checked.
2477
2478 __ align(CodeEntryAlignment);
2479 StubCodeMark mark(this, stub_id);
2480 address start = __ pc();
2481
2482 __ enter(); // required for proper stackwalking of RuntimeStub frame
2483
2484 #ifdef ASSERT
2485 // caller guarantees that the arrays really are different
2486 // otherwise, we would have to make conjoint checks
2487 { Label L;
2488 array_overlap_test(L, TIMES_OOP);
2489 __ stop("checkcast_copy within a single array");
2490 __ bind(L);
2491 }
2492 #endif //ASSERT
2493
2494 setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2495 // ckoff => rcx, ckval => r8
2496 // r9 is used to save r15_thread
2497 #ifdef _WIN64
2498 // last argument (#4) is on stack on Win64
2499 __ movptr(ckval, Address(rsp, 6 * wordSize));
2500 #endif
2501
2502 // Caller of this entry point must set up the argument registers.
2503 if (entry != nullptr) {
2504 *entry = __ pc();
2505 BLOCK_COMMENT("Entry:");
2506 }
2507
2508 // allocate spill slots for r13, r14
2509 enum {
2510 saved_r13_offset,
2511 saved_r14_offset,
2512 saved_r10_offset,
2513 saved_rbp_offset
2514 };
2515 __ subptr(rsp, saved_rbp_offset * wordSize);
2516 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2517 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2518 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2519
2520 #ifdef ASSERT
2521 Label L2;
2522 __ get_thread_slow(r14);
2523 __ cmpptr(r15_thread, r14);
2524 __ jcc(Assembler::equal, L2);
2525 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2526 __ bind(L2);
2527 #endif // ASSERT
2528
2529 // check that int operands are properly extended to size_t
2530 assert_clean_int(length, rax);
2531 assert_clean_int(ckoff, rax);
2532
2533 #ifdef ASSERT
2534 BLOCK_COMMENT("assert consistent ckoff/ckval");
2535 // The ckoff and ckval must be mutually consistent,
2536 // even though caller generates both.
2537 { Label L;
2538 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2539 __ cmpl(ckoff, Address(ckval, sco_offset));
2540 __ jcc(Assembler::equal, L);
2541 __ stop("super_check_offset inconsistent");
2542 __ bind(L);
2543 }
2544 #endif //ASSERT
2545
2546 // Loop-invariant addresses. They are exclusive end pointers.
2547 Address end_from_addr(from, length, TIMES_OOP, 0);
2548 Address end_to_addr(to, length, TIMES_OOP, 0);
2549 // Loop-variant addresses. They assume post-incremented count < 0.
2550 Address from_element_addr(end_from, count, TIMES_OOP, 0);
2551 Address to_element_addr(end_to, count, TIMES_OOP, 0);
2552
2553 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2554 if (dest_uninitialized) {
2555 decorators |= IS_DEST_UNINITIALIZED;
2556 }
2557
2558 BasicType type = T_OBJECT;
2559 size_t element_size = UseCompressedOops ? 4 : 8;
2560
2561 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2562 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2563
2564 // Copy from low to high addresses, indexed from the end of each array.
2565 __ lea(end_from, end_from_addr);
2566 __ lea(end_to, end_to_addr);
2567 __ movptr(r14_length, length); // save a copy of the length
2568 assert(length == count, ""); // else fix next line:
2569 __ negptr(count); // negate and test the length
2570 __ jcc(Assembler::notZero, L_load_element);
2571
2572 // Empty array: Nothing to do.
2573 __ xorptr(rax, rax); // return 0 on (trivial) success
2574 __ jmp(L_done);
2575
2576 // ======== begin loop ========
2577 // (Loop is rotated; its entry is L_load_element.)
2578 // Loop control:
2579 // for (count = -count; count != 0; count++)
2580 // Base pointers src, dst are biased by 8*(count-1),to last element.
2581 __ align(OptoLoopAlignment);
2582
2583 __ BIND(L_store_element);
2584 bs->copy_store_at(_masm,
2585 decorators,
2586 type,
2587 element_size,
2588 to_element_addr,
2589 rax_oop,
2590 r10);
2591 __ increment(count); // increment the count toward zero
2592 __ jcc(Assembler::zero, L_do_card_marks);
2593
2594 // ======== loop entry is here ========
2595 __ BIND(L_load_element);
2596 bs->copy_load_at(_masm,
2597 decorators,
2598 type,
2599 element_size,
2600 rax_oop,
2601 from_element_addr,
2602 r10);
2603 __ testptr(rax_oop, rax_oop);
2604 __ jcc(Assembler::zero, L_store_element);
2605
2606 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2607 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2608 // ======== end loop ========
2609
2610 // It was a real error; we must depend on the caller to finish the job.
2611 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2612 // Emit GC store barriers for the oops we have copied (r14 + rdx),
2613 // and report their number to the caller.
2614 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2615 Label L_post_barrier;
2616 __ addptr(r14_length, count); // K = (original - remaining) oops
2617 __ movptr(rax, r14_length); // save the value
2618 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
2619 __ jccb(Assembler::notZero, L_post_barrier);
2620 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2621
2622 // Come here on success only.
2623 __ BIND(L_do_card_marks);
2624 __ xorptr(rax, rax); // return 0 on success
2625
2626 __ BIND(L_post_barrier);
2627 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2628
2629 // Common exit point (success or failure).
2630 __ BIND(L_done);
2631 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2632 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2633 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2634 restore_arg_regs_using_thread();
2635 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2636 __ leave(); // required for proper stackwalking of RuntimeStub frame
2637 __ ret(0);
2638
2639 return start;
2640 }
2641
2642
2643 // Generate 'unsafe' array copy stub
2644 // Though just as safe as the other stubs, it takes an unscaled
2645 // size_t argument instead of an element count.
2646 //
2647 // Input:
2648 // c_rarg0 - source array address
2649 // c_rarg1 - destination array address
2650 // c_rarg2 - byte count, treated as ssize_t, can be zero
2651 //
2652 // Examines the alignment of the operands and dispatches
2653 // to a long, int, short, or byte copy loop.
2654 //
2655 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
2656 address int_copy_entry, address long_copy_entry) {
2657
2658 Label L_long_aligned, L_int_aligned, L_short_aligned;
2659
2660 // Input registers (before setup_arg_regs)
2661 const Register from = c_rarg0; // source array address
2662 const Register to = c_rarg1; // destination array address
2663 const Register size = c_rarg2; // byte count (size_t)
2664
2665 // Register used as a temp
2666 const Register bits = rax; // test copy of low bits
2667
2668 __ align(CodeEntryAlignment);
2669 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2670 StubCodeMark mark(this, stub_id);
2671 address start = __ pc();
2672
2673 __ enter(); // required for proper stackwalking of RuntimeStub frame
2674
2675 // bump this on entry, not on exit:
2676 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2677
2678 __ mov(bits, from);
2679 __ orptr(bits, to);
2680 __ orptr(bits, size);
2681
2682 __ testb(bits, BytesPerLong-1);
2683 __ jccb(Assembler::zero, L_long_aligned);
2684
2685 __ testb(bits, BytesPerInt-1);
2686 __ jccb(Assembler::zero, L_int_aligned);
2687
2688 __ testb(bits, BytesPerShort-1);
2689 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2690
2691 __ BIND(L_short_aligned);
2692 __ shrptr(size, LogBytesPerShort); // size => short_count
2693 __ jump(RuntimeAddress(short_copy_entry));
2694
2695 __ BIND(L_int_aligned);
2696 __ shrptr(size, LogBytesPerInt); // size => int_count
2697 __ jump(RuntimeAddress(int_copy_entry));
2698
2699 __ BIND(L_long_aligned);
2700 __ shrptr(size, LogBytesPerLong); // size => qword_count
2701 __ jump(RuntimeAddress(long_copy_entry));
2702
2703 return start;
2704 }
2705
2706
2707 // Static enum for helper
2708 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
2709 // Helper for generate_unsafe_setmemory
2710 //
2711 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
2712 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
2713 Register size, Register wide_value,
2714 Register tmp, Label& L_exit,
2715 MacroAssembler *_masm) {
2716 Label L_Loop, L_Tail, L_TailLoop;
2717
2718 int shiftval = 0;
2719 int incr = 0;
2720
2721 switch (type) {
2722 case USM_SHORT:
2723 shiftval = 1;
2724 incr = 16;
2725 break;
2726 case USM_DWORD:
2727 shiftval = 2;
2728 incr = 32;
2729 break;
2730 case USM_QUADWORD:
2731 shiftval = 3;
2732 incr = 64;
2733 break;
2734 }
2735
2736 // At this point, we know the lower bits of size are zero
2737 __ shrq(size, shiftval);
2738 // size now has number of X-byte chunks (2, 4 or 8)
2739
2740 // Number of (8*X)-byte chunks into tmp
2741 __ movq(tmp, size);
2742 __ shrq(tmp, 3);
2743 __ jccb(Assembler::zero, L_Tail);
2744
2745 __ BIND(L_Loop);
2746
2747 // Unroll 8 stores
2748 for (int i = 0; i < 8; i++) {
2749 switch (type) {
2750 case USM_SHORT:
2751 __ movw(Address(dest, (2 * i)), wide_value);
2752 break;
2753 case USM_DWORD:
2754 __ movl(Address(dest, (4 * i)), wide_value);
2755 break;
2756 case USM_QUADWORD:
2757 __ movq(Address(dest, (8 * i)), wide_value);
2758 break;
2759 }
2760 }
2761 __ addq(dest, incr);
2762 __ decrementq(tmp);
2763 __ jccb(Assembler::notZero, L_Loop);
2764
2765 __ BIND(L_Tail);
2766
2767 // Find number of remaining X-byte chunks
2768 __ andq(size, 0x7);
2769
2770 // If zero, then we're done
2771 __ jccb(Assembler::zero, L_exit);
2772
2773 __ BIND(L_TailLoop);
2774
2775 switch (type) {
2776 case USM_SHORT:
2777 __ movw(Address(dest, 0), wide_value);
2778 break;
2779 case USM_DWORD:
2780 __ movl(Address(dest, 0), wide_value);
2781 break;
2782 case USM_QUADWORD:
2783 __ movq(Address(dest, 0), wide_value);
2784 break;
2785 }
2786 __ addq(dest, incr >> 3);
2787 __ decrementq(size);
2788 __ jccb(Assembler::notZero, L_TailLoop);
2789 }
2790
2791 // Generate 'unsafe' set memory stub
2792 // Though just as safe as the other stubs, it takes an unscaled
2793 // size_t (# bytes) argument instead of an element count.
2794 //
2795 // Input:
2796 // c_rarg0 - destination array address
2797 // c_rarg1 - byte count (size_t)
2798 // c_rarg2 - byte value
2799 //
2800 // Examines the alignment of the operands and dispatches
2801 // to an int, short, or byte fill loop.
2802 //
2803 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
2804 __ align(CodeEntryAlignment);
2805 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
2806 StubCodeMark mark(this, stub_id);
2807 address start = __ pc();
2808 __ enter(); // required for proper stackwalking of RuntimeStub frame
2809
2810 assert(unsafe_byte_fill != nullptr, "Invalid call");
2811
2812 // bump this on entry, not on exit:
2813 INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
2814
2815 {
2816 Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
2817
2818 const Register dest = c_rarg0;
2819 const Register size = c_rarg1;
2820 const Register byteVal = c_rarg2;
2821 const Register wide_value = rax;
2822 const Register rScratch1 = r10;
2823
2824 assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
2825
2826 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2827
2828 __ testq(size, size);
2829 __ jcc(Assembler::zero, L_exit);
2830
2831 // Propagate byte to full Register
2832 __ movzbl(rScratch1, byteVal);
2833 __ mov64(wide_value, 0x0101010101010101ULL);
2834 __ imulq(wide_value, rScratch1);
2835
2836 // Check for pointer & size alignment
2837 __ movq(rScratch1, dest);
2838 __ orq(rScratch1, size);
2839
2840 __ testb(rScratch1, 7);
2841 __ jcc(Assembler::equal, L_fillQuadwords);
2842
2843 __ testb(rScratch1, 3);
2844 __ jcc(Assembler::equal, L_fillDwords);
2845
2846 __ testb(rScratch1, 1);
2847 __ jcc(Assembler::notEqual, L_fillBytes);
2848
2849 // Fill words
2850 {
2851 UnsafeMemoryAccessMark umam(this, true, true);
2852
2853 // At this point, we know the lower bit of size is zero and a
2854 // multiple of 2
2855 do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
2856 L_exit, _masm);
2857 }
2858 __ jmpb(L_exit);
2859
2860 __ BIND(L_fillQuadwords);
2861
2862 // Fill QUADWORDs
2863 {
2864 UnsafeMemoryAccessMark umam(this, true, true);
2865
2866 // At this point, we know the lower 3 bits of size are zero and a
2867 // multiple of 8
2868 do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
2869 L_exit, _masm);
2870 }
2871 __ BIND(L_exit);
2872
2873 __ leave(); // required for proper stackwalking of RuntimeStub frame
2874 __ ret(0);
2875
2876 __ BIND(L_fillDwords);
2877
2878 // Fill DWORDs
2879 {
2880 UnsafeMemoryAccessMark umam(this, true, true);
2881
2882 // At this point, we know the lower 2 bits of size are zero and a
2883 // multiple of 4
2884 do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
2885 L_exit, _masm);
2886 }
2887 __ jmpb(L_exit);
2888
2889 __ BIND(L_fillBytes);
2890 // Set up for tail call to previously generated byte fill routine
2891 // Parameter order is (ptr, byteVal, size)
2892 __ xchgq(c_rarg1, c_rarg2);
2893 __ leave(); // Clear effect of enter()
2894 __ jump(RuntimeAddress(unsafe_byte_fill));
2895 }
2896
2897 return start;
2898 }
2899
2900 // Perform range checks on the proposed arraycopy.
2901 // Kills temp, but nothing else.
2902 // Also, clean the sign bits of src_pos and dst_pos.
2903 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2904 Register src_pos, // source position (c_rarg1)
2905 Register dst, // destination array oo (c_rarg2)
2906 Register dst_pos, // destination position (c_rarg3)
2907 Register length,
2908 Register temp,
2909 Label& L_failed) {
2910 BLOCK_COMMENT("arraycopy_range_checks:");
2911
2912 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2913 __ movl(temp, length);
2914 __ addl(temp, src_pos); // src_pos + length
2915 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2916 __ jcc(Assembler::above, L_failed);
2917
2918 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2919 __ movl(temp, length);
2920 __ addl(temp, dst_pos); // dst_pos + length
2921 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2922 __ jcc(Assembler::above, L_failed);
2923
2924 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2925 // Move with sign extension can be used since they are positive.
2926 __ movslq(src_pos, src_pos);
2927 __ movslq(dst_pos, dst_pos);
2928
2929 BLOCK_COMMENT("arraycopy_range_checks done");
2930 }
2931
2932
2933 // Generate generic array copy stubs
2934 //
2935 // Input:
2936 // c_rarg0 - src oop
2937 // c_rarg1 - src_pos (32-bits)
2938 // c_rarg2 - dst oop
2939 // c_rarg3 - dst_pos (32-bits)
2940 // not Win64
2941 // c_rarg4 - element count (32-bits)
2942 // Win64
2943 // rsp+40 - element count (32-bits)
2944 //
2945 // Output:
2946 // rax == 0 - success
2947 // rax == -1^K - failure, where K is partial transfer count
2948 //
2949 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2950 address int_copy_entry, address oop_copy_entry,
2951 address long_copy_entry, address checkcast_copy_entry) {
2952
2953 Label L_failed, L_failed_0, L_objArray;
2954 Label L_copy_shorts, L_copy_ints, L_copy_longs;
2955
2956 // Input registers
2957 const Register src = c_rarg0; // source array oop
2958 const Register src_pos = c_rarg1; // source position
2959 const Register dst = c_rarg2; // destination array oop
2960 const Register dst_pos = c_rarg3; // destination position
2961 #ifndef _WIN64
2962 const Register length = c_rarg4;
2963 const Register rklass_tmp = r9; // load_klass
2964 #else
2965 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64
2966 const Register rklass_tmp = rdi; // load_klass
2967 #endif
2968
2969 { int modulus = CodeEntryAlignment;
2970 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
2971 int advance = target - (__ offset() % modulus);
2972 if (advance < 0) advance += modulus;
2973 if (advance > 0) __ nop(advance);
2974 }
2975 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2976 StubCodeMark mark(this, stub_id);
2977
2978 // Short-hop target to L_failed. Makes for denser prologue code.
2979 __ BIND(L_failed_0);
2980 __ jmp(L_failed);
2981 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2982
2983 __ align(CodeEntryAlignment);
2984 address start = __ pc();
2985
2986 __ enter(); // required for proper stackwalking of RuntimeStub frame
2987
2988 #ifdef _WIN64
2989 __ push_ppx(rklass_tmp); // rdi is callee-save on Windows
2990 #endif
2991
2992 // bump this on entry, not on exit:
2993 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2994
2995 //-----------------------------------------------------------------------
2996 // Assembler stub will be used for this call to arraycopy
2997 // if the following conditions are met:
2998 //
2999 // (1) src and dst must not be null.
3000 // (2) src_pos must not be negative.
3001 // (3) dst_pos must not be negative.
3002 // (4) length must not be negative.
3003 // (5) src klass and dst klass should be the same and not null.
3004 // (6) src and dst should be arrays.
3005 // (7) src_pos + length must not exceed length of src.
3006 // (8) dst_pos + length must not exceed length of dst.
3007 //
3008
3009 // if (src == nullptr) return -1;
3010 __ testptr(src, src); // src oop
3011 size_t j1off = __ offset();
3012 __ jccb(Assembler::zero, L_failed_0);
3013
3014 // if (src_pos < 0) return -1;
3015 __ testl(src_pos, src_pos); // src_pos (32-bits)
3016 __ jccb(Assembler::negative, L_failed_0);
3017
3018 // if (dst == nullptr) return -1;
3019 __ testptr(dst, dst); // dst oop
3020 __ jccb(Assembler::zero, L_failed_0);
3021
3022 // if (dst_pos < 0) return -1;
3023 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3024 size_t j4off = __ offset();
3025 __ jccb(Assembler::negative, L_failed_0);
3026
3027 // The first four tests are very dense code,
3028 // but not quite dense enough to put four
3029 // jumps in a 16-byte instruction fetch buffer.
3030 // That's good, because some branch predicters
3031 // do not like jumps so close together.
3032 // Make sure of this.
3033 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3034
3035 // registers used as temp
3036 const Register r11_length = r11; // elements count to copy
3037 const Register r10_src_klass = r10; // array klass
3038
3039 // if (length < 0) return -1;
3040 __ movl(r11_length, length); // length (elements count, 32-bits value)
3041 __ testl(r11_length, r11_length);
3042 __ jccb(Assembler::negative, L_failed_0);
3043
3044 __ load_klass(r10_src_klass, src, rklass_tmp);
3045 #ifdef ASSERT
3046 // assert(src->klass() != nullptr);
3047 {
3048 BLOCK_COMMENT("assert klasses not null {");
3049 Label L1, L2;
3050 __ testptr(r10_src_klass, r10_src_klass);
3051 __ jcc(Assembler::notZero, L2); // it is broken if klass is null
3052 __ bind(L1);
3053 __ stop("broken null klass");
3054 __ bind(L2);
3055 __ load_klass(rax, dst, rklass_tmp);
3056 __ cmpq(rax, 0);
3057 __ jcc(Assembler::equal, L1); // this would be broken also
3058 BLOCK_COMMENT("} assert klasses not null done");
3059 }
3060 #endif
3061
3062 // Load layout helper (32-bits)
3063 //
3064 // |array_tag| | header_size | element_type | |log2_element_size|
3065 // 32 30 24 16 8 2 0
3066 //
3067 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3068 //
3069
3070 const int lh_offset = in_bytes(Klass::layout_helper_offset());
3071
3072 // Handle objArrays completely differently...
3073 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3074 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3075 __ jcc(Assembler::equal, L_objArray);
3076
3077 // if (src->klass() != dst->klass()) return -1;
3078 __ load_klass(rax, dst, rklass_tmp);
3079 __ cmpq(r10_src_klass, rax);
3080 __ jcc(Assembler::notEqual, L_failed);
3081
3082 // Check for flat inline type array -> return -1
3083 __ test_flat_array_oop(src, rax, L_failed);
3084
3085 // Check for null-free (non-flat) inline type array -> handle as object array
3086 __ test_null_free_array_oop(src, rax, L_objArray);
3087
3088 const Register rax_lh = rax; // layout helper
3089 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3090
3091 // Check for flat inline type array -> return -1
3092 __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
3093 __ jcc(Assembler::notZero, L_failed);
3094
3095 // if (!src->is_Array()) return -1;
3096 __ cmpl(rax_lh, Klass::_lh_neutral_value);
3097 __ jcc(Assembler::greaterEqual, L_failed);
3098
3099 // At this point, it is known to be a typeArray (array_tag 0x3).
3100 #ifdef ASSERT
3101 {
3102 BLOCK_COMMENT("assert primitive array {");
3103 Label L;
3104 __ movl(rklass_tmp, rax_lh);
3105 __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3106 __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3107 __ jcc(Assembler::equal, L);
3108 __ stop("must be a primitive array");
3109 __ bind(L);
3110 BLOCK_COMMENT("} assert primitive array done");
3111 }
3112 #endif
3113
3114 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3115 r10, L_failed);
3116
3117 // TypeArrayKlass
3118 //
3119 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3120 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3121 //
3122
3123 const Register r10_offset = r10; // array offset
3124 const Register rax_elsize = rax_lh; // element size
3125
3126 __ movl(r10_offset, rax_lh);
3127 __ shrl(r10_offset, Klass::_lh_header_size_shift);
3128 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
3129 __ addptr(src, r10_offset); // src array offset
3130 __ addptr(dst, r10_offset); // dst array offset
3131 BLOCK_COMMENT("choose copy loop based on element size");
3132 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3133
3134 #ifdef _WIN64
3135 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3136 #endif
3137
3138 // next registers should be set before the jump to corresponding stub
3139 const Register from = c_rarg0; // source array address
3140 const Register to = c_rarg1; // destination array address
3141 const Register count = c_rarg2; // elements count
3142
3143 // 'from', 'to', 'count' registers should be set in such order
3144 // since they are the same as 'src', 'src_pos', 'dst'.
3145
3146 __ cmpl(rax_elsize, 0);
3147 __ jccb(Assembler::notEqual, L_copy_shorts);
3148 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3149 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3150 __ movl2ptr(count, r11_length); // length
3151 __ jump(RuntimeAddress(byte_copy_entry));
3152
3153 __ BIND(L_copy_shorts);
3154 __ cmpl(rax_elsize, LogBytesPerShort);
3155 __ jccb(Assembler::notEqual, L_copy_ints);
3156 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3157 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3158 __ movl2ptr(count, r11_length); // length
3159 __ jump(RuntimeAddress(short_copy_entry));
3160
3161 __ BIND(L_copy_ints);
3162 __ cmpl(rax_elsize, LogBytesPerInt);
3163 __ jccb(Assembler::notEqual, L_copy_longs);
3164 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3165 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3166 __ movl2ptr(count, r11_length); // length
3167 __ jump(RuntimeAddress(int_copy_entry));
3168
3169 __ BIND(L_copy_longs);
3170 #ifdef ASSERT
3171 {
3172 BLOCK_COMMENT("assert long copy {");
3173 Label L;
3174 __ cmpl(rax_elsize, LogBytesPerLong);
3175 __ jcc(Assembler::equal, L);
3176 __ stop("must be long copy, but elsize is wrong");
3177 __ bind(L);
3178 BLOCK_COMMENT("} assert long copy done");
3179 }
3180 #endif
3181 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3182 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3183 __ movl2ptr(count, r11_length); // length
3184 __ jump(RuntimeAddress(long_copy_entry));
3185
3186 // ObjArrayKlass
3187 __ BIND(L_objArray);
3188 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
3189
3190 Label L_plain_copy, L_checkcast_copy;
3191 // test array classes for subtyping
3192 __ load_klass(rax, dst, rklass_tmp);
3193 __ cmpq(r10_src_klass, rax); // usual case is exact equality
3194 __ jcc(Assembler::notEqual, L_checkcast_copy);
3195
3196 // Identically typed arrays can be copied without element-wise checks.
3197 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3198 r10, L_failed);
3199
3200 __ lea(from, Address(src, src_pos, TIMES_OOP,
3201 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3202 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3203 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3204 __ movl2ptr(count, r11_length); // length
3205 __ BIND(L_plain_copy);
3206 #ifdef _WIN64
3207 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3208 #endif
3209 __ jump(RuntimeAddress(oop_copy_entry));
3210
3211 __ BIND(L_checkcast_copy);
3212 // live at this point: r10_src_klass, r11_length, rax (dst_klass)
3213 {
3214 // Before looking at dst.length, make sure dst is also an objArray.
3215 // This check also fails for flat arrays which are not supported.
3216 __ cmpl(Address(rax, lh_offset), objArray_lh);
3217 __ jcc(Assembler::notEqual, L_failed);
3218
3219 #ifdef ASSERT
3220 {
3221 BLOCK_COMMENT("assert not null-free array {");
3222 Label L;
3223 __ test_non_null_free_array_oop(dst, rklass_tmp, L);
3224 __ stop("unexpected null-free array");
3225 __ bind(L);
3226 BLOCK_COMMENT("} assert not null-free array");
3227 }
3228 #endif
3229
3230 // It is safe to examine both src.length and dst.length.
3231 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3232 rax, L_failed);
3233
3234 const Register r11_dst_klass = r11;
3235 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3236
3237 // Marshal the base address arguments now, freeing registers.
3238 __ lea(from, Address(src, src_pos, TIMES_OOP,
3239 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3240 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3241 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3242 __ movl(count, length); // length (reloaded)
3243 Register sco_temp = c_rarg3; // this register is free now
3244 assert_different_registers(from, to, count, sco_temp,
3245 r11_dst_klass, r10_src_klass);
3246 assert_clean_int(count, sco_temp);
3247
3248 // Generate the type check.
3249 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3250 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3251 assert_clean_int(sco_temp, rax);
3252 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3253
3254 // Fetch destination element klass from the ObjArrayKlass header.
3255 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3256 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3257 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
3258 assert_clean_int(sco_temp, rax);
3259
3260 #ifdef _WIN64
3261 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3262 #endif
3263
3264 // the checkcast_copy loop needs two extra arguments:
3265 assert(c_rarg3 == sco_temp, "#3 already in place");
3266 // Set up arguments for checkcast_copy_entry.
3267 setup_arg_regs_using_thread(4);
3268 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3269 __ jump(RuntimeAddress(checkcast_copy_entry));
3270 }
3271
3272 __ BIND(L_failed);
3273 #ifdef _WIN64
3274 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3275 #endif
3276 __ xorptr(rax, rax);
3277 __ notptr(rax); // return -1
3278 __ leave(); // required for proper stackwalking of RuntimeStub frame
3279 __ ret(0);
3280
3281 return start;
3282 }
3283
3284 #undef __