1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/macroAssembler.hpp"
26 #include "gc/shared/barrierSet.hpp"
27 #include "gc/shared/barrierSetAssembler.hpp"
28 #include "oops/objArrayKlass.hpp"
29 #include "runtime/sharedRuntime.hpp"
30 #include "runtime/stubRoutines.hpp"
31 #include "stubGenerator_x86_64.hpp"
32 #ifdef COMPILER2
33 #include "opto/c2_globals.hpp"
34 #endif
35
36 #define __ _masm->
37
38 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
39
40 #ifdef PRODUCT
41 #define BLOCK_COMMENT(str) /* nothing */
42 #else
43 #define BLOCK_COMMENT(str) __ block_comment(str)
44 #endif // PRODUCT
45
46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
47
48 #ifdef PRODUCT
49 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
50 #else
51 #define INC_COUNTER_NP(counter, rscratch) \
52 BLOCK_COMMENT("inc_counter " #counter); \
53 inc_counter_np(_masm, counter, rscratch);
54
55 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
56 __ incrementl(ExternalAddress((address)&counter), rscratch);
57 }
58
59 #ifdef COMPILER2
60 static uint& get_profile_ctr(int shift) {
61 if (shift == 0) {
62 return SharedRuntime::_jbyte_array_copy_ctr;
63 } else if (shift == 1) {
64 return SharedRuntime::_jshort_array_copy_ctr;
65 } else if (shift == 2) {
66 return SharedRuntime::_jint_array_copy_ctr;
67 } else {
68 assert(shift == 3, "");
69 return SharedRuntime::_jlong_array_copy_ctr;
70 }
71 }
72 #endif // COMPILER2
73 #endif // !PRODUCT
74
75 void StubGenerator::generate_arraycopy_stubs() {
76 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
77 // entry immediately following their stack push. This can be used
78 // as a post-push branch target for compatible stubs when they
79 // identify a special case that can be handled by the fallback
80 // stub e.g a disjoint copy stub may be use as a special case
81 // fallback for its compatible conjoint copy stub.
82 //
83 // A no push entry is always returned in the following local and
84 // then published by assigning to the appropriate entry field in
85 // class StubRoutines. The entry value is then passed to the
86 // generator for the compatible stub. That means the entry must be
87 // listed when saving to/restoring from the AOT cache, ensuring
88 // that the inter-stub jumps are noted at AOT-cache save and
89 // relocated at AOT cache load.
90 address nopush_entry;
91
92 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(&nopush_entry);
93 // disjoint nopush entry is needed by conjoint copy
94 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
95 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
96 // conjoint nopush entry is needed by generic/unsafe copy
97 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
98
99 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
100 // disjoint nopush entry is needed by conjoint copy
101 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
102 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
103 // conjoint nopush entry is needed by generic/unsafe copy
104 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
105
106 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
107 // disjoint nopush entry is needed by conjoint copy
108 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
109 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
110 // conjoint nopush entry is needed by generic/unsafe copy
111 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
112
113 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
114 // disjoint nopush entry is needed by conjoint copy
115 StubRoutines::_jlong_disjoint_arraycopy_nopush = nopush_entry;
116 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
117 // conjoint nopush entry is needed by generic/unsafe copy
118 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
119
120 if (UseCompressedOops) {
121 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
122 // disjoint nopush entry is needed by conjoint copy
123 StubRoutines::_oop_disjoint_arraycopy_nopush = nopush_entry;
124 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
125 // conjoint nopush entry is needed by generic/unsafe copy
126 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
127 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
128 // disjoint nopush entry is needed by conjoint copy
129 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
130 // note that we don't need a returned nopush entry because the
131 // generic/unsafe copy does not cater for uninit arrays.
132 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
133 } else {
134 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
135 // disjoint nopush entry is needed by conjoint copy
136 StubRoutines::_oop_disjoint_arraycopy_nopush = nopush_entry;
137 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
138 // conjoint nopush entry is needed by generic/unsafe copy
139 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
140 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
141 // disjoint nopush entry is needed by conjoint copy
142 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
143 // note that we don't need a returned nopush entry because the
144 // generic/unsafe copy does not cater for uninit arrays.
145 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
146 }
147
148 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
149 // checkcast nopush entry is needed by generic copy
150 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
151 // note that we don't need a returned nopush entry because the
152 // generic copy does not cater for uninit arrays.
153 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
154
155 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
156 StubRoutines::_jshort_arraycopy_nopush,
157 StubRoutines::_jint_arraycopy_nopush,
158 StubRoutines::_jlong_arraycopy_nopush);
159 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
160 StubRoutines::_jshort_arraycopy_nopush,
161 StubRoutines::_jint_arraycopy_nopush,
162 StubRoutines::_oop_arraycopy_nopush,
163 StubRoutines::_jlong_arraycopy_nopush,
164 StubRoutines::_checkcast_arraycopy_nopush);
165
166 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
167 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
168 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
169 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
170 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
171 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
172
173 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
174
175 // We don't generate specialized code for HeapWord-aligned source
176 // arrays, so just use the code we've already generated
177 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy;
178 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy;
179
180 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
181 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy;
182
183 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
184 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
185
186 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
187 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
188
189 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
190 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
191
192 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
193 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
194 }
195
196
197 // Verify that a register contains clean 32-bits positive value
198 // (high 32-bits are 0) so it could be used in 64-bits shifts.
199 //
200 // Input:
201 // Rint - 32-bits value
202 // Rtmp - scratch
203 //
204 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
205 #ifdef ASSERT
206 Label L;
207 assert_different_registers(Rtmp, Rint);
208 __ movslq(Rtmp, Rint);
209 __ cmpq(Rtmp, Rint);
210 __ jcc(Assembler::equal, L);
211 __ stop("high 32-bits of int value are not 0");
212 __ bind(L);
213 #endif
214 }
215
216
217 // Generate overlap test for array copy stubs
218 //
219 // Input:
220 // c_rarg0 - from
221 // c_rarg1 - to
222 // c_rarg2 - element count
223 //
224 // Output:
225 // rax - &from[element count - 1]
226 //
227 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
228 const Register from = c_rarg0;
229 const Register to = c_rarg1;
230 const Register count = c_rarg2;
231 const Register end_from = rax;
232
233 __ cmpptr(to, from);
234 __ lea(end_from, Address(from, count, sf, 0));
235 if (NOLp == nullptr) {
236 RuntimeAddress no_overlap(no_overlap_target);
237 __ jump_cc(Assembler::belowEqual, no_overlap);
238 __ cmpptr(to, end_from);
239 __ jump_cc(Assembler::aboveEqual, no_overlap);
240 } else {
241 __ jcc(Assembler::belowEqual, (*NOLp));
242 __ cmpptr(to, end_from);
243 __ jcc(Assembler::aboveEqual, (*NOLp));
244 }
245 }
246
247
248 // Copy big chunks forward
249 //
250 // Inputs:
251 // end_from - source arrays end address
252 // end_to - destination array end address
253 // qword_count - 64-bits element count, negative
254 // tmp1 - scratch
255 // L_copy_bytes - entry label
256 // L_copy_8_bytes - exit label
257 //
258 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
259 Register qword_count, Register tmp1,
260 Register tmp2, Label& L_copy_bytes,
261 Label& L_copy_8_bytes, DecoratorSet decorators,
262 BasicType type) {
263 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
264 DEBUG_ONLY(__ stop("enter at entry label, not here"));
265 Label L_loop;
266 __ align(OptoLoopAlignment);
267 if (UseUnalignedLoadStores) {
268 Label L_end;
269 __ BIND(L_loop);
270 if (UseAVX >= 2) {
271 bs->copy_load_at(_masm, decorators, type, 32,
272 xmm0, Address(end_from, qword_count, Address::times_8, -56),
273 tmp1, xmm1);
274 bs->copy_store_at(_masm, decorators, type, 32,
275 Address(end_to, qword_count, Address::times_8, -56), xmm0,
276 tmp1, tmp2, xmm1);
277
278 bs->copy_load_at(_masm, decorators, type, 32,
279 xmm0, Address(end_from, qword_count, Address::times_8, -24),
280 tmp1, xmm1);
281 bs->copy_store_at(_masm, decorators, type, 32,
282 Address(end_to, qword_count, Address::times_8, -24), xmm0,
283 tmp1, tmp2, xmm1);
284 } else {
285 bs->copy_load_at(_masm, decorators, type, 16,
286 xmm0, Address(end_from, qword_count, Address::times_8, -56),
287 tmp1, xmm1);
288 bs->copy_store_at(_masm, decorators, type, 16,
289 Address(end_to, qword_count, Address::times_8, -56), xmm0,
290 tmp1, tmp2, xmm1);
291 bs->copy_load_at(_masm, decorators, type, 16,
292 xmm0, Address(end_from, qword_count, Address::times_8, -40),
293 tmp1, xmm1);
294 bs->copy_store_at(_masm, decorators, type, 16,
295 Address(end_to, qword_count, Address::times_8, -40), xmm0,
296 tmp1, tmp2, xmm1);
297 bs->copy_load_at(_masm, decorators, type, 16,
298 xmm0, Address(end_from, qword_count, Address::times_8, -24),
299 tmp1, xmm1);
300 bs->copy_store_at(_masm, decorators, type, 16,
301 Address(end_to, qword_count, Address::times_8, -24), xmm0,
302 tmp1, tmp2, xmm1);
303 bs->copy_load_at(_masm, decorators, type, 16,
304 xmm0, Address(end_from, qword_count, Address::times_8, -8),
305 tmp1, xmm1);
306 bs->copy_store_at(_masm, decorators, type, 16,
307 Address(end_to, qword_count, Address::times_8, -8), xmm0,
308 tmp1, tmp2, xmm1);
309 }
310
311 __ BIND(L_copy_bytes);
312 __ addptr(qword_count, 8);
313 __ jcc(Assembler::lessEqual, L_loop);
314 __ subptr(qword_count, 4); // sub(8) and add(4)
315 __ jcc(Assembler::greater, L_end);
316 // Copy trailing 32 bytes
317 if (UseAVX >= 2) {
318 bs->copy_load_at(_masm, decorators, type, 32,
319 xmm0, Address(end_from, qword_count, Address::times_8, -24),
320 tmp1, xmm1);
321 bs->copy_store_at(_masm, decorators, type, 32,
322 Address(end_to, qword_count, Address::times_8, -24), xmm0,
323 tmp1, tmp2, xmm1);
324 } else {
325 bs->copy_load_at(_masm, decorators, type, 16,
326 xmm0, Address(end_from, qword_count, Address::times_8, -24),
327 tmp1, xmm1);
328 bs->copy_store_at(_masm, decorators, type, 16,
329 Address(end_to, qword_count, Address::times_8, -24), xmm0,
330 tmp1, tmp2, xmm1);
331 bs->copy_load_at(_masm, decorators, type, 16,
332 xmm0, Address(end_from, qword_count, Address::times_8, -8),
333 tmp1, xmm1);
334 bs->copy_store_at(_masm, decorators, type, 16,
335 Address(end_to, qword_count, Address::times_8, -8), xmm0,
336 tmp1, tmp2, xmm1);
337 }
338 __ addptr(qword_count, 4);
339 __ BIND(L_end);
340 } else {
341 // Copy 32-bytes per iteration
342 __ BIND(L_loop);
343 bs->copy_load_at(_masm, decorators, type, 8,
344 tmp1, Address(end_from, qword_count, Address::times_8, -24),
345 tmp2);
346 bs->copy_store_at(_masm, decorators, type, 8,
347 Address(end_to, qword_count, Address::times_8, -24), tmp1,
348 tmp2);
349 bs->copy_load_at(_masm, decorators, type, 8,
350 tmp1, Address(end_from, qword_count, Address::times_8, -16),
351 tmp2);
352 bs->copy_store_at(_masm, decorators, type, 8,
353 Address(end_to, qword_count, Address::times_8, -16), tmp1,
354 tmp2);
355 bs->copy_load_at(_masm, decorators, type, 8,
356 tmp1, Address(end_from, qword_count, Address::times_8, -8),
357 tmp2);
358 bs->copy_store_at(_masm, decorators, type, 8,
359 Address(end_to, qword_count, Address::times_8, -8), tmp1,
360 tmp2);
361 bs->copy_load_at(_masm, decorators, type, 8,
362 tmp1, Address(end_from, qword_count, Address::times_8, 0),
363 tmp2);
364 bs->copy_store_at(_masm, decorators, type, 8,
365 Address(end_to, qword_count, Address::times_8, 0), tmp1,
366 tmp2);
367
368 __ BIND(L_copy_bytes);
369 __ addptr(qword_count, 4);
370 __ jcc(Assembler::lessEqual, L_loop);
371 }
372 __ subptr(qword_count, 4);
373 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
374 }
375
376
377 // Copy big chunks backward
378 //
379 // Inputs:
380 // from - source arrays address
381 // dest - destination array address
382 // qword_count - 64-bits element count
383 // tmp1 - scratch
384 // L_copy_bytes - entry label
385 // L_copy_8_bytes - exit label
386 //
387 void StubGenerator::copy_bytes_backward(Register from, Register dest,
388 Register qword_count, Register tmp1,
389 Register tmp2, Label& L_copy_bytes,
390 Label& L_copy_8_bytes, DecoratorSet decorators,
391 BasicType type) {
392 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
393 DEBUG_ONLY(__ stop("enter at entry label, not here"));
394 Label L_loop;
395 __ align(OptoLoopAlignment);
396 if (UseUnalignedLoadStores) {
397 Label L_end;
398 __ BIND(L_loop);
399 if (UseAVX >= 2) {
400 bs->copy_load_at(_masm, decorators, type, 32,
401 xmm0, Address(from, qword_count, Address::times_8, 32),
402 tmp1, xmm1);
403 bs->copy_store_at(_masm, decorators, type, 32,
404 Address(dest, qword_count, Address::times_8, 32), xmm0,
405 tmp1, tmp2, xmm1);
406 bs->copy_load_at(_masm, decorators, type, 32,
407 xmm0, Address(from, qword_count, Address::times_8, 0),
408 tmp1, xmm1);
409 bs->copy_store_at(_masm, decorators, type, 32,
410 Address(dest, qword_count, Address::times_8, 0), xmm0,
411 tmp1, tmp2, xmm1);
412 } else {
413 bs->copy_load_at(_masm, decorators, type, 16,
414 xmm0, Address(from, qword_count, Address::times_8, 48),
415 tmp1, xmm1);
416 bs->copy_store_at(_masm, decorators, type, 16,
417 Address(dest, qword_count, Address::times_8, 48), xmm0,
418 tmp1, tmp2, xmm1);
419 bs->copy_load_at(_masm, decorators, type, 16,
420 xmm0, Address(from, qword_count, Address::times_8, 32),
421 tmp1, xmm1);
422 bs->copy_store_at(_masm, decorators, type, 16,
423 Address(dest, qword_count, Address::times_8, 32), xmm0,
424 tmp1, tmp2, xmm1);
425 bs->copy_load_at(_masm, decorators, type, 16,
426 xmm0, Address(from, qword_count, Address::times_8, 16),
427 tmp1, xmm1);
428 bs->copy_store_at(_masm, decorators, type, 16,
429 Address(dest, qword_count, Address::times_8, 16), xmm0,
430 tmp1, tmp2, xmm1);
431 bs->copy_load_at(_masm, decorators, type, 16,
432 xmm0, Address(from, qword_count, Address::times_8, 0),
433 tmp1, xmm1);
434 bs->copy_store_at(_masm, decorators, type, 16,
435 Address(dest, qword_count, Address::times_8, 0), xmm0,
436 tmp1, tmp2, xmm1);
437 }
438
439 __ BIND(L_copy_bytes);
440 __ subptr(qword_count, 8);
441 __ jcc(Assembler::greaterEqual, L_loop);
442
443 __ addptr(qword_count, 4); // add(8) and sub(4)
444 __ jcc(Assembler::less, L_end);
445 // Copy trailing 32 bytes
446 if (UseAVX >= 2) {
447 bs->copy_load_at(_masm, decorators, type, 32,
448 xmm0, Address(from, qword_count, Address::times_8, 0),
449 tmp1, xmm1);
450 bs->copy_store_at(_masm, decorators, type, 32,
451 Address(dest, qword_count, Address::times_8, 0), xmm0,
452 tmp1, tmp2, xmm1);
453 } else {
454 bs->copy_load_at(_masm, decorators, type, 16,
455 xmm0, Address(from, qword_count, Address::times_8, 16),
456 tmp1, xmm1);
457 bs->copy_store_at(_masm, decorators, type, 16,
458 Address(dest, qword_count, Address::times_8, 16), xmm0,
459 tmp1, tmp2, xmm1);
460 bs->copy_load_at(_masm, decorators, type, 16,
461 xmm0, Address(from, qword_count, Address::times_8, 0),
462 tmp1, xmm1);
463 bs->copy_store_at(_masm, decorators, type, 16,
464 Address(dest, qword_count, Address::times_8, 0), xmm0,
465 tmp1, tmp2, xmm1);
466 }
467 __ subptr(qword_count, 4);
468 __ BIND(L_end);
469 } else {
470 // Copy 32-bytes per iteration
471 __ BIND(L_loop);
472 bs->copy_load_at(_masm, decorators, type, 8,
473 tmp1, Address(from, qword_count, Address::times_8, 24),
474 tmp2);
475 bs->copy_store_at(_masm, decorators, type, 8,
476 Address(dest, qword_count, Address::times_8, 24), tmp1,
477 tmp2);
478 bs->copy_load_at(_masm, decorators, type, 8,
479 tmp1, Address(from, qword_count, Address::times_8, 16),
480 tmp2);
481 bs->copy_store_at(_masm, decorators, type, 8,
482 Address(dest, qword_count, Address::times_8, 16), tmp1,
483 tmp2);
484 bs->copy_load_at(_masm, decorators, type, 8,
485 tmp1, Address(from, qword_count, Address::times_8, 8),
486 tmp2);
487 bs->copy_store_at(_masm, decorators, type, 8,
488 Address(dest, qword_count, Address::times_8, 8), tmp1,
489 tmp2);
490 bs->copy_load_at(_masm, decorators, type, 8,
491 tmp1, Address(from, qword_count, Address::times_8, 0),
492 tmp2);
493 bs->copy_store_at(_masm, decorators, type, 8,
494 Address(dest, qword_count, Address::times_8, 0), tmp1,
495 tmp2);
496
497 __ BIND(L_copy_bytes);
498 __ subptr(qword_count, 4);
499 __ jcc(Assembler::greaterEqual, L_loop);
500 }
501 __ addptr(qword_count, 4);
502 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
503 }
504
505 #ifdef COMPILER2
506
507 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
508 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
509 // for both special cases (various small block sizes) and aligned copy loop. This is the
510 // default configuration.
511 // - If copy length is above CopyAVX3Threshold, then implementation use 64 byte vectors (ZMMs)
512 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
513 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
514 // better performance for disjoint copies. For conjoint/backward copy vector based
515 // copy performs better.
516 // - If user sets CopyAVX3Threshold=0, then special cases for small blocks sizes operate over
517 // 64 byte vector registers (ZMMs).
518
519 // Inputs:
520 // c_rarg0 - source array address
521 // c_rarg1 - destination array address
522 // c_rarg2 - element count, treated as ssize_t, can be zero
523 //
524 //
525 // Side Effects:
526 // disjoint_copy_avx3_masked is set to the no-overlap entry point
527 // used by generate_conjoint_[byte/int/short/long]_copy().
528 //
529 address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry) {
530 // aligned is always false -- x86_64 always uses the unaligned code
531 const bool aligned = false;
532 int shift;
533 bool is_oop;
534 bool dest_uninitialized;
535
536 switch (stub_id) {
537 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
538 shift = 0;
539 is_oop = false;
540 dest_uninitialized = false;
541 break;
542 case StubId::stubgen_jshort_disjoint_arraycopy_id:
543 shift = 1;
544 is_oop = false;
545 dest_uninitialized = false;
546 break;
547 case StubId::stubgen_jint_disjoint_arraycopy_id:
548 shift = 2;
549 is_oop = false;
550 dest_uninitialized = false;
551 break;
552 case StubId::stubgen_jlong_disjoint_arraycopy_id:
553 shift = 3;
554 is_oop = false;
555 dest_uninitialized = false;
556 break;
557 case StubId::stubgen_oop_disjoint_arraycopy_id:
558 shift = (UseCompressedOops ? 2 : 3);
559 is_oop = true;
560 dest_uninitialized = false;
561 break;
562 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
563 shift = (UseCompressedOops ? 2 : 3);
564 is_oop = true;
565 dest_uninitialized = true;
566 break;
567 default:
568 ShouldNotReachHere();
569 }
570 GrowableArray<address> entries;
571 GrowableArray<address> extras;
572 bool add_handlers = !is_oop && !aligned;
573 bool add_relocs = UseZGC && is_oop;
574 bool add_extras = add_handlers || add_relocs;
575 // The stub employs one unsafe handler region by default but has two
576 // when MaxVectorSize == 64 So we may expect 0, 3 or 6 extras.
577 int handlers_count = (MaxVectorSize == 64 ? 2 : 1);
578 int expected_entry_count = (entry != nullptr ? 2 : 1);
579 int expected_extra_count = (add_handlers ? handlers_count : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/1/2 x UMAM {start,end,handler}
580 int entry_count = StubInfo::entry_count(stub_id);
581 assert(entry_count == expected_entry_count, "sanity check");
582 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
583 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
584 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
585 if (start != nullptr) {
586 assert(entries.length() == expected_entry_count - 1,
587 "unexpected entry count %d", entries.length());
588 assert(!add_handlers || extras.length() == expected_extra_count,
589 "unexpected handler addresses count %d", extras.length());
590 if (entry != nullptr) {
591 *entry = entries.at(0);
592 }
593 if (add_handlers) {
594 // restore 1/2 x UMAM {start,end,handler} addresses from extras
595 register_unsafe_access_handlers(extras, 0, handlers_count);
596 }
597 #if INCLUDE_ZGC
598 // register addresses at which ZGC does colour patching
599 if (add_relocs) {
600 register_reloc_addresses(extras, 0, extras.length());
601 }
602 #endif // INCLUDE_ZGC
603 return start;
604 }
605
606 __ align(CodeEntryAlignment);
607 StubCodeMark mark(this, stub_id);
608 start = __ pc();
609
610 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
611 const int large_threshold = 2621440; // 2.5 MB
612 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
613 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
614 Label L_copy_large, L_finish;
615 const Register from = rdi; // source array address
616 const Register to = rsi; // destination array address
617 const Register count = rdx; // elements count
618 const Register temp1 = r8;
619 const Register temp2 = r11;
620 const Register temp3 = rax;
621 const Register temp4 = rcx;
622 // End pointers are inclusive, and if count is not zero they point
623 // to the last unit copied: end_to[0] := end_from[0]
624
625 __ enter(); // required for proper stackwalking of RuntimeStub frame
626 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
627
628 if (entry != nullptr) {
629 *entry = __ pc();
630 entries.append(*entry);
631 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
632 BLOCK_COMMENT("Entry:");
633 }
634
635 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
636 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
637
638 setup_argument_regs(type);
639
640 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
641 if (dest_uninitialized) {
642 decorators |= IS_DEST_UNINITIALIZED;
643 }
644 if (aligned) {
645 decorators |= ARRAYCOPY_ALIGNED;
646 }
647 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
648 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
649
650 {
651 // Type(shift) byte(0), short(1), int(2), long(3)
652 int loop_size[] = { 192, 96, 48, 24};
653 int threshold[] = { 4096, 2048, 1024, 512};
654
655 // UnsafeMemoryAccess page error: continue after unsafe access
656 UnsafeMemoryAccessMark umam(this, add_handlers, true);
657 // 'from', 'to' and 'count' are now valid
658
659 // temp1 holds remaining count and temp4 holds running count used to compute
660 // next address offset for start of to/from addresses (temp4 * scale).
661 __ mov64(temp4, 0);
662 __ movq(temp1, count);
663
664 // Zero length check.
665 __ BIND(L_tail);
666 __ cmpq(temp1, 0);
667 __ jcc(Assembler::lessEqual, L_exit);
668
669 // Special cases using 32 byte [masked] vector copy operations.
670 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
671 temp4, temp3, use64byteVector, L_entry, L_exit);
672
673 // PRE-MAIN-POST loop for aligned copy.
674 __ BIND(L_entry);
675
676 if (MaxVectorSize == 64) {
677 __ movq(temp2, temp1);
678 __ shlq(temp2, shift);
679 __ cmpq(temp2, large_threshold);
680 __ jcc(Assembler::greaterEqual, L_copy_large);
681 }
682 if (CopyAVX3Threshold != 0) {
683 __ cmpq(count, threshold[shift]);
684 if (MaxVectorSize == 64) {
685 // Copy using 64 byte vectors.
686 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
687 } else {
688 assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
689 // REP MOVS offer a faster copy path.
690 __ jcc(Assembler::greaterEqual, L_repmovs);
691 }
692 }
693
694 if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
695 // Partial copy to make dst address 32 byte aligned.
696 __ movq(temp2, to);
697 __ andq(temp2, 31);
698 __ jcc(Assembler::equal, L_main_pre_loop);
699
700 __ negptr(temp2);
701 __ addq(temp2, 32);
702 if (shift) {
703 __ shrq(temp2, shift);
704 }
705 __ movq(temp3, temp2);
706 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
707 __ movq(temp4, temp2);
708 __ movq(temp1, count);
709 __ subq(temp1, temp2);
710
711 __ cmpq(temp1, loop_size[shift]);
712 __ jcc(Assembler::less, L_tail);
713
714 __ BIND(L_main_pre_loop);
715 __ subq(temp1, loop_size[shift]);
716
717 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
718 __ align32();
719 __ BIND(L_main_loop);
720 copy64_avx(to, from, temp4, xmm1, false, shift, 0);
721 copy64_avx(to, from, temp4, xmm1, false, shift, 64);
722 copy64_avx(to, from, temp4, xmm1, false, shift, 128);
723 __ addptr(temp4, loop_size[shift]);
724 __ subq(temp1, loop_size[shift]);
725 __ jcc(Assembler::greater, L_main_loop);
726
727 __ addq(temp1, loop_size[shift]);
728
729 // Tail loop.
730 __ jmp(L_tail);
731
732 __ BIND(L_repmovs);
733 __ movq(temp2, temp1);
734 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
735 __ movq(temp3, to);
736 __ movq(to, from);
737 __ movq(from, temp3);
738 // Save to/from for restoration post rep_mov.
739 __ movq(temp1, to);
740 __ movq(temp3, from);
741 if(shift < 3) {
742 __ shrq(temp2, 3-shift); // quad word count
743 }
744 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX).
745 __ rep_mov();
746 __ shlq(temp2, 3); // convert quad words into byte count.
747 if(shift) {
748 __ shrq(temp2, shift); // type specific count.
749 }
750 // Restore original addresses in to/from.
751 __ movq(to, temp3);
752 __ movq(from, temp1);
753 __ movq(temp4, temp2);
754 __ movq(temp1, count);
755 __ subq(temp1, temp2); // tailing part (less than a quad ward size).
756 __ jmp(L_tail);
757 }
758
759 if (MaxVectorSize > 32) {
760 __ BIND(L_pre_main_post_64);
761 // Partial copy to make dst address 64 byte aligned.
762 __ movq(temp2, to);
763 __ andq(temp2, 63);
764 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
765
766 __ negptr(temp2);
767 __ addq(temp2, 64);
768 if (shift) {
769 __ shrq(temp2, shift);
770 }
771 __ movq(temp3, temp2);
772 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
773 __ movq(temp4, temp2);
774 __ movq(temp1, count);
775 __ subq(temp1, temp2);
776
777 __ cmpq(temp1, loop_size[shift]);
778 __ jcc(Assembler::less, L_tail64);
779
780 __ BIND(L_main_pre_loop_64bytes);
781 __ subq(temp1, loop_size[shift]);
782
783 // Main loop with aligned copy block size of 192 bytes at
784 // 64 byte copy granularity.
785 __ align32();
786 __ BIND(L_main_loop_64bytes);
787 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
788 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
789 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
790 __ addptr(temp4, loop_size[shift]);
791 __ subq(temp1, loop_size[shift]);
792 __ jcc(Assembler::greater, L_main_loop_64bytes);
793
794 __ addq(temp1, loop_size[shift]);
795 // Zero length check.
796 __ jcc(Assembler::lessEqual, L_exit);
797
798 __ BIND(L_tail64);
799
800 // Tail handling using 64 byte [masked] vector copy operations.
801 use64byteVector = true;
802 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
803 temp4, temp3, use64byteVector, L_entry, L_exit);
804 }
805 __ BIND(L_exit);
806 }
807
808 __ BIND(L_finish);
809 address ucme_exit_pc = __ pc();
810 // When called from generic_arraycopy r11 contains specific values
811 // used during arraycopy epilogue, re-initializing r11.
812 if (is_oop) {
813 __ movq(r11, shift == 3 ? count : to);
814 }
815 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
816 restore_argument_regs(type);
817 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
818 __ xorptr(rax, rax); // return 0
819 __ vzeroupper();
820 __ leave(); // required for proper stackwalking of RuntimeStub frame
821 __ ret(0);
822
823 if (MaxVectorSize == 64) {
824 __ BIND(L_copy_large);
825 UnsafeMemoryAccessMark umam(this, add_handlers, false, ucme_exit_pc);
826 arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
827 __ jmp(L_finish);
828 }
829 // retrieve the registered handler addresses
830 address end = __ pc();
831 if (add_handlers) {
832 retrieve_unsafe_access_handlers(start, end, extras);
833 }
834 assert(extras.length() == expected_extra_count,
835 "unexpected handler addresses count %d", extras.length());
836 #if INCLUDE_ZGC
837 // retrieve addresses at which ZGC does colour patching
838 if (add_relocs) {
839 retrieve_reloc_addresses(start, end, extras);
840 }
841 #endif // INCLUDE_ZGC
842
843 // record the stub entry and end plus the no_push entry and any
844 // extra handler addresses
845 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
846
847 return start;
848 }
849
850 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
851 Register temp3, Register temp4, Register count,
852 XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
853 XMMRegister xmm4, int shift) {
854
855 // Type(shift) byte(0), short(1), int(2), long(3)
856 int loop_size[] = { 256, 128, 64, 32};
857 int threshold[] = { 4096, 2048, 1024, 512};
858
859 Label L_main_loop_large;
860 Label L_tail_large;
861 Label L_exit_large;
862 Label L_entry_large;
863 Label L_main_pre_loop_large;
864 Label L_pre_main_post_large;
865
866 assert(MaxVectorSize == 64, "vector length != 64");
867 __ BIND(L_entry_large);
868
869 __ BIND(L_pre_main_post_large);
870 // Partial copy to make dst address 64 byte aligned.
871 __ movq(temp2, to);
872 __ andq(temp2, 63);
873 __ jcc(Assembler::equal, L_main_pre_loop_large);
874
875 __ negptr(temp2);
876 __ addq(temp2, 64);
877 if (shift) {
878 __ shrq(temp2, shift);
879 }
880 __ movq(temp3, temp2);
881 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
882 __ movq(temp4, temp2);
883 __ movq(temp1, count);
884 __ subq(temp1, temp2);
885
886 __ cmpq(temp1, loop_size[shift]);
887 __ jcc(Assembler::less, L_tail_large);
888
889 __ BIND(L_main_pre_loop_large);
890 __ subq(temp1, loop_size[shift]);
891
892 // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
893 __ align32();
894 __ BIND(L_main_loop_large);
895 copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
896 __ addptr(temp4, loop_size[shift]);
897 __ subq(temp1, loop_size[shift]);
898 __ jcc(Assembler::greater, L_main_loop_large);
899 // fence needed because copy256_avx3 uses non-temporal stores
900 __ sfence();
901
902 __ addq(temp1, loop_size[shift]);
903 // Zero length check.
904 __ jcc(Assembler::lessEqual, L_exit_large);
905 __ BIND(L_tail_large);
906 // Tail handling using 64 byte [masked] vector copy operations.
907 __ cmpq(temp1, 0);
908 __ jcc(Assembler::lessEqual, L_exit_large);
909 arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
910 temp4, temp3, L_exit_large);
911 __ BIND(L_exit_large);
912 }
913
914 // Inputs:
915 // c_rarg0 - source array address
916 // c_rarg1 - destination array address
917 // c_rarg2 - element count, treated as ssize_t, can be zero
918 //
919 //
920 address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry, address nooverlap_target) {
921 // aligned is always false -- x86_64 always uses the unaligned code
922 const bool aligned = false;
923 int shift;
924 bool is_oop;
925 bool dest_uninitialized;
926
927 switch (stub_id) {
928 case StubId::stubgen_jbyte_arraycopy_id:
929 shift = 0;
930 is_oop = false;
931 dest_uninitialized = false;
932 break;
933 case StubId::stubgen_jshort_arraycopy_id:
934 shift = 1;
935 is_oop = false;
936 dest_uninitialized = false;
937 break;
938 case StubId::stubgen_jint_arraycopy_id:
939 shift = 2;
940 is_oop = false;
941 dest_uninitialized = false;
942 break;
943 case StubId::stubgen_jlong_arraycopy_id:
944 shift = 3;
945 is_oop = false;
946 dest_uninitialized = false;
947 break;
948 case StubId::stubgen_oop_arraycopy_id:
949 shift = (UseCompressedOops ? 2 : 3);
950 is_oop = true;
951 dest_uninitialized = false;
952 break;
953 case StubId::stubgen_oop_arraycopy_uninit_id:
954 shift = (UseCompressedOops ? 2 : 3);
955 is_oop = true;
956 dest_uninitialized = true;
957 break;
958 default:
959 ShouldNotReachHere();
960 }
961 GrowableArray<address> entries;
962 GrowableArray<address> extras;
963 bool add_handlers = !is_oop && !aligned;
964 bool add_relocs = UseZGC && is_oop;
965 bool add_extras = add_handlers || add_relocs;
966 int expected_entry_count = (entry != nullptr ? 2 : 1);
967 int expected_handler_count = (add_handlers ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/1 x UMAM {start,end,handler}
968 int entry_count = StubInfo::entry_count(stub_id);
969 assert(entry_count == expected_entry_count, "sanity check");
970 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
971 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
972 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
973 if (start != nullptr) {
974 assert(entries.length() == expected_entry_count - 1,
975 "unexpected entry count %d", entries.length());
976 assert(!add_handlers || extras.length() == expected_handler_count,
977 "unexpected handler addresses count %d", extras.length());
978 if (entry != nullptr) {
979 *entry = entries.at(0);
980 }
981 if (add_handlers) {
982 // restore 1 x UMAM {start,end,handler} addresses from extras
983 register_unsafe_access_handlers(extras, 0, 1);
984 }
985 #if INCLUDE_ZGC
986 if (add_relocs) {
987 // register addresses at which ZGC does colour patching
988 register_reloc_addresses(extras, 0, extras.length());
989 }
990 #endif // INCLUDE_ZGC
991 return start;
992 }
993 __ align(CodeEntryAlignment);
994 StubCodeMark mark(this, stub_id);
995 start = __ pc();
996
997 bool use64byteVector = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
998
999 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1000 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1001 const Register from = rdi; // source array address
1002 const Register to = rsi; // destination array address
1003 const Register count = rdx; // elements count
1004 const Register temp1 = r8;
1005 const Register temp2 = rcx;
1006 const Register temp3 = r11;
1007 const Register temp4 = rax;
1008 // End pointers are inclusive, and if count is not zero they point
1009 // to the last unit copied: end_to[0] := end_from[0]
1010
1011 __ enter(); // required for proper stackwalking of RuntimeStub frame
1012 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1013
1014 if (entry != nullptr) {
1015 *entry = __ pc();
1016 entries.append(*entry);
1017 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1018 BLOCK_COMMENT("Entry:");
1019 }
1020
1021 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1022
1023 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1024 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1025
1026 setup_argument_regs(type);
1027
1028 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1029 if (dest_uninitialized) {
1030 decorators |= IS_DEST_UNINITIALIZED;
1031 }
1032 if (aligned) {
1033 decorators |= ARRAYCOPY_ALIGNED;
1034 }
1035 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1036 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1037 {
1038 // Type(shift) byte(0), short(1), int(2), long(3)
1039 int loop_size[] = { 192, 96, 48, 24};
1040 int threshold[] = { 4096, 2048, 1024, 512};
1041
1042 // UnsafeMemoryAccess page error: continue after unsafe access
1043 UnsafeMemoryAccessMark umam(this, add_handlers, true);
1044 // 'from', 'to' and 'count' are now valid
1045
1046 // temp1 holds remaining count.
1047 __ movq(temp1, count);
1048
1049 // Zero length check.
1050 __ BIND(L_tail);
1051 __ cmpq(temp1, 0);
1052 __ jcc(Assembler::lessEqual, L_exit);
1053
1054 __ mov64(temp2, 0);
1055 __ movq(temp3, temp1);
1056 // Special cases using 32 byte [masked] vector copy operations.
1057 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1058 temp4, use64byteVector, L_entry, L_exit);
1059
1060 // PRE-MAIN-POST loop for aligned copy.
1061 __ BIND(L_entry);
1062
1063 if ((MaxVectorSize > 32) && (CopyAVX3Threshold != 0)) {
1064 __ cmpq(temp1, threshold[shift]);
1065 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1066 }
1067
1068 if ((MaxVectorSize < 64) || (CopyAVX3Threshold != 0)) {
1069 // Partial copy to make dst address 32 byte aligned.
1070 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1071 __ andq(temp2, 31);
1072 __ jcc(Assembler::equal, L_main_pre_loop);
1073
1074 if (shift) {
1075 __ shrq(temp2, shift);
1076 }
1077 __ subq(temp1, temp2);
1078 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1079
1080 __ cmpq(temp1, loop_size[shift]);
1081 __ jcc(Assembler::less, L_tail);
1082
1083 __ BIND(L_main_pre_loop);
1084
1085 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1086 __ align32();
1087 __ BIND(L_main_loop);
1088 copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1089 copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1090 copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1091 __ subptr(temp1, loop_size[shift]);
1092 __ cmpq(temp1, loop_size[shift]);
1093 __ jcc(Assembler::greater, L_main_loop);
1094
1095 // Tail loop.
1096 __ jmp(L_tail);
1097 }
1098
1099 if (MaxVectorSize > 32) {
1100 __ BIND(L_pre_main_post_64);
1101 // Partial copy to make dst address 64 byte aligned.
1102 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1103 __ andq(temp2, 63);
1104 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1105
1106 if (shift) {
1107 __ shrq(temp2, shift);
1108 }
1109 __ subq(temp1, temp2);
1110 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1111
1112 __ cmpq(temp1, loop_size[shift]);
1113 __ jcc(Assembler::less, L_tail64);
1114
1115 __ BIND(L_main_pre_loop_64bytes);
1116
1117 // Main loop with aligned copy block size of 192 bytes at
1118 // 64 byte copy granularity.
1119 __ align32();
1120 __ BIND(L_main_loop_64bytes);
1121 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1122 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1123 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1124 __ subq(temp1, loop_size[shift]);
1125 __ cmpq(temp1, loop_size[shift]);
1126 __ jcc(Assembler::greater, L_main_loop_64bytes);
1127
1128 // Zero length check.
1129 __ cmpq(temp1, 0);
1130 __ jcc(Assembler::lessEqual, L_exit);
1131
1132 __ BIND(L_tail64);
1133
1134 // Tail handling using 64 byte [masked] vector copy operations.
1135 use64byteVector = true;
1136 __ mov64(temp2, 0);
1137 __ movq(temp3, temp1);
1138 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1139 temp4, use64byteVector, L_entry, L_exit);
1140 }
1141 __ BIND(L_exit);
1142 }
1143 address ucme_exit_pc = __ pc();
1144 // When called from generic_arraycopy r11 contains specific values
1145 // used during arraycopy epilogue, re-initializing r11.
1146 if(is_oop) {
1147 __ movq(r11, count);
1148 }
1149 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1150 restore_argument_regs(type);
1151 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1152 __ xorptr(rax, rax); // return 0
1153 __ vzeroupper();
1154 __ leave(); // required for proper stackwalking of RuntimeStub frame
1155 __ ret(0);
1156
1157 // retrieve the registered handler addresses
1158 address end = __ pc();
1159 if (add_handlers) {
1160 retrieve_unsafe_access_handlers(start, end, extras);
1161 }
1162 assert(extras.length() == expected_handler_count,
1163 "unexpected handler addresses count %d", extras.length());
1164 #if INCLUDE_ZGC
1165 // retrieve addresses at which ZGC does colour patching
1166 if (add_relocs) {
1167 retrieve_reloc_addresses(start, end, extras);
1168 }
1169 #endif // INCLUDE_ZGC
1170 // record the stub entry and end plus the no_push entry and any
1171 // extra handler addresses
1172 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
1173
1174 return start;
1175 }
1176
1177 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1178 Register to, Register count, int shift,
1179 Register index, Register temp,
1180 bool use64byteVector, Label& L_entry, Label& L_exit) {
1181 Label L_entry_64, L_entry_96, L_entry_128;
1182 Label L_entry_160, L_entry_192;
1183
1184 int size_mat[][6] = {
1185 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
1186 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
1187 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
1188 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
1189 };
1190
1191 // Case A) Special case for length less than equal to 32 bytes.
1192 __ cmpq(count, size_mat[shift][0]);
1193 __ jccb(Assembler::greater, L_entry_64);
1194 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1195 __ jmp(L_exit);
1196
1197 // Case B) Special case for length less than equal to 64 bytes.
1198 __ BIND(L_entry_64);
1199 __ cmpq(count, size_mat[shift][1]);
1200 __ jccb(Assembler::greater, L_entry_96);
1201 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1202 __ jmp(L_exit);
1203
1204 // Case C) Special case for length less than equal to 96 bytes.
1205 __ BIND(L_entry_96);
1206 __ cmpq(count, size_mat[shift][2]);
1207 __ jccb(Assembler::greater, L_entry_128);
1208 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1209 __ subq(count, 64 >> shift);
1210 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1211 __ jmp(L_exit);
1212
1213 // Case D) Special case for length less than equal to 128 bytes.
1214 __ BIND(L_entry_128);
1215 __ cmpq(count, size_mat[shift][3]);
1216 __ jccb(Assembler::greater, L_entry_160);
1217 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1218 copy32_avx(to, from, index, xmm, shift, 64);
1219 __ subq(count, 96 >> shift);
1220 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1221 __ jmp(L_exit);
1222
1223 // Case E) Special case for length less than equal to 160 bytes.
1224 __ BIND(L_entry_160);
1225 __ cmpq(count, size_mat[shift][4]);
1226 __ jccb(Assembler::greater, L_entry_192);
1227 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1228 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1229 __ subq(count, 128 >> shift);
1230 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1231 __ jmp(L_exit);
1232
1233 // Case F) Special case for length less than equal to 192 bytes.
1234 __ BIND(L_entry_192);
1235 __ cmpq(count, size_mat[shift][5]);
1236 __ jcc(Assembler::greater, L_entry);
1237 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1238 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1239 copy32_avx(to, from, index, xmm, shift, 128);
1240 __ subq(count, 160 >> shift);
1241 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1242 __ jmp(L_exit);
1243 }
1244
1245 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1246 Register to, Register count, int shift, Register index,
1247 Register temp, Label& L_exit) {
1248 Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1249
1250 int size_mat[][4] = {
1251 /* T_BYTE */ {64, 128, 192, 256},
1252 /* T_SHORT*/ {32, 64 , 96 , 128},
1253 /* T_INT */ {16, 32 , 48 , 64},
1254 /* T_LONG */ { 8, 16 , 24 , 32}
1255 };
1256
1257 assert(MaxVectorSize == 64, "vector length != 64");
1258 // Case A) Special case for length less than or equal to 64 bytes.
1259 __ BIND(L_entry_64);
1260 __ cmpq(count, size_mat[shift][0]);
1261 __ jccb(Assembler::greater, L_entry_128);
1262 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1263 __ jmp(L_exit);
1264
1265 // Case B) Special case for length less than or equal to 128 bytes.
1266 __ BIND(L_entry_128);
1267 __ cmpq(count, size_mat[shift][1]);
1268 __ jccb(Assembler::greater, L_entry_192);
1269 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1270 __ subq(count, 64 >> shift);
1271 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1272 __ jmp(L_exit);
1273
1274 // Case C) Special case for length less than or equal to 192 bytes.
1275 __ BIND(L_entry_192);
1276 __ cmpq(count, size_mat[shift][2]);
1277 __ jcc(Assembler::greater, L_entry_256);
1278 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1279 copy64_avx(to, from, index, xmm, false, shift, 64, true);
1280 __ subq(count, 128 >> shift);
1281 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1282 __ jmp(L_exit);
1283
1284 // Case D) Special case for length less than or equal to 256 bytes.
1285 __ BIND(L_entry_256);
1286 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1287 copy64_avx(to, from, index, xmm, false, shift, 64, true);
1288 copy64_avx(to, from, index, xmm, false, shift, 128, true);
1289 __ subq(count, 192 >> shift);
1290 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1291 __ jmp(L_exit);
1292 }
1293
1294 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1295 Register to, Register start_index, Register end_index,
1296 Register count, int shift, Register temp,
1297 bool use64byteVector, Label& L_entry, Label& L_exit) {
1298 Label L_entry_64, L_entry_96, L_entry_128;
1299 Label L_entry_160, L_entry_192;
1300 bool avx3 = (MaxVectorSize > 32) && (CopyAVX3Threshold == 0);
1301
1302 int size_mat[][6] = {
1303 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
1304 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
1305 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
1306 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
1307 };
1308
1309 // Case A) Special case for length less than equal to 32 bytes.
1310 __ cmpq(count, size_mat[shift][0]);
1311 __ jccb(Assembler::greater, L_entry_64);
1312 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1313 __ jmp(L_exit);
1314
1315 // Case B) Special case for length less than equal to 64 bytes.
1316 __ BIND(L_entry_64);
1317 __ cmpq(count, size_mat[shift][1]);
1318 __ jccb(Assembler::greater, L_entry_96);
1319 if (avx3) {
1320 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1321 } else {
1322 copy32_avx(to, from, end_index, xmm, shift, -32);
1323 __ subq(count, 32 >> shift);
1324 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1325 }
1326 __ jmp(L_exit);
1327
1328 // Case C) Special case for length less than equal to 96 bytes.
1329 __ BIND(L_entry_96);
1330 __ cmpq(count, size_mat[shift][2]);
1331 __ jccb(Assembler::greater, L_entry_128);
1332 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1333 __ subq(count, 64 >> shift);
1334 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1335 __ jmp(L_exit);
1336
1337 // Case D) Special case for length less than equal to 128 bytes.
1338 __ BIND(L_entry_128);
1339 __ cmpq(count, size_mat[shift][3]);
1340 __ jccb(Assembler::greater, L_entry_160);
1341 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1342 copy32_avx(to, from, end_index, xmm, shift, -96);
1343 __ subq(count, 96 >> shift);
1344 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1345 __ jmp(L_exit);
1346
1347 // Case E) Special case for length less than equal to 160 bytes.
1348 __ BIND(L_entry_160);
1349 __ cmpq(count, size_mat[shift][4]);
1350 __ jccb(Assembler::greater, L_entry_192);
1351 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1352 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1353 __ subq(count, 128 >> shift);
1354 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1355 __ jmp(L_exit);
1356
1357 // Case F) Special case for length less than equal to 192 bytes.
1358 __ BIND(L_entry_192);
1359 __ cmpq(count, size_mat[shift][5]);
1360 __ jcc(Assembler::greater, L_entry);
1361 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1362 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1363 copy32_avx(to, from, end_index, xmm, shift, -160);
1364 __ subq(count, 160 >> shift);
1365 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1366 __ jmp(L_exit);
1367 }
1368
1369 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1370 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1371 int shift, int offset) {
1372 if (MaxVectorSize == 64) {
1373 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1374 __ prefetcht0(Address(src, index, scale, offset + 0x200));
1375 __ prefetcht0(Address(src, index, scale, offset + 0x240));
1376 __ prefetcht0(Address(src, index, scale, offset + 0x280));
1377 __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1378
1379 __ prefetcht0(Address(src, index, scale, offset + 0x400));
1380 __ prefetcht0(Address(src, index, scale, offset + 0x440));
1381 __ prefetcht0(Address(src, index, scale, offset + 0x480));
1382 __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1383
1384 __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1385 __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1386 __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1387 __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1388
1389 __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1390 __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1391 __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1392 __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1393 }
1394 }
1395
1396 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1397 KRegister mask, Register length, Register index,
1398 Register temp, int shift, int offset,
1399 bool use64byteVector) {
1400 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1401 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1402 if (!use64byteVector) {
1403 copy32_avx(dst, src, index, xmm, shift, offset);
1404 __ subptr(length, 32 >> shift);
1405 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1406 } else {
1407 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1408 assert(MaxVectorSize == 64, "vector length != 64");
1409 __ mov64(temp, -1L);
1410 __ bzhiq(temp, temp, length);
1411 __ kmovql(mask, temp);
1412 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1413 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1414 }
1415 }
1416
1417
1418 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1419 KRegister mask, Register length, Register index,
1420 Register temp, int shift, int offset) {
1421 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1422 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1423 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1424 __ mov64(temp, -1L);
1425 __ bzhiq(temp, temp, length);
1426 __ kmovql(mask, temp);
1427 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1428 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1429 }
1430
1431
1432 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1433 int shift, int offset) {
1434 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1435 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1436 __ vmovdqu(xmm, Address(src, index, scale, offset));
1437 __ vmovdqu(Address(dst, index, scale, offset), xmm);
1438 }
1439
1440
1441 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1442 bool conjoint, int shift, int offset, bool use64byteVector) {
1443 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1444 if (!use64byteVector) {
1445 if (conjoint) {
1446 copy32_avx(dst, src, index, xmm, shift, offset+32);
1447 copy32_avx(dst, src, index, xmm, shift, offset);
1448 } else {
1449 copy32_avx(dst, src, index, xmm, shift, offset);
1450 copy32_avx(dst, src, index, xmm, shift, offset+32);
1451 }
1452 } else {
1453 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1454 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1455 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1456 }
1457 }
1458
1459 #endif // COMPILER2
1460
1461
1462 // Arguments:
1463 // entry - location for return of (post-push) entry
1464 //
1465 // Inputs:
1466 // c_rarg0 - source array address
1467 // c_rarg1 - destination array address
1468 // c_rarg2 - element count, treated as ssize_t, can be zero
1469 //
1470 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1471 // we let the hardware handle it. The one to eight bytes within words,
1472 // dwords or qwords that span cache line boundaries will still be loaded
1473 // and stored atomically.
1474 //
1475 // Side Effects:
1476 // entry is set to the no-overlap entry point
1477 // used by generate_conjoint_byte_copy().
1478 //
1479 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1480 StubId stub_id = StubId::stubgen_jbyte_disjoint_arraycopy_id;
1481 // aligned is always false -- x86_64 always uses the unaligned code
1482 const bool aligned = false;
1483 #ifdef COMPILER2
1484 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1485 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1486 }
1487 #endif // COMPILER2
1488 GrowableArray<address> entries;
1489 GrowableArray<address> extras;
1490 int expected_entry_count = (entry != nullptr ? 2 : 1);
1491 int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1492 int entry_count = StubInfo::entry_count(stub_id);
1493 assert(entry_count == expected_entry_count, "sanity check");
1494 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1495 address start = load_archive_data(stub_id, entries_ptr, &extras);
1496 if (start != nullptr) {
1497 assert(entries.length() == expected_entry_count - 1,
1498 "unexpected entry count %d", entries.length());
1499 assert(extras.length() == expected_handler_count,
1500 "unexpected handler addresses count %d", extras.length());
1501 if (entry != nullptr) {
1502 *entry = entries.at(0);
1503 }
1504 // restore 2 UMAM {start,end,handler} addresses from extras
1505 register_unsafe_access_handlers(extras, 0, 2);
1506 return start;
1507 }
1508 __ align(CodeEntryAlignment);
1509 StubCodeMark mark(this, stub_id);
1510 start = __ pc();
1511 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1512
1513 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1514 Label L_copy_byte, L_exit;
1515 const Register from = rdi; // source array address
1516 const Register to = rsi; // destination array address
1517 const Register count = rdx; // elements count
1518 const Register byte_count = rcx;
1519 const Register qword_count = count;
1520 const Register end_from = from; // source array end address
1521 const Register end_to = to; // destination array end address
1522 // End pointers are inclusive, and if count is not zero they point
1523 // to the last unit copied: end_to[0] := end_from[0]
1524
1525 __ enter(); // required for proper stackwalking of RuntimeStub frame
1526 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1527
1528 if (entry != nullptr) {
1529 *entry = __ pc();
1530 entries.append(*entry);
1531 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1532 BLOCK_COMMENT("Entry:");
1533 }
1534
1535 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1536 // r9 and r10 may be used to save non-volatile registers
1537
1538 {
1539 // UnsafeMemoryAccess page error: continue after unsafe access
1540 UnsafeMemoryAccessMark umam(this, !aligned, true);
1541 // 'from', 'to' and 'count' are now valid
1542 __ movptr(byte_count, count);
1543 __ shrptr(count, 3); // count => qword_count
1544
1545 // Copy from low to high addresses. Use 'to' as scratch.
1546 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1547 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1548 __ negptr(qword_count); // make the count negative
1549 __ jmp(L_copy_bytes);
1550
1551 // Copy trailing qwords
1552 __ BIND(L_copy_8_bytes);
1553 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1554 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1555 __ increment(qword_count);
1556 __ jcc(Assembler::notZero, L_copy_8_bytes);
1557
1558 // Check for and copy trailing dword
1559 __ BIND(L_copy_4_bytes);
1560 __ testl(byte_count, 4);
1561 __ jccb(Assembler::zero, L_copy_2_bytes);
1562 __ movl(rax, Address(end_from, 8));
1563 __ movl(Address(end_to, 8), rax);
1564
1565 __ addptr(end_from, 4);
1566 __ addptr(end_to, 4);
1567
1568 // Check for and copy trailing word
1569 __ BIND(L_copy_2_bytes);
1570 __ testl(byte_count, 2);
1571 __ jccb(Assembler::zero, L_copy_byte);
1572 __ movw(rax, Address(end_from, 8));
1573 __ movw(Address(end_to, 8), rax);
1574
1575 __ addptr(end_from, 2);
1576 __ addptr(end_to, 2);
1577
1578 // Check for and copy trailing byte
1579 __ BIND(L_copy_byte);
1580 __ testl(byte_count, 1);
1581 __ jccb(Assembler::zero, L_exit);
1582 __ movb(rax, Address(end_from, 8));
1583 __ movb(Address(end_to, 8), rax);
1584 }
1585 __ BIND(L_exit);
1586 address ucme_exit_pc = __ pc();
1587 restore_arg_regs();
1588 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1589 __ xorptr(rax, rax); // return 0
1590 __ vzeroupper();
1591 __ leave(); // required for proper stackwalking of RuntimeStub frame
1592 __ ret(0);
1593
1594 {
1595 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1596 // Copy in multi-bytes chunks
1597 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1598 __ jmp(L_copy_4_bytes);
1599 }
1600
1601 // retrieve the registered handler addresses
1602 address end = __ pc();
1603 retrieve_unsafe_access_handlers(start, end, extras);
1604 assert(extras.length() == expected_handler_count,
1605 "unexpected handler addresses count %d", extras.length());
1606
1607 // record the stub entry and end plus the no_push entry and any
1608 // extra handler addresses
1609 store_archive_data(stub_id, start, end, entries_ptr, &extras);
1610
1611 return start;
1612 }
1613
1614
1615 // Arguments:
1616 // entry - location for return of (post-push) entry
1617 // nooverlap_target - entry to branch to if no overlap detected
1618 //
1619 // Inputs:
1620 // c_rarg0 - source array address
1621 // c_rarg1 - destination array address
1622 // c_rarg2 - element count, treated as ssize_t, can be zero
1623 //
1624 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1625 // we let the hardware handle it. The one to eight bytes within words,
1626 // dwords or qwords that span cache line boundaries will still be loaded
1627 // and stored atomically.
1628 //
1629 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1630 StubId stub_id = StubId::stubgen_jbyte_arraycopy_id;
1631 // aligned is always false -- x86_64 always uses the unaligned code
1632 const bool aligned = false;
1633 #ifdef COMPILER2
1634 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1635 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1636 }
1637 #endif // COMPILER2
1638 GrowableArray<address> entries;
1639 GrowableArray<address> extras;
1640 int expected_entry_count = (entry != nullptr ? 2 : 1);
1641 int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1642 int entry_count = StubInfo::entry_count(stub_id);
1643 assert(entry_count == expected_entry_count, "sanity check");
1644 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1645 address start = load_archive_data(stub_id, entries_ptr, &extras);
1646 if (start != nullptr) {
1647 assert(entries.length() == expected_entry_count - 1,
1648 "unexpected entry count %d", entries.length());
1649 assert(extras.length() == expected_handler_count,
1650 "unexpected handler addresses count %d", extras.length());
1651 if (entry != nullptr) {
1652 *entry = entries.at(0);
1653 }
1654 // restore 2 UMAM {start,end,handler} addresses from extras
1655 register_unsafe_access_handlers(extras, 0, 2);
1656 return start;
1657 }
1658 __ align(CodeEntryAlignment);
1659 StubCodeMark mark(this, stub_id);
1660 start = __ pc();
1661 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1662
1663 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1664 const Register from = rdi; // source array address
1665 const Register to = rsi; // destination array address
1666 const Register count = rdx; // elements count
1667 const Register byte_count = rcx;
1668 const Register qword_count = count;
1669
1670 __ enter(); // required for proper stackwalking of RuntimeStub frame
1671 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1672
1673 if (entry != nullptr) {
1674 *entry = __ pc();
1675 entries.append(*entry);
1676 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1677 BLOCK_COMMENT("Entry:");
1678 }
1679
1680 array_overlap_test(nooverlap_target, Address::times_1);
1681 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1682 // r9 and r10 may be used to save non-volatile registers
1683
1684 {
1685 // UnsafeMemoryAccess page error: continue after unsafe access
1686 UnsafeMemoryAccessMark umam(this, !aligned, true);
1687 // 'from', 'to' and 'count' are now valid
1688 __ movptr(byte_count, count);
1689 __ shrptr(count, 3); // count => qword_count
1690
1691 // Copy from high to low addresses.
1692
1693 // Check for and copy trailing byte
1694 __ testl(byte_count, 1);
1695 __ jcc(Assembler::zero, L_copy_2_bytes);
1696 __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1697 __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1698 __ decrement(byte_count); // Adjust for possible trailing word
1699
1700 // Check for and copy trailing word
1701 __ BIND(L_copy_2_bytes);
1702 __ testl(byte_count, 2);
1703 __ jcc(Assembler::zero, L_copy_4_bytes);
1704 __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1705 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1706
1707 // Check for and copy trailing dword
1708 __ BIND(L_copy_4_bytes);
1709 __ testl(byte_count, 4);
1710 __ jcc(Assembler::zero, L_copy_bytes);
1711 __ movl(rax, Address(from, qword_count, Address::times_8));
1712 __ movl(Address(to, qword_count, Address::times_8), rax);
1713 __ jmp(L_copy_bytes);
1714
1715 // Copy trailing qwords
1716 __ BIND(L_copy_8_bytes);
1717 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1718 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1719 __ decrement(qword_count);
1720 __ jcc(Assembler::notZero, L_copy_8_bytes);
1721 }
1722 restore_arg_regs();
1723 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1724 __ xorptr(rax, rax); // return 0
1725 __ vzeroupper();
1726 __ leave(); // required for proper stackwalking of RuntimeStub frame
1727 __ ret(0);
1728
1729 {
1730 // UnsafeMemoryAccess page error: continue after unsafe access
1731 UnsafeMemoryAccessMark umam(this, !aligned, true);
1732 // Copy in multi-bytes chunks
1733 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1734 }
1735 restore_arg_regs();
1736 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1737 __ xorptr(rax, rax); // return 0
1738 __ vzeroupper();
1739 __ leave(); // required for proper stackwalking of RuntimeStub frame
1740 __ ret(0);
1741
1742 // retrieve the registered handler addresses
1743 address end = __ pc();
1744 retrieve_unsafe_access_handlers(start, end, extras);
1745 assert(extras.length() == expected_handler_count,
1746 "unexpected handler addresses count %d", extras.length());
1747
1748 // record the stub entry and end plus the no_push entry and any
1749 // extra handler addresses
1750 store_archive_data(stub_id, start, end, entries_ptr, &extras);
1751
1752 return start;
1753 }
1754
1755
1756 // Arguments:
1757 // entry - location for return of (post-push) entry
1758 //
1759 // Inputs:
1760 // c_rarg0 - source array address
1761 // c_rarg1 - destination array address
1762 // c_rarg2 - element count, treated as ssize_t, can be zero
1763 //
1764 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1765 // let the hardware handle it. The two or four words within dwords
1766 // or qwords that span cache line boundaries will still be loaded
1767 // and stored atomically.
1768 //
1769 // Side Effects:
1770 // entry is set to the no-overlap entry point
1771 // used by generate_conjoint_short_copy().
1772 //
1773 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1774 StubId stub_id = StubId::stubgen_jshort_disjoint_arraycopy_id;
1775 // aligned is always false -- x86_64 always uses the unaligned code
1776 const bool aligned = false;
1777 #ifdef COMPILER2
1778 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1779 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1780 }
1781 #endif // COMPILER2
1782 GrowableArray<address> entries;
1783 GrowableArray<address> extras;
1784 int expected_entry_count = (entry != nullptr ? 2 : 1);
1785 int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
1786 int entry_count = StubInfo::entry_count(stub_id);
1787 assert(entry_count == expected_entry_count, "sanity check");
1788 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
1789 address start = load_archive_data(stub_id, entries_ptr, &extras);
1790 if (start != nullptr) {
1791 assert(entries.length() == expected_entry_count - 1,
1792 "unexpected entry count %d", entries.length());
1793 assert(extras.length() == expected_handler_count,
1794 "unexpected handler addresses count %d", extras.length());
1795 if (entry != nullptr) {
1796 *entry = entries.at(0);
1797 }
1798 // restore 2 UMAM {start,end,handler} addresses from extras
1799 register_unsafe_access_handlers(extras, 0, 2);
1800 return start;
1801 }
1802 __ align(CodeEntryAlignment);
1803 StubCodeMark mark(this, stub_id);
1804 start = __ pc();
1805 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1806
1807 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1808 const Register from = rdi; // source array address
1809 const Register to = rsi; // destination array address
1810 const Register count = rdx; // elements count
1811 const Register word_count = rcx;
1812 const Register qword_count = count;
1813 const Register end_from = from; // source array end address
1814 const Register end_to = to; // destination array end address
1815 // End pointers are inclusive, and if count is not zero they point
1816 // to the last unit copied: end_to[0] := end_from[0]
1817
1818 __ enter(); // required for proper stackwalking of RuntimeStub frame
1819 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1820
1821 if (entry != nullptr) {
1822 *entry = __ pc();
1823 entries.append(*entry);
1824 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1825 BLOCK_COMMENT("Entry:");
1826 }
1827
1828 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1829 // r9 and r10 may be used to save non-volatile registers
1830
1831 {
1832 // UnsafeMemoryAccess page error: continue after unsafe access
1833 UnsafeMemoryAccessMark umam(this, !aligned, true);
1834 // 'from', 'to' and 'count' are now valid
1835 __ movptr(word_count, count);
1836 __ shrptr(count, 2); // count => qword_count
1837
1838 // Copy from low to high addresses. Use 'to' as scratch.
1839 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1840 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1841 __ negptr(qword_count);
1842 __ jmp(L_copy_bytes);
1843
1844 // Copy trailing qwords
1845 __ BIND(L_copy_8_bytes);
1846 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1847 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1848 __ increment(qword_count);
1849 __ jcc(Assembler::notZero, L_copy_8_bytes);
1850
1851 // Original 'dest' is trashed, so we can't use it as a
1852 // base register for a possible trailing word copy
1853
1854 // Check for and copy trailing dword
1855 __ BIND(L_copy_4_bytes);
1856 __ testl(word_count, 2);
1857 __ jccb(Assembler::zero, L_copy_2_bytes);
1858 __ movl(rax, Address(end_from, 8));
1859 __ movl(Address(end_to, 8), rax);
1860
1861 __ addptr(end_from, 4);
1862 __ addptr(end_to, 4);
1863
1864 // Check for and copy trailing word
1865 __ BIND(L_copy_2_bytes);
1866 __ testl(word_count, 1);
1867 __ jccb(Assembler::zero, L_exit);
1868 __ movw(rax, Address(end_from, 8));
1869 __ movw(Address(end_to, 8), rax);
1870 }
1871 __ BIND(L_exit);
1872 address ucme_exit_pc = __ pc();
1873 restore_arg_regs();
1874 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1875 __ xorptr(rax, rax); // return 0
1876 __ vzeroupper();
1877 __ leave(); // required for proper stackwalking of RuntimeStub frame
1878 __ ret(0);
1879
1880 {
1881 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1882 // Copy in multi-bytes chunks
1883 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1884 __ jmp(L_copy_4_bytes);
1885 }
1886
1887 // retrieve the registered handler addresses
1888 address end = __ pc();
1889 retrieve_unsafe_access_handlers(start, end, extras);
1890 assert(extras.length() == expected_handler_count,
1891 "unexpected handler addresses count %d", extras.length());
1892
1893 // record the stub entry and end plus the no_push entry and any
1894 // extra handler addresses
1895 store_archive_data(stub_id, start, end, entries_ptr, &extras);
1896
1897 return start;
1898 }
1899
1900
1901 address StubGenerator::generate_fill(StubId stub_id) {
1902 BasicType t;
1903 bool aligned;
1904 switch (stub_id) {
1905 case StubId::stubgen_jbyte_fill_id:
1906 t = T_BYTE;
1907 aligned = false;
1908 break;
1909 case StubId::stubgen_jshort_fill_id:
1910 t = T_SHORT;
1911 aligned = false;
1912 break;
1913 case StubId::stubgen_jint_fill_id:
1914 t = T_INT;
1915 aligned = false;
1916 break;
1917 case StubId::stubgen_arrayof_jbyte_fill_id:
1918 t = T_BYTE;
1919 aligned = true;
1920 break;
1921 case StubId::stubgen_arrayof_jshort_fill_id:
1922 t = T_SHORT;
1923 aligned = true;
1924 break;
1925 case StubId::stubgen_arrayof_jint_fill_id:
1926 t = T_INT;
1927 aligned = true;
1928 break;
1929 default:
1930 ShouldNotReachHere();
1931 }
1932 int entry_count = StubInfo::entry_count(stub_id);
1933 assert(entry_count == 1, "sanity check");
1934 GrowableArray<address> extras;
1935 bool add_handlers = ((t == T_BYTE) && !aligned);
1936 int handlers_count = (add_handlers ? 1 : 0);
1937 int expected_extras_count = (handlers_count * UnsafeMemoryAccess::COLUMN_COUNT); // 0/1 x UMAM {start,end,handler}
1938 GrowableArray<address>* extras_ptr = (add_handlers ? &extras : nullptr);
1939 address start = load_archive_data(stub_id, nullptr, extras_ptr);
1940 if (start != nullptr) {
1941 assert(extras.length() == expected_extras_count,
1942 "unexpected handler addresses count %d", extras.length());
1943 if (add_handlers) {
1944 // restore 1 x UMAM {start,end,handler} addresses from extras
1945 register_unsafe_access_handlers(extras, 0, 1);
1946 }
1947 return start;
1948 }
1949
1950 __ align(CodeEntryAlignment);
1951 StubCodeMark mark(this, stub_id);
1952 start = __ pc();
1953
1954 BLOCK_COMMENT("Entry:");
1955
1956 const Register to = c_rarg0; // destination array address
1957 const Register value = c_rarg1; // value
1958 const Register count = c_rarg2; // elements count
1959 __ mov(r11, count);
1960
1961 __ enter(); // required for proper stackwalking of RuntimeStub frame
1962
1963 {
1964 // Add set memory mark to protect against unsafe accesses faulting
1965 UnsafeMemoryAccessMark umam(this, add_handlers, true);
1966 __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1967 }
1968
1969 __ vzeroupper();
1970 __ leave(); // required for proper stackwalking of RuntimeStub frame
1971 __ ret(0);
1972
1973 address end = __ pc();
1974 if (add_handlers) {
1975 retrieve_unsafe_access_handlers(start, end, extras);
1976 }
1977 assert(extras.length() == expected_extras_count,
1978 "unexpected handler addresses count %d", extras.length());
1979 // record the stub entry and end
1980 store_archive_data(stub_id, start, end, nullptr, extras_ptr);
1981
1982 return start;
1983 }
1984
1985
1986 // Arguments:
1987 // entry - location for return of (post-push) entry
1988 // nooverlap_target - entry to branch to if no overlap detected
1989 //
1990 // Inputs:
1991 // c_rarg0 - source array address
1992 // c_rarg1 - destination array address
1993 // c_rarg2 - element count, treated as ssize_t, can be zero
1994 //
1995 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1996 // let the hardware handle it. The two or four words within dwords
1997 // or qwords that span cache line boundaries will still be loaded
1998 // and stored atomically.
1999 //
2000 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
2001 StubId stub_id = StubId::stubgen_jshort_arraycopy_id;
2002 // aligned is always false -- x86_64 always uses the unaligned code
2003 const bool aligned = false;
2004 #ifdef COMPILER2
2005 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2006 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2007 }
2008 #endif // COMPILER2
2009 GrowableArray<address> entries;
2010 GrowableArray<address> extras;
2011 int expected_entry_count = (entry != nullptr ? 2 : 1);
2012 int expected_handler_count = (2 * UnsafeMemoryAccess::COLUMN_COUNT); // 2 x UMAM {start,end,handler}
2013 int entry_count = StubInfo::entry_count(stub_id);
2014 assert(entry_count == expected_entry_count, "sanity check");
2015 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2016 address start = load_archive_data(stub_id, entries_ptr, &extras);
2017 if (start != nullptr) {
2018 assert(entries.length() == expected_entry_count - 1,
2019 "unexpected entry count %d", entries.length());
2020 assert(extras.length() == expected_handler_count,
2021 "unexpected handler addresses count %d", extras.length());
2022 if (entry != nullptr) {
2023 *entry = entries.at(0);
2024 }
2025 // restore 2 UMAM {start,end,handler} addresses from extras
2026 register_unsafe_access_handlers(extras, 0, 2);
2027 return start;
2028 }
2029 __ align(CodeEntryAlignment);
2030 StubCodeMark mark(this, stub_id);
2031 start = __ pc();
2032 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2033
2034 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2035 const Register from = rdi; // source array address
2036 const Register to = rsi; // destination array address
2037 const Register count = rdx; // elements count
2038 const Register word_count = rcx;
2039 const Register qword_count = count;
2040
2041 __ enter(); // required for proper stackwalking of RuntimeStub frame
2042 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2043
2044 if (entry != nullptr) {
2045 *entry = __ pc();
2046 entries.append(*entry);
2047 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2048 BLOCK_COMMENT("Entry:");
2049 }
2050
2051 array_overlap_test(nooverlap_target, Address::times_2);
2052 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2053 // r9 and r10 may be used to save non-volatile registers
2054
2055 {
2056 // UnsafeMemoryAccess page error: continue after unsafe access
2057 UnsafeMemoryAccessMark umam(this, !aligned, true);
2058 // 'from', 'to' and 'count' are now valid
2059 __ movptr(word_count, count);
2060 __ shrptr(count, 2); // count => qword_count
2061
2062 // Copy from high to low addresses. Use 'to' as scratch.
2063
2064 // Check for and copy trailing word
2065 __ testl(word_count, 1);
2066 __ jccb(Assembler::zero, L_copy_4_bytes);
2067 __ movw(rax, Address(from, word_count, Address::times_2, -2));
2068 __ movw(Address(to, word_count, Address::times_2, -2), rax);
2069
2070 // Check for and copy trailing dword
2071 __ BIND(L_copy_4_bytes);
2072 __ testl(word_count, 2);
2073 __ jcc(Assembler::zero, L_copy_bytes);
2074 __ movl(rax, Address(from, qword_count, Address::times_8));
2075 __ movl(Address(to, qword_count, Address::times_8), rax);
2076 __ jmp(L_copy_bytes);
2077
2078 // Copy trailing qwords
2079 __ BIND(L_copy_8_bytes);
2080 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2081 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2082 __ decrement(qword_count);
2083 __ jcc(Assembler::notZero, L_copy_8_bytes);
2084 }
2085 restore_arg_regs();
2086 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2087 __ xorptr(rax, rax); // return 0
2088 __ vzeroupper();
2089 __ leave(); // required for proper stackwalking of RuntimeStub frame
2090 __ ret(0);
2091
2092 {
2093 // UnsafeMemoryAccess page error: continue after unsafe access
2094 UnsafeMemoryAccessMark umam(this, !aligned, true);
2095 // Copy in multi-bytes chunks
2096 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
2097 }
2098 restore_arg_regs();
2099 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2100 __ xorptr(rax, rax); // return 0
2101 __ vzeroupper();
2102 __ leave(); // required for proper stackwalking of RuntimeStub frame
2103 __ ret(0);
2104
2105 // retrieve the registered handler addresses
2106 address end = __ pc();
2107 retrieve_unsafe_access_handlers(start, end, extras);
2108 assert(extras.length() == expected_handler_count,
2109 "unexpected handler addresses count %d", extras.length());
2110
2111 // record the stub entry and end plus the no_push entry and any
2112 // extra handler addresses
2113 store_archive_data(stub_id, start, end, entries_ptr, &extras);
2114
2115 return start;
2116 }
2117
2118
2119 // Arguments:
2120 // stub_id - unqiue id for stub to generate
2121 // entry - location for return of (post-push) entry
2122 // is_oop - true => oop array, so generate store check code
2123 //
2124 // Inputs:
2125 // c_rarg0 - source array address
2126 // c_rarg1 - destination array address
2127 // c_rarg2 - element count, treated as ssize_t, can be zero
2128 //
2129 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2130 // the hardware handle it. The two dwords within qwords that span
2131 // cache line boundaries will still be loaded and stored atomically.
2132 //
2133 // Side Effects:
2134 // disjoint_int_copy_entry is set to the no-overlap entry point
2135 // used by generate_conjoint_int_oop_copy().
2136 //
2137 address StubGenerator::generate_disjoint_int_oop_copy(StubId stub_id, address* entry) {
2138 // aligned is always false -- x86_64 always uses the unaligned code
2139 const bool aligned = false;
2140 bool is_oop;
2141 bool dest_uninitialized;
2142 switch (stub_id) {
2143 case StubId::stubgen_jint_disjoint_arraycopy_id:
2144 is_oop = false;
2145 dest_uninitialized = false;
2146 break;
2147 case StubId::stubgen_oop_disjoint_arraycopy_id:
2148 assert(UseCompressedOops, "inconsistent oop copy size!");
2149 is_oop = true;
2150 dest_uninitialized = false;
2151 break;
2152 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2153 assert(UseCompressedOops, "inconsistent oop copy size!");
2154 is_oop = true;
2155 dest_uninitialized = true;
2156 break;
2157 default:
2158 ShouldNotReachHere();
2159 }
2160
2161 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2162 #ifdef COMPILER2
2163 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2164 return generate_disjoint_copy_avx3_masked(stub_id, entry);
2165 }
2166 #endif // COMPILER2
2167 GrowableArray<address> entries;
2168 GrowableArray<address> extras;
2169 bool add_handlers = !is_oop && !aligned;
2170 bool add_relocs = UseZGC && is_oop;
2171 bool add_extras = add_handlers || add_relocs;
2172 int expected_entry_count = (entry != nullptr ? 2 : 1);
2173 int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2174 int entry_count = StubInfo::entry_count(stub_id);
2175 assert(entry_count == expected_entry_count, "sanity check");
2176 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2177 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2178 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2179 if (start != nullptr) {
2180 assert(entries.length() == expected_entry_count - 1,
2181 "unexpected entry count %d", entries.length());
2182 assert(!add_handlers || extras.length() == expected_handler_count,
2183 "unexpected handler addresses count %d", extras.length());
2184 if (entry != nullptr) {
2185 *entry = entries.at(0);
2186 }
2187 if (add_handlers) {
2188 // restore 2 UMAM {start,end,handler} addresses from extras
2189 register_unsafe_access_handlers(extras, 0, 2);
2190 }
2191 #if INCLUDE_ZGC
2192 // register addresses at which ZGC does colour patching
2193 if (add_relocs) {
2194 register_reloc_addresses(extras, 0, extras.length());
2195 }
2196 #endif // INCLUDE_ZGC
2197 return start;
2198 }
2199
2200 __ align(CodeEntryAlignment);
2201 StubCodeMark mark(this, stub_id);
2202 start = __ pc();
2203
2204 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2205 const Register from = rdi; // source array address
2206 const Register to = rsi; // destination array address
2207 const Register count = rdx; // elements count
2208 const Register dword_count = rcx;
2209 const Register qword_count = count;
2210 const Register end_from = from; // source array end address
2211 const Register end_to = to; // destination array end address
2212 // End pointers are inclusive, and if count is not zero they point
2213 // to the last unit copied: end_to[0] := end_from[0]
2214
2215 __ enter(); // required for proper stackwalking of RuntimeStub frame
2216 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2217
2218 if (entry != nullptr) {
2219 *entry = __ pc();
2220 entries.append(*entry);
2221 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2222 BLOCK_COMMENT("Entry:");
2223 }
2224
2225 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2226 // r9 is used to save r15_thread
2227
2228 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2229 if (dest_uninitialized) {
2230 decorators |= IS_DEST_UNINITIALIZED;
2231 }
2232 if (aligned) {
2233 decorators |= ARRAYCOPY_ALIGNED;
2234 }
2235
2236 BasicType type = is_oop ? T_OBJECT : T_INT;
2237 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2238
2239 {
2240 // UnsafeMemoryAccess page error: continue after unsafe access
2241 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2242 // 'from', 'to' and 'count' are now valid
2243 __ movptr(dword_count, count);
2244 __ shrptr(count, 1); // count => qword_count
2245
2246 // Copy from low to high addresses. Use 'to' as scratch.
2247 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2248 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2249 __ negptr(qword_count);
2250 __ jmp(L_copy_bytes);
2251
2252 // Copy trailing qwords
2253 __ BIND(L_copy_8_bytes);
2254 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2255 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2256 __ increment(qword_count);
2257 __ jcc(Assembler::notZero, L_copy_8_bytes);
2258
2259 // Check for and copy trailing dword
2260 __ BIND(L_copy_4_bytes);
2261 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2262 __ jccb(Assembler::zero, L_exit);
2263 __ movl(rax, Address(end_from, 8));
2264 __ movl(Address(end_to, 8), rax);
2265 }
2266 __ BIND(L_exit);
2267 address ucme_exit_pc = __ pc();
2268 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2269 restore_arg_regs_using_thread();
2270 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2271 __ vzeroupper();
2272 __ xorptr(rax, rax); // return 0
2273 __ leave(); // required for proper stackwalking of RuntimeStub frame
2274 __ ret(0);
2275
2276 {
2277 UnsafeMemoryAccessMark umam(this, add_handlers, false, ucme_exit_pc);
2278 // Copy in multi-bytes chunks
2279 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2280 __ jmp(L_copy_4_bytes);
2281 }
2282
2283 // retrieve the registered handler addresses
2284 address end = __ pc();
2285 if (add_handlers) {
2286 retrieve_unsafe_access_handlers(start, end, extras);
2287 }
2288 assert(extras.length() == expected_handler_count,
2289 "unexpected handler addresses count %d", extras.length());
2290 #if INCLUDE_ZGC
2291 // retrieve addresses at which ZGC does colour patching
2292 if (add_relocs) {
2293 retrieve_reloc_addresses(start, end, extras);
2294 }
2295 #endif // INCLUDE_ZGC
2296
2297 // record the stub entry and end plus the no_push entry and any
2298 // extra handler addresses
2299 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2300
2301 return start;
2302 }
2303
2304
2305 // Arguments:
2306 // entry - location for return of (post-push) entry
2307 // nooverlap_target - entry to branch to if no overlap detected
2308 // is_oop - true => oop array, so generate store check code
2309 //
2310 // Inputs:
2311 // c_rarg0 - source array address
2312 // c_rarg1 - destination array address
2313 // c_rarg2 - element count, treated as ssize_t, can be zero
2314 //
2315 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2316 // the hardware handle it. The two dwords within qwords that span
2317 // cache line boundaries will still be loaded and stored atomically.
2318 //
2319 address StubGenerator::generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2320 // aligned is always false -- x86_64 always uses the unaligned code
2321 const bool aligned = false;
2322 bool is_oop;
2323 bool dest_uninitialized;
2324 switch (stub_id) {
2325 case StubId::stubgen_jint_arraycopy_id:
2326 is_oop = false;
2327 dest_uninitialized = false;
2328 break;
2329 case StubId::stubgen_oop_arraycopy_id:
2330 assert(UseCompressedOops, "inconsistent oop copy size!");
2331 is_oop = true;
2332 dest_uninitialized = false;
2333 break;
2334 case StubId::stubgen_oop_arraycopy_uninit_id:
2335 assert(UseCompressedOops, "inconsistent oop copy size!");
2336 is_oop = true;
2337 dest_uninitialized = true;
2338 break;
2339 default:
2340 ShouldNotReachHere();
2341 }
2342
2343 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2344 #ifdef COMPILER2
2345 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2346 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2347 }
2348 #endif // COMPILER2
2349 bool add_handlers = !is_oop && !aligned;
2350 bool add_relocs = UseZGC && is_oop;
2351 bool add_extras = add_handlers || add_relocs;
2352 GrowableArray<address> entries;
2353 GrowableArray<address> extras;
2354 int expected_entry_count = (entry != nullptr ? 2 : 1);
2355 int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2356 int entry_count = StubInfo::entry_count(stub_id);
2357 assert(entry_count == expected_entry_count, "sanity check");
2358 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2359 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2360 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2361 if (start != nullptr) {
2362 assert(entries.length() == expected_entry_count - 1,
2363 "unexpected entry count %d", entries.length());
2364 assert(!add_handlers || extras.length() == expected_handler_count,
2365 "unexpected handler addresses count %d", extras.length());
2366 if (entry != nullptr) {
2367 *entry = entries.at(0);
2368 }
2369 if (add_handlers) {
2370 // restore 2 UMAM {start,end,handler} addresses from extras
2371 register_unsafe_access_handlers(extras, 0, 2);
2372 }
2373 #if INCLUDE_ZGC
2374 // register addresses at which ZGC does colour patching
2375 if (add_relocs) {
2376 register_reloc_addresses(extras, 6, extras.length());
2377 }
2378 #endif // INCLUDE_ZGC
2379 return start;
2380 }
2381
2382 __ align(CodeEntryAlignment);
2383 StubCodeMark mark(this, stub_id);
2384 start = __ pc();
2385
2386 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2387 const Register from = rdi; // source array address
2388 const Register to = rsi; // destination array address
2389 const Register count = rdx; // elements count
2390 const Register dword_count = rcx;
2391 const Register qword_count = count;
2392
2393 __ enter(); // required for proper stackwalking of RuntimeStub frame
2394 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2395
2396 if (entry != nullptr) {
2397 *entry = __ pc();
2398 entries.append(*entry);
2399 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2400 BLOCK_COMMENT("Entry:");
2401 }
2402
2403 array_overlap_test(nooverlap_target, Address::times_4);
2404 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2405 // r9 is used to save r15_thread
2406
2407 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2408 if (dest_uninitialized) {
2409 decorators |= IS_DEST_UNINITIALIZED;
2410 }
2411 if (aligned) {
2412 decorators |= ARRAYCOPY_ALIGNED;
2413 }
2414
2415 BasicType type = is_oop ? T_OBJECT : T_INT;
2416 // no registers are destroyed by this call
2417 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2418
2419 assert_clean_int(count, rax); // Make sure 'count' is clean int.
2420 {
2421 // UnsafeMemoryAccess page error: continue after unsafe access
2422 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2423 // 'from', 'to' and 'count' are now valid
2424 __ movptr(dword_count, count);
2425 __ shrptr(count, 1); // count => qword_count
2426
2427 // Copy from high to low addresses. Use 'to' as scratch.
2428
2429 // Check for and copy trailing dword
2430 __ testl(dword_count, 1);
2431 __ jcc(Assembler::zero, L_copy_bytes);
2432 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2433 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2434 __ jmp(L_copy_bytes);
2435
2436 // Copy trailing qwords
2437 __ BIND(L_copy_8_bytes);
2438 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2439 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2440 __ decrement(qword_count);
2441 __ jcc(Assembler::notZero, L_copy_8_bytes);
2442 }
2443 if (is_oop) {
2444 __ jmp(L_exit);
2445 }
2446 restore_arg_regs_using_thread();
2447 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2448 __ xorptr(rax, rax); // return 0
2449 __ vzeroupper();
2450 __ leave(); // required for proper stackwalking of RuntimeStub frame
2451 __ ret(0);
2452
2453 {
2454 // UnsafeMemoryAccess page error: continue after unsafe access
2455 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2456 // Copy in multi-bytes chunks
2457 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2458 }
2459
2460 __ BIND(L_exit);
2461 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2462 restore_arg_regs_using_thread();
2463 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2464 __ xorptr(rax, rax); // return 0
2465 __ vzeroupper();
2466 __ leave(); // required for proper stackwalking of RuntimeStub frame
2467 __ ret(0);
2468
2469 // retrieve the registered handler addresses
2470 address end = __ pc();
2471 if (add_handlers) {
2472 retrieve_unsafe_access_handlers(start, end, extras);
2473 }
2474 assert(extras.length() == expected_handler_count,
2475 "unexpected handler addresses count %d", extras.length());
2476 #if INCLUDE_ZGC
2477 // retrieve addresses at which ZGC does colour patching
2478 if (add_relocs) {
2479 retrieve_reloc_addresses(start, end, extras);
2480 }
2481 #endif // INCLUDE_ZGC
2482 // record the stub entry and end plus the no_push entry and any
2483 // extra handler addresses
2484 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2485
2486 return start;
2487 }
2488
2489
2490 // Arguments:
2491 // entry - location for return of (post-push) entry
2492 //
2493 // Inputs:
2494 // c_rarg0 - source array address
2495 // c_rarg1 - destination array address
2496 // c_rarg2 - element count, treated as ssize_t, can be zero
2497 //
2498 // Side Effects:
2499 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2500 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2501 //
2502 address StubGenerator::generate_disjoint_long_oop_copy(StubId stub_id, address *entry) {
2503 // aligned is always false -- x86_64 always uses the unaligned code
2504 const bool aligned = false;
2505 bool is_oop;
2506 bool dest_uninitialized;
2507 switch (stub_id) {
2508 case StubId::stubgen_jlong_disjoint_arraycopy_id:
2509 is_oop = false;
2510 dest_uninitialized = false;
2511 break;
2512 case StubId::stubgen_oop_disjoint_arraycopy_id:
2513 assert(!UseCompressedOops, "inconsistent oop copy size!");
2514 is_oop = true;
2515 dest_uninitialized = false;
2516 break;
2517 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2518 assert(!UseCompressedOops, "inconsistent oop copy size!");
2519 is_oop = true;
2520 dest_uninitialized = true;
2521 break;
2522 default:
2523 ShouldNotReachHere();
2524 }
2525
2526 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2527 #ifdef COMPILER2
2528 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2529 return generate_disjoint_copy_avx3_masked(stub_id, entry);
2530 }
2531 #endif // COMPILER2
2532 bool add_handlers = !is_oop && !aligned;
2533 bool add_relocs = UseZGC && is_oop;
2534 bool add_extras = add_handlers || add_relocs;
2535 GrowableArray<address> entries;
2536 GrowableArray<address> extras;
2537 int expected_entry_count = (entry != nullptr ? 2 : 1);
2538 int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2539 int entry_count = StubInfo::entry_count(stub_id);
2540 assert(entry_count == expected_entry_count, "sanity check");
2541 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2542 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2543 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2544 if (start != nullptr) {
2545 assert(entries.length() == expected_entry_count - 1,
2546 "unexpected entry count %d", entries.length());
2547 assert(!add_handlers || extras.length() == expected_handler_count,
2548 "unexpected handler addresses count %d", extras.length());
2549 if (entry != nullptr) {
2550 *entry = entries.at(0);
2551 }
2552 if (add_handlers) {
2553 // restore 2 UMAM {start,end,handler} addresses from extras
2554 register_unsafe_access_handlers(extras, 0, 2);
2555 }
2556 #if INCLUDE_ZGC
2557 // register addresses at which ZGC does colour patching
2558 if (add_relocs) {
2559 register_reloc_addresses(extras, 0, extras.length());
2560 }
2561 #endif // INCLUDE_ZGC
2562 return start;
2563 }
2564
2565 __ align(CodeEntryAlignment);
2566 StubCodeMark mark(this, stub_id);
2567 start = __ pc();
2568
2569 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2570 const Register from = rdi; // source array address
2571 const Register to = rsi; // destination array address
2572 const Register qword_count = rdx; // elements count
2573 const Register end_from = from; // source array end address
2574 const Register end_to = rcx; // destination array end address
2575 const Register saved_count = r11;
2576 // End pointers are inclusive, and if count is not zero they point
2577 // to the last unit copied: end_to[0] := end_from[0]
2578
2579 __ enter(); // required for proper stackwalking of RuntimeStub frame
2580 // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2581 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2582
2583 if (entry != nullptr) {
2584 *entry = __ pc();
2585 entries.append(*entry);
2586 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2587 BLOCK_COMMENT("Entry:");
2588 }
2589
2590 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2591 // r9 is used to save r15_thread
2592 // 'from', 'to' and 'qword_count' are now valid
2593
2594 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2595 if (dest_uninitialized) {
2596 decorators |= IS_DEST_UNINITIALIZED;
2597 }
2598 if (aligned) {
2599 decorators |= ARRAYCOPY_ALIGNED;
2600 }
2601
2602 BasicType type = is_oop ? T_OBJECT : T_LONG;
2603 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2604 {
2605 // UnsafeMemoryAccess page error: continue after unsafe access
2606 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2607
2608 // Copy from low to high addresses. Use 'to' as scratch.
2609 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2610 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2611 __ negptr(qword_count);
2612 __ jmp(L_copy_bytes);
2613
2614 // Copy trailing qwords
2615 __ BIND(L_copy_8_bytes);
2616 bs->copy_load_at(_masm, decorators, type, 8,
2617 rax, Address(end_from, qword_count, Address::times_8, 8),
2618 r10);
2619 bs->copy_store_at(_masm, decorators, type, 8,
2620 Address(end_to, qword_count, Address::times_8, 8), rax,
2621 r10);
2622 __ increment(qword_count);
2623 __ jcc(Assembler::notZero, L_copy_8_bytes);
2624 }
2625 if (is_oop) {
2626 __ jmp(L_exit);
2627 } else {
2628 restore_arg_regs_using_thread();
2629 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2630 __ xorptr(rax, rax); // return 0
2631 __ vzeroupper();
2632 __ leave(); // required for proper stackwalking of RuntimeStub frame
2633 __ ret(0);
2634 }
2635
2636 {
2637 // UnsafeMemoryAccess page error: continue after unsafe access
2638 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2639 // Copy in multi-bytes chunks
2640 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2641 }
2642
2643 __ BIND(L_exit);
2644 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2645 restore_arg_regs_using_thread();
2646 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2647 SharedRuntime::_jlong_array_copy_ctr,
2648 rscratch1); // Update counter after rscratch1 is free
2649 __ vzeroupper();
2650 __ xorptr(rax, rax); // return 0
2651 __ leave(); // required for proper stackwalking of RuntimeStub frame
2652 __ ret(0);
2653
2654 // retrieve the registered handler addresses
2655 address end = __ pc();
2656 if (add_handlers) {
2657 retrieve_unsafe_access_handlers(start, end, extras);
2658 }
2659 assert(extras.length() == expected_handler_count,
2660 "unexpected handler addresses count %d", extras.length());
2661 #if INCLUDE_ZGC
2662 // retrieve addresses at which ZGC does colour patching
2663 if (add_relocs) {
2664 retrieve_reloc_addresses(start, end, extras);
2665 }
2666 #endif // INCLUDE_ZGC
2667 // record the stub entry and end plus the no_push entry and any
2668 // extra handler addresses
2669 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2670
2671 return start;
2672 }
2673
2674
2675 // Arguments:
2676 // entry - location for return of (post-push) entry
2677 // nooverlap_target - entry to branch to if no overlap detected
2678 // is_oop - true => oop array, so generate store check code
2679 //
2680 // Inputs:
2681 // c_rarg0 - source array address
2682 // c_rarg1 - destination array address
2683 // c_rarg2 - element count, treated as ssize_t, can be zero
2684 //
2685 address StubGenerator::generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2686 // aligned is always false -- x86_64 always uses the unaligned code
2687 const bool aligned = false;
2688 bool is_oop;
2689 bool dest_uninitialized;
2690 switch (stub_id) {
2691 case StubId::stubgen_jlong_arraycopy_id:
2692 is_oop = false;
2693 dest_uninitialized = false;
2694 break;
2695 case StubId::stubgen_oop_arraycopy_id:
2696 assert(!UseCompressedOops, "inconsistent oop copy size!");
2697 is_oop = true;
2698 dest_uninitialized = false;
2699 break;
2700 case StubId::stubgen_oop_arraycopy_uninit_id:
2701 assert(!UseCompressedOops, "inconsistent oop copy size!");
2702 is_oop = true;
2703 dest_uninitialized = true;
2704 break;
2705 default:
2706 ShouldNotReachHere();
2707 }
2708
2709 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2710 #ifdef COMPILER2
2711 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2712 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2713 }
2714 #endif // COMPILER2
2715 bool add_handlers = !is_oop && !aligned;
2716 bool add_relocs = UseZGC && is_oop;
2717 bool add_extras = add_handlers || add_relocs;
2718 GrowableArray<address> entries;
2719 GrowableArray<address> extras;
2720 int expected_entry_count = (entry != nullptr ? 2 : 1);
2721 int expected_handler_count = (add_handlers ? 2 : 0) * UnsafeMemoryAccess::COLUMN_COUNT; // 0/2 x UMAM {start,end,handler}
2722 int entry_count = StubInfo::entry_count(stub_id);
2723 assert(entry_count == expected_entry_count, "sanity check");
2724 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2725 GrowableArray<address>* extras_ptr = (add_extras ? &extras : nullptr);
2726 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2727 if (start != nullptr) {
2728 assert(entries.length() == expected_entry_count - 1,
2729 "unexpected entry count %d", entries.length());
2730 assert(!add_handlers || extras.length() == expected_handler_count,
2731 "unexpected handler addresses count %d", extras.length());
2732 if (entry != nullptr) {
2733 *entry = entries.at(0);
2734 }
2735 if (add_handlers) {
2736 // restore 2 UMAM {start,end,handler} addresses from extras
2737 register_unsafe_access_handlers(extras, 0, 2);
2738 }
2739 #if INCLUDE_ZGC
2740 // register addresses at which ZGC does colour patching
2741 if (add_relocs) {
2742 register_reloc_addresses(extras, 0, extras.length());
2743 }
2744 #endif // INCLUDE_ZGC
2745 return start;
2746 }
2747
2748 __ align(CodeEntryAlignment);
2749 StubCodeMark mark(this, stub_id);
2750 start = __ pc();
2751
2752 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2753 const Register from = rdi; // source array address
2754 const Register to = rsi; // destination array address
2755 const Register qword_count = rdx; // elements count
2756 const Register saved_count = rcx;
2757
2758 __ enter(); // required for proper stackwalking of RuntimeStub frame
2759 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2760
2761 if (entry != nullptr) {
2762 *entry = __ pc();
2763 entries.append(*entry);
2764 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2765 BLOCK_COMMENT("Entry:");
2766 }
2767
2768 array_overlap_test(nooverlap_target, Address::times_8);
2769 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2770 // r9 is used to save r15_thread
2771 // 'from', 'to' and 'qword_count' are now valid
2772
2773 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2774 if (dest_uninitialized) {
2775 decorators |= IS_DEST_UNINITIALIZED;
2776 }
2777 if (aligned) {
2778 decorators |= ARRAYCOPY_ALIGNED;
2779 }
2780
2781 BasicType type = is_oop ? T_OBJECT : T_LONG;
2782 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2783 {
2784 // UnsafeMemoryAccess page error: continue after unsafe access
2785 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2786
2787 __ jmp(L_copy_bytes);
2788
2789 // Copy trailing qwords
2790 __ BIND(L_copy_8_bytes);
2791 bs->copy_load_at(_masm, decorators, type, 8,
2792 rax, Address(from, qword_count, Address::times_8, -8),
2793 r10);
2794 bs->copy_store_at(_masm, decorators, type, 8,
2795 Address(to, qword_count, Address::times_8, -8), rax,
2796 r10);
2797 __ decrement(qword_count);
2798 __ jcc(Assembler::notZero, L_copy_8_bytes);
2799 }
2800 if (is_oop) {
2801 __ jmp(L_exit);
2802 } else {
2803 restore_arg_regs_using_thread();
2804 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2805 __ xorptr(rax, rax); // return 0
2806 __ vzeroupper();
2807 __ leave(); // required for proper stackwalking of RuntimeStub frame
2808 __ ret(0);
2809 }
2810 {
2811 // UnsafeMemoryAccess page error: continue after unsafe access
2812 UnsafeMemoryAccessMark umam(this, add_handlers, true);
2813
2814 // Copy in multi-bytes chunks
2815 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2816 }
2817 __ BIND(L_exit);
2818 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2819 restore_arg_regs_using_thread();
2820 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2821 SharedRuntime::_jlong_array_copy_ctr,
2822 rscratch1); // Update counter after rscratch1 is free
2823 __ vzeroupper();
2824 __ xorptr(rax, rax); // return 0
2825 __ leave(); // required for proper stackwalking of RuntimeStub frame
2826 __ ret(0);
2827
2828
2829 // retrieve the registered handler addresses
2830 address end = __ pc();
2831 if (add_handlers) {
2832 retrieve_unsafe_access_handlers(start, end, extras);
2833 }
2834 assert(extras.length() == expected_handler_count,
2835 "unexpected handler addresses count %d", extras.length());
2836 #if INCLUDE_ZGC
2837 // retrieve addresses at which ZGC does colour patching
2838 if ((UseZGC && is_oop)) {
2839 retrieve_reloc_addresses(start, end, extras);
2840 }
2841 #endif // INCLUDE_ZGC
2842 // record the stub entry and end plus the no_push entry and any
2843 // extra handler addresses
2844 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2845
2846 return start;
2847 }
2848
2849
2850 // Helper for generating a dynamic type check.
2851 // Smashes no registers.
2852 void StubGenerator::generate_type_check(Register sub_klass,
2853 Register super_check_offset,
2854 Register super_klass,
2855 Label& L_success) {
2856 assert_different_registers(sub_klass, super_check_offset, super_klass);
2857
2858 BLOCK_COMMENT("type_check:");
2859
2860 Label L_miss;
2861
2862 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2863 super_check_offset);
2864 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2865
2866 // Fall through on failure!
2867 __ BIND(L_miss);
2868 }
2869
2870 //
2871 // Generate checkcasting array copy stub
2872 //
2873 // Input:
2874 // c_rarg0 - source array address
2875 // c_rarg1 - destination array address
2876 // c_rarg2 - element count, treated as ssize_t, can be zero
2877 // c_rarg3 - size_t ckoff (super_check_offset)
2878 // not Win64
2879 // c_rarg4 - oop ckval (super_klass)
2880 // Win64
2881 // rsp+40 - oop ckval (super_klass)
2882 //
2883 // Output:
2884 // rax == 0 - success
2885 // rax == -1^K - failure, where K is partial transfer count
2886 //
2887 address StubGenerator::generate_checkcast_copy(StubId stub_id, address *entry) {
2888
2889 bool dest_uninitialized;
2890 switch (stub_id) {
2891 case StubId::stubgen_checkcast_arraycopy_id:
2892 dest_uninitialized = false;
2893 break;
2894 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2895 dest_uninitialized = true;
2896 break;
2897 default:
2898 ShouldNotReachHere();
2899 }
2900
2901 GrowableArray<address> entries;
2902 GrowableArray<address> extras;
2903 int expected_entry_count = (entry != nullptr ? 2 : 1);
2904 int entry_count = StubInfo::entry_count(stub_id);
2905 assert(entry_count == expected_entry_count, "sanity check");
2906 GrowableArray<address>* entries_ptr = (entry_count == 1 ? nullptr : &entries);
2907 GrowableArray<address>* extras_ptr = (UseZGC ? &extras : nullptr);
2908 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2909 if (start != nullptr) {
2910 assert(entries.length() == expected_entry_count - 1,
2911 "unexpected addresses count %d", entries.length());
2912 if (entry != nullptr) {
2913 *entry = entries.at(0);
2914 }
2915 #if INCLUDE_ZGC
2916 if (UseZGC) {
2917 register_reloc_addresses(extras, 0, extras.length());
2918 }
2919 #endif // INCLUDE_ZGC
2920 return start;
2921 }
2922
2923 Label L_load_element, L_store_element, L_do_card_marks, L_done;
2924
2925 // Input registers (after setup_arg_regs)
2926 const Register from = rdi; // source array address
2927 const Register to = rsi; // destination array address
2928 const Register length = rdx; // elements count
2929 const Register ckoff = rcx; // super_check_offset
2930 const Register ckval = r8; // super_klass
2931
2932 // Registers used as temps (r13, r14 are save-on-entry)
2933 const Register end_from = from; // source array end address
2934 const Register end_to = r13; // destination array end address
2935 const Register count = rdx; // -(count_remaining)
2936 const Register r14_length = r14; // saved copy of length
2937 // End pointers are inclusive, and if length is not zero they point
2938 // to the last unit copied: end_to[0] := end_from[0]
2939
2940 const Register rax_oop = rax; // actual oop copied
2941 const Register r11_klass = r11; // oop._klass
2942
2943 //---------------------------------------------------------------
2944 // Assembler stub will be used for this call to arraycopy
2945 // if the two arrays are subtypes of Object[] but the
2946 // destination array type is not equal to or a supertype
2947 // of the source type. Each element must be separately
2948 // checked.
2949
2950 __ align(CodeEntryAlignment);
2951 StubCodeMark mark(this, stub_id);
2952 start = __ pc();
2953
2954 __ enter(); // required for proper stackwalking of RuntimeStub frame
2955
2956 #ifdef ASSERT
2957 // caller guarantees that the arrays really are different
2958 // otherwise, we would have to make conjoint checks
2959 { Label L;
2960 array_overlap_test(L, TIMES_OOP);
2961 __ stop("checkcast_copy within a single array");
2962 __ bind(L);
2963 }
2964 #endif //ASSERT
2965
2966 setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2967 // ckoff => rcx, ckval => r8
2968 // r9 is used to save r15_thread
2969 #ifdef _WIN64
2970 // last argument (#4) is on stack on Win64
2971 __ movptr(ckval, Address(rsp, 6 * wordSize));
2972 #endif
2973
2974 // Caller of this entry point must set up the argument registers.
2975 if (entry != nullptr) {
2976 *entry = __ pc();
2977 entries.append(*entry);
2978 BLOCK_COMMENT("Entry:");
2979 }
2980
2981 // allocate spill slots for r13, r14
2982 enum {
2983 saved_r13_offset,
2984 saved_r14_offset,
2985 saved_r10_offset,
2986 saved_rbp_offset
2987 };
2988 __ subptr(rsp, saved_rbp_offset * wordSize);
2989 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2990 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2991 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2992
2993 #ifdef ASSERT
2994 Label L2;
2995 __ get_thread_slow(r14);
2996 __ cmpptr(r15_thread, r14);
2997 __ jcc(Assembler::equal, L2);
2998 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2999 __ bind(L2);
3000 #endif // ASSERT
3001
3002 // check that int operands are properly extended to size_t
3003 assert_clean_int(length, rax);
3004 assert_clean_int(ckoff, rax);
3005
3006 #ifdef ASSERT
3007 BLOCK_COMMENT("assert consistent ckoff/ckval");
3008 // The ckoff and ckval must be mutually consistent,
3009 // even though caller generates both.
3010 { Label L;
3011 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3012 __ cmpl(ckoff, Address(ckval, sco_offset));
3013 __ jcc(Assembler::equal, L);
3014 __ stop("super_check_offset inconsistent");
3015 __ bind(L);
3016 }
3017 #endif //ASSERT
3018
3019 // Loop-invariant addresses. They are exclusive end pointers.
3020 Address end_from_addr(from, length, TIMES_OOP, 0);
3021 Address end_to_addr(to, length, TIMES_OOP, 0);
3022 // Loop-variant addresses. They assume post-incremented count < 0.
3023 Address from_element_addr(end_from, count, TIMES_OOP, 0);
3024 Address to_element_addr(end_to, count, TIMES_OOP, 0);
3025
3026 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
3027 if (dest_uninitialized) {
3028 decorators |= IS_DEST_UNINITIALIZED;
3029 }
3030
3031 BasicType type = T_OBJECT;
3032 size_t element_size = UseCompressedOops ? 4 : 8;
3033
3034 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3035 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
3036
3037 // Copy from low to high addresses, indexed from the end of each array.
3038 __ lea(end_from, end_from_addr);
3039 __ lea(end_to, end_to_addr);
3040 __ movptr(r14_length, length); // save a copy of the length
3041 assert(length == count, ""); // else fix next line:
3042 __ negptr(count); // negate and test the length
3043 __ jcc(Assembler::notZero, L_load_element);
3044
3045 // Empty array: Nothing to do.
3046 __ xorptr(rax, rax); // return 0 on (trivial) success
3047 __ jmp(L_done);
3048
3049 // ======== begin loop ========
3050 // (Loop is rotated; its entry is L_load_element.)
3051 // Loop control:
3052 // for (count = -count; count != 0; count++)
3053 // Base pointers src, dst are biased by 8*(count-1),to last element.
3054 __ align(OptoLoopAlignment);
3055
3056 __ BIND(L_store_element);
3057 bs->copy_store_at(_masm,
3058 decorators,
3059 type,
3060 element_size,
3061 to_element_addr,
3062 rax_oop,
3063 r10);
3064 __ increment(count); // increment the count toward zero
3065 __ jcc(Assembler::zero, L_do_card_marks);
3066
3067 // ======== loop entry is here ========
3068 __ BIND(L_load_element);
3069 bs->copy_load_at(_masm,
3070 decorators,
3071 type,
3072 element_size,
3073 rax_oop,
3074 from_element_addr,
3075 r10);
3076 __ testptr(rax_oop, rax_oop);
3077 __ jcc(Assembler::zero, L_store_element);
3078
3079 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
3080 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
3081 // ======== end loop ========
3082
3083 // It was a real error; we must depend on the caller to finish the job.
3084 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
3085 // Emit GC store barriers for the oops we have copied (r14 + rdx),
3086 // and report their number to the caller.
3087 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
3088 Label L_post_barrier;
3089 __ addptr(r14_length, count); // K = (original - remaining) oops
3090 __ movptr(rax, r14_length); // save the value
3091 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
3092 __ jccb(Assembler::notZero, L_post_barrier);
3093 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
3094
3095 // Come here on success only.
3096 __ BIND(L_do_card_marks);
3097 __ xorptr(rax, rax); // return 0 on success
3098
3099 __ BIND(L_post_barrier);
3100 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
3101
3102 // Common exit point (success or failure).
3103 __ BIND(L_done);
3104 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
3105 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
3106 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
3107 restore_arg_regs_using_thread();
3108 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
3109 __ leave(); // required for proper stackwalking of RuntimeStub frame
3110 __ ret(0);
3111
3112 address end = __ pc();
3113 #if INCLUDE_ZGC
3114 // retrieve addresses at which ZGC does colour patching
3115 if (UseZGC) {
3116 retrieve_reloc_addresses(start, end, extras);
3117 }
3118 #endif // INCLUDE_ZGC
3119 // record the stub entry and end plus the no_push entry
3120 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
3121
3122 return start;
3123 }
3124
3125
3126 // Generate 'unsafe' array copy stub
3127 // Though just as safe as the other stubs, it takes an unscaled
3128 // size_t argument instead of an element count.
3129 //
3130 // Input:
3131 // c_rarg0 - source array address
3132 // c_rarg1 - destination array address
3133 // c_rarg2 - byte count, treated as ssize_t, can be zero
3134 //
3135 // Examines the alignment of the operands and dispatches
3136 // to a long, int, short, or byte copy loop.
3137 //
3138 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
3139 address int_copy_entry, address long_copy_entry) {
3140
3141 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
3142 int entry_count = StubInfo::entry_count(stub_id);
3143 assert(entry_count == 1, "sanity check");
3144 address start = load_archive_data(stub_id);
3145 if (start != nullptr) {
3146 return start;
3147 }
3148
3149 Label L_long_aligned, L_int_aligned, L_short_aligned;
3150
3151 // Input registers (before setup_arg_regs)
3152 const Register from = c_rarg0; // source array address
3153 const Register to = c_rarg1; // destination array address
3154 const Register size = c_rarg2; // byte count (size_t)
3155
3156 // Register used as a temp
3157 const Register bits = rax; // test copy of low bits
3158
3159 __ align(CodeEntryAlignment);
3160 StubCodeMark mark(this, stub_id);
3161 start = __ pc();
3162
3163 __ enter(); // required for proper stackwalking of RuntimeStub frame
3164
3165 // bump this on entry, not on exit:
3166 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
3167
3168 __ mov(bits, from);
3169 __ orptr(bits, to);
3170 __ orptr(bits, size);
3171
3172 __ testb(bits, BytesPerLong-1);
3173 __ jccb(Assembler::zero, L_long_aligned);
3174
3175 __ testb(bits, BytesPerInt-1);
3176 __ jccb(Assembler::zero, L_int_aligned);
3177
3178 __ testb(bits, BytesPerShort-1);
3179 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
3180
3181 __ BIND(L_short_aligned);
3182 __ shrptr(size, LogBytesPerShort); // size => short_count
3183 __ jump(RuntimeAddress(short_copy_entry));
3184
3185 __ BIND(L_int_aligned);
3186 __ shrptr(size, LogBytesPerInt); // size => int_count
3187 __ jump(RuntimeAddress(int_copy_entry));
3188
3189 __ BIND(L_long_aligned);
3190 __ shrptr(size, LogBytesPerLong); // size => qword_count
3191 __ jump(RuntimeAddress(long_copy_entry));
3192
3193 // record the stub entry and end plus
3194 store_archive_data(stub_id, start, __ pc());
3195
3196 return start;
3197 }
3198
3199
3200 // Static enum for helper
3201 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
3202 // Helper for generate_unsafe_setmemory
3203 //
3204 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
3205 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
3206 Register size, Register wide_value,
3207 Register tmp, Label& L_exit,
3208 MacroAssembler *_masm) {
3209 Label L_Loop, L_Tail, L_TailLoop;
3210
3211 int shiftval = 0;
3212 int incr = 0;
3213
3214 switch (type) {
3215 case USM_SHORT:
3216 shiftval = 1;
3217 incr = 16;
3218 break;
3219 case USM_DWORD:
3220 shiftval = 2;
3221 incr = 32;
3222 break;
3223 case USM_QUADWORD:
3224 shiftval = 3;
3225 incr = 64;
3226 break;
3227 }
3228
3229 // At this point, we know the lower bits of size are zero
3230 __ shrq(size, shiftval);
3231 // size now has number of X-byte chunks (2, 4 or 8)
3232
3233 // Number of (8*X)-byte chunks into tmp
3234 __ movq(tmp, size);
3235 __ shrq(tmp, 3);
3236 __ jccb(Assembler::zero, L_Tail);
3237
3238 __ BIND(L_Loop);
3239
3240 // Unroll 8 stores
3241 for (int i = 0; i < 8; i++) {
3242 switch (type) {
3243 case USM_SHORT:
3244 __ movw(Address(dest, (2 * i)), wide_value);
3245 break;
3246 case USM_DWORD:
3247 __ movl(Address(dest, (4 * i)), wide_value);
3248 break;
3249 case USM_QUADWORD:
3250 __ movq(Address(dest, (8 * i)), wide_value);
3251 break;
3252 }
3253 }
3254 __ addq(dest, incr);
3255 __ decrementq(tmp);
3256 __ jccb(Assembler::notZero, L_Loop);
3257
3258 __ BIND(L_Tail);
3259
3260 // Find number of remaining X-byte chunks
3261 __ andq(size, 0x7);
3262
3263 // If zero, then we're done
3264 __ jccb(Assembler::zero, L_exit);
3265
3266 __ BIND(L_TailLoop);
3267
3268 switch (type) {
3269 case USM_SHORT:
3270 __ movw(Address(dest, 0), wide_value);
3271 break;
3272 case USM_DWORD:
3273 __ movl(Address(dest, 0), wide_value);
3274 break;
3275 case USM_QUADWORD:
3276 __ movq(Address(dest, 0), wide_value);
3277 break;
3278 }
3279 __ addq(dest, incr >> 3);
3280 __ decrementq(size);
3281 __ jccb(Assembler::notZero, L_TailLoop);
3282 }
3283
3284 // Generate 'unsafe' set memory stub
3285 // Though just as safe as the other stubs, it takes an unscaled
3286 // size_t (# bytes) argument instead of an element count.
3287 //
3288 // Input:
3289 // c_rarg0 - destination array address
3290 // c_rarg1 - byte count (size_t)
3291 // c_rarg2 - byte value
3292 //
3293 // Examines the alignment of the operands and dispatches
3294 // to an int, short, or byte fill loop.
3295 //
3296 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
3297 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
3298 int entry_count = StubInfo::entry_count(stub_id);
3299 assert(entry_count == 1, "sanity check");
3300 // we expect three set of extra unsafememory access handler entries
3301 GrowableArray<address> extras;
3302 int expected_handler_count = 3 * UnsafeMemoryAccess::COLUMN_COUNT;
3303 address start = load_archive_data(stub_id, nullptr, &extras);
3304 if (start != nullptr) {
3305 assert(extras.length() == expected_handler_count,
3306 "unexpected handler addresses count %d", extras.length());
3307 register_unsafe_access_handlers(extras, 0, 3);
3308 return start;
3309 }
3310
3311 __ align(CodeEntryAlignment);
3312 StubCodeMark mark(this, stub_id);
3313 start = __ pc();
3314 __ enter(); // required for proper stackwalking of RuntimeStub frame
3315
3316 assert(unsafe_byte_fill != nullptr, "Invalid call");
3317
3318 // bump this on entry, not on exit:
3319 INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
3320
3321 {
3322 Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
3323
3324 const Register dest = c_rarg0;
3325 const Register size = c_rarg1;
3326 const Register byteVal = c_rarg2;
3327 const Register wide_value = rax;
3328 const Register rScratch1 = r10;
3329
3330 assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
3331
3332 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
3333
3334 __ testq(size, size);
3335 __ jcc(Assembler::zero, L_exit);
3336
3337 // Propagate byte to full Register
3338 __ movzbl(rScratch1, byteVal);
3339 __ mov64(wide_value, 0x0101010101010101ULL);
3340 __ imulq(wide_value, rScratch1);
3341
3342 // Check for pointer & size alignment
3343 __ movq(rScratch1, dest);
3344 __ orq(rScratch1, size);
3345
3346 __ testb(rScratch1, 7);
3347 __ jcc(Assembler::equal, L_fillQuadwords);
3348
3349 __ testb(rScratch1, 3);
3350 __ jcc(Assembler::equal, L_fillDwords);
3351
3352 __ testb(rScratch1, 1);
3353 __ jcc(Assembler::notEqual, L_fillBytes);
3354
3355 // Fill words
3356 {
3357 UnsafeMemoryAccessMark umam(this, true, true);
3358
3359 // At this point, we know the lower bit of size is zero and a
3360 // multiple of 2
3361 do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
3362 L_exit, _masm);
3363 }
3364 __ jmpb(L_exit);
3365
3366 __ BIND(L_fillQuadwords);
3367
3368 // Fill QUADWORDs
3369 {
3370 UnsafeMemoryAccessMark umam(this, true, true);
3371
3372 // At this point, we know the lower 3 bits of size are zero and a
3373 // multiple of 8
3374 do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
3375 L_exit, _masm);
3376 }
3377 __ BIND(L_exit);
3378
3379 __ leave(); // required for proper stackwalking of RuntimeStub frame
3380 __ ret(0);
3381
3382 __ BIND(L_fillDwords);
3383
3384 // Fill DWORDs
3385 {
3386 UnsafeMemoryAccessMark umam(this, true, true);
3387
3388 // At this point, we know the lower 2 bits of size are zero and a
3389 // multiple of 4
3390 do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
3391 L_exit, _masm);
3392 }
3393 __ jmpb(L_exit);
3394
3395 __ BIND(L_fillBytes);
3396 // Set up for tail call to previously generated byte fill routine
3397 // Parameter order is (ptr, byteVal, size)
3398 __ xchgq(c_rarg1, c_rarg2);
3399 __ leave(); // Clear effect of enter()
3400 __ jump(RuntimeAddress(unsafe_byte_fill));
3401 }
3402
3403 // retrieve the registered handler addresses
3404 address end = __ pc();
3405 retrieve_unsafe_access_handlers(start, end, extras);
3406 assert(extras.length() == expected_handler_count,
3407 "unexpected handler addresses count %d", extras.length());
3408
3409 // record the stub entry and end plus the no_push entry and any
3410 // extra handler addresses
3411 store_archive_data(stub_id, start, end, nullptr, &extras);
3412
3413 return start;
3414 }
3415
3416 // Perform range checks on the proposed arraycopy.
3417 // Kills temp, but nothing else.
3418 // Also, clean the sign bits of src_pos and dst_pos.
3419 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0)
3420 Register src_pos, // source position (c_rarg1)
3421 Register dst, // destination array oo (c_rarg2)
3422 Register dst_pos, // destination position (c_rarg3)
3423 Register length,
3424 Register temp,
3425 Label& L_failed) {
3426 BLOCK_COMMENT("arraycopy_range_checks:");
3427
3428 // if (src_pos + length > arrayOop(src)->length()) FAIL;
3429 __ movl(temp, length);
3430 __ addl(temp, src_pos); // src_pos + length
3431 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
3432 __ jcc(Assembler::above, L_failed);
3433
3434 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
3435 __ movl(temp, length);
3436 __ addl(temp, dst_pos); // dst_pos + length
3437 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
3438 __ jcc(Assembler::above, L_failed);
3439
3440 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
3441 // Move with sign extension can be used since they are positive.
3442 __ movslq(src_pos, src_pos);
3443 __ movslq(dst_pos, dst_pos);
3444
3445 BLOCK_COMMENT("arraycopy_range_checks done");
3446 }
3447
3448
3449 // Generate generic array copy stubs
3450 //
3451 // Input:
3452 // c_rarg0 - src oop
3453 // c_rarg1 - src_pos (32-bits)
3454 // c_rarg2 - dst oop
3455 // c_rarg3 - dst_pos (32-bits)
3456 // not Win64
3457 // c_rarg4 - element count (32-bits)
3458 // Win64
3459 // rsp+40 - element count (32-bits)
3460 //
3461 // Output:
3462 // rax == 0 - success
3463 // rax == -1^K - failure, where K is partial transfer count
3464 //
3465 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
3466 address int_copy_entry, address oop_copy_entry,
3467 address long_copy_entry, address checkcast_copy_entry) {
3468
3469 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
3470 int entry_count = StubInfo::entry_count(stub_id);
3471 assert(entry_count == 1, "sanity check");
3472 address start = load_archive_data(stub_id);
3473 if (start != nullptr) {
3474 return start;
3475 }
3476
3477 Label L_failed, L_failed_0, L_skip_failed_0, L_objArray;
3478 Label L_copy_shorts, L_copy_ints, L_copy_longs;
3479
3480 // Input registers
3481 const Register src = c_rarg0; // source array oop
3482 const Register src_pos = c_rarg1; // source position
3483 const Register dst = c_rarg2; // destination array oop
3484 const Register dst_pos = c_rarg3; // destination position
3485 #ifndef _WIN64
3486 const Register length = c_rarg4;
3487 const Register rklass_tmp = r9; // load_klass
3488 #else
3489 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64
3490 const Register rklass_tmp = rdi; // load_klass
3491 #endif
3492
3493 StubCodeMark mark(this, stub_id);
3494 __ align(CodeEntryAlignment);
3495 start = __ pc();
3496
3497 __ enter(); // required for proper stackwalking of RuntimeStub frame
3498
3499 #ifdef _WIN64
3500 __ push_ppx(rklass_tmp); // rdi is callee-save on Windows
3501 #endif
3502
3503 // bump this on entry, not on exit:
3504 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
3505
3506 //-----------------------------------------------------------------------
3507 // Assembler stub will be used for this call to arraycopy
3508 // if the following conditions are met:
3509 //
3510 // (1) src and dst must not be null.
3511 // (2) src_pos must not be negative.
3512 // (3) dst_pos must not be negative.
3513 // (4) length must not be negative.
3514 // (5) src klass and dst klass should be the same and not null.
3515 // (6) src and dst should be arrays.
3516 // (7) src_pos + length must not exceed length of src.
3517 // (8) dst_pos + length must not exceed length of dst.
3518 //
3519
3520 // if (src == nullptr) return -1;
3521 __ testptr(src, src); // src oop
3522 size_t j1off = __ offset();
3523 __ jccb(Assembler::zero, L_failed_0);
3524
3525 // if (src_pos < 0) return -1;
3526 __ testl(src_pos, src_pos); // src_pos (32-bits)
3527 __ jccb(Assembler::negative, L_failed_0);
3528
3529 // if (dst == nullptr) return -1;
3530 __ testptr(dst, dst); // dst oop
3531 __ jccb(Assembler::zero, L_failed_0);
3532
3533 // if (dst_pos < 0) return -1;
3534 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3535 size_t j4off = __ offset();
3536 // skip over the failure trampoline
3537 __ jccb(Assembler::positive, L_skip_failed_0);
3538
3539 // The first four tests are very dense code,
3540 // but not quite dense enough to put four
3541 // jumps in a 16-byte instruction fetch buffer.
3542 // That's good, because some branch predicters
3543 // do not like jumps so close together.
3544 // Make sure of this.
3545 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3546
3547 // Short-hop target to L_failed. Makes for denser prologue code.
3548 __ BIND(L_failed_0);
3549 __ jmp(L_failed);
3550
3551 // continue here if first 4 checks pass
3552 __ bind(L_skip_failed_0);
3553
3554 // registers used as temp
3555 const Register r11_length = r11; // elements count to copy
3556 const Register r10_src_klass = r10; // array klass
3557
3558 // if (length < 0) return -1;
3559 __ movl(r11_length, length); // length (elements count, 32-bits value)
3560 __ testl(r11_length, r11_length);
3561 __ jccb(Assembler::negative, L_failed_0);
3562
3563 __ load_klass(r10_src_klass, src, rklass_tmp);
3564 #ifdef ASSERT
3565 // assert(src->klass() != nullptr);
3566 {
3567 BLOCK_COMMENT("assert klasses not null {");
3568 Label L1, L2;
3569 __ testptr(r10_src_klass, r10_src_klass);
3570 __ jcc(Assembler::notZero, L2); // it is broken if klass is null
3571 __ bind(L1);
3572 __ stop("broken null klass");
3573 __ bind(L2);
3574 __ load_klass(rax, dst, rklass_tmp);
3575 __ cmpq(rax, 0);
3576 __ jcc(Assembler::equal, L1); // this would be broken also
3577 BLOCK_COMMENT("} assert klasses not null done");
3578 }
3579 #endif
3580
3581 // Load layout helper (32-bits)
3582 //
3583 // |array_tag| | header_size | element_type | |log2_element_size|
3584 // 32 30 24 16 8 2 0
3585 //
3586 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3587 //
3588
3589 const int lh_offset = in_bytes(Klass::layout_helper_offset());
3590
3591 // Handle objArrays completely differently...
3592 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3593 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3594 __ jcc(Assembler::equal, L_objArray);
3595
3596 // if (src->klass() != dst->klass()) return -1;
3597 __ load_klass(rax, dst, rklass_tmp);
3598 __ cmpq(r10_src_klass, rax);
3599 __ jcc(Assembler::notEqual, L_failed);
3600
3601 // Check for flat inline type array -> return -1
3602 __ test_flat_array_oop(src, rax, L_failed);
3603
3604 // Check for null-free (non-flat) inline type array -> handle as object array
3605 __ test_null_free_array_oop(src, rax, L_objArray);
3606
3607 const Register rax_lh = rax; // layout helper
3608 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3609
3610 // Check for flat inline type array -> return -1
3611 __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
3612 __ jcc(Assembler::notZero, L_failed);
3613
3614 // if (!src->is_Array()) return -1;
3615 __ cmpl(rax_lh, Klass::_lh_neutral_value);
3616 __ jcc(Assembler::greaterEqual, L_failed);
3617
3618 // At this point, it is known to be a typeArray (array_tag 0x3).
3619 #ifdef ASSERT
3620 {
3621 BLOCK_COMMENT("assert primitive array {");
3622 Label L;
3623 __ movl(rklass_tmp, rax_lh);
3624 __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3625 __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3626 __ jcc(Assembler::equal, L);
3627 __ stop("must be a primitive array");
3628 __ bind(L);
3629 BLOCK_COMMENT("} assert primitive array done");
3630 }
3631 #endif
3632
3633 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3634 r10, L_failed);
3635
3636 // TypeArrayKlass
3637 //
3638 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3639 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3640 //
3641
3642 const Register r10_offset = r10; // array offset
3643 const Register rax_elsize = rax_lh; // element size
3644
3645 __ movl(r10_offset, rax_lh);
3646 __ shrl(r10_offset, Klass::_lh_header_size_shift);
3647 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
3648 __ addptr(src, r10_offset); // src array offset
3649 __ addptr(dst, r10_offset); // dst array offset
3650 BLOCK_COMMENT("choose copy loop based on element size");
3651 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3652
3653 #ifdef _WIN64
3654 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3655 #endif
3656
3657 // next registers should be set before the jump to corresponding stub
3658 const Register from = c_rarg0; // source array address
3659 const Register to = c_rarg1; // destination array address
3660 const Register count = c_rarg2; // elements count
3661
3662 // 'from', 'to', 'count' registers should be set in such order
3663 // since they are the same as 'src', 'src_pos', 'dst'.
3664
3665 __ cmpl(rax_elsize, 0);
3666 __ jccb(Assembler::notEqual, L_copy_shorts);
3667 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3668 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3669 __ movl2ptr(count, r11_length); // length
3670 __ jump(RuntimeAddress(byte_copy_entry));
3671
3672 __ BIND(L_copy_shorts);
3673 __ cmpl(rax_elsize, LogBytesPerShort);
3674 __ jccb(Assembler::notEqual, L_copy_ints);
3675 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3676 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3677 __ movl2ptr(count, r11_length); // length
3678 __ jump(RuntimeAddress(short_copy_entry));
3679
3680 __ BIND(L_copy_ints);
3681 __ cmpl(rax_elsize, LogBytesPerInt);
3682 __ jccb(Assembler::notEqual, L_copy_longs);
3683 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3684 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3685 __ movl2ptr(count, r11_length); // length
3686 __ jump(RuntimeAddress(int_copy_entry));
3687
3688 __ BIND(L_copy_longs);
3689 #ifdef ASSERT
3690 {
3691 BLOCK_COMMENT("assert long copy {");
3692 Label L;
3693 __ cmpl(rax_elsize, LogBytesPerLong);
3694 __ jcc(Assembler::equal, L);
3695 __ stop("must be long copy, but elsize is wrong");
3696 __ bind(L);
3697 BLOCK_COMMENT("} assert long copy done");
3698 }
3699 #endif
3700 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3701 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3702 __ movl2ptr(count, r11_length); // length
3703 __ jump(RuntimeAddress(long_copy_entry));
3704
3705 // ObjArrayKlass
3706 __ BIND(L_objArray);
3707 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
3708
3709 Label L_plain_copy, L_checkcast_copy;
3710 // test array classes for subtyping
3711 __ load_klass(rax, dst, rklass_tmp);
3712 __ cmpq(r10_src_klass, rax); // usual case is exact equality
3713 __ jcc(Assembler::notEqual, L_checkcast_copy);
3714
3715 // Identically typed arrays can be copied without element-wise checks.
3716 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3717 r10, L_failed);
3718
3719 __ lea(from, Address(src, src_pos, TIMES_OOP,
3720 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3721 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3722 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3723 __ movl2ptr(count, r11_length); // length
3724 __ BIND(L_plain_copy);
3725 #ifdef _WIN64
3726 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3727 #endif
3728 __ jump(RuntimeAddress(oop_copy_entry));
3729
3730 __ BIND(L_checkcast_copy);
3731 // live at this point: r10_src_klass, r11_length, rax (dst_klass)
3732 {
3733 // Before looking at dst.length, make sure dst is also an objArray.
3734 // This check also fails for flat arrays which are not supported.
3735 __ cmpl(Address(rax, lh_offset), objArray_lh);
3736 __ jcc(Assembler::notEqual, L_failed);
3737
3738 #ifdef ASSERT
3739 {
3740 BLOCK_COMMENT("assert not null-free array {");
3741 Label L;
3742 __ test_non_null_free_array_oop(dst, rklass_tmp, L);
3743 __ stop("unexpected null-free array");
3744 __ bind(L);
3745 BLOCK_COMMENT("} assert not null-free array");
3746 }
3747 #endif
3748
3749 // It is safe to examine both src.length and dst.length.
3750 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3751 rax, L_failed);
3752
3753 const Register r11_dst_klass = r11;
3754 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3755
3756 // Marshal the base address arguments now, freeing registers.
3757 __ lea(from, Address(src, src_pos, TIMES_OOP,
3758 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3759 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3760 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3761 __ movl(count, length); // length (reloaded)
3762 Register sco_temp = c_rarg3; // this register is free now
3763 assert_different_registers(from, to, count, sco_temp,
3764 r11_dst_klass, r10_src_klass);
3765 assert_clean_int(count, sco_temp);
3766
3767 // Generate the type check.
3768 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3769 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3770 assert_clean_int(sco_temp, rax);
3771 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3772
3773 // Fetch destination element klass from the ObjArrayKlass header.
3774 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3775 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3776 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
3777 assert_clean_int(sco_temp, rax);
3778
3779 #ifdef _WIN64
3780 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3781 #endif
3782
3783 // the checkcast_copy loop needs two extra arguments:
3784 assert(c_rarg3 == sco_temp, "#3 already in place");
3785 // Set up arguments for checkcast_copy_entry.
3786 setup_arg_regs_using_thread(4);
3787 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3788 __ jump(RuntimeAddress(checkcast_copy_entry));
3789 }
3790
3791 __ BIND(L_failed);
3792 #ifdef _WIN64
3793 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3794 #endif
3795 __ xorptr(rax, rax);
3796 __ notptr(rax); // return -1
3797 __ leave(); // required for proper stackwalking of RuntimeStub frame
3798 __ ret(0);
3799
3800 // record the stub entry and end
3801 store_archive_data(stub_id, start, __ pc());
3802
3803 return start;
3804 }
3805
3806 #undef __