1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/macroAssembler.hpp"
26 #include "gc/shared/barrierSet.hpp"
27 #include "gc/shared/barrierSetAssembler.hpp"
28 #include "oops/objArrayKlass.hpp"
29 #include "runtime/sharedRuntime.hpp"
30 #include "runtime/stubRoutines.hpp"
31 #include "stubGenerator_x86_64.hpp"
32 #ifdef COMPILER2
33 #include "opto/c2_globals.hpp"
34 #endif
35 #if INCLUDE_JVMCI
36 #include "jvmci/jvmci_globals.hpp"
37 #endif
38
39 #define __ _masm->
40
41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
42
43 #ifdef PRODUCT
44 #define BLOCK_COMMENT(str) /* nothing */
45 #else
46 #define BLOCK_COMMENT(str) __ block_comment(str)
47 #endif // PRODUCT
48
49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
50
51 #ifdef PRODUCT
52 #define INC_COUNTER_NP(counter, rscratch) ((void)0)
53 #else
54 #define INC_COUNTER_NP(counter, rscratch) \
55 BLOCK_COMMENT("inc_counter " #counter); \
56 inc_counter_np(_masm, counter, rscratch);
57
58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) {
59 __ incrementl(ExternalAddress((address)&counter), rscratch);
60 }
61
62 #if COMPILER2_OR_JVMCI
63 static uint& get_profile_ctr(int shift) {
64 if (shift == 0) {
65 return SharedRuntime::_jbyte_array_copy_ctr;
66 } else if (shift == 1) {
67 return SharedRuntime::_jshort_array_copy_ctr;
68 } else if (shift == 2) {
69 return SharedRuntime::_jint_array_copy_ctr;
70 } else {
71 assert(shift == 3, "");
72 return SharedRuntime::_jlong_array_copy_ctr;
73 }
74 }
75 #endif // COMPILER2_OR_JVMCI
76 #endif // !PRODUCT
77
78 void StubGenerator::generate_arraycopy_stubs() {
79 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
80 // entry immediately following their stack push. This can be used
81 // as a post-push branch target for compatible stubs when they
82 // identify a special case that can be handled by the fallback
83 // stub e.g a disjoint copy stub may be use as a special case
84 // fallback for its compatible conjoint copy stub.
85 //
86 // A no push entry is always returned in the following local and
87 // then published by assigning to the appropriate entry field in
88 // class StubRoutines. The entry value is then passed to the
89 // generator for the compatible stub. That means the entry must be
90 // listed when saving to/restoring from the AOT cache, ensuring
91 // that the inter-stub jumps are noted at AOT-cache save and
92 // relocated at AOT cache load.
93 address nopush_entry;
94
95 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(&nopush_entry);
96 // disjoint nopush entry is needed by conjoint copy
97 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
98 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
99 // conjoint nopush entry is needed by generic/unsafe copy
100 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
101
102 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
103 // disjoint nopush entry is needed by conjoint copy
104 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
105 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
106 // conjoint nopush entry is needed by generic/unsafe copy
107 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
108
109 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
110 // disjoint nopush entry is needed by conjoint copy
111 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
112 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
113 // conjoint nopush entry is needed by generic/unsafe copy
114 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
115
116 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
117 // disjoint nopush entry is needed by conjoint copy
118 StubRoutines::_jlong_disjoint_arraycopy_nopush = nopush_entry;
119 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
120 // conjoint nopush entry is needed by generic/unsafe copy
121 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
122
123 if (UseCompressedOops) {
124 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
125 // disjoint nopush entry is needed by conjoint copy
126 StubRoutines::_oop_disjoint_arraycopy_nopush = nopush_entry;
127 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
128 // conjoint nopush entry is needed by generic/unsafe copy
129 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
130 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
131 // disjoint nopush entry is needed by conjoint copy
132 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
133 // note that we don't need a returned nopush entry because the
134 // generic/unsafe copy does not cater for uninit arrays.
135 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
136 } else {
137 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
138 // disjoint nopush entry is needed by conjoint copy
139 StubRoutines::_oop_disjoint_arraycopy_nopush = nopush_entry;
140 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
141 // conjoint nopush entry is needed by generic/unsafe copy
142 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
143 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
144 // disjoint nopush entry is needed by conjoint copy
145 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
146 // note that we don't need a returned nopush entry because the
147 // generic/unsafe copy does not cater for uninit arrays.
148 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
149 }
150
151 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
152 // checkcast nopush entry is needed by generic copy
153 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
154 // note that we don't need a returned nopush entry because the
155 // generic copy does not cater for uninit arrays.
156 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
157
158 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
159 StubRoutines::_jshort_arraycopy_nopush,
160 StubRoutines::_jint_arraycopy_nopush,
161 StubRoutines::_jlong_arraycopy_nopush);
162 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
163 StubRoutines::_jshort_arraycopy_nopush,
164 StubRoutines::_jint_arraycopy_nopush,
165 StubRoutines::_oop_arraycopy_nopush,
166 StubRoutines::_jlong_arraycopy_nopush,
167 StubRoutines::_checkcast_arraycopy_nopush);
168
169 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
170 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
171 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
172 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
173 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
174 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
175
176 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
177
178 // We don't generate specialized code for HeapWord-aligned source
179 // arrays, so just use the code we've already generated
180 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy;
181 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy;
182
183 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
184 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy;
185
186 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
187 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
188
189 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
190 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
191
192 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
193 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
194
195 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
196 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
197 }
198
199
200 // Verify that a register contains clean 32-bits positive value
201 // (high 32-bits are 0) so it could be used in 64-bits shifts.
202 //
203 // Input:
204 // Rint - 32-bits value
205 // Rtmp - scratch
206 //
207 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) {
208 #ifdef ASSERT
209 Label L;
210 assert_different_registers(Rtmp, Rint);
211 __ movslq(Rtmp, Rint);
212 __ cmpq(Rtmp, Rint);
213 __ jcc(Assembler::equal, L);
214 __ stop("high 32-bits of int value are not 0");
215 __ bind(L);
216 #endif
217 }
218
219
220 // Generate overlap test for array copy stubs
221 //
222 // Input:
223 // c_rarg0 - from
224 // c_rarg1 - to
225 // c_rarg2 - element count
226 //
227 // Output:
228 // rax - &from[element count - 1]
229 //
230 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
231 const Register from = c_rarg0;
232 const Register to = c_rarg1;
233 const Register count = c_rarg2;
234 const Register end_from = rax;
235
236 __ cmpptr(to, from);
237 __ lea(end_from, Address(from, count, sf, 0));
238 if (NOLp == nullptr) {
239 RuntimeAddress no_overlap(no_overlap_target);
240 __ jump_cc(Assembler::belowEqual, no_overlap);
241 __ cmpptr(to, end_from);
242 __ jump_cc(Assembler::aboveEqual, no_overlap);
243 } else {
244 __ jcc(Assembler::belowEqual, (*NOLp));
245 __ cmpptr(to, end_from);
246 __ jcc(Assembler::aboveEqual, (*NOLp));
247 }
248 }
249
250
251 // Copy big chunks forward
252 //
253 // Inputs:
254 // end_from - source arrays end address
255 // end_to - destination array end address
256 // qword_count - 64-bits element count, negative
257 // tmp1 - scratch
258 // L_copy_bytes - entry label
259 // L_copy_8_bytes - exit label
260 //
261 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to,
262 Register qword_count, Register tmp1,
263 Register tmp2, Label& L_copy_bytes,
264 Label& L_copy_8_bytes, DecoratorSet decorators,
265 BasicType type) {
266 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
267 DEBUG_ONLY(__ stop("enter at entry label, not here"));
268 Label L_loop;
269 __ align(OptoLoopAlignment);
270 if (UseUnalignedLoadStores) {
271 Label L_end;
272 __ BIND(L_loop);
273 if (UseAVX >= 2) {
274 bs->copy_load_at(_masm, decorators, type, 32,
275 xmm0, Address(end_from, qword_count, Address::times_8, -56),
276 tmp1, xmm1);
277 bs->copy_store_at(_masm, decorators, type, 32,
278 Address(end_to, qword_count, Address::times_8, -56), xmm0,
279 tmp1, tmp2, xmm1);
280
281 bs->copy_load_at(_masm, decorators, type, 32,
282 xmm0, Address(end_from, qword_count, Address::times_8, -24),
283 tmp1, xmm1);
284 bs->copy_store_at(_masm, decorators, type, 32,
285 Address(end_to, qword_count, Address::times_8, -24), xmm0,
286 tmp1, tmp2, xmm1);
287 } else {
288 bs->copy_load_at(_masm, decorators, type, 16,
289 xmm0, Address(end_from, qword_count, Address::times_8, -56),
290 tmp1, xmm1);
291 bs->copy_store_at(_masm, decorators, type, 16,
292 Address(end_to, qword_count, Address::times_8, -56), xmm0,
293 tmp1, tmp2, xmm1);
294 bs->copy_load_at(_masm, decorators, type, 16,
295 xmm0, Address(end_from, qword_count, Address::times_8, -40),
296 tmp1, xmm1);
297 bs->copy_store_at(_masm, decorators, type, 16,
298 Address(end_to, qword_count, Address::times_8, -40), xmm0,
299 tmp1, tmp2, xmm1);
300 bs->copy_load_at(_masm, decorators, type, 16,
301 xmm0, Address(end_from, qword_count, Address::times_8, -24),
302 tmp1, xmm1);
303 bs->copy_store_at(_masm, decorators, type, 16,
304 Address(end_to, qword_count, Address::times_8, -24), xmm0,
305 tmp1, tmp2, xmm1);
306 bs->copy_load_at(_masm, decorators, type, 16,
307 xmm0, Address(end_from, qword_count, Address::times_8, -8),
308 tmp1, xmm1);
309 bs->copy_store_at(_masm, decorators, type, 16,
310 Address(end_to, qword_count, Address::times_8, -8), xmm0,
311 tmp1, tmp2, xmm1);
312 }
313
314 __ BIND(L_copy_bytes);
315 __ addptr(qword_count, 8);
316 __ jcc(Assembler::lessEqual, L_loop);
317 __ subptr(qword_count, 4); // sub(8) and add(4)
318 __ jcc(Assembler::greater, L_end);
319 // Copy trailing 32 bytes
320 if (UseAVX >= 2) {
321 bs->copy_load_at(_masm, decorators, type, 32,
322 xmm0, Address(end_from, qword_count, Address::times_8, -24),
323 tmp1, xmm1);
324 bs->copy_store_at(_masm, decorators, type, 32,
325 Address(end_to, qword_count, Address::times_8, -24), xmm0,
326 tmp1, tmp2, xmm1);
327 } else {
328 bs->copy_load_at(_masm, decorators, type, 16,
329 xmm0, Address(end_from, qword_count, Address::times_8, -24),
330 tmp1, xmm1);
331 bs->copy_store_at(_masm, decorators, type, 16,
332 Address(end_to, qword_count, Address::times_8, -24), xmm0,
333 tmp1, tmp2, xmm1);
334 bs->copy_load_at(_masm, decorators, type, 16,
335 xmm0, Address(end_from, qword_count, Address::times_8, -8),
336 tmp1, xmm1);
337 bs->copy_store_at(_masm, decorators, type, 16,
338 Address(end_to, qword_count, Address::times_8, -8), xmm0,
339 tmp1, tmp2, xmm1);
340 }
341 __ addptr(qword_count, 4);
342 __ BIND(L_end);
343 } else {
344 // Copy 32-bytes per iteration
345 __ BIND(L_loop);
346 bs->copy_load_at(_masm, decorators, type, 8,
347 tmp1, Address(end_from, qword_count, Address::times_8, -24),
348 tmp2);
349 bs->copy_store_at(_masm, decorators, type, 8,
350 Address(end_to, qword_count, Address::times_8, -24), tmp1,
351 tmp2);
352 bs->copy_load_at(_masm, decorators, type, 8,
353 tmp1, Address(end_from, qword_count, Address::times_8, -16),
354 tmp2);
355 bs->copy_store_at(_masm, decorators, type, 8,
356 Address(end_to, qword_count, Address::times_8, -16), tmp1,
357 tmp2);
358 bs->copy_load_at(_masm, decorators, type, 8,
359 tmp1, Address(end_from, qword_count, Address::times_8, -8),
360 tmp2);
361 bs->copy_store_at(_masm, decorators, type, 8,
362 Address(end_to, qword_count, Address::times_8, -8), tmp1,
363 tmp2);
364 bs->copy_load_at(_masm, decorators, type, 8,
365 tmp1, Address(end_from, qword_count, Address::times_8, 0),
366 tmp2);
367 bs->copy_store_at(_masm, decorators, type, 8,
368 Address(end_to, qword_count, Address::times_8, 0), tmp1,
369 tmp2);
370
371 __ BIND(L_copy_bytes);
372 __ addptr(qword_count, 4);
373 __ jcc(Assembler::lessEqual, L_loop);
374 }
375 __ subptr(qword_count, 4);
376 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
377 }
378
379
380 // Copy big chunks backward
381 //
382 // Inputs:
383 // from - source arrays address
384 // dest - destination array address
385 // qword_count - 64-bits element count
386 // tmp1 - scratch
387 // L_copy_bytes - entry label
388 // L_copy_8_bytes - exit label
389 //
390 void StubGenerator::copy_bytes_backward(Register from, Register dest,
391 Register qword_count, Register tmp1,
392 Register tmp2, Label& L_copy_bytes,
393 Label& L_copy_8_bytes, DecoratorSet decorators,
394 BasicType type) {
395 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
396 DEBUG_ONLY(__ stop("enter at entry label, not here"));
397 Label L_loop;
398 __ align(OptoLoopAlignment);
399 if (UseUnalignedLoadStores) {
400 Label L_end;
401 __ BIND(L_loop);
402 if (UseAVX >= 2) {
403 bs->copy_load_at(_masm, decorators, type, 32,
404 xmm0, Address(from, qword_count, Address::times_8, 32),
405 tmp1, xmm1);
406 bs->copy_store_at(_masm, decorators, type, 32,
407 Address(dest, qword_count, Address::times_8, 32), xmm0,
408 tmp1, tmp2, xmm1);
409 bs->copy_load_at(_masm, decorators, type, 32,
410 xmm0, Address(from, qword_count, Address::times_8, 0),
411 tmp1, xmm1);
412 bs->copy_store_at(_masm, decorators, type, 32,
413 Address(dest, qword_count, Address::times_8, 0), xmm0,
414 tmp1, tmp2, xmm1);
415 } else {
416 bs->copy_load_at(_masm, decorators, type, 16,
417 xmm0, Address(from, qword_count, Address::times_8, 48),
418 tmp1, xmm1);
419 bs->copy_store_at(_masm, decorators, type, 16,
420 Address(dest, qword_count, Address::times_8, 48), xmm0,
421 tmp1, tmp2, xmm1);
422 bs->copy_load_at(_masm, decorators, type, 16,
423 xmm0, Address(from, qword_count, Address::times_8, 32),
424 tmp1, xmm1);
425 bs->copy_store_at(_masm, decorators, type, 16,
426 Address(dest, qword_count, Address::times_8, 32), xmm0,
427 tmp1, tmp2, xmm1);
428 bs->copy_load_at(_masm, decorators, type, 16,
429 xmm0, Address(from, qword_count, Address::times_8, 16),
430 tmp1, xmm1);
431 bs->copy_store_at(_masm, decorators, type, 16,
432 Address(dest, qword_count, Address::times_8, 16), xmm0,
433 tmp1, tmp2, xmm1);
434 bs->copy_load_at(_masm, decorators, type, 16,
435 xmm0, Address(from, qword_count, Address::times_8, 0),
436 tmp1, xmm1);
437 bs->copy_store_at(_masm, decorators, type, 16,
438 Address(dest, qword_count, Address::times_8, 0), xmm0,
439 tmp1, tmp2, xmm1);
440 }
441
442 __ BIND(L_copy_bytes);
443 __ subptr(qword_count, 8);
444 __ jcc(Assembler::greaterEqual, L_loop);
445
446 __ addptr(qword_count, 4); // add(8) and sub(4)
447 __ jcc(Assembler::less, L_end);
448 // Copy trailing 32 bytes
449 if (UseAVX >= 2) {
450 bs->copy_load_at(_masm, decorators, type, 32,
451 xmm0, Address(from, qword_count, Address::times_8, 0),
452 tmp1, xmm1);
453 bs->copy_store_at(_masm, decorators, type, 32,
454 Address(dest, qword_count, Address::times_8, 0), xmm0,
455 tmp1, tmp2, xmm1);
456 } else {
457 bs->copy_load_at(_masm, decorators, type, 16,
458 xmm0, Address(from, qword_count, Address::times_8, 16),
459 tmp1, xmm1);
460 bs->copy_store_at(_masm, decorators, type, 16,
461 Address(dest, qword_count, Address::times_8, 16), xmm0,
462 tmp1, tmp2, xmm1);
463 bs->copy_load_at(_masm, decorators, type, 16,
464 xmm0, Address(from, qword_count, Address::times_8, 0),
465 tmp1, xmm1);
466 bs->copy_store_at(_masm, decorators, type, 16,
467 Address(dest, qword_count, Address::times_8, 0), xmm0,
468 tmp1, tmp2, xmm1);
469 }
470 __ subptr(qword_count, 4);
471 __ BIND(L_end);
472 } else {
473 // Copy 32-bytes per iteration
474 __ BIND(L_loop);
475 bs->copy_load_at(_masm, decorators, type, 8,
476 tmp1, Address(from, qword_count, Address::times_8, 24),
477 tmp2);
478 bs->copy_store_at(_masm, decorators, type, 8,
479 Address(dest, qword_count, Address::times_8, 24), tmp1,
480 tmp2);
481 bs->copy_load_at(_masm, decorators, type, 8,
482 tmp1, Address(from, qword_count, Address::times_8, 16),
483 tmp2);
484 bs->copy_store_at(_masm, decorators, type, 8,
485 Address(dest, qword_count, Address::times_8, 16), tmp1,
486 tmp2);
487 bs->copy_load_at(_masm, decorators, type, 8,
488 tmp1, Address(from, qword_count, Address::times_8, 8),
489 tmp2);
490 bs->copy_store_at(_masm, decorators, type, 8,
491 Address(dest, qword_count, Address::times_8, 8), tmp1,
492 tmp2);
493 bs->copy_load_at(_masm, decorators, type, 8,
494 tmp1, Address(from, qword_count, Address::times_8, 0),
495 tmp2);
496 bs->copy_store_at(_masm, decorators, type, 8,
497 Address(dest, qword_count, Address::times_8, 0), tmp1,
498 tmp2);
499
500 __ BIND(L_copy_bytes);
501 __ subptr(qword_count, 4);
502 __ jcc(Assembler::greaterEqual, L_loop);
503 }
504 __ addptr(qword_count, 4);
505 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
506 }
507
508 #if COMPILER2_OR_JVMCI
509
510 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
511 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
512 // for both special cases (various small block sizes) and aligned copy loop. This is the
513 // default configuration.
514 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
515 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
516 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
517 // better performance for disjoint copies. For conjoint/backward copy vector based
518 // copy performs better.
519 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
520 // 64 byte vector registers (ZMMs).
521
522 // Inputs:
523 // c_rarg0 - source array address
524 // c_rarg1 - destination array address
525 // c_rarg2 - element count, treated as ssize_t, can be zero
526 //
527 //
528 // Side Effects:
529 // disjoint_copy_avx3_masked is set to the no-overlap entry point
530 // used by generate_conjoint_[byte/int/short/long]_copy().
531 //
532 address StubGenerator::generate_disjoint_copy_avx3_masked(StubId stub_id, address* entry) {
533 // aligned is always false -- x86_64 always uses the unaligned code
534 const bool aligned = false;
535 int shift;
536 bool is_oop;
537 bool dest_uninitialized;
538
539 switch (stub_id) {
540 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
541 shift = 0;
542 is_oop = false;
543 dest_uninitialized = false;
544 break;
545 case StubId::stubgen_jshort_disjoint_arraycopy_id:
546 shift = 1;
547 is_oop = false;
548 dest_uninitialized = false;
549 break;
550 case StubId::stubgen_jint_disjoint_arraycopy_id:
551 shift = 2;
552 is_oop = false;
553 dest_uninitialized = false;
554 break;
555 case StubId::stubgen_jlong_disjoint_arraycopy_id:
556 shift = 3;
557 is_oop = false;
558 dest_uninitialized = false;
559 break;
560 case StubId::stubgen_oop_disjoint_arraycopy_id:
561 shift = (UseCompressedOops ? 2 : 3);
562 is_oop = true;
563 dest_uninitialized = false;
564 break;
565 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
566 shift = (UseCompressedOops ? 2 : 3);
567 is_oop = true;
568 dest_uninitialized = true;
569 break;
570 default:
571 ShouldNotReachHere();
572 }
573
574 __ align(CodeEntryAlignment);
575 StubCodeMark mark(this, stub_id);
576 address start = __ pc();
577
578 int avx3threshold = VM_Version::avx3_threshold();
579 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
580 const int large_threshold = 2621440; // 2.5 MB
581 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
582 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
583 Label L_copy_large, L_finish;
584 const Register from = rdi; // source array address
585 const Register to = rsi; // destination array address
586 const Register count = rdx; // elements count
587 const Register temp1 = r8;
588 const Register temp2 = r11;
589 const Register temp3 = rax;
590 const Register temp4 = rcx;
591 // End pointers are inclusive, and if count is not zero they point
592 // to the last unit copied: end_to[0] := end_from[0]
593
594 __ enter(); // required for proper stackwalking of RuntimeStub frame
595 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
596
597 if (entry != nullptr) {
598 *entry = __ pc();
599 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
600 BLOCK_COMMENT("Entry:");
601 }
602
603 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
604 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
605
606 setup_argument_regs(type);
607
608 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
609 if (dest_uninitialized) {
610 decorators |= IS_DEST_UNINITIALIZED;
611 }
612 if (aligned) {
613 decorators |= ARRAYCOPY_ALIGNED;
614 }
615 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
616 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
617
618 {
619 // Type(shift) byte(0), short(1), int(2), long(3)
620 int loop_size[] = { 192, 96, 48, 24};
621 int threshold[] = { 4096, 2048, 1024, 512};
622
623 // UnsafeMemoryAccess page error: continue after unsafe access
624 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
625 // 'from', 'to' and 'count' are now valid
626
627 // temp1 holds remaining count and temp4 holds running count used to compute
628 // next address offset for start of to/from addresses (temp4 * scale).
629 __ mov64(temp4, 0);
630 __ movq(temp1, count);
631
632 // Zero length check.
633 __ BIND(L_tail);
634 __ cmpq(temp1, 0);
635 __ jcc(Assembler::lessEqual, L_exit);
636
637 // Special cases using 32 byte [masked] vector copy operations.
638 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
639 temp4, temp3, use64byteVector, L_entry, L_exit);
640
641 // PRE-MAIN-POST loop for aligned copy.
642 __ BIND(L_entry);
643
644 if (MaxVectorSize == 64) {
645 __ movq(temp2, temp1);
646 __ shlq(temp2, shift);
647 __ cmpq(temp2, large_threshold);
648 __ jcc(Assembler::greaterEqual, L_copy_large);
649 }
650 if (avx3threshold != 0) {
651 __ cmpq(count, threshold[shift]);
652 if (MaxVectorSize == 64) {
653 // Copy using 64 byte vectors.
654 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
655 } else {
656 assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
657 // REP MOVS offer a faster copy path.
658 __ jcc(Assembler::greaterEqual, L_repmovs);
659 }
660 }
661
662 if ((MaxVectorSize < 64) || (avx3threshold != 0)) {
663 // Partial copy to make dst address 32 byte aligned.
664 __ movq(temp2, to);
665 __ andq(temp2, 31);
666 __ jcc(Assembler::equal, L_main_pre_loop);
667
668 __ negptr(temp2);
669 __ addq(temp2, 32);
670 if (shift) {
671 __ shrq(temp2, shift);
672 }
673 __ movq(temp3, temp2);
674 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
675 __ movq(temp4, temp2);
676 __ movq(temp1, count);
677 __ subq(temp1, temp2);
678
679 __ cmpq(temp1, loop_size[shift]);
680 __ jcc(Assembler::less, L_tail);
681
682 __ BIND(L_main_pre_loop);
683 __ subq(temp1, loop_size[shift]);
684
685 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
686 __ align32();
687 __ BIND(L_main_loop);
688 copy64_avx(to, from, temp4, xmm1, false, shift, 0);
689 copy64_avx(to, from, temp4, xmm1, false, shift, 64);
690 copy64_avx(to, from, temp4, xmm1, false, shift, 128);
691 __ addptr(temp4, loop_size[shift]);
692 __ subq(temp1, loop_size[shift]);
693 __ jcc(Assembler::greater, L_main_loop);
694
695 __ addq(temp1, loop_size[shift]);
696
697 // Tail loop.
698 __ jmp(L_tail);
699
700 __ BIND(L_repmovs);
701 __ movq(temp2, temp1);
702 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
703 __ movq(temp3, to);
704 __ movq(to, from);
705 __ movq(from, temp3);
706 // Save to/from for restoration post rep_mov.
707 __ movq(temp1, to);
708 __ movq(temp3, from);
709 if(shift < 3) {
710 __ shrq(temp2, 3-shift); // quad word count
711 }
712 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX).
713 __ rep_mov();
714 __ shlq(temp2, 3); // convert quad words into byte count.
715 if(shift) {
716 __ shrq(temp2, shift); // type specific count.
717 }
718 // Restore original addresses in to/from.
719 __ movq(to, temp3);
720 __ movq(from, temp1);
721 __ movq(temp4, temp2);
722 __ movq(temp1, count);
723 __ subq(temp1, temp2); // tailing part (less than a quad ward size).
724 __ jmp(L_tail);
725 }
726
727 if (MaxVectorSize > 32) {
728 __ BIND(L_pre_main_post_64);
729 // Partial copy to make dst address 64 byte aligned.
730 __ movq(temp2, to);
731 __ andq(temp2, 63);
732 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
733
734 __ negptr(temp2);
735 __ addq(temp2, 64);
736 if (shift) {
737 __ shrq(temp2, shift);
738 }
739 __ movq(temp3, temp2);
740 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
741 __ movq(temp4, temp2);
742 __ movq(temp1, count);
743 __ subq(temp1, temp2);
744
745 __ cmpq(temp1, loop_size[shift]);
746 __ jcc(Assembler::less, L_tail64);
747
748 __ BIND(L_main_pre_loop_64bytes);
749 __ subq(temp1, loop_size[shift]);
750
751 // Main loop with aligned copy block size of 192 bytes at
752 // 64 byte copy granularity.
753 __ align32();
754 __ BIND(L_main_loop_64bytes);
755 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
756 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
757 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
758 __ addptr(temp4, loop_size[shift]);
759 __ subq(temp1, loop_size[shift]);
760 __ jcc(Assembler::greater, L_main_loop_64bytes);
761
762 __ addq(temp1, loop_size[shift]);
763 // Zero length check.
764 __ jcc(Assembler::lessEqual, L_exit);
765
766 __ BIND(L_tail64);
767
768 // Tail handling using 64 byte [masked] vector copy operations.
769 use64byteVector = true;
770 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
771 temp4, temp3, use64byteVector, L_entry, L_exit);
772 }
773 __ BIND(L_exit);
774 }
775
776 __ BIND(L_finish);
777 address ucme_exit_pc = __ pc();
778 // When called from generic_arraycopy r11 contains specific values
779 // used during arraycopy epilogue, re-initializing r11.
780 if (is_oop) {
781 __ movq(r11, shift == 3 ? count : to);
782 }
783 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
784 restore_argument_regs(type);
785 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
786 __ xorptr(rax, rax); // return 0
787 __ vzeroupper();
788 __ leave(); // required for proper stackwalking of RuntimeStub frame
789 __ ret(0);
790
791 if (MaxVectorSize == 64) {
792 __ BIND(L_copy_large);
793 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
794 arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
795 __ jmp(L_finish);
796 }
797 return start;
798 }
799
800 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
801 Register temp3, Register temp4, Register count,
802 XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
803 XMMRegister xmm4, int shift) {
804
805 // Type(shift) byte(0), short(1), int(2), long(3)
806 int loop_size[] = { 256, 128, 64, 32};
807 int threshold[] = { 4096, 2048, 1024, 512};
808
809 Label L_main_loop_large;
810 Label L_tail_large;
811 Label L_exit_large;
812 Label L_entry_large;
813 Label L_main_pre_loop_large;
814 Label L_pre_main_post_large;
815
816 assert(MaxVectorSize == 64, "vector length != 64");
817 __ BIND(L_entry_large);
818
819 __ BIND(L_pre_main_post_large);
820 // Partial copy to make dst address 64 byte aligned.
821 __ movq(temp2, to);
822 __ andq(temp2, 63);
823 __ jcc(Assembler::equal, L_main_pre_loop_large);
824
825 __ negptr(temp2);
826 __ addq(temp2, 64);
827 if (shift) {
828 __ shrq(temp2, shift);
829 }
830 __ movq(temp3, temp2);
831 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
832 __ movq(temp4, temp2);
833 __ movq(temp1, count);
834 __ subq(temp1, temp2);
835
836 __ cmpq(temp1, loop_size[shift]);
837 __ jcc(Assembler::less, L_tail_large);
838
839 __ BIND(L_main_pre_loop_large);
840 __ subq(temp1, loop_size[shift]);
841
842 // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
843 __ align32();
844 __ BIND(L_main_loop_large);
845 copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
846 __ addptr(temp4, loop_size[shift]);
847 __ subq(temp1, loop_size[shift]);
848 __ jcc(Assembler::greater, L_main_loop_large);
849 // fence needed because copy256_avx3 uses non-temporal stores
850 __ sfence();
851
852 __ addq(temp1, loop_size[shift]);
853 // Zero length check.
854 __ jcc(Assembler::lessEqual, L_exit_large);
855 __ BIND(L_tail_large);
856 // Tail handling using 64 byte [masked] vector copy operations.
857 __ cmpq(temp1, 0);
858 __ jcc(Assembler::lessEqual, L_exit_large);
859 arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
860 temp4, temp3, L_exit_large);
861 __ BIND(L_exit_large);
862 }
863
864 // Inputs:
865 // c_rarg0 - source array address
866 // c_rarg1 - destination array address
867 // c_rarg2 - element count, treated as ssize_t, can be zero
868 //
869 //
870 address StubGenerator::generate_conjoint_copy_avx3_masked(StubId stub_id, address* entry, address nooverlap_target) {
871 // aligned is always false -- x86_64 always uses the unaligned code
872 const bool aligned = false;
873 int shift;
874 bool is_oop;
875 bool dest_uninitialized;
876
877 switch (stub_id) {
878 case StubId::stubgen_jbyte_arraycopy_id:
879 shift = 0;
880 is_oop = false;
881 dest_uninitialized = false;
882 break;
883 case StubId::stubgen_jshort_arraycopy_id:
884 shift = 1;
885 is_oop = false;
886 dest_uninitialized = false;
887 break;
888 case StubId::stubgen_jint_arraycopy_id:
889 shift = 2;
890 is_oop = false;
891 dest_uninitialized = false;
892 break;
893 case StubId::stubgen_jlong_arraycopy_id:
894 shift = 3;
895 is_oop = false;
896 dest_uninitialized = false;
897 break;
898 case StubId::stubgen_oop_arraycopy_id:
899 shift = (UseCompressedOops ? 2 : 3);
900 is_oop = true;
901 dest_uninitialized = false;
902 break;
903 case StubId::stubgen_oop_arraycopy_uninit_id:
904 shift = (UseCompressedOops ? 2 : 3);
905 is_oop = true;
906 dest_uninitialized = true;
907 break;
908 default:
909 ShouldNotReachHere();
910 }
911
912 __ align(CodeEntryAlignment);
913 StubCodeMark mark(this, stub_id);
914 address start = __ pc();
915
916 int avx3threshold = VM_Version::avx3_threshold();
917 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
918
919 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
920 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
921 const Register from = rdi; // source array address
922 const Register to = rsi; // destination array address
923 const Register count = rdx; // elements count
924 const Register temp1 = r8;
925 const Register temp2 = rcx;
926 const Register temp3 = r11;
927 const Register temp4 = rax;
928 // End pointers are inclusive, and if count is not zero they point
929 // to the last unit copied: end_to[0] := end_from[0]
930
931 __ enter(); // required for proper stackwalking of RuntimeStub frame
932 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
933
934 if (entry != nullptr) {
935 *entry = __ pc();
936 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
937 BLOCK_COMMENT("Entry:");
938 }
939
940 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
941
942 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
943 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
944
945 setup_argument_regs(type);
946
947 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
948 if (dest_uninitialized) {
949 decorators |= IS_DEST_UNINITIALIZED;
950 }
951 if (aligned) {
952 decorators |= ARRAYCOPY_ALIGNED;
953 }
954 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
955 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
956 {
957 // Type(shift) byte(0), short(1), int(2), long(3)
958 int loop_size[] = { 192, 96, 48, 24};
959 int threshold[] = { 4096, 2048, 1024, 512};
960
961 // UnsafeMemoryAccess page error: continue after unsafe access
962 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
963 // 'from', 'to' and 'count' are now valid
964
965 // temp1 holds remaining count.
966 __ movq(temp1, count);
967
968 // Zero length check.
969 __ BIND(L_tail);
970 __ cmpq(temp1, 0);
971 __ jcc(Assembler::lessEqual, L_exit);
972
973 __ mov64(temp2, 0);
974 __ movq(temp3, temp1);
975 // Special cases using 32 byte [masked] vector copy operations.
976 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
977 temp4, use64byteVector, L_entry, L_exit);
978
979 // PRE-MAIN-POST loop for aligned copy.
980 __ BIND(L_entry);
981
982 if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
983 __ cmpq(temp1, threshold[shift]);
984 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
985 }
986
987 if ((MaxVectorSize < 64) || (avx3threshold != 0)) {
988 // Partial copy to make dst address 32 byte aligned.
989 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
990 __ andq(temp2, 31);
991 __ jcc(Assembler::equal, L_main_pre_loop);
992
993 if (shift) {
994 __ shrq(temp2, shift);
995 }
996 __ subq(temp1, temp2);
997 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
998
999 __ cmpq(temp1, loop_size[shift]);
1000 __ jcc(Assembler::less, L_tail);
1001
1002 __ BIND(L_main_pre_loop);
1003
1004 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1005 __ align32();
1006 __ BIND(L_main_loop);
1007 copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1008 copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1009 copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1010 __ subptr(temp1, loop_size[shift]);
1011 __ cmpq(temp1, loop_size[shift]);
1012 __ jcc(Assembler::greater, L_main_loop);
1013
1014 // Tail loop.
1015 __ jmp(L_tail);
1016 }
1017
1018 if (MaxVectorSize > 32) {
1019 __ BIND(L_pre_main_post_64);
1020 // Partial copy to make dst address 64 byte aligned.
1021 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1022 __ andq(temp2, 63);
1023 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1024
1025 if (shift) {
1026 __ shrq(temp2, shift);
1027 }
1028 __ subq(temp1, temp2);
1029 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1030
1031 __ cmpq(temp1, loop_size[shift]);
1032 __ jcc(Assembler::less, L_tail64);
1033
1034 __ BIND(L_main_pre_loop_64bytes);
1035
1036 // Main loop with aligned copy block size of 192 bytes at
1037 // 64 byte copy granularity.
1038 __ align32();
1039 __ BIND(L_main_loop_64bytes);
1040 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1041 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1042 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1043 __ subq(temp1, loop_size[shift]);
1044 __ cmpq(temp1, loop_size[shift]);
1045 __ jcc(Assembler::greater, L_main_loop_64bytes);
1046
1047 // Zero length check.
1048 __ cmpq(temp1, 0);
1049 __ jcc(Assembler::lessEqual, L_exit);
1050
1051 __ BIND(L_tail64);
1052
1053 // Tail handling using 64 byte [masked] vector copy operations.
1054 use64byteVector = true;
1055 __ mov64(temp2, 0);
1056 __ movq(temp3, temp1);
1057 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1058 temp4, use64byteVector, L_entry, L_exit);
1059 }
1060 __ BIND(L_exit);
1061 }
1062 address ucme_exit_pc = __ pc();
1063 // When called from generic_arraycopy r11 contains specific values
1064 // used during arraycopy epilogue, re-initializing r11.
1065 if(is_oop) {
1066 __ movq(r11, count);
1067 }
1068 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1069 restore_argument_regs(type);
1070 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free
1071 __ xorptr(rax, rax); // return 0
1072 __ vzeroupper();
1073 __ leave(); // required for proper stackwalking of RuntimeStub frame
1074 __ ret(0);
1075
1076 return start;
1077 }
1078
1079 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
1080 Register to, Register count, int shift,
1081 Register index, Register temp,
1082 bool use64byteVector, Label& L_entry, Label& L_exit) {
1083 Label L_entry_64, L_entry_96, L_entry_128;
1084 Label L_entry_160, L_entry_192;
1085
1086 int size_mat[][6] = {
1087 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
1088 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
1089 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
1090 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
1091 };
1092
1093 // Case A) Special case for length less than equal to 32 bytes.
1094 __ cmpq(count, size_mat[shift][0]);
1095 __ jccb(Assembler::greater, L_entry_64);
1096 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
1097 __ jmp(L_exit);
1098
1099 // Case B) Special case for length less than equal to 64 bytes.
1100 __ BIND(L_entry_64);
1101 __ cmpq(count, size_mat[shift][1]);
1102 __ jccb(Assembler::greater, L_entry_96);
1103 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
1104 __ jmp(L_exit);
1105
1106 // Case C) Special case for length less than equal to 96 bytes.
1107 __ BIND(L_entry_96);
1108 __ cmpq(count, size_mat[shift][2]);
1109 __ jccb(Assembler::greater, L_entry_128);
1110 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1111 __ subq(count, 64 >> shift);
1112 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
1113 __ jmp(L_exit);
1114
1115 // Case D) Special case for length less than equal to 128 bytes.
1116 __ BIND(L_entry_128);
1117 __ cmpq(count, size_mat[shift][3]);
1118 __ jccb(Assembler::greater, L_entry_160);
1119 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1120 copy32_avx(to, from, index, xmm, shift, 64);
1121 __ subq(count, 96 >> shift);
1122 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
1123 __ jmp(L_exit);
1124
1125 // Case E) Special case for length less than equal to 160 bytes.
1126 __ BIND(L_entry_160);
1127 __ cmpq(count, size_mat[shift][4]);
1128 __ jccb(Assembler::greater, L_entry_192);
1129 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1130 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1131 __ subq(count, 128 >> shift);
1132 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
1133 __ jmp(L_exit);
1134
1135 // Case F) Special case for length less than equal to 192 bytes.
1136 __ BIND(L_entry_192);
1137 __ cmpq(count, size_mat[shift][5]);
1138 __ jcc(Assembler::greater, L_entry);
1139 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
1140 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
1141 copy32_avx(to, from, index, xmm, shift, 128);
1142 __ subq(count, 160 >> shift);
1143 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
1144 __ jmp(L_exit);
1145 }
1146
1147 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1148 Register to, Register count, int shift, Register index,
1149 Register temp, Label& L_exit) {
1150 Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1151
1152 int size_mat[][4] = {
1153 /* T_BYTE */ {64, 128, 192, 256},
1154 /* T_SHORT*/ {32, 64 , 96 , 128},
1155 /* T_INT */ {16, 32 , 48 , 64},
1156 /* T_LONG */ { 8, 16 , 24 , 32}
1157 };
1158
1159 assert(MaxVectorSize == 64, "vector length != 64");
1160 // Case A) Special case for length less than or equal to 64 bytes.
1161 __ BIND(L_entry_64);
1162 __ cmpq(count, size_mat[shift][0]);
1163 __ jccb(Assembler::greater, L_entry_128);
1164 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1165 __ jmp(L_exit);
1166
1167 // Case B) Special case for length less than or equal to 128 bytes.
1168 __ BIND(L_entry_128);
1169 __ cmpq(count, size_mat[shift][1]);
1170 __ jccb(Assembler::greater, L_entry_192);
1171 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1172 __ subq(count, 64 >> shift);
1173 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1174 __ jmp(L_exit);
1175
1176 // Case C) Special case for length less than or equal to 192 bytes.
1177 __ BIND(L_entry_192);
1178 __ cmpq(count, size_mat[shift][2]);
1179 __ jcc(Assembler::greater, L_entry_256);
1180 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1181 copy64_avx(to, from, index, xmm, false, shift, 64, true);
1182 __ subq(count, 128 >> shift);
1183 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1184 __ jmp(L_exit);
1185
1186 // Case D) Special case for length less than or equal to 256 bytes.
1187 __ BIND(L_entry_256);
1188 copy64_avx(to, from, index, xmm, false, shift, 0, true);
1189 copy64_avx(to, from, index, xmm, false, shift, 64, true);
1190 copy64_avx(to, from, index, xmm, false, shift, 128, true);
1191 __ subq(count, 192 >> shift);
1192 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1193 __ jmp(L_exit);
1194 }
1195
1196 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
1197 Register to, Register start_index, Register end_index,
1198 Register count, int shift, Register temp,
1199 bool use64byteVector, Label& L_entry, Label& L_exit) {
1200 Label L_entry_64, L_entry_96, L_entry_128;
1201 Label L_entry_160, L_entry_192;
1202 bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
1203
1204 int size_mat[][6] = {
1205 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
1206 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
1207 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
1208 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
1209 };
1210
1211 // Case A) Special case for length less than equal to 32 bytes.
1212 __ cmpq(count, size_mat[shift][0]);
1213 __ jccb(Assembler::greater, L_entry_64);
1214 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1215 __ jmp(L_exit);
1216
1217 // Case B) Special case for length less than equal to 64 bytes.
1218 __ BIND(L_entry_64);
1219 __ cmpq(count, size_mat[shift][1]);
1220 __ jccb(Assembler::greater, L_entry_96);
1221 if (avx3) {
1222 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
1223 } else {
1224 copy32_avx(to, from, end_index, xmm, shift, -32);
1225 __ subq(count, 32 >> shift);
1226 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1227 }
1228 __ jmp(L_exit);
1229
1230 // Case C) Special case for length less than equal to 96 bytes.
1231 __ BIND(L_entry_96);
1232 __ cmpq(count, size_mat[shift][2]);
1233 __ jccb(Assembler::greater, L_entry_128);
1234 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1235 __ subq(count, 64 >> shift);
1236 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1237 __ jmp(L_exit);
1238
1239 // Case D) Special case for length less than equal to 128 bytes.
1240 __ BIND(L_entry_128);
1241 __ cmpq(count, size_mat[shift][3]);
1242 __ jccb(Assembler::greater, L_entry_160);
1243 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1244 copy32_avx(to, from, end_index, xmm, shift, -96);
1245 __ subq(count, 96 >> shift);
1246 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1247 __ jmp(L_exit);
1248
1249 // Case E) Special case for length less than equal to 160 bytes.
1250 __ BIND(L_entry_160);
1251 __ cmpq(count, size_mat[shift][4]);
1252 __ jccb(Assembler::greater, L_entry_192);
1253 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1254 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1255 __ subq(count, 128 >> shift);
1256 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1257 __ jmp(L_exit);
1258
1259 // Case F) Special case for length less than equal to 192 bytes.
1260 __ BIND(L_entry_192);
1261 __ cmpq(count, size_mat[shift][5]);
1262 __ jcc(Assembler::greater, L_entry);
1263 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
1264 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
1265 copy32_avx(to, from, end_index, xmm, shift, -160);
1266 __ subq(count, 160 >> shift);
1267 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
1268 __ jmp(L_exit);
1269 }
1270
1271 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1272 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1273 int shift, int offset) {
1274 if (MaxVectorSize == 64) {
1275 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1276 __ prefetcht0(Address(src, index, scale, offset + 0x200));
1277 __ prefetcht0(Address(src, index, scale, offset + 0x240));
1278 __ prefetcht0(Address(src, index, scale, offset + 0x280));
1279 __ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1280
1281 __ prefetcht0(Address(src, index, scale, offset + 0x400));
1282 __ prefetcht0(Address(src, index, scale, offset + 0x440));
1283 __ prefetcht0(Address(src, index, scale, offset + 0x480));
1284 __ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1285
1286 __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1287 __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1288 __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1289 __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1290
1291 __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1292 __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1293 __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1294 __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1295 }
1296 }
1297
1298 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
1299 KRegister mask, Register length, Register index,
1300 Register temp, int shift, int offset,
1301 bool use64byteVector) {
1302 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1303 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1304 if (!use64byteVector) {
1305 copy32_avx(dst, src, index, xmm, shift, offset);
1306 __ subptr(length, 32 >> shift);
1307 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
1308 } else {
1309 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1310 assert(MaxVectorSize == 64, "vector length != 64");
1311 __ mov64(temp, -1L);
1312 __ bzhiq(temp, temp, length);
1313 __ kmovql(mask, temp);
1314 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit);
1315 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit);
1316 }
1317 }
1318
1319
1320 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
1321 KRegister mask, Register length, Register index,
1322 Register temp, int shift, int offset) {
1323 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1324 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1325 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1326 __ mov64(temp, -1L);
1327 __ bzhiq(temp, temp, length);
1328 __ kmovql(mask, temp);
1329 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit);
1330 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit);
1331 }
1332
1333
1334 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
1335 int shift, int offset) {
1336 assert(MaxVectorSize >= 32, "vector length should be >= 32");
1337 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1338 __ vmovdqu(xmm, Address(src, index, scale, offset));
1339 __ vmovdqu(Address(dst, index, scale, offset), xmm);
1340 }
1341
1342
1343 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
1344 bool conjoint, int shift, int offset, bool use64byteVector) {
1345 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
1346 if (!use64byteVector) {
1347 if (conjoint) {
1348 copy32_avx(dst, src, index, xmm, shift, offset+32);
1349 copy32_avx(dst, src, index, xmm, shift, offset);
1350 } else {
1351 copy32_avx(dst, src, index, xmm, shift, offset);
1352 copy32_avx(dst, src, index, xmm, shift, offset+32);
1353 }
1354 } else {
1355 Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1356 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
1357 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
1358 }
1359 }
1360
1361 #endif // COMPILER2_OR_JVMCI
1362
1363
1364 // Arguments:
1365 // entry - location for return of (post-push) entry
1366 //
1367 // Inputs:
1368 // c_rarg0 - source array address
1369 // c_rarg1 - destination array address
1370 // c_rarg2 - element count, treated as ssize_t, can be zero
1371 //
1372 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1373 // we let the hardware handle it. The one to eight bytes within words,
1374 // dwords or qwords that span cache line boundaries will still be loaded
1375 // and stored atomically.
1376 //
1377 // Side Effects:
1378 // entry is set to the no-overlap entry point
1379 // used by generate_conjoint_byte_copy().
1380 //
1381 address StubGenerator::generate_disjoint_byte_copy(address* entry) {
1382 StubId stub_id = StubId::stubgen_jbyte_disjoint_arraycopy_id;
1383 // aligned is always false -- x86_64 always uses the unaligned code
1384 const bool aligned = false;
1385 #if COMPILER2_OR_JVMCI
1386 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1387 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1388 }
1389 #endif
1390 __ align(CodeEntryAlignment);
1391 StubCodeMark mark(this, stub_id);
1392 address start = __ pc();
1393 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1394
1395 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1396 Label L_copy_byte, L_exit;
1397 const Register from = rdi; // source array address
1398 const Register to = rsi; // destination array address
1399 const Register count = rdx; // elements count
1400 const Register byte_count = rcx;
1401 const Register qword_count = count;
1402 const Register end_from = from; // source array end address
1403 const Register end_to = to; // destination array end address
1404 // End pointers are inclusive, and if count is not zero they point
1405 // to the last unit copied: end_to[0] := end_from[0]
1406
1407 __ enter(); // required for proper stackwalking of RuntimeStub frame
1408 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1409
1410 if (entry != nullptr) {
1411 *entry = __ pc();
1412 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1413 BLOCK_COMMENT("Entry:");
1414 }
1415
1416 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1417 // r9 and r10 may be used to save non-volatile registers
1418
1419 {
1420 // UnsafeMemoryAccess page error: continue after unsafe access
1421 UnsafeMemoryAccessMark umam(this, !aligned, true);
1422 // 'from', 'to' and 'count' are now valid
1423 __ movptr(byte_count, count);
1424 __ shrptr(count, 3); // count => qword_count
1425
1426 // Copy from low to high addresses. Use 'to' as scratch.
1427 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1428 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1429 __ negptr(qword_count); // make the count negative
1430 __ jmp(L_copy_bytes);
1431
1432 // Copy trailing qwords
1433 __ BIND(L_copy_8_bytes);
1434 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1435 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1436 __ increment(qword_count);
1437 __ jcc(Assembler::notZero, L_copy_8_bytes);
1438
1439 // Check for and copy trailing dword
1440 __ BIND(L_copy_4_bytes);
1441 __ testl(byte_count, 4);
1442 __ jccb(Assembler::zero, L_copy_2_bytes);
1443 __ movl(rax, Address(end_from, 8));
1444 __ movl(Address(end_to, 8), rax);
1445
1446 __ addptr(end_from, 4);
1447 __ addptr(end_to, 4);
1448
1449 // Check for and copy trailing word
1450 __ BIND(L_copy_2_bytes);
1451 __ testl(byte_count, 2);
1452 __ jccb(Assembler::zero, L_copy_byte);
1453 __ movw(rax, Address(end_from, 8));
1454 __ movw(Address(end_to, 8), rax);
1455
1456 __ addptr(end_from, 2);
1457 __ addptr(end_to, 2);
1458
1459 // Check for and copy trailing byte
1460 __ BIND(L_copy_byte);
1461 __ testl(byte_count, 1);
1462 __ jccb(Assembler::zero, L_exit);
1463 __ movb(rax, Address(end_from, 8));
1464 __ movb(Address(end_to, 8), rax);
1465 }
1466 __ BIND(L_exit);
1467 address ucme_exit_pc = __ pc();
1468 restore_arg_regs();
1469 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1470 __ xorptr(rax, rax); // return 0
1471 __ vzeroupper();
1472 __ leave(); // required for proper stackwalking of RuntimeStub frame
1473 __ ret(0);
1474
1475 {
1476 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1477 // Copy in multi-bytes chunks
1478 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1479 __ jmp(L_copy_4_bytes);
1480 }
1481 return start;
1482 }
1483
1484
1485 // Arguments:
1486 // entry - location for return of (post-push) entry
1487 // nooverlap_target - entry to branch to if no overlap detected
1488 //
1489 // Inputs:
1490 // c_rarg0 - source array address
1491 // c_rarg1 - destination array address
1492 // c_rarg2 - element count, treated as ssize_t, can be zero
1493 //
1494 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1495 // we let the hardware handle it. The one to eight bytes within words,
1496 // dwords or qwords that span cache line boundaries will still be loaded
1497 // and stored atomically.
1498 //
1499 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) {
1500 StubId stub_id = StubId::stubgen_jbyte_arraycopy_id;
1501 // aligned is always false -- x86_64 always uses the unaligned code
1502 const bool aligned = false;
1503 #if COMPILER2_OR_JVMCI
1504 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1505 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1506 }
1507 #endif
1508 __ align(CodeEntryAlignment);
1509 StubCodeMark mark(this, stub_id);
1510 address start = __ pc();
1511 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1512
1513 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1514 const Register from = rdi; // source array address
1515 const Register to = rsi; // destination array address
1516 const Register count = rdx; // elements count
1517 const Register byte_count = rcx;
1518 const Register qword_count = count;
1519
1520 __ enter(); // required for proper stackwalking of RuntimeStub frame
1521 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1522
1523 if (entry != nullptr) {
1524 *entry = __ pc();
1525 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1526 BLOCK_COMMENT("Entry:");
1527 }
1528
1529 array_overlap_test(nooverlap_target, Address::times_1);
1530 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1531 // r9 and r10 may be used to save non-volatile registers
1532
1533 {
1534 // UnsafeMemoryAccess page error: continue after unsafe access
1535 UnsafeMemoryAccessMark umam(this, !aligned, true);
1536 // 'from', 'to' and 'count' are now valid
1537 __ movptr(byte_count, count);
1538 __ shrptr(count, 3); // count => qword_count
1539
1540 // Copy from high to low addresses.
1541
1542 // Check for and copy trailing byte
1543 __ testl(byte_count, 1);
1544 __ jcc(Assembler::zero, L_copy_2_bytes);
1545 __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1546 __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1547 __ decrement(byte_count); // Adjust for possible trailing word
1548
1549 // Check for and copy trailing word
1550 __ BIND(L_copy_2_bytes);
1551 __ testl(byte_count, 2);
1552 __ jcc(Assembler::zero, L_copy_4_bytes);
1553 __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1554 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1555
1556 // Check for and copy trailing dword
1557 __ BIND(L_copy_4_bytes);
1558 __ testl(byte_count, 4);
1559 __ jcc(Assembler::zero, L_copy_bytes);
1560 __ movl(rax, Address(from, qword_count, Address::times_8));
1561 __ movl(Address(to, qword_count, Address::times_8), rax);
1562 __ jmp(L_copy_bytes);
1563
1564 // Copy trailing qwords
1565 __ BIND(L_copy_8_bytes);
1566 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1567 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1568 __ decrement(qword_count);
1569 __ jcc(Assembler::notZero, L_copy_8_bytes);
1570 }
1571 restore_arg_regs();
1572 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1573 __ xorptr(rax, rax); // return 0
1574 __ vzeroupper();
1575 __ leave(); // required for proper stackwalking of RuntimeStub frame
1576 __ ret(0);
1577
1578 {
1579 // UnsafeMemoryAccess page error: continue after unsafe access
1580 UnsafeMemoryAccessMark umam(this, !aligned, true);
1581 // Copy in multi-bytes chunks
1582 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE);
1583 }
1584 restore_arg_regs();
1585 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1586 __ xorptr(rax, rax); // return 0
1587 __ vzeroupper();
1588 __ leave(); // required for proper stackwalking of RuntimeStub frame
1589 __ ret(0);
1590
1591 return start;
1592 }
1593
1594
1595 // Arguments:
1596 // entry - location for return of (post-push) entry
1597 //
1598 // Inputs:
1599 // c_rarg0 - source array address
1600 // c_rarg1 - destination array address
1601 // c_rarg2 - element count, treated as ssize_t, can be zero
1602 //
1603 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1604 // let the hardware handle it. The two or four words within dwords
1605 // or qwords that span cache line boundaries will still be loaded
1606 // and stored atomically.
1607 //
1608 // Side Effects:
1609 // entry is set to the no-overlap entry point
1610 // used by generate_conjoint_short_copy().
1611 //
1612 address StubGenerator::generate_disjoint_short_copy(address *entry) {
1613 StubId stub_id = StubId::stubgen_jshort_disjoint_arraycopy_id;
1614 // aligned is always false -- x86_64 always uses the unaligned code
1615 const bool aligned = false;
1616 #if COMPILER2_OR_JVMCI
1617 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1618 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1619 }
1620 #endif
1621
1622 __ align(CodeEntryAlignment);
1623 StubCodeMark mark(this, stub_id);
1624 address start = __ pc();
1625 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1626
1627 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1628 const Register from = rdi; // source array address
1629 const Register to = rsi; // destination array address
1630 const Register count = rdx; // elements count
1631 const Register word_count = rcx;
1632 const Register qword_count = count;
1633 const Register end_from = from; // source array end address
1634 const Register end_to = to; // destination array end address
1635 // End pointers are inclusive, and if count is not zero they point
1636 // to the last unit copied: end_to[0] := end_from[0]
1637
1638 __ enter(); // required for proper stackwalking of RuntimeStub frame
1639 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1640
1641 if (entry != nullptr) {
1642 *entry = __ pc();
1643 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1644 BLOCK_COMMENT("Entry:");
1645 }
1646
1647 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1648 // r9 and r10 may be used to save non-volatile registers
1649
1650 {
1651 // UnsafeMemoryAccess page error: continue after unsafe access
1652 UnsafeMemoryAccessMark umam(this, !aligned, true);
1653 // 'from', 'to' and 'count' are now valid
1654 __ movptr(word_count, count);
1655 __ shrptr(count, 2); // count => qword_count
1656
1657 // Copy from low to high addresses. Use 'to' as scratch.
1658 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1659 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1660 __ negptr(qword_count);
1661 __ jmp(L_copy_bytes);
1662
1663 // Copy trailing qwords
1664 __ BIND(L_copy_8_bytes);
1665 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1666 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1667 __ increment(qword_count);
1668 __ jcc(Assembler::notZero, L_copy_8_bytes);
1669
1670 // Original 'dest' is trashed, so we can't use it as a
1671 // base register for a possible trailing word copy
1672
1673 // Check for and copy trailing dword
1674 __ BIND(L_copy_4_bytes);
1675 __ testl(word_count, 2);
1676 __ jccb(Assembler::zero, L_copy_2_bytes);
1677 __ movl(rax, Address(end_from, 8));
1678 __ movl(Address(end_to, 8), rax);
1679
1680 __ addptr(end_from, 4);
1681 __ addptr(end_to, 4);
1682
1683 // Check for and copy trailing word
1684 __ BIND(L_copy_2_bytes);
1685 __ testl(word_count, 1);
1686 __ jccb(Assembler::zero, L_exit);
1687 __ movw(rax, Address(end_from, 8));
1688 __ movw(Address(end_to, 8), rax);
1689 }
1690 __ BIND(L_exit);
1691 address ucme_exit_pc = __ pc();
1692 restore_arg_regs();
1693 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1694 __ xorptr(rax, rax); // return 0
1695 __ vzeroupper();
1696 __ leave(); // required for proper stackwalking of RuntimeStub frame
1697 __ ret(0);
1698
1699 {
1700 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc);
1701 // Copy in multi-bytes chunks
1702 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1703 __ jmp(L_copy_4_bytes);
1704 }
1705
1706 return start;
1707 }
1708
1709
1710 address StubGenerator::generate_fill(StubId stub_id) {
1711 BasicType t;
1712 bool aligned;
1713
1714 switch (stub_id) {
1715 case StubId::stubgen_jbyte_fill_id:
1716 t = T_BYTE;
1717 aligned = false;
1718 break;
1719 case StubId::stubgen_jshort_fill_id:
1720 t = T_SHORT;
1721 aligned = false;
1722 break;
1723 case StubId::stubgen_jint_fill_id:
1724 t = T_INT;
1725 aligned = false;
1726 break;
1727 case StubId::stubgen_arrayof_jbyte_fill_id:
1728 t = T_BYTE;
1729 aligned = true;
1730 break;
1731 case StubId::stubgen_arrayof_jshort_fill_id:
1732 t = T_SHORT;
1733 aligned = true;
1734 break;
1735 case StubId::stubgen_arrayof_jint_fill_id:
1736 t = T_INT;
1737 aligned = true;
1738 break;
1739 default:
1740 ShouldNotReachHere();
1741 }
1742
1743 __ align(CodeEntryAlignment);
1744 StubCodeMark mark(this, stub_id);
1745 address start = __ pc();
1746
1747 BLOCK_COMMENT("Entry:");
1748
1749 const Register to = c_rarg0; // destination array address
1750 const Register value = c_rarg1; // value
1751 const Register count = c_rarg2; // elements count
1752 __ mov(r11, count);
1753
1754 __ enter(); // required for proper stackwalking of RuntimeStub frame
1755
1756 {
1757 // Add set memory mark to protect against unsafe accesses faulting
1758 UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true);
1759 __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
1760 }
1761
1762 __ vzeroupper();
1763 __ leave(); // required for proper stackwalking of RuntimeStub frame
1764 __ ret(0);
1765
1766 return start;
1767 }
1768
1769
1770 // Arguments:
1771 // entry - location for return of (post-push) entry
1772 // nooverlap_target - entry to branch to if no overlap detected
1773 //
1774 // Inputs:
1775 // c_rarg0 - source array address
1776 // c_rarg1 - destination array address
1777 // c_rarg2 - element count, treated as ssize_t, can be zero
1778 //
1779 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1780 // let the hardware handle it. The two or four words within dwords
1781 // or qwords that span cache line boundaries will still be loaded
1782 // and stored atomically.
1783 //
1784 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) {
1785 StubId stub_id = StubId::stubgen_jshort_arraycopy_id;
1786 // aligned is always false -- x86_64 always uses the unaligned code
1787 const bool aligned = false;
1788 #if COMPILER2_OR_JVMCI
1789 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1790 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
1791 }
1792 #endif
1793
1794 __ align(CodeEntryAlignment);
1795 StubCodeMark mark(this, stub_id);
1796 address start = __ pc();
1797 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1798
1799 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1800 const Register from = rdi; // source array address
1801 const Register to = rsi; // destination array address
1802 const Register count = rdx; // elements count
1803 const Register word_count = rcx;
1804 const Register qword_count = count;
1805
1806 __ enter(); // required for proper stackwalking of RuntimeStub frame
1807 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1808
1809 if (entry != nullptr) {
1810 *entry = __ pc();
1811 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1812 BLOCK_COMMENT("Entry:");
1813 }
1814
1815 array_overlap_test(nooverlap_target, Address::times_2);
1816 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1817 // r9 and r10 may be used to save non-volatile registers
1818
1819 {
1820 // UnsafeMemoryAccess page error: continue after unsafe access
1821 UnsafeMemoryAccessMark umam(this, !aligned, true);
1822 // 'from', 'to' and 'count' are now valid
1823 __ movptr(word_count, count);
1824 __ shrptr(count, 2); // count => qword_count
1825
1826 // Copy from high to low addresses. Use 'to' as scratch.
1827
1828 // Check for and copy trailing word
1829 __ testl(word_count, 1);
1830 __ jccb(Assembler::zero, L_copy_4_bytes);
1831 __ movw(rax, Address(from, word_count, Address::times_2, -2));
1832 __ movw(Address(to, word_count, Address::times_2, -2), rax);
1833
1834 // Check for and copy trailing dword
1835 __ BIND(L_copy_4_bytes);
1836 __ testl(word_count, 2);
1837 __ jcc(Assembler::zero, L_copy_bytes);
1838 __ movl(rax, Address(from, qword_count, Address::times_8));
1839 __ movl(Address(to, qword_count, Address::times_8), rax);
1840 __ jmp(L_copy_bytes);
1841
1842 // Copy trailing qwords
1843 __ BIND(L_copy_8_bytes);
1844 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1845 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1846 __ decrement(qword_count);
1847 __ jcc(Assembler::notZero, L_copy_8_bytes);
1848 }
1849 restore_arg_regs();
1850 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1851 __ xorptr(rax, rax); // return 0
1852 __ vzeroupper();
1853 __ leave(); // required for proper stackwalking of RuntimeStub frame
1854 __ ret(0);
1855
1856 {
1857 // UnsafeMemoryAccess page error: continue after unsafe access
1858 UnsafeMemoryAccessMark umam(this, !aligned, true);
1859 // Copy in multi-bytes chunks
1860 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT);
1861 }
1862 restore_arg_regs();
1863 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1864 __ xorptr(rax, rax); // return 0
1865 __ vzeroupper();
1866 __ leave(); // required for proper stackwalking of RuntimeStub frame
1867 __ ret(0);
1868
1869 return start;
1870 }
1871
1872
1873 // Arguments:
1874 // stub_id - unqiue id for stub to generate
1875 // entry - location for return of (post-push) entry
1876 // is_oop - true => oop array, so generate store check code
1877 //
1878 // Inputs:
1879 // c_rarg0 - source array address
1880 // c_rarg1 - destination array address
1881 // c_rarg2 - element count, treated as ssize_t, can be zero
1882 //
1883 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1884 // the hardware handle it. The two dwords within qwords that span
1885 // cache line boundaries will still be loaded and stored atomically.
1886 //
1887 // Side Effects:
1888 // disjoint_int_copy_entry is set to the no-overlap entry point
1889 // used by generate_conjoint_int_oop_copy().
1890 //
1891 address StubGenerator::generate_disjoint_int_oop_copy(StubId stub_id, address* entry) {
1892 // aligned is always false -- x86_64 always uses the unaligned code
1893 const bool aligned = false;
1894 bool is_oop;
1895 bool dest_uninitialized;
1896 switch (stub_id) {
1897 case StubId::stubgen_jint_disjoint_arraycopy_id:
1898 is_oop = false;
1899 dest_uninitialized = false;
1900 break;
1901 case StubId::stubgen_oop_disjoint_arraycopy_id:
1902 assert(UseCompressedOops, "inconsistent oop copy size!");
1903 is_oop = true;
1904 dest_uninitialized = false;
1905 break;
1906 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1907 assert(UseCompressedOops, "inconsistent oop copy size!");
1908 is_oop = true;
1909 dest_uninitialized = true;
1910 break;
1911 default:
1912 ShouldNotReachHere();
1913 }
1914
1915 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1916 #if COMPILER2_OR_JVMCI
1917 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1918 return generate_disjoint_copy_avx3_masked(stub_id, entry);
1919 }
1920 #endif
1921
1922 __ align(CodeEntryAlignment);
1923 StubCodeMark mark(this, stub_id);
1924 address start = __ pc();
1925
1926 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1927 const Register from = rdi; // source array address
1928 const Register to = rsi; // destination array address
1929 const Register count = rdx; // elements count
1930 const Register dword_count = rcx;
1931 const Register qword_count = count;
1932 const Register end_from = from; // source array end address
1933 const Register end_to = to; // destination array end address
1934 // End pointers are inclusive, and if count is not zero they point
1935 // to the last unit copied: end_to[0] := end_from[0]
1936
1937 __ enter(); // required for proper stackwalking of RuntimeStub frame
1938 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1939
1940 if (entry != nullptr) {
1941 *entry = __ pc();
1942 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1943 BLOCK_COMMENT("Entry:");
1944 }
1945
1946 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1947 // r9 is used to save r15_thread
1948
1949 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1950 if (dest_uninitialized) {
1951 decorators |= IS_DEST_UNINITIALIZED;
1952 }
1953 if (aligned) {
1954 decorators |= ARRAYCOPY_ALIGNED;
1955 }
1956
1957 BasicType type = is_oop ? T_OBJECT : T_INT;
1958 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1959
1960 {
1961 // UnsafeMemoryAccess page error: continue after unsafe access
1962 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
1963 // 'from', 'to' and 'count' are now valid
1964 __ movptr(dword_count, count);
1965 __ shrptr(count, 1); // count => qword_count
1966
1967 // Copy from low to high addresses. Use 'to' as scratch.
1968 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1969 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1970 __ negptr(qword_count);
1971 __ jmp(L_copy_bytes);
1972
1973 // Copy trailing qwords
1974 __ BIND(L_copy_8_bytes);
1975 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1976 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1977 __ increment(qword_count);
1978 __ jcc(Assembler::notZero, L_copy_8_bytes);
1979
1980 // Check for and copy trailing dword
1981 __ BIND(L_copy_4_bytes);
1982 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1983 __ jccb(Assembler::zero, L_exit);
1984 __ movl(rax, Address(end_from, 8));
1985 __ movl(Address(end_to, 8), rax);
1986 }
1987 __ BIND(L_exit);
1988 address ucme_exit_pc = __ pc();
1989 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1990 restore_arg_regs_using_thread();
1991 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
1992 __ vzeroupper();
1993 __ xorptr(rax, rax); // return 0
1994 __ leave(); // required for proper stackwalking of RuntimeStub frame
1995 __ ret(0);
1996
1997 {
1998 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc);
1999 // Copy in multi-bytes chunks
2000 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2001 __ jmp(L_copy_4_bytes);
2002 }
2003
2004 return start;
2005 }
2006
2007
2008 // Arguments:
2009 // entry - location for return of (post-push) entry
2010 // nooverlap_target - entry to branch to if no overlap detected
2011 // is_oop - true => oop array, so generate store check code
2012 //
2013 // Inputs:
2014 // c_rarg0 - source array address
2015 // c_rarg1 - destination array address
2016 // c_rarg2 - element count, treated as ssize_t, can be zero
2017 //
2018 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2019 // the hardware handle it. The two dwords within qwords that span
2020 // cache line boundaries will still be loaded and stored atomically.
2021 //
2022 address StubGenerator::generate_conjoint_int_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2023 // aligned is always false -- x86_64 always uses the unaligned code
2024 const bool aligned = false;
2025 bool is_oop;
2026 bool dest_uninitialized;
2027 switch (stub_id) {
2028 case StubId::stubgen_jint_arraycopy_id:
2029 is_oop = false;
2030 dest_uninitialized = false;
2031 break;
2032 case StubId::stubgen_oop_arraycopy_id:
2033 assert(UseCompressedOops, "inconsistent oop copy size!");
2034 is_oop = true;
2035 dest_uninitialized = false;
2036 break;
2037 case StubId::stubgen_oop_arraycopy_uninit_id:
2038 assert(UseCompressedOops, "inconsistent oop copy size!");
2039 is_oop = true;
2040 dest_uninitialized = true;
2041 break;
2042 default:
2043 ShouldNotReachHere();
2044 }
2045
2046 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2047 #if COMPILER2_OR_JVMCI
2048 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2049 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2050 }
2051 #endif
2052
2053 __ align(CodeEntryAlignment);
2054 StubCodeMark mark(this, stub_id);
2055 address start = __ pc();
2056
2057 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2058 const Register from = rdi; // source array address
2059 const Register to = rsi; // destination array address
2060 const Register count = rdx; // elements count
2061 const Register dword_count = rcx;
2062 const Register qword_count = count;
2063
2064 __ enter(); // required for proper stackwalking of RuntimeStub frame
2065 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2066
2067 if (entry != nullptr) {
2068 *entry = __ pc();
2069 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2070 BLOCK_COMMENT("Entry:");
2071 }
2072
2073 array_overlap_test(nooverlap_target, Address::times_4);
2074 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2075 // r9 is used to save r15_thread
2076
2077 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2078 if (dest_uninitialized) {
2079 decorators |= IS_DEST_UNINITIALIZED;
2080 }
2081 if (aligned) {
2082 decorators |= ARRAYCOPY_ALIGNED;
2083 }
2084
2085 BasicType type = is_oop ? T_OBJECT : T_INT;
2086 // no registers are destroyed by this call
2087 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2088
2089 assert_clean_int(count, rax); // Make sure 'count' is clean int.
2090 {
2091 // UnsafeMemoryAccess page error: continue after unsafe access
2092 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2093 // 'from', 'to' and 'count' are now valid
2094 __ movptr(dword_count, count);
2095 __ shrptr(count, 1); // count => qword_count
2096
2097 // Copy from high to low addresses. Use 'to' as scratch.
2098
2099 // Check for and copy trailing dword
2100 __ testl(dword_count, 1);
2101 __ jcc(Assembler::zero, L_copy_bytes);
2102 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2103 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2104 __ jmp(L_copy_bytes);
2105
2106 // Copy trailing qwords
2107 __ BIND(L_copy_8_bytes);
2108 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2109 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2110 __ decrement(qword_count);
2111 __ jcc(Assembler::notZero, L_copy_8_bytes);
2112 }
2113 if (is_oop) {
2114 __ jmp(L_exit);
2115 }
2116 restore_arg_regs_using_thread();
2117 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2118 __ xorptr(rax, rax); // return 0
2119 __ vzeroupper();
2120 __ leave(); // required for proper stackwalking of RuntimeStub frame
2121 __ ret(0);
2122
2123 {
2124 // UnsafeMemoryAccess page error: continue after unsafe access
2125 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2126 // Copy in multi-bytes chunks
2127 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT);
2128 }
2129
2130 __ BIND(L_exit);
2131 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2132 restore_arg_regs_using_thread();
2133 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2134 __ xorptr(rax, rax); // return 0
2135 __ vzeroupper();
2136 __ leave(); // required for proper stackwalking of RuntimeStub frame
2137 __ ret(0);
2138
2139 return start;
2140 }
2141
2142
2143 // Arguments:
2144 // entry - location for return of (post-push) entry
2145 //
2146 // Inputs:
2147 // c_rarg0 - source array address
2148 // c_rarg1 - destination array address
2149 // c_rarg2 - element count, treated as ssize_t, can be zero
2150 //
2151 // Side Effects:
2152 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2153 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2154 //
2155 address StubGenerator::generate_disjoint_long_oop_copy(StubId stub_id, address *entry) {
2156 // aligned is always false -- x86_64 always uses the unaligned code
2157 const bool aligned = false;
2158 bool is_oop;
2159 bool dest_uninitialized;
2160 switch (stub_id) {
2161 case StubId::stubgen_jlong_disjoint_arraycopy_id:
2162 is_oop = false;
2163 dest_uninitialized = false;
2164 break;
2165 case StubId::stubgen_oop_disjoint_arraycopy_id:
2166 assert(!UseCompressedOops, "inconsistent oop copy size!");
2167 is_oop = true;
2168 dest_uninitialized = false;
2169 break;
2170 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2171 assert(!UseCompressedOops, "inconsistent oop copy size!");
2172 is_oop = true;
2173 dest_uninitialized = true;
2174 break;
2175 default:
2176 ShouldNotReachHere();
2177 }
2178
2179 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2180 #if COMPILER2_OR_JVMCI
2181 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2182 return generate_disjoint_copy_avx3_masked(stub_id, entry);
2183 }
2184 #endif
2185
2186 __ align(CodeEntryAlignment);
2187 StubCodeMark mark(this, stub_id);
2188 address start = __ pc();
2189
2190 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2191 const Register from = rdi; // source array address
2192 const Register to = rsi; // destination array address
2193 const Register qword_count = rdx; // elements count
2194 const Register end_from = from; // source array end address
2195 const Register end_to = rcx; // destination array end address
2196 const Register saved_count = r11;
2197 // End pointers are inclusive, and if count is not zero they point
2198 // to the last unit copied: end_to[0] := end_from[0]
2199
2200 __ enter(); // required for proper stackwalking of RuntimeStub frame
2201 // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2202 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2203
2204 if (entry != nullptr) {
2205 *entry = __ pc();
2206 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2207 BLOCK_COMMENT("Entry:");
2208 }
2209
2210 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2211 // r9 is used to save r15_thread
2212 // 'from', 'to' and 'qword_count' are now valid
2213
2214 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2215 if (dest_uninitialized) {
2216 decorators |= IS_DEST_UNINITIALIZED;
2217 }
2218 if (aligned) {
2219 decorators |= ARRAYCOPY_ALIGNED;
2220 }
2221
2222 BasicType type = is_oop ? T_OBJECT : T_LONG;
2223 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2224 {
2225 // UnsafeMemoryAccess page error: continue after unsafe access
2226 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2227
2228 // Copy from low to high addresses. Use 'to' as scratch.
2229 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2230 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2231 __ negptr(qword_count);
2232 __ jmp(L_copy_bytes);
2233
2234 // Copy trailing qwords
2235 __ BIND(L_copy_8_bytes);
2236 bs->copy_load_at(_masm, decorators, type, 8,
2237 rax, Address(end_from, qword_count, Address::times_8, 8),
2238 r10);
2239 bs->copy_store_at(_masm, decorators, type, 8,
2240 Address(end_to, qword_count, Address::times_8, 8), rax,
2241 r10);
2242 __ increment(qword_count);
2243 __ jcc(Assembler::notZero, L_copy_8_bytes);
2244 }
2245 if (is_oop) {
2246 __ jmp(L_exit);
2247 } else {
2248 restore_arg_regs_using_thread();
2249 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2250 __ xorptr(rax, rax); // return 0
2251 __ vzeroupper();
2252 __ leave(); // required for proper stackwalking of RuntimeStub frame
2253 __ ret(0);
2254 }
2255
2256 {
2257 // UnsafeMemoryAccess page error: continue after unsafe access
2258 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2259 // Copy in multi-bytes chunks
2260 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2261 }
2262
2263 __ BIND(L_exit);
2264 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2265 restore_arg_regs_using_thread();
2266 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2267 SharedRuntime::_jlong_array_copy_ctr,
2268 rscratch1); // Update counter after rscratch1 is free
2269 __ vzeroupper();
2270 __ xorptr(rax, rax); // return 0
2271 __ leave(); // required for proper stackwalking of RuntimeStub frame
2272 __ ret(0);
2273
2274 return start;
2275 }
2276
2277
2278 // Arguments:
2279 // entry - location for return of (post-push) entry
2280 // nooverlap_target - entry to branch to if no overlap detected
2281 // is_oop - true => oop array, so generate store check code
2282 //
2283 // Inputs:
2284 // c_rarg0 - source array address
2285 // c_rarg1 - destination array address
2286 // c_rarg2 - element count, treated as ssize_t, can be zero
2287 //
2288 address StubGenerator::generate_conjoint_long_oop_copy(StubId stub_id, address nooverlap_target, address *entry) {
2289 // aligned is always false -- x86_64 always uses the unaligned code
2290 const bool aligned = false;
2291 bool is_oop;
2292 bool dest_uninitialized;
2293 switch (stub_id) {
2294 case StubId::stubgen_jlong_arraycopy_id:
2295 is_oop = false;
2296 dest_uninitialized = false;
2297 break;
2298 case StubId::stubgen_oop_arraycopy_id:
2299 assert(!UseCompressedOops, "inconsistent oop copy size!");
2300 is_oop = true;
2301 dest_uninitialized = false;
2302 break;
2303 case StubId::stubgen_oop_arraycopy_uninit_id:
2304 assert(!UseCompressedOops, "inconsistent oop copy size!");
2305 is_oop = true;
2306 dest_uninitialized = true;
2307 break;
2308 default:
2309 ShouldNotReachHere();
2310 }
2311
2312 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2313 #if COMPILER2_OR_JVMCI
2314 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2315 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target);
2316 }
2317 #endif
2318
2319 __ align(CodeEntryAlignment);
2320 StubCodeMark mark(this, stub_id);
2321 address start = __ pc();
2322
2323 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2324 const Register from = rdi; // source array address
2325 const Register to = rsi; // destination array address
2326 const Register qword_count = rdx; // elements count
2327 const Register saved_count = rcx;
2328
2329 __ enter(); // required for proper stackwalking of RuntimeStub frame
2330 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2331
2332 if (entry != nullptr) {
2333 *entry = __ pc();
2334 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2335 BLOCK_COMMENT("Entry:");
2336 }
2337
2338 array_overlap_test(nooverlap_target, Address::times_8);
2339 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2340 // r9 is used to save r15_thread
2341 // 'from', 'to' and 'qword_count' are now valid
2342
2343 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2344 if (dest_uninitialized) {
2345 decorators |= IS_DEST_UNINITIALIZED;
2346 }
2347 if (aligned) {
2348 decorators |= ARRAYCOPY_ALIGNED;
2349 }
2350
2351 BasicType type = is_oop ? T_OBJECT : T_LONG;
2352 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2353 {
2354 // UnsafeMemoryAccess page error: continue after unsafe access
2355 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2356
2357 __ jmp(L_copy_bytes);
2358
2359 // Copy trailing qwords
2360 __ BIND(L_copy_8_bytes);
2361 bs->copy_load_at(_masm, decorators, type, 8,
2362 rax, Address(from, qword_count, Address::times_8, -8),
2363 r10);
2364 bs->copy_store_at(_masm, decorators, type, 8,
2365 Address(to, qword_count, Address::times_8, -8), rax,
2366 r10);
2367 __ decrement(qword_count);
2368 __ jcc(Assembler::notZero, L_copy_8_bytes);
2369 }
2370 if (is_oop) {
2371 __ jmp(L_exit);
2372 } else {
2373 restore_arg_regs_using_thread();
2374 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2375 __ xorptr(rax, rax); // return 0
2376 __ vzeroupper();
2377 __ leave(); // required for proper stackwalking of RuntimeStub frame
2378 __ ret(0);
2379 }
2380 {
2381 // UnsafeMemoryAccess page error: continue after unsafe access
2382 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true);
2383
2384 // Copy in multi-bytes chunks
2385 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG);
2386 }
2387 __ BIND(L_exit);
2388 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2389 restore_arg_regs_using_thread();
2390 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr :
2391 SharedRuntime::_jlong_array_copy_ctr,
2392 rscratch1); // Update counter after rscratch1 is free
2393 __ vzeroupper();
2394 __ xorptr(rax, rax); // return 0
2395 __ leave(); // required for proper stackwalking of RuntimeStub frame
2396 __ ret(0);
2397
2398 return start;
2399 }
2400
2401
2402 // Helper for generating a dynamic type check.
2403 // Smashes no registers.
2404 void StubGenerator::generate_type_check(Register sub_klass,
2405 Register super_check_offset,
2406 Register super_klass,
2407 Label& L_success) {
2408 assert_different_registers(sub_klass, super_check_offset, super_klass);
2409
2410 BLOCK_COMMENT("type_check:");
2411
2412 Label L_miss;
2413
2414 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2415 super_check_offset);
2416 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
2417
2418 // Fall through on failure!
2419 __ BIND(L_miss);
2420 }
2421
2422 //
2423 // Generate checkcasting array copy stub
2424 //
2425 // Input:
2426 // c_rarg0 - source array address
2427 // c_rarg1 - destination array address
2428 // c_rarg2 - element count, treated as ssize_t, can be zero
2429 // c_rarg3 - size_t ckoff (super_check_offset)
2430 // not Win64
2431 // c_rarg4 - oop ckval (super_klass)
2432 // Win64
2433 // rsp+40 - oop ckval (super_klass)
2434 //
2435 // Output:
2436 // rax == 0 - success
2437 // rax == -1^K - failure, where K is partial transfer count
2438 //
2439 address StubGenerator::generate_checkcast_copy(StubId stub_id, address *entry) {
2440
2441 bool dest_uninitialized;
2442 switch (stub_id) {
2443 case StubId::stubgen_checkcast_arraycopy_id:
2444 dest_uninitialized = false;
2445 break;
2446 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2447 dest_uninitialized = true;
2448 break;
2449 default:
2450 ShouldNotReachHere();
2451 }
2452
2453 Label L_load_element, L_store_element, L_do_card_marks, L_done;
2454
2455 // Input registers (after setup_arg_regs)
2456 const Register from = rdi; // source array address
2457 const Register to = rsi; // destination array address
2458 const Register length = rdx; // elements count
2459 const Register ckoff = rcx; // super_check_offset
2460 const Register ckval = r8; // super_klass
2461
2462 // Registers used as temps (r13, r14 are save-on-entry)
2463 const Register end_from = from; // source array end address
2464 const Register end_to = r13; // destination array end address
2465 const Register count = rdx; // -(count_remaining)
2466 const Register r14_length = r14; // saved copy of length
2467 // End pointers are inclusive, and if length is not zero they point
2468 // to the last unit copied: end_to[0] := end_from[0]
2469
2470 const Register rax_oop = rax; // actual oop copied
2471 const Register r11_klass = r11; // oop._klass
2472
2473 //---------------------------------------------------------------
2474 // Assembler stub will be used for this call to arraycopy
2475 // if the two arrays are subtypes of Object[] but the
2476 // destination array type is not equal to or a supertype
2477 // of the source type. Each element must be separately
2478 // checked.
2479
2480 __ align(CodeEntryAlignment);
2481 StubCodeMark mark(this, stub_id);
2482 address start = __ pc();
2483
2484 __ enter(); // required for proper stackwalking of RuntimeStub frame
2485
2486 #ifdef ASSERT
2487 // caller guarantees that the arrays really are different
2488 // otherwise, we would have to make conjoint checks
2489 { Label L;
2490 array_overlap_test(L, TIMES_OOP);
2491 __ stop("checkcast_copy within a single array");
2492 __ bind(L);
2493 }
2494 #endif //ASSERT
2495
2496 setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx
2497 // ckoff => rcx, ckval => r8
2498 // r9 is used to save r15_thread
2499 #ifdef _WIN64
2500 // last argument (#4) is on stack on Win64
2501 __ movptr(ckval, Address(rsp, 6 * wordSize));
2502 #endif
2503
2504 // Caller of this entry point must set up the argument registers.
2505 if (entry != nullptr) {
2506 *entry = __ pc();
2507 BLOCK_COMMENT("Entry:");
2508 }
2509
2510 // allocate spill slots for r13, r14
2511 enum {
2512 saved_r13_offset,
2513 saved_r14_offset,
2514 saved_r10_offset,
2515 saved_rbp_offset
2516 };
2517 __ subptr(rsp, saved_rbp_offset * wordSize);
2518 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2519 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2520 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2521
2522 #ifdef ASSERT
2523 Label L2;
2524 __ get_thread_slow(r14);
2525 __ cmpptr(r15_thread, r14);
2526 __ jcc(Assembler::equal, L2);
2527 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2528 __ bind(L2);
2529 #endif // ASSERT
2530
2531 // check that int operands are properly extended to size_t
2532 assert_clean_int(length, rax);
2533 assert_clean_int(ckoff, rax);
2534
2535 #ifdef ASSERT
2536 BLOCK_COMMENT("assert consistent ckoff/ckval");
2537 // The ckoff and ckval must be mutually consistent,
2538 // even though caller generates both.
2539 { Label L;
2540 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2541 __ cmpl(ckoff, Address(ckval, sco_offset));
2542 __ jcc(Assembler::equal, L);
2543 __ stop("super_check_offset inconsistent");
2544 __ bind(L);
2545 }
2546 #endif //ASSERT
2547
2548 // Loop-invariant addresses. They are exclusive end pointers.
2549 Address end_from_addr(from, length, TIMES_OOP, 0);
2550 Address end_to_addr(to, length, TIMES_OOP, 0);
2551 // Loop-variant addresses. They assume post-incremented count < 0.
2552 Address from_element_addr(end_from, count, TIMES_OOP, 0);
2553 Address to_element_addr(end_to, count, TIMES_OOP, 0);
2554
2555 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2556 if (dest_uninitialized) {
2557 decorators |= IS_DEST_UNINITIALIZED;
2558 }
2559
2560 BasicType type = T_OBJECT;
2561 size_t element_size = UseCompressedOops ? 4 : 8;
2562
2563 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2564 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2565
2566 // Copy from low to high addresses, indexed from the end of each array.
2567 __ lea(end_from, end_from_addr);
2568 __ lea(end_to, end_to_addr);
2569 __ movptr(r14_length, length); // save a copy of the length
2570 assert(length == count, ""); // else fix next line:
2571 __ negptr(count); // negate and test the length
2572 __ jcc(Assembler::notZero, L_load_element);
2573
2574 // Empty array: Nothing to do.
2575 __ xorptr(rax, rax); // return 0 on (trivial) success
2576 __ jmp(L_done);
2577
2578 // ======== begin loop ========
2579 // (Loop is rotated; its entry is L_load_element.)
2580 // Loop control:
2581 // for (count = -count; count != 0; count++)
2582 // Base pointers src, dst are biased by 8*(count-1),to last element.
2583 __ align(OptoLoopAlignment);
2584
2585 __ BIND(L_store_element);
2586 bs->copy_store_at(_masm,
2587 decorators,
2588 type,
2589 element_size,
2590 to_element_addr,
2591 rax_oop,
2592 r10);
2593 __ increment(count); // increment the count toward zero
2594 __ jcc(Assembler::zero, L_do_card_marks);
2595
2596 // ======== loop entry is here ========
2597 __ BIND(L_load_element);
2598 bs->copy_load_at(_masm,
2599 decorators,
2600 type,
2601 element_size,
2602 rax_oop,
2603 from_element_addr,
2604 r10);
2605 __ testptr(rax_oop, rax_oop);
2606 __ jcc(Assembler::zero, L_store_element);
2607
2608 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2609 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2610 // ======== end loop ========
2611
2612 // It was a real error; we must depend on the caller to finish the job.
2613 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2614 // Emit GC store barriers for the oops we have copied (r14 + rdx),
2615 // and report their number to the caller.
2616 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2617 Label L_post_barrier;
2618 __ addptr(r14_length, count); // K = (original - remaining) oops
2619 __ movptr(rax, r14_length); // save the value
2620 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
2621 __ jccb(Assembler::notZero, L_post_barrier);
2622 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2623
2624 // Come here on success only.
2625 __ BIND(L_do_card_marks);
2626 __ xorptr(rax, rax); // return 0 on success
2627
2628 __ BIND(L_post_barrier);
2629 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2630
2631 // Common exit point (success or failure).
2632 __ BIND(L_done);
2633 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2634 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2635 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2636 restore_arg_regs_using_thread();
2637 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free
2638 __ leave(); // required for proper stackwalking of RuntimeStub frame
2639 __ ret(0);
2640
2641 return start;
2642 }
2643
2644
2645 // Generate 'unsafe' array copy stub
2646 // Though just as safe as the other stubs, it takes an unscaled
2647 // size_t argument instead of an element count.
2648 //
2649 // Input:
2650 // c_rarg0 - source array address
2651 // c_rarg1 - destination array address
2652 // c_rarg2 - byte count, treated as ssize_t, can be zero
2653 //
2654 // Examines the alignment of the operands and dispatches
2655 // to a long, int, short, or byte copy loop.
2656 //
2657 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry,
2658 address int_copy_entry, address long_copy_entry) {
2659
2660 Label L_long_aligned, L_int_aligned, L_short_aligned;
2661
2662 // Input registers (before setup_arg_regs)
2663 const Register from = c_rarg0; // source array address
2664 const Register to = c_rarg1; // destination array address
2665 const Register size = c_rarg2; // byte count (size_t)
2666
2667 // Register used as a temp
2668 const Register bits = rax; // test copy of low bits
2669
2670 __ align(CodeEntryAlignment);
2671 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2672 StubCodeMark mark(this, stub_id);
2673 address start = __ pc();
2674
2675 __ enter(); // required for proper stackwalking of RuntimeStub frame
2676
2677 // bump this on entry, not on exit:
2678 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1);
2679
2680 __ mov(bits, from);
2681 __ orptr(bits, to);
2682 __ orptr(bits, size);
2683
2684 __ testb(bits, BytesPerLong-1);
2685 __ jccb(Assembler::zero, L_long_aligned);
2686
2687 __ testb(bits, BytesPerInt-1);
2688 __ jccb(Assembler::zero, L_int_aligned);
2689
2690 __ testb(bits, BytesPerShort-1);
2691 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2692
2693 __ BIND(L_short_aligned);
2694 __ shrptr(size, LogBytesPerShort); // size => short_count
2695 __ jump(RuntimeAddress(short_copy_entry));
2696
2697 __ BIND(L_int_aligned);
2698 __ shrptr(size, LogBytesPerInt); // size => int_count
2699 __ jump(RuntimeAddress(int_copy_entry));
2700
2701 __ BIND(L_long_aligned);
2702 __ shrptr(size, LogBytesPerLong); // size => qword_count
2703 __ jump(RuntimeAddress(long_copy_entry));
2704
2705 return start;
2706 }
2707
2708
2709 // Static enum for helper
2710 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
2711 // Helper for generate_unsafe_setmemory
2712 //
2713 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
2714 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
2715 Register size, Register wide_value,
2716 Register tmp, Label& L_exit,
2717 MacroAssembler *_masm) {
2718 Label L_Loop, L_Tail, L_TailLoop;
2719
2720 int shiftval = 0;
2721 int incr = 0;
2722
2723 switch (type) {
2724 case USM_SHORT:
2725 shiftval = 1;
2726 incr = 16;
2727 break;
2728 case USM_DWORD:
2729 shiftval = 2;
2730 incr = 32;
2731 break;
2732 case USM_QUADWORD:
2733 shiftval = 3;
2734 incr = 64;
2735 break;
2736 }
2737
2738 // At this point, we know the lower bits of size are zero
2739 __ shrq(size, shiftval);
2740 // size now has number of X-byte chunks (2, 4 or 8)
2741
2742 // Number of (8*X)-byte chunks into tmp
2743 __ movq(tmp, size);
2744 __ shrq(tmp, 3);
2745 __ jccb(Assembler::zero, L_Tail);
2746
2747 __ BIND(L_Loop);
2748
2749 // Unroll 8 stores
2750 for (int i = 0; i < 8; i++) {
2751 switch (type) {
2752 case USM_SHORT:
2753 __ movw(Address(dest, (2 * i)), wide_value);
2754 break;
2755 case USM_DWORD:
2756 __ movl(Address(dest, (4 * i)), wide_value);
2757 break;
2758 case USM_QUADWORD:
2759 __ movq(Address(dest, (8 * i)), wide_value);
2760 break;
2761 }
2762 }
2763 __ addq(dest, incr);
2764 __ decrementq(tmp);
2765 __ jccb(Assembler::notZero, L_Loop);
2766
2767 __ BIND(L_Tail);
2768
2769 // Find number of remaining X-byte chunks
2770 __ andq(size, 0x7);
2771
2772 // If zero, then we're done
2773 __ jccb(Assembler::zero, L_exit);
2774
2775 __ BIND(L_TailLoop);
2776
2777 switch (type) {
2778 case USM_SHORT:
2779 __ movw(Address(dest, 0), wide_value);
2780 break;
2781 case USM_DWORD:
2782 __ movl(Address(dest, 0), wide_value);
2783 break;
2784 case USM_QUADWORD:
2785 __ movq(Address(dest, 0), wide_value);
2786 break;
2787 }
2788 __ addq(dest, incr >> 3);
2789 __ decrementq(size);
2790 __ jccb(Assembler::notZero, L_TailLoop);
2791 }
2792
2793 // Generate 'unsafe' set memory stub
2794 // Though just as safe as the other stubs, it takes an unscaled
2795 // size_t (# bytes) argument instead of an element count.
2796 //
2797 // Input:
2798 // c_rarg0 - destination array address
2799 // c_rarg1 - byte count (size_t)
2800 // c_rarg2 - byte value
2801 //
2802 // Examines the alignment of the operands and dispatches
2803 // to an int, short, or byte fill loop.
2804 //
2805 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) {
2806 __ align(CodeEntryAlignment);
2807 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
2808 StubCodeMark mark(this, stub_id);
2809 address start = __ pc();
2810 __ enter(); // required for proper stackwalking of RuntimeStub frame
2811
2812 assert(unsafe_byte_fill != nullptr, "Invalid call");
2813
2814 // bump this on entry, not on exit:
2815 INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1);
2816
2817 {
2818 Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes;
2819
2820 const Register dest = c_rarg0;
2821 const Register size = c_rarg1;
2822 const Register byteVal = c_rarg2;
2823 const Register wide_value = rax;
2824 const Register rScratch1 = r10;
2825
2826 assert_different_registers(dest, size, byteVal, wide_value, rScratch1);
2827
2828 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2829
2830 __ testq(size, size);
2831 __ jcc(Assembler::zero, L_exit);
2832
2833 // Propagate byte to full Register
2834 __ movzbl(rScratch1, byteVal);
2835 __ mov64(wide_value, 0x0101010101010101ULL);
2836 __ imulq(wide_value, rScratch1);
2837
2838 // Check for pointer & size alignment
2839 __ movq(rScratch1, dest);
2840 __ orq(rScratch1, size);
2841
2842 __ testb(rScratch1, 7);
2843 __ jcc(Assembler::equal, L_fillQuadwords);
2844
2845 __ testb(rScratch1, 3);
2846 __ jcc(Assembler::equal, L_fillDwords);
2847
2848 __ testb(rScratch1, 1);
2849 __ jcc(Assembler::notEqual, L_fillBytes);
2850
2851 // Fill words
2852 {
2853 UnsafeMemoryAccessMark umam(this, true, true);
2854
2855 // At this point, we know the lower bit of size is zero and a
2856 // multiple of 2
2857 do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
2858 L_exit, _masm);
2859 }
2860 __ jmpb(L_exit);
2861
2862 __ BIND(L_fillQuadwords);
2863
2864 // Fill QUADWORDs
2865 {
2866 UnsafeMemoryAccessMark umam(this, true, true);
2867
2868 // At this point, we know the lower 3 bits of size are zero and a
2869 // multiple of 8
2870 do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
2871 L_exit, _masm);
2872 }
2873 __ BIND(L_exit);
2874
2875 __ leave(); // required for proper stackwalking of RuntimeStub frame
2876 __ ret(0);
2877
2878 __ BIND(L_fillDwords);
2879
2880 // Fill DWORDs
2881 {
2882 UnsafeMemoryAccessMark umam(this, true, true);
2883
2884 // At this point, we know the lower 2 bits of size are zero and a
2885 // multiple of 4
2886 do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
2887 L_exit, _masm);
2888 }
2889 __ jmpb(L_exit);
2890
2891 __ BIND(L_fillBytes);
2892 // Set up for tail call to previously generated byte fill routine
2893 // Parameter order is (ptr, byteVal, size)
2894 __ xchgq(c_rarg1, c_rarg2);
2895 __ leave(); // Clear effect of enter()
2896 __ jump(RuntimeAddress(unsafe_byte_fill));
2897 }
2898
2899 return start;
2900 }
2901
2902 // Perform range checks on the proposed arraycopy.
2903 // Kills temp, but nothing else.
2904 // Also, clean the sign bits of src_pos and dst_pos.
2905 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2906 Register src_pos, // source position (c_rarg1)
2907 Register dst, // destination array oo (c_rarg2)
2908 Register dst_pos, // destination position (c_rarg3)
2909 Register length,
2910 Register temp,
2911 Label& L_failed) {
2912 BLOCK_COMMENT("arraycopy_range_checks:");
2913
2914 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2915 __ movl(temp, length);
2916 __ addl(temp, src_pos); // src_pos + length
2917 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2918 __ jcc(Assembler::above, L_failed);
2919
2920 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2921 __ movl(temp, length);
2922 __ addl(temp, dst_pos); // dst_pos + length
2923 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2924 __ jcc(Assembler::above, L_failed);
2925
2926 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2927 // Move with sign extension can be used since they are positive.
2928 __ movslq(src_pos, src_pos);
2929 __ movslq(dst_pos, dst_pos);
2930
2931 BLOCK_COMMENT("arraycopy_range_checks done");
2932 }
2933
2934
2935 // Generate generic array copy stubs
2936 //
2937 // Input:
2938 // c_rarg0 - src oop
2939 // c_rarg1 - src_pos (32-bits)
2940 // c_rarg2 - dst oop
2941 // c_rarg3 - dst_pos (32-bits)
2942 // not Win64
2943 // c_rarg4 - element count (32-bits)
2944 // Win64
2945 // rsp+40 - element count (32-bits)
2946 //
2947 // Output:
2948 // rax == 0 - success
2949 // rax == -1^K - failure, where K is partial transfer count
2950 //
2951 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2952 address int_copy_entry, address oop_copy_entry,
2953 address long_copy_entry, address checkcast_copy_entry) {
2954
2955 Label L_failed, L_failed_0, L_objArray;
2956 Label L_copy_shorts, L_copy_ints, L_copy_longs;
2957
2958 // Input registers
2959 const Register src = c_rarg0; // source array oop
2960 const Register src_pos = c_rarg1; // source position
2961 const Register dst = c_rarg2; // destination array oop
2962 const Register dst_pos = c_rarg3; // destination position
2963 #ifndef _WIN64
2964 const Register length = c_rarg4;
2965 const Register rklass_tmp = r9; // load_klass
2966 #else
2967 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64
2968 const Register rklass_tmp = rdi; // load_klass
2969 #endif
2970
2971 { int modulus = CodeEntryAlignment;
2972 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
2973 int advance = target - (__ offset() % modulus);
2974 if (advance < 0) advance += modulus;
2975 if (advance > 0) __ nop(advance);
2976 }
2977 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2978 StubCodeMark mark(this, stub_id);
2979
2980 // Short-hop target to L_failed. Makes for denser prologue code.
2981 __ BIND(L_failed_0);
2982 __ jmp(L_failed);
2983 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2984
2985 __ align(CodeEntryAlignment);
2986 address start = __ pc();
2987
2988 __ enter(); // required for proper stackwalking of RuntimeStub frame
2989
2990 #ifdef _WIN64
2991 __ push_ppx(rklass_tmp); // rdi is callee-save on Windows
2992 #endif
2993
2994 // bump this on entry, not on exit:
2995 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1);
2996
2997 //-----------------------------------------------------------------------
2998 // Assembler stub will be used for this call to arraycopy
2999 // if the following conditions are met:
3000 //
3001 // (1) src and dst must not be null.
3002 // (2) src_pos must not be negative.
3003 // (3) dst_pos must not be negative.
3004 // (4) length must not be negative.
3005 // (5) src klass and dst klass should be the same and not null.
3006 // (6) src and dst should be arrays.
3007 // (7) src_pos + length must not exceed length of src.
3008 // (8) dst_pos + length must not exceed length of dst.
3009 //
3010
3011 // if (src == nullptr) return -1;
3012 __ testptr(src, src); // src oop
3013 size_t j1off = __ offset();
3014 __ jccb(Assembler::zero, L_failed_0);
3015
3016 // if (src_pos < 0) return -1;
3017 __ testl(src_pos, src_pos); // src_pos (32-bits)
3018 __ jccb(Assembler::negative, L_failed_0);
3019
3020 // if (dst == nullptr) return -1;
3021 __ testptr(dst, dst); // dst oop
3022 __ jccb(Assembler::zero, L_failed_0);
3023
3024 // if (dst_pos < 0) return -1;
3025 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3026 size_t j4off = __ offset();
3027 __ jccb(Assembler::negative, L_failed_0);
3028
3029 // The first four tests are very dense code,
3030 // but not quite dense enough to put four
3031 // jumps in a 16-byte instruction fetch buffer.
3032 // That's good, because some branch predicters
3033 // do not like jumps so close together.
3034 // Make sure of this.
3035 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3036
3037 // registers used as temp
3038 const Register r11_length = r11; // elements count to copy
3039 const Register r10_src_klass = r10; // array klass
3040
3041 // if (length < 0) return -1;
3042 __ movl(r11_length, length); // length (elements count, 32-bits value)
3043 __ testl(r11_length, r11_length);
3044 __ jccb(Assembler::negative, L_failed_0);
3045
3046 __ load_klass(r10_src_klass, src, rklass_tmp);
3047 #ifdef ASSERT
3048 // assert(src->klass() != nullptr);
3049 {
3050 BLOCK_COMMENT("assert klasses not null {");
3051 Label L1, L2;
3052 __ testptr(r10_src_klass, r10_src_klass);
3053 __ jcc(Assembler::notZero, L2); // it is broken if klass is null
3054 __ bind(L1);
3055 __ stop("broken null klass");
3056 __ bind(L2);
3057 __ load_klass(rax, dst, rklass_tmp);
3058 __ cmpq(rax, 0);
3059 __ jcc(Assembler::equal, L1); // this would be broken also
3060 BLOCK_COMMENT("} assert klasses not null done");
3061 }
3062 #endif
3063
3064 // Load layout helper (32-bits)
3065 //
3066 // |array_tag| | header_size | element_type | |log2_element_size|
3067 // 32 30 24 16 8 2 0
3068 //
3069 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3070 //
3071
3072 const int lh_offset = in_bytes(Klass::layout_helper_offset());
3073
3074 // Handle objArrays completely differently...
3075 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3076 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3077 __ jcc(Assembler::equal, L_objArray);
3078
3079 // if (src->klass() != dst->klass()) return -1;
3080 __ load_klass(rax, dst, rklass_tmp);
3081 __ cmpq(r10_src_klass, rax);
3082 __ jcc(Assembler::notEqual, L_failed);
3083
3084 const Register rax_lh = rax; // layout helper
3085 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3086
3087 // if (!src->is_Array()) return -1;
3088 __ cmpl(rax_lh, Klass::_lh_neutral_value);
3089 __ jcc(Assembler::greaterEqual, L_failed);
3090
3091 // At this point, it is known to be a typeArray (array_tag 0x3).
3092 #ifdef ASSERT
3093 {
3094 BLOCK_COMMENT("assert primitive array {");
3095 Label L;
3096 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3097 __ jcc(Assembler::greaterEqual, L);
3098 __ stop("must be a primitive array");
3099 __ bind(L);
3100 BLOCK_COMMENT("} assert primitive array done");
3101 }
3102 #endif
3103
3104 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3105 r10, L_failed);
3106
3107 // TypeArrayKlass
3108 //
3109 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3110 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3111 //
3112
3113 const Register r10_offset = r10; // array offset
3114 const Register rax_elsize = rax_lh; // element size
3115
3116 __ movl(r10_offset, rax_lh);
3117 __ shrl(r10_offset, Klass::_lh_header_size_shift);
3118 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
3119 __ addptr(src, r10_offset); // src array offset
3120 __ addptr(dst, r10_offset); // dst array offset
3121 BLOCK_COMMENT("choose copy loop based on element size");
3122 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3123
3124 #ifdef _WIN64
3125 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3126 #endif
3127
3128 // next registers should be set before the jump to corresponding stub
3129 const Register from = c_rarg0; // source array address
3130 const Register to = c_rarg1; // destination array address
3131 const Register count = c_rarg2; // elements count
3132
3133 // 'from', 'to', 'count' registers should be set in such order
3134 // since they are the same as 'src', 'src_pos', 'dst'.
3135
3136 __ cmpl(rax_elsize, 0);
3137 __ jccb(Assembler::notEqual, L_copy_shorts);
3138 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3139 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3140 __ movl2ptr(count, r11_length); // length
3141 __ jump(RuntimeAddress(byte_copy_entry));
3142
3143 __ BIND(L_copy_shorts);
3144 __ cmpl(rax_elsize, LogBytesPerShort);
3145 __ jccb(Assembler::notEqual, L_copy_ints);
3146 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3147 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3148 __ movl2ptr(count, r11_length); // length
3149 __ jump(RuntimeAddress(short_copy_entry));
3150
3151 __ BIND(L_copy_ints);
3152 __ cmpl(rax_elsize, LogBytesPerInt);
3153 __ jccb(Assembler::notEqual, L_copy_longs);
3154 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3155 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3156 __ movl2ptr(count, r11_length); // length
3157 __ jump(RuntimeAddress(int_copy_entry));
3158
3159 __ BIND(L_copy_longs);
3160 #ifdef ASSERT
3161 {
3162 BLOCK_COMMENT("assert long copy {");
3163 Label L;
3164 __ cmpl(rax_elsize, LogBytesPerLong);
3165 __ jcc(Assembler::equal, L);
3166 __ stop("must be long copy, but elsize is wrong");
3167 __ bind(L);
3168 BLOCK_COMMENT("} assert long copy done");
3169 }
3170 #endif
3171 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3172 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3173 __ movl2ptr(count, r11_length); // length
3174 __ jump(RuntimeAddress(long_copy_entry));
3175
3176 // ObjArrayKlass
3177 __ BIND(L_objArray);
3178 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
3179
3180 Label L_plain_copy, L_checkcast_copy;
3181 // test array classes for subtyping
3182 __ load_klass(rax, dst, rklass_tmp);
3183 __ cmpq(r10_src_klass, rax); // usual case is exact equality
3184 __ jcc(Assembler::notEqual, L_checkcast_copy);
3185
3186 // Identically typed arrays can be copied without element-wise checks.
3187 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3188 r10, L_failed);
3189
3190 __ lea(from, Address(src, src_pos, TIMES_OOP,
3191 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3192 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3193 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3194 __ movl2ptr(count, r11_length); // length
3195 __ BIND(L_plain_copy);
3196 #ifdef _WIN64
3197 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3198 #endif
3199 __ jump(RuntimeAddress(oop_copy_entry));
3200
3201 __ BIND(L_checkcast_copy);
3202 // live at this point: r10_src_klass, r11_length, rax (dst_klass)
3203 {
3204 // Before looking at dst.length, make sure dst is also an objArray.
3205 __ cmpl(Address(rax, lh_offset), objArray_lh);
3206 __ jcc(Assembler::notEqual, L_failed);
3207
3208 // It is safe to examine both src.length and dst.length.
3209 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3210 rax, L_failed);
3211
3212 const Register r11_dst_klass = r11;
3213 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3214
3215 // Marshal the base address arguments now, freeing registers.
3216 __ lea(from, Address(src, src_pos, TIMES_OOP,
3217 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3218 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3219 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3220 __ movl(count, length); // length (reloaded)
3221 Register sco_temp = c_rarg3; // this register is free now
3222 assert_different_registers(from, to, count, sco_temp,
3223 r11_dst_klass, r10_src_klass);
3224 assert_clean_int(count, sco_temp);
3225
3226 // Generate the type check.
3227 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3228 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3229 assert_clean_int(sco_temp, rax);
3230 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3231
3232 // Fetch destination element klass from the ObjArrayKlass header.
3233 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3234 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3235 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
3236 assert_clean_int(sco_temp, rax);
3237
3238 #ifdef _WIN64
3239 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3240 #endif
3241
3242 // the checkcast_copy loop needs two extra arguments:
3243 assert(c_rarg3 == sco_temp, "#3 already in place");
3244 // Set up arguments for checkcast_copy_entry.
3245 setup_arg_regs_using_thread(4);
3246 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3247 __ jump(RuntimeAddress(checkcast_copy_entry));
3248 }
3249
3250 __ BIND(L_failed);
3251 #ifdef _WIN64
3252 __ pop_ppx(rklass_tmp); // Restore callee-save rdi
3253 #endif
3254 __ xorptr(rax, rax);
3255 __ notptr(rax); // return -1
3256 __ leave(); // required for proper stackwalking of RuntimeStub frame
3257 __ ret(0);
3258
3259 return start;
3260 }
3261
3262 #undef __