1 /* 2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/objArrayKlass.hpp" 30 #include "runtime/sharedRuntime.hpp" 31 #include "runtime/stubRoutines.hpp" 32 #include "stubGenerator_x86_64.hpp" 33 #ifdef COMPILER2 34 #include "opto/c2_globals.hpp" 35 #endif 36 #if INCLUDE_JVMCI 37 #include "jvmci/jvmci_globals.hpp" 38 #endif 39 40 #define __ _masm-> 41 42 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #else 47 #define BLOCK_COMMENT(str) __ block_comment(str) 48 #endif // PRODUCT 49 50 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 51 52 #ifdef PRODUCT 53 #define INC_COUNTER_NP(counter, rscratch) ((void)0) 54 #else 55 #define INC_COUNTER_NP(counter, rscratch) \ 56 BLOCK_COMMENT("inc_counter " #counter); \ 57 inc_counter_np(_masm, counter, rscratch); 58 59 static void inc_counter_np(MacroAssembler* _masm, int& counter, Register rscratch) { 60 __ incrementl(ExternalAddress((address)&counter), rscratch); 61 } 62 63 #if COMPILER2_OR_JVMCI 64 static int& get_profile_ctr(int shift) { 65 if (shift == 0) { 66 return SharedRuntime::_jbyte_array_copy_ctr; 67 } else if (shift == 1) { 68 return SharedRuntime::_jshort_array_copy_ctr; 69 } else if (shift == 2) { 70 return SharedRuntime::_jint_array_copy_ctr; 71 } else { 72 assert(shift == 3, ""); 73 return SharedRuntime::_jlong_array_copy_ctr; 74 } 75 } 76 #endif // COMPILER2_OR_JVMCI 77 #endif // !PRODUCT 78 79 void StubGenerator::generate_arraycopy_stubs() { 80 address entry; 81 address entry_jbyte_arraycopy; 82 address entry_jshort_arraycopy; 83 address entry_jint_arraycopy; 84 address entry_oop_arraycopy; 85 address entry_jlong_arraycopy; 86 address entry_checkcast_arraycopy; 87 88 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 89 "jbyte_disjoint_arraycopy"); 90 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy, 91 "jbyte_arraycopy"); 92 93 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 94 "jshort_disjoint_arraycopy"); 95 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy, 96 "jshort_arraycopy"); 97 98 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry, 99 "jint_disjoint_arraycopy"); 100 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry, 101 &entry_jint_arraycopy, "jint_arraycopy"); 102 103 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry, 104 "jlong_disjoint_arraycopy"); 105 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry, 106 &entry_jlong_arraycopy, "jlong_arraycopy"); 107 if (UseCompressedOops) { 108 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry, 109 "oop_disjoint_arraycopy"); 110 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry, 111 &entry_oop_arraycopy, "oop_arraycopy"); 112 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry, 113 "oop_disjoint_arraycopy_uninit", 114 /*dest_uninitialized*/true); 115 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry, 116 NULL, "oop_arraycopy_uninit", 117 /*dest_uninitialized*/true); 118 } else { 119 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry, 120 "oop_disjoint_arraycopy"); 121 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry, 122 &entry_oop_arraycopy, "oop_arraycopy"); 123 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry, 124 "oop_disjoint_arraycopy_uninit", 125 /*dest_uninitialized*/true); 126 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry, 127 NULL, "oop_arraycopy_uninit", 128 /*dest_uninitialized*/true); 129 } 130 131 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 132 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 133 /*dest_uninitialized*/true); 134 135 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 136 entry_jbyte_arraycopy, 137 entry_jshort_arraycopy, 138 entry_jint_arraycopy, 139 entry_jlong_arraycopy); 140 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 141 entry_jbyte_arraycopy, 142 entry_jshort_arraycopy, 143 entry_jint_arraycopy, 144 entry_oop_arraycopy, 145 entry_jlong_arraycopy, 146 entry_checkcast_arraycopy); 147 148 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 149 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 150 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 151 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 152 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 153 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 154 155 // We don't generate specialized code for HeapWord-aligned source 156 // arrays, so just use the code we've already generated 157 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; 158 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; 159 160 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; 161 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; 162 163 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 164 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 165 166 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 167 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 168 169 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 170 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 171 172 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 173 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 174 } 175 176 177 // Verify that a register contains clean 32-bits positive value 178 // (high 32-bits are 0) so it could be used in 64-bits shifts. 179 // 180 // Input: 181 // Rint - 32-bits value 182 // Rtmp - scratch 183 // 184 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) { 185 #ifdef ASSERT 186 Label L; 187 assert_different_registers(Rtmp, Rint); 188 __ movslq(Rtmp, Rint); 189 __ cmpq(Rtmp, Rint); 190 __ jcc(Assembler::equal, L); 191 __ stop("high 32-bits of int value are not 0"); 192 __ bind(L); 193 #endif 194 } 195 196 197 // Generate overlap test for array copy stubs 198 // 199 // Input: 200 // c_rarg0 - from 201 // c_rarg1 - to 202 // c_rarg2 - element count 203 // 204 // Output: 205 // rax - &from[element count - 1] 206 // 207 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { 208 const Register from = c_rarg0; 209 const Register to = c_rarg1; 210 const Register count = c_rarg2; 211 const Register end_from = rax; 212 213 __ cmpptr(to, from); 214 __ lea(end_from, Address(from, count, sf, 0)); 215 if (NOLp == NULL) { 216 ExternalAddress no_overlap(no_overlap_target); 217 __ jump_cc(Assembler::belowEqual, no_overlap); 218 __ cmpptr(to, end_from); 219 __ jump_cc(Assembler::aboveEqual, no_overlap); 220 } else { 221 __ jcc(Assembler::belowEqual, (*NOLp)); 222 __ cmpptr(to, end_from); 223 __ jcc(Assembler::aboveEqual, (*NOLp)); 224 } 225 } 226 227 228 // Copy big chunks forward 229 // 230 // Inputs: 231 // end_from - source arrays end address 232 // end_to - destination array end address 233 // qword_count - 64-bits element count, negative 234 // to - scratch 235 // L_copy_bytes - entry label 236 // L_copy_8_bytes - exit label 237 // 238 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to, 239 Register qword_count, Register to, 240 Label& L_copy_bytes, Label& L_copy_8_bytes) { 241 DEBUG_ONLY(__ stop("enter at entry label, not here")); 242 Label L_loop; 243 __ align(OptoLoopAlignment); 244 if (UseUnalignedLoadStores) { 245 Label L_end; 246 __ BIND(L_loop); 247 if (UseAVX >= 2) { 248 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 249 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 250 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); 251 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); 252 } else { 253 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 254 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 255 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); 256 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); 257 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); 258 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); 259 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); 260 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); 261 } 262 263 __ BIND(L_copy_bytes); 264 __ addptr(qword_count, 8); 265 __ jcc(Assembler::lessEqual, L_loop); 266 __ subptr(qword_count, 4); // sub(8) and add(4) 267 __ jccb(Assembler::greater, L_end); 268 // Copy trailing 32 bytes 269 if (UseAVX >= 2) { 270 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 271 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 272 } else { 273 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 274 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 275 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); 276 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); 277 } 278 __ addptr(qword_count, 4); 279 __ BIND(L_end); 280 } else { 281 // Copy 32-bytes per iteration 282 __ BIND(L_loop); 283 __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); 284 __ movq(Address(end_to, qword_count, Address::times_8, -24), to); 285 __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); 286 __ movq(Address(end_to, qword_count, Address::times_8, -16), to); 287 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); 288 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); 289 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); 290 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); 291 292 __ BIND(L_copy_bytes); 293 __ addptr(qword_count, 4); 294 __ jcc(Assembler::lessEqual, L_loop); 295 } 296 __ subptr(qword_count, 4); 297 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 298 } 299 300 301 // Copy big chunks backward 302 // 303 // Inputs: 304 // from - source arrays address 305 // dest - destination array address 306 // qword_count - 64-bits element count 307 // to - scratch 308 // L_copy_bytes - entry label 309 // L_copy_8_bytes - exit label 310 // 311 void StubGenerator::copy_bytes_backward(Register from, Register dest, 312 Register qword_count, Register to, 313 Label& L_copy_bytes, Label& L_copy_8_bytes) { 314 DEBUG_ONLY(__ stop("enter at entry label, not here")); 315 Label L_loop; 316 __ align(OptoLoopAlignment); 317 if (UseUnalignedLoadStores) { 318 Label L_end; 319 __ BIND(L_loop); 320 if (UseAVX >= 2) { 321 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); 322 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); 323 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 324 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 325 } else { 326 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); 327 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); 328 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); 329 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); 330 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); 331 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); 332 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); 333 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); 334 } 335 336 __ BIND(L_copy_bytes); 337 __ subptr(qword_count, 8); 338 __ jcc(Assembler::greaterEqual, L_loop); 339 340 __ addptr(qword_count, 4); // add(8) and sub(4) 341 __ jccb(Assembler::less, L_end); 342 // Copy trailing 32 bytes 343 if (UseAVX >= 2) { 344 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); 345 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0); 346 } else { 347 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); 348 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); 349 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 350 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 351 } 352 __ subptr(qword_count, 4); 353 __ BIND(L_end); 354 } else { 355 // Copy 32-bytes per iteration 356 __ BIND(L_loop); 357 __ movq(to, Address(from, qword_count, Address::times_8, 24)); 358 __ movq(Address(dest, qword_count, Address::times_8, 24), to); 359 __ movq(to, Address(from, qword_count, Address::times_8, 16)); 360 __ movq(Address(dest, qword_count, Address::times_8, 16), to); 361 __ movq(to, Address(from, qword_count, Address::times_8, 8)); 362 __ movq(Address(dest, qword_count, Address::times_8, 8), to); 363 __ movq(to, Address(from, qword_count, Address::times_8, 0)); 364 __ movq(Address(dest, qword_count, Address::times_8, 0), to); 365 366 __ BIND(L_copy_bytes); 367 __ subptr(qword_count, 4); 368 __ jcc(Assembler::greaterEqual, L_loop); 369 } 370 __ addptr(qword_count, 4); 371 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 372 } 373 374 #if COMPILER2_OR_JVMCI 375 376 // Note: Following rules apply to AVX3 optimized arraycopy stubs:- 377 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) 378 // for both special cases (various small block sizes) and aligned copy loop. This is the 379 // default configuration. 380 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) 381 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. 382 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a 383 // better performance for disjoint copies. For conjoint/backward copy vector based 384 // copy performs better. 385 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over 386 // 64 byte vector registers (ZMMs). 387 388 // Inputs: 389 // c_rarg0 - source array address 390 // c_rarg1 - destination array address 391 // c_rarg2 - element count, treated as ssize_t, can be zero 392 // 393 // 394 // Side Effects: 395 // disjoint_copy_avx3_masked is set to the no-overlap entry point 396 // used by generate_conjoint_[byte/int/short/long]_copy(). 397 // 398 address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const char *name, 399 int shift, bool aligned, bool is_oop, 400 bool dest_uninitialized) { 401 __ align(CodeEntryAlignment); 402 StubCodeMark mark(this, "StubRoutines", name); 403 address start = __ pc(); 404 405 int avx3threshold = VM_Version::avx3_threshold(); 406 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 407 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 408 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 409 const Register from = rdi; // source array address 410 const Register to = rsi; // destination array address 411 const Register count = rdx; // elements count 412 const Register temp1 = r8; 413 const Register temp2 = r11; 414 const Register temp3 = rax; 415 const Register temp4 = rcx; 416 // End pointers are inclusive, and if count is not zero they point 417 // to the last unit copied: end_to[0] := end_from[0] 418 419 __ enter(); // required for proper stackwalking of RuntimeStub frame 420 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 421 422 if (entry != NULL) { 423 *entry = __ pc(); 424 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 425 BLOCK_COMMENT("Entry:"); 426 } 427 428 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 429 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 430 431 setup_argument_regs(type); 432 433 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 434 if (dest_uninitialized) { 435 decorators |= IS_DEST_UNINITIALIZED; 436 } 437 if (aligned) { 438 decorators |= ARRAYCOPY_ALIGNED; 439 } 440 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 441 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 442 443 { 444 // Type(shift) byte(0), short(1), int(2), long(3) 445 int loop_size[] = { 192, 96, 48, 24}; 446 int threshold[] = { 4096, 2048, 1024, 512}; 447 448 // UnsafeCopyMemory page error: continue after ucm 449 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 450 // 'from', 'to' and 'count' are now valid 451 452 // temp1 holds remaining count and temp4 holds running count used to compute 453 // next address offset for start of to/from addresses (temp4 * scale). 454 __ mov64(temp4, 0); 455 __ movq(temp1, count); 456 457 // Zero length check. 458 __ BIND(L_tail); 459 __ cmpq(temp1, 0); 460 __ jcc(Assembler::lessEqual, L_exit); 461 462 // Special cases using 32 byte [masked] vector copy operations. 463 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 464 temp4, temp3, use64byteVector, L_entry, L_exit); 465 466 // PRE-MAIN-POST loop for aligned copy. 467 __ BIND(L_entry); 468 469 if (avx3threshold != 0) { 470 __ cmpq(count, threshold[shift]); 471 if (MaxVectorSize == 64) { 472 // Copy using 64 byte vectors. 473 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 474 } else { 475 assert(MaxVectorSize < 64, "vector size should be < 64 bytes"); 476 // REP MOVS offer a faster copy path. 477 __ jcc(Assembler::greaterEqual, L_repmovs); 478 } 479 } 480 481 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 482 // Partial copy to make dst address 32 byte aligned. 483 __ movq(temp2, to); 484 __ andq(temp2, 31); 485 __ jcc(Assembler::equal, L_main_pre_loop); 486 487 __ negptr(temp2); 488 __ addq(temp2, 32); 489 if (shift) { 490 __ shrq(temp2, shift); 491 } 492 __ movq(temp3, temp2); 493 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift); 494 __ movq(temp4, temp2); 495 __ movq(temp1, count); 496 __ subq(temp1, temp2); 497 498 __ cmpq(temp1, loop_size[shift]); 499 __ jcc(Assembler::less, L_tail); 500 501 __ BIND(L_main_pre_loop); 502 __ subq(temp1, loop_size[shift]); 503 504 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 505 __ align32(); 506 __ BIND(L_main_loop); 507 copy64_avx(to, from, temp4, xmm1, false, shift, 0); 508 copy64_avx(to, from, temp4, xmm1, false, shift, 64); 509 copy64_avx(to, from, temp4, xmm1, false, shift, 128); 510 __ addptr(temp4, loop_size[shift]); 511 __ subq(temp1, loop_size[shift]); 512 __ jcc(Assembler::greater, L_main_loop); 513 514 __ addq(temp1, loop_size[shift]); 515 516 // Tail loop. 517 __ jmp(L_tail); 518 519 __ BIND(L_repmovs); 520 __ movq(temp2, temp1); 521 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics. 522 __ movq(temp3, to); 523 __ movq(to, from); 524 __ movq(from, temp3); 525 // Save to/from for restoration post rep_mov. 526 __ movq(temp1, to); 527 __ movq(temp3, from); 528 if(shift < 3) { 529 __ shrq(temp2, 3-shift); // quad word count 530 } 531 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX). 532 __ rep_mov(); 533 __ shlq(temp2, 3); // convert quad words into byte count. 534 if(shift) { 535 __ shrq(temp2, shift); // type specific count. 536 } 537 // Restore original addresses in to/from. 538 __ movq(to, temp3); 539 __ movq(from, temp1); 540 __ movq(temp4, temp2); 541 __ movq(temp1, count); 542 __ subq(temp1, temp2); // tailing part (less than a quad ward size). 543 __ jmp(L_tail); 544 } 545 546 if (MaxVectorSize > 32) { 547 __ BIND(L_pre_main_post_64); 548 // Partial copy to make dst address 64 byte aligned. 549 __ movq(temp2, to); 550 __ andq(temp2, 63); 551 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 552 553 __ negptr(temp2); 554 __ addq(temp2, 64); 555 if (shift) { 556 __ shrq(temp2, shift); 557 } 558 __ movq(temp3, temp2); 559 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true); 560 __ movq(temp4, temp2); 561 __ movq(temp1, count); 562 __ subq(temp1, temp2); 563 564 __ cmpq(temp1, loop_size[shift]); 565 __ jcc(Assembler::less, L_tail64); 566 567 __ BIND(L_main_pre_loop_64bytes); 568 __ subq(temp1, loop_size[shift]); 569 570 // Main loop with aligned copy block size of 192 bytes at 571 // 64 byte copy granularity. 572 __ align32(); 573 __ BIND(L_main_loop_64bytes); 574 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true); 575 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true); 576 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true); 577 __ addptr(temp4, loop_size[shift]); 578 __ subq(temp1, loop_size[shift]); 579 __ jcc(Assembler::greater, L_main_loop_64bytes); 580 581 __ addq(temp1, loop_size[shift]); 582 // Zero length check. 583 __ jcc(Assembler::lessEqual, L_exit); 584 585 __ BIND(L_tail64); 586 587 // Tail handling using 64 byte [masked] vector copy operations. 588 use64byteVector = true; 589 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 590 temp4, temp3, use64byteVector, L_entry, L_exit); 591 } 592 __ BIND(L_exit); 593 } 594 595 address ucme_exit_pc = __ pc(); 596 // When called from generic_arraycopy r11 contains specific values 597 // used during arraycopy epilogue, re-initializing r11. 598 if (is_oop) { 599 __ movq(r11, shift == 3 ? count : to); 600 } 601 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 602 restore_argument_regs(type); 603 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 604 __ xorptr(rax, rax); // return 0 605 __ vzeroupper(); 606 __ leave(); // required for proper stackwalking of RuntimeStub frame 607 __ ret(0); 608 609 return start; 610 } 611 612 613 // Inputs: 614 // c_rarg0 - source array address 615 // c_rarg1 - destination array address 616 // c_rarg2 - element count, treated as ssize_t, can be zero 617 // 618 // 619 address StubGenerator::generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift, 620 address nooverlap_target, bool aligned, 621 bool is_oop, bool dest_uninitialized) { 622 __ align(CodeEntryAlignment); 623 StubCodeMark mark(this, "StubRoutines", name); 624 address start = __ pc(); 625 626 int avx3threshold = VM_Version::avx3_threshold(); 627 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 628 629 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 630 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 631 const Register from = rdi; // source array address 632 const Register to = rsi; // destination array address 633 const Register count = rdx; // elements count 634 const Register temp1 = r8; 635 const Register temp2 = rcx; 636 const Register temp3 = r11; 637 const Register temp4 = rax; 638 // End pointers are inclusive, and if count is not zero they point 639 // to the last unit copied: end_to[0] := end_from[0] 640 641 __ enter(); // required for proper stackwalking of RuntimeStub frame 642 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 643 644 if (entry != NULL) { 645 *entry = __ pc(); 646 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 647 BLOCK_COMMENT("Entry:"); 648 } 649 650 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift)); 651 652 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 653 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 654 655 setup_argument_regs(type); 656 657 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 658 if (dest_uninitialized) { 659 decorators |= IS_DEST_UNINITIALIZED; 660 } 661 if (aligned) { 662 decorators |= ARRAYCOPY_ALIGNED; 663 } 664 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 665 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 666 { 667 // Type(shift) byte(0), short(1), int(2), long(3) 668 int loop_size[] = { 192, 96, 48, 24}; 669 int threshold[] = { 4096, 2048, 1024, 512}; 670 671 // UnsafeCopyMemory page error: continue after ucm 672 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 673 // 'from', 'to' and 'count' are now valid 674 675 // temp1 holds remaining count. 676 __ movq(temp1, count); 677 678 // Zero length check. 679 __ BIND(L_tail); 680 __ cmpq(temp1, 0); 681 __ jcc(Assembler::lessEqual, L_exit); 682 683 __ mov64(temp2, 0); 684 __ movq(temp3, temp1); 685 // Special cases using 32 byte [masked] vector copy operations. 686 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 687 temp4, use64byteVector, L_entry, L_exit); 688 689 // PRE-MAIN-POST loop for aligned copy. 690 __ BIND(L_entry); 691 692 if ((MaxVectorSize > 32) && (avx3threshold != 0)) { 693 __ cmpq(temp1, threshold[shift]); 694 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 695 } 696 697 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 698 // Partial copy to make dst address 32 byte aligned. 699 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 700 __ andq(temp2, 31); 701 __ jcc(Assembler::equal, L_main_pre_loop); 702 703 if (shift) { 704 __ shrq(temp2, shift); 705 } 706 __ subq(temp1, temp2); 707 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift); 708 709 __ cmpq(temp1, loop_size[shift]); 710 __ jcc(Assembler::less, L_tail); 711 712 __ BIND(L_main_pre_loop); 713 714 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 715 __ align32(); 716 __ BIND(L_main_loop); 717 copy64_avx(to, from, temp1, xmm1, true, shift, -64); 718 copy64_avx(to, from, temp1, xmm1, true, shift, -128); 719 copy64_avx(to, from, temp1, xmm1, true, shift, -192); 720 __ subptr(temp1, loop_size[shift]); 721 __ cmpq(temp1, loop_size[shift]); 722 __ jcc(Assembler::greater, L_main_loop); 723 724 // Tail loop. 725 __ jmp(L_tail); 726 } 727 728 if (MaxVectorSize > 32) { 729 __ BIND(L_pre_main_post_64); 730 // Partial copy to make dst address 64 byte aligned. 731 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 732 __ andq(temp2, 63); 733 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 734 735 if (shift) { 736 __ shrq(temp2, shift); 737 } 738 __ subq(temp1, temp2); 739 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true); 740 741 __ cmpq(temp1, loop_size[shift]); 742 __ jcc(Assembler::less, L_tail64); 743 744 __ BIND(L_main_pre_loop_64bytes); 745 746 // Main loop with aligned copy block size of 192 bytes at 747 // 64 byte copy granularity. 748 __ align32(); 749 __ BIND(L_main_loop_64bytes); 750 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true); 751 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true); 752 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true); 753 __ subq(temp1, loop_size[shift]); 754 __ cmpq(temp1, loop_size[shift]); 755 __ jcc(Assembler::greater, L_main_loop_64bytes); 756 757 // Zero length check. 758 __ cmpq(temp1, 0); 759 __ jcc(Assembler::lessEqual, L_exit); 760 761 __ BIND(L_tail64); 762 763 // Tail handling using 64 byte [masked] vector copy operations. 764 use64byteVector = true; 765 __ mov64(temp2, 0); 766 __ movq(temp3, temp1); 767 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 768 temp4, use64byteVector, L_entry, L_exit); 769 } 770 __ BIND(L_exit); 771 } 772 address ucme_exit_pc = __ pc(); 773 // When called from generic_arraycopy r11 contains specific values 774 // used during arraycopy epilogue, re-initializing r11. 775 if(is_oop) { 776 __ movq(r11, count); 777 } 778 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 779 restore_argument_regs(type); 780 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 781 __ xorptr(rax, rax); // return 0 782 __ vzeroupper(); 783 __ leave(); // required for proper stackwalking of RuntimeStub frame 784 __ ret(0); 785 786 return start; 787 } 788 789 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from, 790 Register to, Register count, int shift, 791 Register index, Register temp, 792 bool use64byteVector, Label& L_entry, Label& L_exit) { 793 Label L_entry_64, L_entry_96, L_entry_128; 794 Label L_entry_160, L_entry_192; 795 796 int size_mat[][6] = { 797 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 798 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 799 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 800 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 801 }; 802 803 // Case A) Special case for length less than equal to 32 bytes. 804 __ cmpq(count, size_mat[shift][0]); 805 __ jccb(Assembler::greater, L_entry_64); 806 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift); 807 __ jmp(L_exit); 808 809 // Case B) Special case for length less than equal to 64 bytes. 810 __ BIND(L_entry_64); 811 __ cmpq(count, size_mat[shift][1]); 812 __ jccb(Assembler::greater, L_entry_96); 813 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector); 814 __ jmp(L_exit); 815 816 // Case C) Special case for length less than equal to 96 bytes. 817 __ BIND(L_entry_96); 818 __ cmpq(count, size_mat[shift][2]); 819 __ jccb(Assembler::greater, L_entry_128); 820 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 821 __ subq(count, 64 >> shift); 822 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64); 823 __ jmp(L_exit); 824 825 // Case D) Special case for length less than equal to 128 bytes. 826 __ BIND(L_entry_128); 827 __ cmpq(count, size_mat[shift][3]); 828 __ jccb(Assembler::greater, L_entry_160); 829 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 830 copy32_avx(to, from, index, xmm, shift, 64); 831 __ subq(count, 96 >> shift); 832 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96); 833 __ jmp(L_exit); 834 835 // Case E) Special case for length less than equal to 160 bytes. 836 __ BIND(L_entry_160); 837 __ cmpq(count, size_mat[shift][4]); 838 __ jccb(Assembler::greater, L_entry_192); 839 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 840 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 841 __ subq(count, 128 >> shift); 842 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128); 843 __ jmp(L_exit); 844 845 // Case F) Special case for length less than equal to 192 bytes. 846 __ BIND(L_entry_192); 847 __ cmpq(count, size_mat[shift][5]); 848 __ jcc(Assembler::greater, L_entry); 849 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 850 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 851 copy32_avx(to, from, index, xmm, shift, 128); 852 __ subq(count, 160 >> shift); 853 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160); 854 __ jmp(L_exit); 855 } 856 857 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, 858 Register to, Register start_index, Register end_index, 859 Register count, int shift, Register temp, 860 bool use64byteVector, Label& L_entry, Label& L_exit) { 861 Label L_entry_64, L_entry_96, L_entry_128; 862 Label L_entry_160, L_entry_192; 863 bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 864 865 int size_mat[][6] = { 866 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 867 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 868 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 869 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 870 }; 871 872 // Case A) Special case for length less than equal to 32 bytes. 873 __ cmpq(count, size_mat[shift][0]); 874 __ jccb(Assembler::greater, L_entry_64); 875 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 876 __ jmp(L_exit); 877 878 // Case B) Special case for length less than equal to 64 bytes. 879 __ BIND(L_entry_64); 880 __ cmpq(count, size_mat[shift][1]); 881 __ jccb(Assembler::greater, L_entry_96); 882 if (avx3) { 883 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true); 884 } else { 885 copy32_avx(to, from, end_index, xmm, shift, -32); 886 __ subq(count, 32 >> shift); 887 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 888 } 889 __ jmp(L_exit); 890 891 // Case C) Special case for length less than equal to 96 bytes. 892 __ BIND(L_entry_96); 893 __ cmpq(count, size_mat[shift][2]); 894 __ jccb(Assembler::greater, L_entry_128); 895 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 896 __ subq(count, 64 >> shift); 897 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 898 __ jmp(L_exit); 899 900 // Case D) Special case for length less than equal to 128 bytes. 901 __ BIND(L_entry_128); 902 __ cmpq(count, size_mat[shift][3]); 903 __ jccb(Assembler::greater, L_entry_160); 904 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 905 copy32_avx(to, from, end_index, xmm, shift, -96); 906 __ subq(count, 96 >> shift); 907 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 908 __ jmp(L_exit); 909 910 // Case E) Special case for length less than equal to 160 bytes. 911 __ BIND(L_entry_160); 912 __ cmpq(count, size_mat[shift][4]); 913 __ jccb(Assembler::greater, L_entry_192); 914 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 915 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 916 __ subq(count, 128 >> shift); 917 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 918 __ jmp(L_exit); 919 920 // Case F) Special case for length less than equal to 192 bytes. 921 __ BIND(L_entry_192); 922 __ cmpq(count, size_mat[shift][5]); 923 __ jcc(Assembler::greater, L_entry); 924 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 925 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 926 copy32_avx(to, from, end_index, xmm, shift, -160); 927 __ subq(count, 160 >> shift); 928 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 929 __ jmp(L_exit); 930 } 931 932 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm, 933 KRegister mask, Register length, Register index, 934 Register temp, int shift, int offset, 935 bool use64byteVector) { 936 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 937 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 938 if (!use64byteVector) { 939 copy32_avx(dst, src, index, xmm, shift, offset); 940 __ subptr(length, 32 >> shift); 941 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32); 942 } else { 943 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 944 assert(MaxVectorSize == 64, "vector length != 64"); 945 __ mov64(temp, -1L); 946 __ bzhiq(temp, temp, length); 947 __ kmovql(mask, temp); 948 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit); 949 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit); 950 } 951 } 952 953 954 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm, 955 KRegister mask, Register length, Register index, 956 Register temp, int shift, int offset) { 957 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 958 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 959 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 960 __ mov64(temp, -1L); 961 __ bzhiq(temp, temp, length); 962 __ kmovql(mask, temp); 963 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit); 964 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit); 965 } 966 967 968 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, 969 int shift, int offset) { 970 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 971 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 972 __ vmovdqu(xmm, Address(src, index, scale, offset)); 973 __ vmovdqu(Address(dst, index, scale, offset), xmm); 974 } 975 976 977 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, 978 bool conjoint, int shift, int offset, bool use64byteVector) { 979 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch"); 980 if (!use64byteVector) { 981 if (conjoint) { 982 copy32_avx(dst, src, index, xmm, shift, offset+32); 983 copy32_avx(dst, src, index, xmm, shift, offset); 984 } else { 985 copy32_avx(dst, src, index, xmm, shift, offset); 986 copy32_avx(dst, src, index, xmm, shift, offset+32); 987 } 988 } else { 989 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 990 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit); 991 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit); 992 } 993 } 994 995 #endif // COMPILER2_OR_JVMCI 996 997 998 // Arguments: 999 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1000 // ignored 1001 // name - stub name string 1002 // 1003 // Inputs: 1004 // c_rarg0 - source array address 1005 // c_rarg1 - destination array address 1006 // c_rarg2 - element count, treated as ssize_t, can be zero 1007 // 1008 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1009 // we let the hardware handle it. The one to eight bytes within words, 1010 // dwords or qwords that span cache line boundaries will still be loaded 1011 // and stored atomically. 1012 // 1013 // Side Effects: 1014 // disjoint_byte_copy_entry is set to the no-overlap entry point 1015 // used by generate_conjoint_byte_copy(). 1016 // 1017 address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1018 #if COMPILER2_OR_JVMCI 1019 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1020 return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0, 1021 aligned, false, false); 1022 } 1023 #endif 1024 __ align(CodeEntryAlignment); 1025 StubCodeMark mark(this, "StubRoutines", name); 1026 address start = __ pc(); 1027 1028 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1029 Label L_copy_byte, L_exit; 1030 const Register from = rdi; // source array address 1031 const Register to = rsi; // destination array address 1032 const Register count = rdx; // elements count 1033 const Register byte_count = rcx; 1034 const Register qword_count = count; 1035 const Register end_from = from; // source array end address 1036 const Register end_to = to; // destination array end address 1037 // End pointers are inclusive, and if count is not zero they point 1038 // to the last unit copied: end_to[0] := end_from[0] 1039 1040 __ enter(); // required for proper stackwalking of RuntimeStub frame 1041 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1042 1043 if (entry != NULL) { 1044 *entry = __ pc(); 1045 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1046 BLOCK_COMMENT("Entry:"); 1047 } 1048 1049 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1050 // r9 and r10 may be used to save non-volatile registers 1051 1052 { 1053 // UnsafeCopyMemory page error: continue after ucm 1054 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1055 // 'from', 'to' and 'count' are now valid 1056 __ movptr(byte_count, count); 1057 __ shrptr(count, 3); // count => qword_count 1058 1059 // Copy from low to high addresses. Use 'to' as scratch. 1060 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1061 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1062 __ negptr(qword_count); // make the count negative 1063 __ jmp(L_copy_bytes); 1064 1065 // Copy trailing qwords 1066 __ BIND(L_copy_8_bytes); 1067 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1068 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1069 __ increment(qword_count); 1070 __ jcc(Assembler::notZero, L_copy_8_bytes); 1071 1072 // Check for and copy trailing dword 1073 __ BIND(L_copy_4_bytes); 1074 __ testl(byte_count, 4); 1075 __ jccb(Assembler::zero, L_copy_2_bytes); 1076 __ movl(rax, Address(end_from, 8)); 1077 __ movl(Address(end_to, 8), rax); 1078 1079 __ addptr(end_from, 4); 1080 __ addptr(end_to, 4); 1081 1082 // Check for and copy trailing word 1083 __ BIND(L_copy_2_bytes); 1084 __ testl(byte_count, 2); 1085 __ jccb(Assembler::zero, L_copy_byte); 1086 __ movw(rax, Address(end_from, 8)); 1087 __ movw(Address(end_to, 8), rax); 1088 1089 __ addptr(end_from, 2); 1090 __ addptr(end_to, 2); 1091 1092 // Check for and copy trailing byte 1093 __ BIND(L_copy_byte); 1094 __ testl(byte_count, 1); 1095 __ jccb(Assembler::zero, L_exit); 1096 __ movb(rax, Address(end_from, 8)); 1097 __ movb(Address(end_to, 8), rax); 1098 } 1099 __ BIND(L_exit); 1100 address ucme_exit_pc = __ pc(); 1101 restore_arg_regs(); 1102 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1103 __ xorptr(rax, rax); // return 0 1104 __ vzeroupper(); 1105 __ leave(); // required for proper stackwalking of RuntimeStub frame 1106 __ ret(0); 1107 1108 { 1109 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc); 1110 // Copy in multi-bytes chunks 1111 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1112 __ jmp(L_copy_4_bytes); 1113 } 1114 return start; 1115 } 1116 1117 1118 // Arguments: 1119 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1120 // ignored 1121 // name - stub name string 1122 // 1123 // Inputs: 1124 // c_rarg0 - source array address 1125 // c_rarg1 - destination array address 1126 // c_rarg2 - element count, treated as ssize_t, can be zero 1127 // 1128 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1129 // we let the hardware handle it. The one to eight bytes within words, 1130 // dwords or qwords that span cache line boundaries will still be loaded 1131 // and stored atomically. 1132 // 1133 address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1134 address* entry, const char *name) { 1135 #if COMPILER2_OR_JVMCI 1136 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1137 return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0, 1138 nooverlap_target, aligned, false, false); 1139 } 1140 #endif 1141 __ align(CodeEntryAlignment); 1142 StubCodeMark mark(this, "StubRoutines", name); 1143 address start = __ pc(); 1144 1145 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1146 const Register from = rdi; // source array address 1147 const Register to = rsi; // destination array address 1148 const Register count = rdx; // elements count 1149 const Register byte_count = rcx; 1150 const Register qword_count = count; 1151 1152 __ enter(); // required for proper stackwalking of RuntimeStub frame 1153 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1154 1155 if (entry != NULL) { 1156 *entry = __ pc(); 1157 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1158 BLOCK_COMMENT("Entry:"); 1159 } 1160 1161 array_overlap_test(nooverlap_target, Address::times_1); 1162 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1163 // r9 and r10 may be used to save non-volatile registers 1164 1165 { 1166 // UnsafeCopyMemory page error: continue after ucm 1167 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1168 // 'from', 'to' and 'count' are now valid 1169 __ movptr(byte_count, count); 1170 __ shrptr(count, 3); // count => qword_count 1171 1172 // Copy from high to low addresses. 1173 1174 // Check for and copy trailing byte 1175 __ testl(byte_count, 1); 1176 __ jcc(Assembler::zero, L_copy_2_bytes); 1177 __ movb(rax, Address(from, byte_count, Address::times_1, -1)); 1178 __ movb(Address(to, byte_count, Address::times_1, -1), rax); 1179 __ decrement(byte_count); // Adjust for possible trailing word 1180 1181 // Check for and copy trailing word 1182 __ BIND(L_copy_2_bytes); 1183 __ testl(byte_count, 2); 1184 __ jcc(Assembler::zero, L_copy_4_bytes); 1185 __ movw(rax, Address(from, byte_count, Address::times_1, -2)); 1186 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1187 1188 // Check for and copy trailing dword 1189 __ BIND(L_copy_4_bytes); 1190 __ testl(byte_count, 4); 1191 __ jcc(Assembler::zero, L_copy_bytes); 1192 __ movl(rax, Address(from, qword_count, Address::times_8)); 1193 __ movl(Address(to, qword_count, Address::times_8), rax); 1194 __ jmp(L_copy_bytes); 1195 1196 // Copy trailing qwords 1197 __ BIND(L_copy_8_bytes); 1198 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1199 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1200 __ decrement(qword_count); 1201 __ jcc(Assembler::notZero, L_copy_8_bytes); 1202 } 1203 restore_arg_regs(); 1204 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1205 __ xorptr(rax, rax); // return 0 1206 __ vzeroupper(); 1207 __ leave(); // required for proper stackwalking of RuntimeStub frame 1208 __ ret(0); 1209 1210 { 1211 // UnsafeCopyMemory page error: continue after ucm 1212 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1213 // Copy in multi-bytes chunks 1214 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1215 } 1216 restore_arg_regs(); 1217 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1218 __ xorptr(rax, rax); // return 0 1219 __ vzeroupper(); 1220 __ leave(); // required for proper stackwalking of RuntimeStub frame 1221 __ ret(0); 1222 1223 return start; 1224 } 1225 1226 1227 // Arguments: 1228 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1229 // ignored 1230 // name - stub name string 1231 // 1232 // Inputs: 1233 // c_rarg0 - source array address 1234 // c_rarg1 - destination array address 1235 // c_rarg2 - element count, treated as ssize_t, can be zero 1236 // 1237 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1238 // let the hardware handle it. The two or four words within dwords 1239 // or qwords that span cache line boundaries will still be loaded 1240 // and stored atomically. 1241 // 1242 // Side Effects: 1243 // disjoint_short_copy_entry is set to the no-overlap entry point 1244 // used by generate_conjoint_short_copy(). 1245 // 1246 address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { 1247 #if COMPILER2_OR_JVMCI 1248 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1249 return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1, 1250 aligned, false, false); 1251 } 1252 #endif 1253 1254 __ align(CodeEntryAlignment); 1255 StubCodeMark mark(this, "StubRoutines", name); 1256 address start = __ pc(); 1257 1258 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1259 const Register from = rdi; // source array address 1260 const Register to = rsi; // destination array address 1261 const Register count = rdx; // elements count 1262 const Register word_count = rcx; 1263 const Register qword_count = count; 1264 const Register end_from = from; // source array end address 1265 const Register end_to = to; // destination array end address 1266 // End pointers are inclusive, and if count is not zero they point 1267 // to the last unit copied: end_to[0] := end_from[0] 1268 1269 __ enter(); // required for proper stackwalking of RuntimeStub frame 1270 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1271 1272 if (entry != NULL) { 1273 *entry = __ pc(); 1274 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1275 BLOCK_COMMENT("Entry:"); 1276 } 1277 1278 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1279 // r9 and r10 may be used to save non-volatile registers 1280 1281 { 1282 // UnsafeCopyMemory page error: continue after ucm 1283 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1284 // 'from', 'to' and 'count' are now valid 1285 __ movptr(word_count, count); 1286 __ shrptr(count, 2); // count => qword_count 1287 1288 // Copy from low to high addresses. Use 'to' as scratch. 1289 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1290 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1291 __ negptr(qword_count); 1292 __ jmp(L_copy_bytes); 1293 1294 // Copy trailing qwords 1295 __ BIND(L_copy_8_bytes); 1296 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1297 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1298 __ increment(qword_count); 1299 __ jcc(Assembler::notZero, L_copy_8_bytes); 1300 1301 // Original 'dest' is trashed, so we can't use it as a 1302 // base register for a possible trailing word copy 1303 1304 // Check for and copy trailing dword 1305 __ BIND(L_copy_4_bytes); 1306 __ testl(word_count, 2); 1307 __ jccb(Assembler::zero, L_copy_2_bytes); 1308 __ movl(rax, Address(end_from, 8)); 1309 __ movl(Address(end_to, 8), rax); 1310 1311 __ addptr(end_from, 4); 1312 __ addptr(end_to, 4); 1313 1314 // Check for and copy trailing word 1315 __ BIND(L_copy_2_bytes); 1316 __ testl(word_count, 1); 1317 __ jccb(Assembler::zero, L_exit); 1318 __ movw(rax, Address(end_from, 8)); 1319 __ movw(Address(end_to, 8), rax); 1320 } 1321 __ BIND(L_exit); 1322 address ucme_exit_pc = __ pc(); 1323 restore_arg_regs(); 1324 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1325 __ xorptr(rax, rax); // return 0 1326 __ vzeroupper(); 1327 __ leave(); // required for proper stackwalking of RuntimeStub frame 1328 __ ret(0); 1329 1330 { 1331 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc); 1332 // Copy in multi-bytes chunks 1333 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1334 __ jmp(L_copy_4_bytes); 1335 } 1336 1337 return start; 1338 } 1339 1340 1341 address StubGenerator::generate_fill(BasicType t, bool aligned, const char *name) { 1342 __ align(CodeEntryAlignment); 1343 StubCodeMark mark(this, "StubRoutines", name); 1344 address start = __ pc(); 1345 1346 BLOCK_COMMENT("Entry:"); 1347 1348 const Register to = c_rarg0; // destination array address 1349 const Register value = c_rarg1; // value 1350 const Register count = c_rarg2; // elements count 1351 __ mov(r11, count); 1352 1353 __ enter(); // required for proper stackwalking of RuntimeStub frame 1354 1355 __ generate_fill(t, aligned, to, value, r11, rax, xmm0); 1356 1357 __ vzeroupper(); 1358 __ leave(); // required for proper stackwalking of RuntimeStub frame 1359 __ ret(0); 1360 1361 return start; 1362 } 1363 1364 1365 // Arguments: 1366 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1367 // ignored 1368 // name - stub name string 1369 // 1370 // Inputs: 1371 // c_rarg0 - source array address 1372 // c_rarg1 - destination array address 1373 // c_rarg2 - element count, treated as ssize_t, can be zero 1374 // 1375 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1376 // let the hardware handle it. The two or four words within dwords 1377 // or qwords that span cache line boundaries will still be loaded 1378 // and stored atomically. 1379 // 1380 address StubGenerator::generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1381 address *entry, const char *name) { 1382 #if COMPILER2_OR_JVMCI 1383 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1384 return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1, 1385 nooverlap_target, aligned, false, false); 1386 } 1387 #endif 1388 __ align(CodeEntryAlignment); 1389 StubCodeMark mark(this, "StubRoutines", name); 1390 address start = __ pc(); 1391 1392 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; 1393 const Register from = rdi; // source array address 1394 const Register to = rsi; // destination array address 1395 const Register count = rdx; // elements count 1396 const Register word_count = rcx; 1397 const Register qword_count = count; 1398 1399 __ enter(); // required for proper stackwalking of RuntimeStub frame 1400 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1401 1402 if (entry != NULL) { 1403 *entry = __ pc(); 1404 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1405 BLOCK_COMMENT("Entry:"); 1406 } 1407 1408 array_overlap_test(nooverlap_target, Address::times_2); 1409 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1410 // r9 and r10 may be used to save non-volatile registers 1411 1412 { 1413 // UnsafeCopyMemory page error: continue after ucm 1414 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1415 // 'from', 'to' and 'count' are now valid 1416 __ movptr(word_count, count); 1417 __ shrptr(count, 2); // count => qword_count 1418 1419 // Copy from high to low addresses. Use 'to' as scratch. 1420 1421 // Check for and copy trailing word 1422 __ testl(word_count, 1); 1423 __ jccb(Assembler::zero, L_copy_4_bytes); 1424 __ movw(rax, Address(from, word_count, Address::times_2, -2)); 1425 __ movw(Address(to, word_count, Address::times_2, -2), rax); 1426 1427 // Check for and copy trailing dword 1428 __ BIND(L_copy_4_bytes); 1429 __ testl(word_count, 2); 1430 __ jcc(Assembler::zero, L_copy_bytes); 1431 __ movl(rax, Address(from, qword_count, Address::times_8)); 1432 __ movl(Address(to, qword_count, Address::times_8), rax); 1433 __ jmp(L_copy_bytes); 1434 1435 // Copy trailing qwords 1436 __ BIND(L_copy_8_bytes); 1437 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1438 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1439 __ decrement(qword_count); 1440 __ jcc(Assembler::notZero, L_copy_8_bytes); 1441 } 1442 restore_arg_regs(); 1443 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1444 __ xorptr(rax, rax); // return 0 1445 __ vzeroupper(); 1446 __ leave(); // required for proper stackwalking of RuntimeStub frame 1447 __ ret(0); 1448 1449 { 1450 // UnsafeCopyMemory page error: continue after ucm 1451 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1452 // Copy in multi-bytes chunks 1453 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1454 } 1455 restore_arg_regs(); 1456 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1457 __ xorptr(rax, rax); // return 0 1458 __ vzeroupper(); 1459 __ leave(); // required for proper stackwalking of RuntimeStub frame 1460 __ ret(0); 1461 1462 return start; 1463 } 1464 1465 1466 // Arguments: 1467 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1468 // ignored 1469 // is_oop - true => oop array, so generate store check code 1470 // name - stub name string 1471 // 1472 // Inputs: 1473 // c_rarg0 - source array address 1474 // c_rarg1 - destination array address 1475 // c_rarg2 - element count, treated as ssize_t, can be zero 1476 // 1477 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1478 // the hardware handle it. The two dwords within qwords that span 1479 // cache line boundaries will still be loaded and stored atomically. 1480 // 1481 // Side Effects: 1482 // disjoint_int_copy_entry is set to the no-overlap entry point 1483 // used by generate_conjoint_int_oop_copy(). 1484 // 1485 address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, 1486 const char *name, bool dest_uninitialized) { 1487 #if COMPILER2_OR_JVMCI 1488 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1489 return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2, 1490 aligned, is_oop, dest_uninitialized); 1491 } 1492 #endif 1493 1494 __ align(CodeEntryAlignment); 1495 StubCodeMark mark(this, "StubRoutines", name); 1496 address start = __ pc(); 1497 1498 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 1499 const Register from = rdi; // source array address 1500 const Register to = rsi; // destination array address 1501 const Register count = rdx; // elements count 1502 const Register dword_count = rcx; 1503 const Register qword_count = count; 1504 const Register end_from = from; // source array end address 1505 const Register end_to = to; // destination array end address 1506 // End pointers are inclusive, and if count is not zero they point 1507 // to the last unit copied: end_to[0] := end_from[0] 1508 1509 __ enter(); // required for proper stackwalking of RuntimeStub frame 1510 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1511 1512 if (entry != NULL) { 1513 *entry = __ pc(); 1514 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1515 BLOCK_COMMENT("Entry:"); 1516 } 1517 1518 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1519 // r9 is used to save r15_thread 1520 1521 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1522 if (dest_uninitialized) { 1523 decorators |= IS_DEST_UNINITIALIZED; 1524 } 1525 if (aligned) { 1526 decorators |= ARRAYCOPY_ALIGNED; 1527 } 1528 1529 BasicType type = is_oop ? T_OBJECT : T_INT; 1530 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1531 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1532 1533 { 1534 // UnsafeCopyMemory page error: continue after ucm 1535 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1536 // 'from', 'to' and 'count' are now valid 1537 __ movptr(dword_count, count); 1538 __ shrptr(count, 1); // count => qword_count 1539 1540 // Copy from low to high addresses. Use 'to' as scratch. 1541 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1542 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1543 __ negptr(qword_count); 1544 __ jmp(L_copy_bytes); 1545 1546 // Copy trailing qwords 1547 __ BIND(L_copy_8_bytes); 1548 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1549 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1550 __ increment(qword_count); 1551 __ jcc(Assembler::notZero, L_copy_8_bytes); 1552 1553 // Check for and copy trailing dword 1554 __ BIND(L_copy_4_bytes); 1555 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 1556 __ jccb(Assembler::zero, L_exit); 1557 __ movl(rax, Address(end_from, 8)); 1558 __ movl(Address(end_to, 8), rax); 1559 } 1560 __ BIND(L_exit); 1561 address ucme_exit_pc = __ pc(); 1562 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1563 restore_arg_regs_using_thread(); 1564 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1565 __ vzeroupper(); 1566 __ xorptr(rax, rax); // return 0 1567 __ leave(); // required for proper stackwalking of RuntimeStub frame 1568 __ ret(0); 1569 1570 { 1571 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc); 1572 // Copy in multi-bytes chunks 1573 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1574 __ jmp(L_copy_4_bytes); 1575 } 1576 1577 return start; 1578 } 1579 1580 1581 // Arguments: 1582 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1583 // ignored 1584 // is_oop - true => oop array, so generate store check code 1585 // name - stub name string 1586 // 1587 // Inputs: 1588 // c_rarg0 - source array address 1589 // c_rarg1 - destination array address 1590 // c_rarg2 - element count, treated as ssize_t, can be zero 1591 // 1592 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1593 // the hardware handle it. The two dwords within qwords that span 1594 // cache line boundaries will still be loaded and stored atomically. 1595 // 1596 address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 1597 address *entry, const char *name, 1598 bool dest_uninitialized) { 1599 #if COMPILER2_OR_JVMCI 1600 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1601 return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2, 1602 nooverlap_target, aligned, is_oop, dest_uninitialized); 1603 } 1604 #endif 1605 __ align(CodeEntryAlignment); 1606 StubCodeMark mark(this, "StubRoutines", name); 1607 address start = __ pc(); 1608 1609 Label L_copy_bytes, L_copy_8_bytes, L_exit; 1610 const Register from = rdi; // source array address 1611 const Register to = rsi; // destination array address 1612 const Register count = rdx; // elements count 1613 const Register dword_count = rcx; 1614 const Register qword_count = count; 1615 1616 __ enter(); // required for proper stackwalking of RuntimeStub frame 1617 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1618 1619 if (entry != NULL) { 1620 *entry = __ pc(); 1621 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1622 BLOCK_COMMENT("Entry:"); 1623 } 1624 1625 array_overlap_test(nooverlap_target, Address::times_4); 1626 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1627 // r9 is used to save r15_thread 1628 1629 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1630 if (dest_uninitialized) { 1631 decorators |= IS_DEST_UNINITIALIZED; 1632 } 1633 if (aligned) { 1634 decorators |= ARRAYCOPY_ALIGNED; 1635 } 1636 1637 BasicType type = is_oop ? T_OBJECT : T_INT; 1638 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1639 // no registers are destroyed by this call 1640 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1641 1642 assert_clean_int(count, rax); // Make sure 'count' is clean int. 1643 { 1644 // UnsafeCopyMemory page error: continue after ucm 1645 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1646 // 'from', 'to' and 'count' are now valid 1647 __ movptr(dword_count, count); 1648 __ shrptr(count, 1); // count => qword_count 1649 1650 // Copy from high to low addresses. Use 'to' as scratch. 1651 1652 // Check for and copy trailing dword 1653 __ testl(dword_count, 1); 1654 __ jcc(Assembler::zero, L_copy_bytes); 1655 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 1656 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 1657 __ jmp(L_copy_bytes); 1658 1659 // Copy trailing qwords 1660 __ BIND(L_copy_8_bytes); 1661 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1662 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1663 __ decrement(qword_count); 1664 __ jcc(Assembler::notZero, L_copy_8_bytes); 1665 } 1666 if (is_oop) { 1667 __ jmp(L_exit); 1668 } 1669 restore_arg_regs_using_thread(); 1670 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1671 __ xorptr(rax, rax); // return 0 1672 __ vzeroupper(); 1673 __ leave(); // required for proper stackwalking of RuntimeStub frame 1674 __ ret(0); 1675 1676 { 1677 // UnsafeCopyMemory page error: continue after ucm 1678 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1679 // Copy in multi-bytes chunks 1680 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1681 } 1682 1683 __ BIND(L_exit); 1684 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1685 restore_arg_regs_using_thread(); 1686 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1687 __ xorptr(rax, rax); // return 0 1688 __ vzeroupper(); 1689 __ leave(); // required for proper stackwalking of RuntimeStub frame 1690 __ ret(0); 1691 1692 return start; 1693 } 1694 1695 1696 // Arguments: 1697 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1698 // ignored 1699 // is_oop - true => oop array, so generate store check code 1700 // name - stub name string 1701 // 1702 // Inputs: 1703 // c_rarg0 - source array address 1704 // c_rarg1 - destination array address 1705 // c_rarg2 - element count, treated as ssize_t, can be zero 1706 // 1707 // Side Effects: 1708 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1709 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1710 // 1711 address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, 1712 const char *name, bool dest_uninitialized) { 1713 #if COMPILER2_OR_JVMCI 1714 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1715 return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3, 1716 aligned, is_oop, dest_uninitialized); 1717 } 1718 #endif 1719 __ align(CodeEntryAlignment); 1720 StubCodeMark mark(this, "StubRoutines", name); 1721 address start = __ pc(); 1722 1723 Label L_copy_bytes, L_copy_8_bytes, L_exit; 1724 const Register from = rdi; // source array address 1725 const Register to = rsi; // destination array address 1726 const Register qword_count = rdx; // elements count 1727 const Register end_from = from; // source array end address 1728 const Register end_to = rcx; // destination array end address 1729 const Register saved_count = r11; 1730 // End pointers are inclusive, and if count is not zero they point 1731 // to the last unit copied: end_to[0] := end_from[0] 1732 1733 __ enter(); // required for proper stackwalking of RuntimeStub frame 1734 // Save no-overlap entry point for generate_conjoint_long_oop_copy() 1735 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1736 1737 if (entry != NULL) { 1738 *entry = __ pc(); 1739 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1740 BLOCK_COMMENT("Entry:"); 1741 } 1742 1743 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1744 // r9 is used to save r15_thread 1745 // 'from', 'to' and 'qword_count' are now valid 1746 1747 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1748 if (dest_uninitialized) { 1749 decorators |= IS_DEST_UNINITIALIZED; 1750 } 1751 if (aligned) { 1752 decorators |= ARRAYCOPY_ALIGNED; 1753 } 1754 1755 BasicType type = is_oop ? T_OBJECT : T_LONG; 1756 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1757 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 1758 { 1759 // UnsafeCopyMemory page error: continue after ucm 1760 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1761 1762 // Copy from low to high addresses. Use 'to' as scratch. 1763 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1764 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1765 __ negptr(qword_count); 1766 __ jmp(L_copy_bytes); 1767 1768 // Copy trailing qwords 1769 __ BIND(L_copy_8_bytes); 1770 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1771 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1772 __ increment(qword_count); 1773 __ jcc(Assembler::notZero, L_copy_8_bytes); 1774 } 1775 if (is_oop) { 1776 __ jmp(L_exit); 1777 } else { 1778 restore_arg_regs_using_thread(); 1779 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1780 __ xorptr(rax, rax); // return 0 1781 __ vzeroupper(); 1782 __ leave(); // required for proper stackwalking of RuntimeStub frame 1783 __ ret(0); 1784 } 1785 1786 { 1787 // UnsafeCopyMemory page error: continue after ucm 1788 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1789 // Copy in multi-bytes chunks 1790 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1791 } 1792 1793 __ BIND(L_exit); 1794 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 1795 restore_arg_regs_using_thread(); 1796 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 1797 SharedRuntime::_jlong_array_copy_ctr, 1798 rscratch1); // Update counter after rscratch1 is free 1799 __ vzeroupper(); 1800 __ xorptr(rax, rax); // return 0 1801 __ leave(); // required for proper stackwalking of RuntimeStub frame 1802 __ ret(0); 1803 1804 return start; 1805 } 1806 1807 1808 // Arguments: 1809 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1810 // ignored 1811 // is_oop - true => oop array, so generate store check code 1812 // name - stub name string 1813 // 1814 // Inputs: 1815 // c_rarg0 - source array address 1816 // c_rarg1 - destination array address 1817 // c_rarg2 - element count, treated as ssize_t, can be zero 1818 // 1819 address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 1820 address *entry, const char *name, 1821 bool dest_uninitialized) { 1822 #if COMPILER2_OR_JVMCI 1823 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1824 return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3, 1825 nooverlap_target, aligned, is_oop, dest_uninitialized); 1826 } 1827 #endif 1828 __ align(CodeEntryAlignment); 1829 StubCodeMark mark(this, "StubRoutines", name); 1830 address start = __ pc(); 1831 1832 Label L_copy_bytes, L_copy_8_bytes, L_exit; 1833 const Register from = rdi; // source array address 1834 const Register to = rsi; // destination array address 1835 const Register qword_count = rdx; // elements count 1836 const Register saved_count = rcx; 1837 1838 __ enter(); // required for proper stackwalking of RuntimeStub frame 1839 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1840 1841 if (entry != NULL) { 1842 *entry = __ pc(); 1843 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1844 BLOCK_COMMENT("Entry:"); 1845 } 1846 1847 array_overlap_test(nooverlap_target, Address::times_8); 1848 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1849 // r9 is used to save r15_thread 1850 // 'from', 'to' and 'qword_count' are now valid 1851 1852 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1853 if (dest_uninitialized) { 1854 decorators |= IS_DEST_UNINITIALIZED; 1855 } 1856 if (aligned) { 1857 decorators |= ARRAYCOPY_ALIGNED; 1858 } 1859 1860 BasicType type = is_oop ? T_OBJECT : T_LONG; 1861 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1862 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 1863 { 1864 // UnsafeCopyMemory page error: continue after ucm 1865 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1866 1867 __ jmp(L_copy_bytes); 1868 1869 // Copy trailing qwords 1870 __ BIND(L_copy_8_bytes); 1871 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1872 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1873 __ decrement(qword_count); 1874 __ jcc(Assembler::notZero, L_copy_8_bytes); 1875 } 1876 if (is_oop) { 1877 __ jmp(L_exit); 1878 } else { 1879 restore_arg_regs_using_thread(); 1880 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1881 __ xorptr(rax, rax); // return 0 1882 __ vzeroupper(); 1883 __ leave(); // required for proper stackwalking of RuntimeStub frame 1884 __ ret(0); 1885 } 1886 { 1887 // UnsafeCopyMemory page error: continue after ucm 1888 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1889 1890 // Copy in multi-bytes chunks 1891 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1892 } 1893 __ BIND(L_exit); 1894 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 1895 restore_arg_regs_using_thread(); 1896 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 1897 SharedRuntime::_jlong_array_copy_ctr, 1898 rscratch1); // Update counter after rscratch1 is free 1899 __ vzeroupper(); 1900 __ xorptr(rax, rax); // return 0 1901 __ leave(); // required for proper stackwalking of RuntimeStub frame 1902 __ ret(0); 1903 1904 return start; 1905 } 1906 1907 1908 // Helper for generating a dynamic type check. 1909 // Smashes no registers. 1910 void StubGenerator::generate_type_check(Register sub_klass, 1911 Register super_check_offset, 1912 Register super_klass, 1913 Label& L_success) { 1914 assert_different_registers(sub_klass, super_check_offset, super_klass); 1915 1916 BLOCK_COMMENT("type_check:"); 1917 1918 Label L_miss; 1919 1920 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1921 super_check_offset); 1922 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1923 1924 // Fall through on failure! 1925 __ BIND(L_miss); 1926 } 1927 1928 // 1929 // Generate checkcasting array copy stub 1930 // 1931 // Input: 1932 // c_rarg0 - source array address 1933 // c_rarg1 - destination array address 1934 // c_rarg2 - element count, treated as ssize_t, can be zero 1935 // c_rarg3 - size_t ckoff (super_check_offset) 1936 // not Win64 1937 // c_rarg4 - oop ckval (super_klass) 1938 // Win64 1939 // rsp+40 - oop ckval (super_klass) 1940 // 1941 // Output: 1942 // rax == 0 - success 1943 // rax == -1^K - failure, where K is partial transfer count 1944 // 1945 address StubGenerator::generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized) { 1946 1947 Label L_load_element, L_store_element, L_do_card_marks, L_done; 1948 1949 // Input registers (after setup_arg_regs) 1950 const Register from = rdi; // source array address 1951 const Register to = rsi; // destination array address 1952 const Register length = rdx; // elements count 1953 const Register ckoff = rcx; // super_check_offset 1954 const Register ckval = r8; // super_klass 1955 1956 // Registers used as temps (r13, r14 are save-on-entry) 1957 const Register end_from = from; // source array end address 1958 const Register end_to = r13; // destination array end address 1959 const Register count = rdx; // -(count_remaining) 1960 const Register r14_length = r14; // saved copy of length 1961 // End pointers are inclusive, and if length is not zero they point 1962 // to the last unit copied: end_to[0] := end_from[0] 1963 1964 const Register rax_oop = rax; // actual oop copied 1965 const Register r11_klass = r11; // oop._klass 1966 1967 //--------------------------------------------------------------- 1968 // Assembler stub will be used for this call to arraycopy 1969 // if the two arrays are subtypes of Object[] but the 1970 // destination array type is not equal to or a supertype 1971 // of the source type. Each element must be separately 1972 // checked. 1973 1974 __ align(CodeEntryAlignment); 1975 StubCodeMark mark(this, "StubRoutines", name); 1976 address start = __ pc(); 1977 1978 __ enter(); // required for proper stackwalking of RuntimeStub frame 1979 1980 #ifdef ASSERT 1981 // caller guarantees that the arrays really are different 1982 // otherwise, we would have to make conjoint checks 1983 { Label L; 1984 array_overlap_test(L, TIMES_OOP); 1985 __ stop("checkcast_copy within a single array"); 1986 __ bind(L); 1987 } 1988 #endif //ASSERT 1989 1990 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx 1991 // ckoff => rcx, ckval => r8 1992 // r9 and r10 may be used to save non-volatile registers 1993 #ifdef _WIN64 1994 // last argument (#4) is on stack on Win64 1995 __ movptr(ckval, Address(rsp, 6 * wordSize)); 1996 #endif 1997 1998 // Caller of this entry point must set up the argument registers. 1999 if (entry != NULL) { 2000 *entry = __ pc(); 2001 BLOCK_COMMENT("Entry:"); 2002 } 2003 2004 // allocate spill slots for r13, r14 2005 enum { 2006 saved_r13_offset, 2007 saved_r14_offset, 2008 saved_r10_offset, 2009 saved_rbp_offset 2010 }; 2011 __ subptr(rsp, saved_rbp_offset * wordSize); 2012 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 2013 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 2014 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10); 2015 2016 #ifdef ASSERT 2017 Label L2; 2018 __ get_thread(r14); 2019 __ cmpptr(r15_thread, r14); 2020 __ jcc(Assembler::equal, L2); 2021 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 2022 __ bind(L2); 2023 #endif // ASSERT 2024 2025 // check that int operands are properly extended to size_t 2026 assert_clean_int(length, rax); 2027 assert_clean_int(ckoff, rax); 2028 2029 #ifdef ASSERT 2030 BLOCK_COMMENT("assert consistent ckoff/ckval"); 2031 // The ckoff and ckval must be mutually consistent, 2032 // even though caller generates both. 2033 { Label L; 2034 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2035 __ cmpl(ckoff, Address(ckval, sco_offset)); 2036 __ jcc(Assembler::equal, L); 2037 __ stop("super_check_offset inconsistent"); 2038 __ bind(L); 2039 } 2040 #endif //ASSERT 2041 2042 // Loop-invariant addresses. They are exclusive end pointers. 2043 Address end_from_addr(from, length, TIMES_OOP, 0); 2044 Address end_to_addr(to, length, TIMES_OOP, 0); 2045 // Loop-variant addresses. They assume post-incremented count < 0. 2046 Address from_element_addr(end_from, count, TIMES_OOP, 0); 2047 Address to_element_addr(end_to, count, TIMES_OOP, 0); 2048 2049 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 2050 if (dest_uninitialized) { 2051 decorators |= IS_DEST_UNINITIALIZED; 2052 } 2053 2054 BasicType type = T_OBJECT; 2055 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2056 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2057 2058 // Copy from low to high addresses, indexed from the end of each array. 2059 __ lea(end_from, end_from_addr); 2060 __ lea(end_to, end_to_addr); 2061 __ movptr(r14_length, length); // save a copy of the length 2062 assert(length == count, ""); // else fix next line: 2063 __ negptr(count); // negate and test the length 2064 __ jcc(Assembler::notZero, L_load_element); 2065 2066 // Empty array: Nothing to do. 2067 __ xorptr(rax, rax); // return 0 on (trivial) success 2068 __ jmp(L_done); 2069 2070 // ======== begin loop ======== 2071 // (Loop is rotated; its entry is L_load_element.) 2072 // Loop control: 2073 // for (count = -count; count != 0; count++) 2074 // Base pointers src, dst are biased by 8*(count-1),to last element. 2075 __ align(OptoLoopAlignment); 2076 2077 __ BIND(L_store_element); 2078 __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, noreg, AS_RAW); // store the oop 2079 __ increment(count); // increment the count toward zero 2080 __ jcc(Assembler::zero, L_do_card_marks); 2081 2082 // ======== loop entry is here ======== 2083 __ BIND(L_load_element); 2084 __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop 2085 __ testptr(rax_oop, rax_oop); 2086 __ jcc(Assembler::zero, L_store_element); 2087 2088 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass 2089 generate_type_check(r11_klass, ckoff, ckval, L_store_element); 2090 // ======== end loop ======== 2091 2092 // It was a real error; we must depend on the caller to finish the job. 2093 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. 2094 // Emit GC store barriers for the oops we have copied (r14 + rdx), 2095 // and report their number to the caller. 2096 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); 2097 Label L_post_barrier; 2098 __ addptr(r14_length, count); // K = (original - remaining) oops 2099 __ movptr(rax, r14_length); // save the value 2100 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 2101 __ jccb(Assembler::notZero, L_post_barrier); 2102 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 2103 2104 // Come here on success only. 2105 __ BIND(L_do_card_marks); 2106 __ xorptr(rax, rax); // return 0 on success 2107 2108 __ BIND(L_post_barrier); 2109 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length); 2110 2111 // Common exit point (success or failure). 2112 __ BIND(L_done); 2113 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 2114 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 2115 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize)); 2116 restore_arg_regs(); 2117 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2118 __ leave(); // required for proper stackwalking of RuntimeStub frame 2119 __ ret(0); 2120 2121 return start; 2122 } 2123 2124 2125 // Generate 'unsafe' array copy stub 2126 // Though just as safe as the other stubs, it takes an unscaled 2127 // size_t argument instead of an element count. 2128 // 2129 // Input: 2130 // c_rarg0 - source array address 2131 // c_rarg1 - destination array address 2132 // c_rarg2 - byte count, treated as ssize_t, can be zero 2133 // 2134 // Examines the alignment of the operands and dispatches 2135 // to a long, int, short, or byte copy loop. 2136 // 2137 address StubGenerator::generate_unsafe_copy(const char *name, 2138 address byte_copy_entry, address short_copy_entry, 2139 address int_copy_entry, address long_copy_entry) { 2140 2141 Label L_long_aligned, L_int_aligned, L_short_aligned; 2142 2143 // Input registers (before setup_arg_regs) 2144 const Register from = c_rarg0; // source array address 2145 const Register to = c_rarg1; // destination array address 2146 const Register size = c_rarg2; // byte count (size_t) 2147 2148 // Register used as a temp 2149 const Register bits = rax; // test copy of low bits 2150 2151 __ align(CodeEntryAlignment); 2152 StubCodeMark mark(this, "StubRoutines", name); 2153 address start = __ pc(); 2154 2155 __ enter(); // required for proper stackwalking of RuntimeStub frame 2156 2157 // bump this on entry, not on exit: 2158 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1); 2159 2160 __ mov(bits, from); 2161 __ orptr(bits, to); 2162 __ orptr(bits, size); 2163 2164 __ testb(bits, BytesPerLong-1); 2165 __ jccb(Assembler::zero, L_long_aligned); 2166 2167 __ testb(bits, BytesPerInt-1); 2168 __ jccb(Assembler::zero, L_int_aligned); 2169 2170 __ testb(bits, BytesPerShort-1); 2171 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 2172 2173 __ BIND(L_short_aligned); 2174 __ shrptr(size, LogBytesPerShort); // size => short_count 2175 __ jump(RuntimeAddress(short_copy_entry)); 2176 2177 __ BIND(L_int_aligned); 2178 __ shrptr(size, LogBytesPerInt); // size => int_count 2179 __ jump(RuntimeAddress(int_copy_entry)); 2180 2181 __ BIND(L_long_aligned); 2182 __ shrptr(size, LogBytesPerLong); // size => qword_count 2183 __ jump(RuntimeAddress(long_copy_entry)); 2184 2185 return start; 2186 } 2187 2188 2189 // Perform range checks on the proposed arraycopy. 2190 // Kills temp, but nothing else. 2191 // Also, clean the sign bits of src_pos and dst_pos. 2192 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2193 Register src_pos, // source position (c_rarg1) 2194 Register dst, // destination array oo (c_rarg2) 2195 Register dst_pos, // destination position (c_rarg3) 2196 Register length, 2197 Register temp, 2198 Label& L_failed) { 2199 BLOCK_COMMENT("arraycopy_range_checks:"); 2200 2201 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2202 __ movl(temp, length); 2203 __ addl(temp, src_pos); // src_pos + length 2204 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); 2205 __ jcc(Assembler::above, L_failed); 2206 2207 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2208 __ movl(temp, length); 2209 __ addl(temp, dst_pos); // dst_pos + length 2210 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2211 __ jcc(Assembler::above, L_failed); 2212 2213 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2214 // Move with sign extension can be used since they are positive. 2215 __ movslq(src_pos, src_pos); 2216 __ movslq(dst_pos, dst_pos); 2217 2218 BLOCK_COMMENT("arraycopy_range_checks done"); 2219 } 2220 2221 2222 // Generate generic array copy stubs 2223 // 2224 // Input: 2225 // c_rarg0 - src oop 2226 // c_rarg1 - src_pos (32-bits) 2227 // c_rarg2 - dst oop 2228 // c_rarg3 - dst_pos (32-bits) 2229 // not Win64 2230 // c_rarg4 - element count (32-bits) 2231 // Win64 2232 // rsp+40 - element count (32-bits) 2233 // 2234 // Output: 2235 // rax == 0 - success 2236 // rax == -1^K - failure, where K is partial transfer count 2237 // 2238 address StubGenerator::generate_generic_copy(const char *name, 2239 address byte_copy_entry, address short_copy_entry, 2240 address int_copy_entry, address oop_copy_entry, 2241 address long_copy_entry, address checkcast_copy_entry) { 2242 2243 Label L_failed, L_failed_0, L_objArray; 2244 Label L_copy_shorts, L_copy_ints, L_copy_longs; 2245 2246 // Input registers 2247 const Register src = c_rarg0; // source array oop 2248 const Register src_pos = c_rarg1; // source position 2249 const Register dst = c_rarg2; // destination array oop 2250 const Register dst_pos = c_rarg3; // destination position 2251 #ifndef _WIN64 2252 const Register length = c_rarg4; 2253 const Register rklass_tmp = r9; // load_klass 2254 #else 2255 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64 2256 const Register rklass_tmp = rdi; // load_klass 2257 #endif 2258 2259 { int modulus = CodeEntryAlignment; 2260 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 2261 int advance = target - (__ offset() % modulus); 2262 if (advance < 0) advance += modulus; 2263 if (advance > 0) __ nop(advance); 2264 } 2265 StubCodeMark mark(this, "StubRoutines", name); 2266 2267 // Short-hop target to L_failed. Makes for denser prologue code. 2268 __ BIND(L_failed_0); 2269 __ jmp(L_failed); 2270 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 2271 2272 __ align(CodeEntryAlignment); 2273 address start = __ pc(); 2274 2275 __ enter(); // required for proper stackwalking of RuntimeStub frame 2276 2277 #ifdef _WIN64 2278 __ push(rklass_tmp); // rdi is callee-save on Windows 2279 #endif 2280 2281 // bump this on entry, not on exit: 2282 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1); 2283 2284 //----------------------------------------------------------------------- 2285 // Assembler stub will be used for this call to arraycopy 2286 // if the following conditions are met: 2287 // 2288 // (1) src and dst must not be null. 2289 // (2) src_pos must not be negative. 2290 // (3) dst_pos must not be negative. 2291 // (4) length must not be negative. 2292 // (5) src klass and dst klass should be the same and not NULL. 2293 // (6) src and dst should be arrays. 2294 // (7) src_pos + length must not exceed length of src. 2295 // (8) dst_pos + length must not exceed length of dst. 2296 // 2297 2298 // if (src == NULL) return -1; 2299 __ testptr(src, src); // src oop 2300 size_t j1off = __ offset(); 2301 __ jccb(Assembler::zero, L_failed_0); 2302 2303 // if (src_pos < 0) return -1; 2304 __ testl(src_pos, src_pos); // src_pos (32-bits) 2305 __ jccb(Assembler::negative, L_failed_0); 2306 2307 // if (dst == NULL) return -1; 2308 __ testptr(dst, dst); // dst oop 2309 __ jccb(Assembler::zero, L_failed_0); 2310 2311 // if (dst_pos < 0) return -1; 2312 __ testl(dst_pos, dst_pos); // dst_pos (32-bits) 2313 size_t j4off = __ offset(); 2314 __ jccb(Assembler::negative, L_failed_0); 2315 2316 // The first four tests are very dense code, 2317 // but not quite dense enough to put four 2318 // jumps in a 16-byte instruction fetch buffer. 2319 // That's good, because some branch predicters 2320 // do not like jumps so close together. 2321 // Make sure of this. 2322 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps"); 2323 2324 // registers used as temp 2325 const Register r11_length = r11; // elements count to copy 2326 const Register r10_src_klass = r10; // array klass 2327 2328 // if (length < 0) return -1; 2329 __ movl(r11_length, length); // length (elements count, 32-bits value) 2330 __ testl(r11_length, r11_length); 2331 __ jccb(Assembler::negative, L_failed_0); 2332 2333 __ load_klass(r10_src_klass, src, rklass_tmp); 2334 #ifdef ASSERT 2335 // assert(src->klass() != NULL); 2336 { 2337 BLOCK_COMMENT("assert klasses not null {"); 2338 Label L1, L2; 2339 __ testptr(r10_src_klass, r10_src_klass); 2340 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL 2341 __ bind(L1); 2342 __ stop("broken null klass"); 2343 __ bind(L2); 2344 __ load_klass(rax, dst, rklass_tmp); 2345 __ cmpq(rax, 0); 2346 __ jcc(Assembler::equal, L1); // this would be broken also 2347 BLOCK_COMMENT("} assert klasses not null done"); 2348 } 2349 #endif 2350 2351 // Load layout helper (32-bits) 2352 // 2353 // |array_tag| | header_size | element_type | |log2_element_size| 2354 // 32 30 24 16 8 2 0 2355 // 2356 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2357 // 2358 2359 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2360 2361 // Handle objArrays completely differently... 2362 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2363 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); 2364 __ jcc(Assembler::equal, L_objArray); 2365 2366 // if (src->klass() != dst->klass()) return -1; 2367 __ load_klass(rax, dst, rklass_tmp); 2368 __ cmpq(r10_src_klass, rax); 2369 __ jcc(Assembler::notEqual, L_failed); 2370 2371 const Register rax_lh = rax; // layout helper 2372 __ movl(rax_lh, Address(r10_src_klass, lh_offset)); 2373 2374 // if (!src->is_Array()) return -1; 2375 __ cmpl(rax_lh, Klass::_lh_neutral_value); 2376 __ jcc(Assembler::greaterEqual, L_failed); 2377 2378 // At this point, it is known to be a typeArray (array_tag 0x3). 2379 #ifdef ASSERT 2380 { 2381 BLOCK_COMMENT("assert primitive array {"); 2382 Label L; 2383 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 2384 __ jcc(Assembler::greaterEqual, L); 2385 __ stop("must be a primitive array"); 2386 __ bind(L); 2387 BLOCK_COMMENT("} assert primitive array done"); 2388 } 2389 #endif 2390 2391 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2392 r10, L_failed); 2393 2394 // TypeArrayKlass 2395 // 2396 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2397 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2398 // 2399 2400 const Register r10_offset = r10; // array offset 2401 const Register rax_elsize = rax_lh; // element size 2402 2403 __ movl(r10_offset, rax_lh); 2404 __ shrl(r10_offset, Klass::_lh_header_size_shift); 2405 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset 2406 __ addptr(src, r10_offset); // src array offset 2407 __ addptr(dst, r10_offset); // dst array offset 2408 BLOCK_COMMENT("choose copy loop based on element size"); 2409 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize 2410 2411 #ifdef _WIN64 2412 __ pop(rklass_tmp); // Restore callee-save rdi 2413 #endif 2414 2415 // next registers should be set before the jump to corresponding stub 2416 const Register from = c_rarg0; // source array address 2417 const Register to = c_rarg1; // destination array address 2418 const Register count = c_rarg2; // elements count 2419 2420 // 'from', 'to', 'count' registers should be set in such order 2421 // since they are the same as 'src', 'src_pos', 'dst'. 2422 2423 __ cmpl(rax_elsize, 0); 2424 __ jccb(Assembler::notEqual, L_copy_shorts); 2425 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr 2426 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr 2427 __ movl2ptr(count, r11_length); // length 2428 __ jump(RuntimeAddress(byte_copy_entry)); 2429 2430 __ BIND(L_copy_shorts); 2431 __ cmpl(rax_elsize, LogBytesPerShort); 2432 __ jccb(Assembler::notEqual, L_copy_ints); 2433 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr 2434 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr 2435 __ movl2ptr(count, r11_length); // length 2436 __ jump(RuntimeAddress(short_copy_entry)); 2437 2438 __ BIND(L_copy_ints); 2439 __ cmpl(rax_elsize, LogBytesPerInt); 2440 __ jccb(Assembler::notEqual, L_copy_longs); 2441 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr 2442 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr 2443 __ movl2ptr(count, r11_length); // length 2444 __ jump(RuntimeAddress(int_copy_entry)); 2445 2446 __ BIND(L_copy_longs); 2447 #ifdef ASSERT 2448 { 2449 BLOCK_COMMENT("assert long copy {"); 2450 Label L; 2451 __ cmpl(rax_elsize, LogBytesPerLong); 2452 __ jcc(Assembler::equal, L); 2453 __ stop("must be long copy, but elsize is wrong"); 2454 __ bind(L); 2455 BLOCK_COMMENT("} assert long copy done"); 2456 } 2457 #endif 2458 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 2459 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 2460 __ movl2ptr(count, r11_length); // length 2461 __ jump(RuntimeAddress(long_copy_entry)); 2462 2463 // ObjArrayKlass 2464 __ BIND(L_objArray); 2465 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 2466 2467 Label L_plain_copy, L_checkcast_copy; 2468 // test array classes for subtyping 2469 __ load_klass(rax, dst, rklass_tmp); 2470 __ cmpq(r10_src_klass, rax); // usual case is exact equality 2471 __ jcc(Assembler::notEqual, L_checkcast_copy); 2472 2473 // Identically typed arrays can be copied without element-wise checks. 2474 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2475 r10, L_failed); 2476 2477 __ lea(from, Address(src, src_pos, TIMES_OOP, 2478 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 2479 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2480 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 2481 __ movl2ptr(count, r11_length); // length 2482 __ BIND(L_plain_copy); 2483 #ifdef _WIN64 2484 __ pop(rklass_tmp); // Restore callee-save rdi 2485 #endif 2486 __ jump(RuntimeAddress(oop_copy_entry)); 2487 2488 __ BIND(L_checkcast_copy); 2489 // live at this point: r10_src_klass, r11_length, rax (dst_klass) 2490 { 2491 // Before looking at dst.length, make sure dst is also an objArray. 2492 __ cmpl(Address(rax, lh_offset), objArray_lh); 2493 __ jcc(Assembler::notEqual, L_failed); 2494 2495 // It is safe to examine both src.length and dst.length. 2496 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2497 rax, L_failed); 2498 2499 const Register r11_dst_klass = r11; 2500 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload 2501 2502 // Marshal the base address arguments now, freeing registers. 2503 __ lea(from, Address(src, src_pos, TIMES_OOP, 2504 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2505 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2506 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2507 __ movl(count, length); // length (reloaded) 2508 Register sco_temp = c_rarg3; // this register is free now 2509 assert_different_registers(from, to, count, sco_temp, 2510 r11_dst_klass, r10_src_klass); 2511 assert_clean_int(count, sco_temp); 2512 2513 // Generate the type check. 2514 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2515 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 2516 assert_clean_int(sco_temp, rax); 2517 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 2518 2519 // Fetch destination element klass from the ObjArrayKlass header. 2520 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2521 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 2522 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 2523 assert_clean_int(sco_temp, rax); 2524 2525 #ifdef _WIN64 2526 __ pop(rklass_tmp); // Restore callee-save rdi 2527 #endif 2528 2529 // the checkcast_copy loop needs two extra arguments: 2530 assert(c_rarg3 == sco_temp, "#3 already in place"); 2531 // Set up arguments for checkcast_copy_entry. 2532 setup_arg_regs(4); 2533 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris 2534 __ jump(RuntimeAddress(checkcast_copy_entry)); 2535 } 2536 2537 __ BIND(L_failed); 2538 #ifdef _WIN64 2539 __ pop(rklass_tmp); // Restore callee-save rdi 2540 #endif 2541 __ xorptr(rax, rax); 2542 __ notptr(rax); // return -1 2543 __ leave(); // required for proper stackwalking of RuntimeStub frame 2544 __ ret(0); 2545 2546 return start; 2547 } 2548 2549 #undef __