1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/objArrayKlass.hpp" 30 #include "runtime/sharedRuntime.hpp" 31 #include "runtime/stubRoutines.hpp" 32 #include "stubGenerator_x86_64.hpp" 33 #ifdef COMPILER2 34 #include "opto/c2_globals.hpp" 35 #endif 36 #if INCLUDE_JVMCI 37 #include "jvmci/jvmci_globals.hpp" 38 #endif 39 40 #define __ _masm-> 41 42 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #else 47 #define BLOCK_COMMENT(str) __ block_comment(str) 48 #endif // PRODUCT 49 50 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 51 52 #ifdef PRODUCT 53 #define INC_COUNTER_NP(counter, rscratch) ((void)0) 54 #else 55 #define INC_COUNTER_NP(counter, rscratch) \ 56 BLOCK_COMMENT("inc_counter " #counter); \ 57 inc_counter_np(_masm, counter, rscratch); 58 59 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) { 60 __ incrementl(ExternalAddress((address)&counter), rscratch); 61 } 62 63 #if COMPILER2_OR_JVMCI 64 static uint& get_profile_ctr(int shift) { 65 if (shift == 0) { 66 return SharedRuntime::_jbyte_array_copy_ctr; 67 } else if (shift == 1) { 68 return SharedRuntime::_jshort_array_copy_ctr; 69 } else if (shift == 2) { 70 return SharedRuntime::_jint_array_copy_ctr; 71 } else { 72 assert(shift == 3, ""); 73 return SharedRuntime::_jlong_array_copy_ctr; 74 } 75 } 76 #endif // COMPILER2_OR_JVMCI 77 #endif // !PRODUCT 78 79 void StubGenerator::generate_arraycopy_stubs() { 80 address entry; 81 address entry_jbyte_arraycopy; 82 address entry_jshort_arraycopy; 83 address entry_jint_arraycopy; 84 address entry_oop_arraycopy; 85 address entry_jlong_arraycopy; 86 address entry_checkcast_arraycopy; 87 88 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 89 "jbyte_disjoint_arraycopy"); 90 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy, 91 "jbyte_arraycopy"); 92 93 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 94 "jshort_disjoint_arraycopy"); 95 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy, 96 "jshort_arraycopy"); 97 98 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry, 99 "jint_disjoint_arraycopy"); 100 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry, 101 &entry_jint_arraycopy, "jint_arraycopy"); 102 103 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry, 104 "jlong_disjoint_arraycopy"); 105 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry, 106 &entry_jlong_arraycopy, "jlong_arraycopy"); 107 if (UseCompressedOops) { 108 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry, 109 "oop_disjoint_arraycopy"); 110 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry, 111 &entry_oop_arraycopy, "oop_arraycopy"); 112 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry, 113 "oop_disjoint_arraycopy_uninit", 114 /*dest_uninitialized*/true); 115 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry, 116 nullptr, "oop_arraycopy_uninit", 117 /*dest_uninitialized*/true); 118 } else { 119 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry, 120 "oop_disjoint_arraycopy"); 121 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry, 122 &entry_oop_arraycopy, "oop_arraycopy"); 123 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry, 124 "oop_disjoint_arraycopy_uninit", 125 /*dest_uninitialized*/true); 126 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry, 127 nullptr, "oop_arraycopy_uninit", 128 /*dest_uninitialized*/true); 129 } 130 131 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 132 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 133 /*dest_uninitialized*/true); 134 135 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 136 entry_jbyte_arraycopy, 137 entry_jshort_arraycopy, 138 entry_jint_arraycopy, 139 entry_jlong_arraycopy); 140 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 141 entry_jbyte_arraycopy, 142 entry_jshort_arraycopy, 143 entry_jint_arraycopy, 144 entry_oop_arraycopy, 145 entry_jlong_arraycopy, 146 entry_checkcast_arraycopy); 147 148 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 149 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 150 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 151 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 152 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 153 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 154 155 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory("unsafe_setmemory", StubRoutines::_jbyte_fill); 156 157 // We don't generate specialized code for HeapWord-aligned source 158 // arrays, so just use the code we've already generated 159 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; 160 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; 161 162 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; 163 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; 164 165 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 166 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 167 168 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 169 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 170 171 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 172 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 173 174 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 175 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 176 } 177 178 179 // Verify that a register contains clean 32-bits positive value 180 // (high 32-bits are 0) so it could be used in 64-bits shifts. 181 // 182 // Input: 183 // Rint - 32-bits value 184 // Rtmp - scratch 185 // 186 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) { 187 #ifdef ASSERT 188 Label L; 189 assert_different_registers(Rtmp, Rint); 190 __ movslq(Rtmp, Rint); 191 __ cmpq(Rtmp, Rint); 192 __ jcc(Assembler::equal, L); 193 __ stop("high 32-bits of int value are not 0"); 194 __ bind(L); 195 #endif 196 } 197 198 199 // Generate overlap test for array copy stubs 200 // 201 // Input: 202 // c_rarg0 - from 203 // c_rarg1 - to 204 // c_rarg2 - element count 205 // 206 // Output: 207 // rax - &from[element count - 1] 208 // 209 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { 210 const Register from = c_rarg0; 211 const Register to = c_rarg1; 212 const Register count = c_rarg2; 213 const Register end_from = rax; 214 215 __ cmpptr(to, from); 216 __ lea(end_from, Address(from, count, sf, 0)); 217 if (NOLp == nullptr) { 218 ExternalAddress no_overlap(no_overlap_target); 219 __ jump_cc(Assembler::belowEqual, no_overlap); 220 __ cmpptr(to, end_from); 221 __ jump_cc(Assembler::aboveEqual, no_overlap); 222 } else { 223 __ jcc(Assembler::belowEqual, (*NOLp)); 224 __ cmpptr(to, end_from); 225 __ jcc(Assembler::aboveEqual, (*NOLp)); 226 } 227 } 228 229 230 // Copy big chunks forward 231 // 232 // Inputs: 233 // end_from - source arrays end address 234 // end_to - destination array end address 235 // qword_count - 64-bits element count, negative 236 // tmp1 - scratch 237 // L_copy_bytes - entry label 238 // L_copy_8_bytes - exit label 239 // 240 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to, 241 Register qword_count, Register tmp1, 242 Register tmp2, Label& L_copy_bytes, 243 Label& L_copy_8_bytes, DecoratorSet decorators, 244 BasicType type) { 245 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 246 DEBUG_ONLY(__ stop("enter at entry label, not here")); 247 Label L_loop; 248 __ align(OptoLoopAlignment); 249 if (UseUnalignedLoadStores) { 250 Label L_end; 251 __ BIND(L_loop); 252 if (UseAVX >= 2) { 253 bs->copy_load_at(_masm, decorators, type, 32, 254 xmm0, Address(end_from, qword_count, Address::times_8, -56), 255 tmp1, xmm1); 256 bs->copy_store_at(_masm, decorators, type, 32, 257 Address(end_to, qword_count, Address::times_8, -56), xmm0, 258 tmp1, tmp2, xmm1); 259 260 bs->copy_load_at(_masm, decorators, type, 32, 261 xmm0, Address(end_from, qword_count, Address::times_8, -24), 262 tmp1, xmm1); 263 bs->copy_store_at(_masm, decorators, type, 32, 264 Address(end_to, qword_count, Address::times_8, -24), xmm0, 265 tmp1, tmp2, xmm1); 266 } else { 267 bs->copy_load_at(_masm, decorators, type, 16, 268 xmm0, Address(end_from, qword_count, Address::times_8, -56), 269 tmp1, xmm1); 270 bs->copy_store_at(_masm, decorators, type, 16, 271 Address(end_to, qword_count, Address::times_8, -56), xmm0, 272 tmp1, tmp2, xmm1); 273 bs->copy_load_at(_masm, decorators, type, 16, 274 xmm0, Address(end_from, qword_count, Address::times_8, -40), 275 tmp1, xmm1); 276 bs->copy_store_at(_masm, decorators, type, 16, 277 Address(end_to, qword_count, Address::times_8, -40), xmm0, 278 tmp1, tmp2, xmm1); 279 bs->copy_load_at(_masm, decorators, type, 16, 280 xmm0, Address(end_from, qword_count, Address::times_8, -24), 281 tmp1, xmm1); 282 bs->copy_store_at(_masm, decorators, type, 16, 283 Address(end_to, qword_count, Address::times_8, -24), xmm0, 284 tmp1, tmp2, xmm1); 285 bs->copy_load_at(_masm, decorators, type, 16, 286 xmm0, Address(end_from, qword_count, Address::times_8, -8), 287 tmp1, xmm1); 288 bs->copy_store_at(_masm, decorators, type, 16, 289 Address(end_to, qword_count, Address::times_8, -8), xmm0, 290 tmp1, tmp2, xmm1); 291 } 292 293 __ BIND(L_copy_bytes); 294 __ addptr(qword_count, 8); 295 __ jcc(Assembler::lessEqual, L_loop); 296 __ subptr(qword_count, 4); // sub(8) and add(4) 297 __ jcc(Assembler::greater, L_end); 298 // Copy trailing 32 bytes 299 if (UseAVX >= 2) { 300 bs->copy_load_at(_masm, decorators, type, 32, 301 xmm0, Address(end_from, qword_count, Address::times_8, -24), 302 tmp1, xmm1); 303 bs->copy_store_at(_masm, decorators, type, 32, 304 Address(end_to, qword_count, Address::times_8, -24), xmm0, 305 tmp1, tmp2, xmm1); 306 } else { 307 bs->copy_load_at(_masm, decorators, type, 16, 308 xmm0, Address(end_from, qword_count, Address::times_8, -24), 309 tmp1, xmm1); 310 bs->copy_store_at(_masm, decorators, type, 16, 311 Address(end_to, qword_count, Address::times_8, -24), xmm0, 312 tmp1, tmp2, xmm1); 313 bs->copy_load_at(_masm, decorators, type, 16, 314 xmm0, Address(end_from, qword_count, Address::times_8, -8), 315 tmp1, xmm1); 316 bs->copy_store_at(_masm, decorators, type, 16, 317 Address(end_to, qword_count, Address::times_8, -8), xmm0, 318 tmp1, tmp2, xmm1); 319 } 320 __ addptr(qword_count, 4); 321 __ BIND(L_end); 322 } else { 323 // Copy 32-bytes per iteration 324 __ BIND(L_loop); 325 bs->copy_load_at(_masm, decorators, type, 8, 326 tmp1, Address(end_from, qword_count, Address::times_8, -24), 327 tmp2); 328 bs->copy_store_at(_masm, decorators, type, 8, 329 Address(end_to, qword_count, Address::times_8, -24), tmp1, 330 tmp2); 331 bs->copy_load_at(_masm, decorators, type, 8, 332 tmp1, Address(end_from, qword_count, Address::times_8, -16), 333 tmp2); 334 bs->copy_store_at(_masm, decorators, type, 8, 335 Address(end_to, qword_count, Address::times_8, -16), tmp1, 336 tmp2); 337 bs->copy_load_at(_masm, decorators, type, 8, 338 tmp1, Address(end_from, qword_count, Address::times_8, -8), 339 tmp2); 340 bs->copy_store_at(_masm, decorators, type, 8, 341 Address(end_to, qword_count, Address::times_8, -8), tmp1, 342 tmp2); 343 bs->copy_load_at(_masm, decorators, type, 8, 344 tmp1, Address(end_from, qword_count, Address::times_8, 0), 345 tmp2); 346 bs->copy_store_at(_masm, decorators, type, 8, 347 Address(end_to, qword_count, Address::times_8, 0), tmp1, 348 tmp2); 349 350 __ BIND(L_copy_bytes); 351 __ addptr(qword_count, 4); 352 __ jcc(Assembler::lessEqual, L_loop); 353 } 354 __ subptr(qword_count, 4); 355 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 356 } 357 358 359 // Copy big chunks backward 360 // 361 // Inputs: 362 // from - source arrays address 363 // dest - destination array address 364 // qword_count - 64-bits element count 365 // tmp1 - scratch 366 // L_copy_bytes - entry label 367 // L_copy_8_bytes - exit label 368 // 369 void StubGenerator::copy_bytes_backward(Register from, Register dest, 370 Register qword_count, Register tmp1, 371 Register tmp2, Label& L_copy_bytes, 372 Label& L_copy_8_bytes, DecoratorSet decorators, 373 BasicType type) { 374 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 375 DEBUG_ONLY(__ stop("enter at entry label, not here")); 376 Label L_loop; 377 __ align(OptoLoopAlignment); 378 if (UseUnalignedLoadStores) { 379 Label L_end; 380 __ BIND(L_loop); 381 if (UseAVX >= 2) { 382 bs->copy_load_at(_masm, decorators, type, 32, 383 xmm0, Address(from, qword_count, Address::times_8, 32), 384 tmp1, xmm1); 385 bs->copy_store_at(_masm, decorators, type, 32, 386 Address(dest, qword_count, Address::times_8, 32), xmm0, 387 tmp1, tmp2, xmm1); 388 bs->copy_load_at(_masm, decorators, type, 32, 389 xmm0, Address(from, qword_count, Address::times_8, 0), 390 tmp1, xmm1); 391 bs->copy_store_at(_masm, decorators, type, 32, 392 Address(dest, qword_count, Address::times_8, 0), xmm0, 393 tmp1, tmp2, xmm1); 394 } else { 395 bs->copy_load_at(_masm, decorators, type, 16, 396 xmm0, Address(from, qword_count, Address::times_8, 48), 397 tmp1, xmm1); 398 bs->copy_store_at(_masm, decorators, type, 16, 399 Address(dest, qword_count, Address::times_8, 48), xmm0, 400 tmp1, tmp2, xmm1); 401 bs->copy_load_at(_masm, decorators, type, 16, 402 xmm0, Address(from, qword_count, Address::times_8, 32), 403 tmp1, xmm1); 404 bs->copy_store_at(_masm, decorators, type, 16, 405 Address(dest, qword_count, Address::times_8, 32), xmm0, 406 tmp1, tmp2, xmm1); 407 bs->copy_load_at(_masm, decorators, type, 16, 408 xmm0, Address(from, qword_count, Address::times_8, 16), 409 tmp1, xmm1); 410 bs->copy_store_at(_masm, decorators, type, 16, 411 Address(dest, qword_count, Address::times_8, 16), xmm0, 412 tmp1, tmp2, xmm1); 413 bs->copy_load_at(_masm, decorators, type, 16, 414 xmm0, Address(from, qword_count, Address::times_8, 0), 415 tmp1, xmm1); 416 bs->copy_store_at(_masm, decorators, type, 16, 417 Address(dest, qword_count, Address::times_8, 0), xmm0, 418 tmp1, tmp2, xmm1); 419 } 420 421 __ BIND(L_copy_bytes); 422 __ subptr(qword_count, 8); 423 __ jcc(Assembler::greaterEqual, L_loop); 424 425 __ addptr(qword_count, 4); // add(8) and sub(4) 426 __ jcc(Assembler::less, L_end); 427 // Copy trailing 32 bytes 428 if (UseAVX >= 2) { 429 bs->copy_load_at(_masm, decorators, type, 32, 430 xmm0, Address(from, qword_count, Address::times_8, 0), 431 tmp1, xmm1); 432 bs->copy_store_at(_masm, decorators, type, 32, 433 Address(dest, qword_count, Address::times_8, 0), xmm0, 434 tmp1, tmp2, xmm1); 435 } else { 436 bs->copy_load_at(_masm, decorators, type, 16, 437 xmm0, Address(from, qword_count, Address::times_8, 16), 438 tmp1, xmm1); 439 bs->copy_store_at(_masm, decorators, type, 16, 440 Address(dest, qword_count, Address::times_8, 16), xmm0, 441 tmp1, tmp2, xmm1); 442 bs->copy_load_at(_masm, decorators, type, 16, 443 xmm0, Address(from, qword_count, Address::times_8, 0), 444 tmp1, xmm1); 445 bs->copy_store_at(_masm, decorators, type, 16, 446 Address(dest, qword_count, Address::times_8, 0), xmm0, 447 tmp1, tmp2, xmm1); 448 } 449 __ subptr(qword_count, 4); 450 __ BIND(L_end); 451 } else { 452 // Copy 32-bytes per iteration 453 __ BIND(L_loop); 454 bs->copy_load_at(_masm, decorators, type, 8, 455 tmp1, Address(from, qword_count, Address::times_8, 24), 456 tmp2); 457 bs->copy_store_at(_masm, decorators, type, 8, 458 Address(dest, qword_count, Address::times_8, 24), tmp1, 459 tmp2); 460 bs->copy_load_at(_masm, decorators, type, 8, 461 tmp1, Address(from, qword_count, Address::times_8, 16), 462 tmp2); 463 bs->copy_store_at(_masm, decorators, type, 8, 464 Address(dest, qword_count, Address::times_8, 16), tmp1, 465 tmp2); 466 bs->copy_load_at(_masm, decorators, type, 8, 467 tmp1, Address(from, qword_count, Address::times_8, 8), 468 tmp2); 469 bs->copy_store_at(_masm, decorators, type, 8, 470 Address(dest, qword_count, Address::times_8, 8), tmp1, 471 tmp2); 472 bs->copy_load_at(_masm, decorators, type, 8, 473 tmp1, Address(from, qword_count, Address::times_8, 0), 474 tmp2); 475 bs->copy_store_at(_masm, decorators, type, 8, 476 Address(dest, qword_count, Address::times_8, 0), tmp1, 477 tmp2); 478 479 __ BIND(L_copy_bytes); 480 __ subptr(qword_count, 4); 481 __ jcc(Assembler::greaterEqual, L_loop); 482 } 483 __ addptr(qword_count, 4); 484 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 485 } 486 487 #if COMPILER2_OR_JVMCI 488 489 // Note: Following rules apply to AVX3 optimized arraycopy stubs:- 490 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) 491 // for both special cases (various small block sizes) and aligned copy loop. This is the 492 // default configuration. 493 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) 494 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. 495 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a 496 // better performance for disjoint copies. For conjoint/backward copy vector based 497 // copy performs better. 498 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over 499 // 64 byte vector registers (ZMMs). 500 501 // Inputs: 502 // c_rarg0 - source array address 503 // c_rarg1 - destination array address 504 // c_rarg2 - element count, treated as ssize_t, can be zero 505 // 506 // 507 // Side Effects: 508 // disjoint_copy_avx3_masked is set to the no-overlap entry point 509 // used by generate_conjoint_[byte/int/short/long]_copy(). 510 // 511 address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const char *name, 512 int shift, bool aligned, bool is_oop, 513 bool dest_uninitialized) { 514 __ align(CodeEntryAlignment); 515 StubCodeMark mark(this, "StubRoutines", name); 516 address start = __ pc(); 517 518 int avx3threshold = VM_Version::avx3_threshold(); 519 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 520 const int large_threshold = 2621440; // 2.5 MB 521 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 522 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 523 Label L_copy_large, L_finish; 524 const Register from = rdi; // source array address 525 const Register to = rsi; // destination array address 526 const Register count = rdx; // elements count 527 const Register temp1 = r8; 528 const Register temp2 = r11; 529 const Register temp3 = rax; 530 const Register temp4 = rcx; 531 // End pointers are inclusive, and if count is not zero they point 532 // to the last unit copied: end_to[0] := end_from[0] 533 534 __ enter(); // required for proper stackwalking of RuntimeStub frame 535 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 536 537 if (entry != nullptr) { 538 *entry = __ pc(); 539 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 540 BLOCK_COMMENT("Entry:"); 541 } 542 543 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 544 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 545 546 setup_argument_regs(type); 547 548 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 549 if (dest_uninitialized) { 550 decorators |= IS_DEST_UNINITIALIZED; 551 } 552 if (aligned) { 553 decorators |= ARRAYCOPY_ALIGNED; 554 } 555 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 556 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 557 558 { 559 // Type(shift) byte(0), short(1), int(2), long(3) 560 int loop_size[] = { 192, 96, 48, 24}; 561 int threshold[] = { 4096, 2048, 1024, 512}; 562 563 // UnsafeMemoryAccess page error: continue after unsafe access 564 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 565 // 'from', 'to' and 'count' are now valid 566 567 // temp1 holds remaining count and temp4 holds running count used to compute 568 // next address offset for start of to/from addresses (temp4 * scale). 569 __ mov64(temp4, 0); 570 __ movq(temp1, count); 571 572 // Zero length check. 573 __ BIND(L_tail); 574 __ cmpq(temp1, 0); 575 __ jcc(Assembler::lessEqual, L_exit); 576 577 // Special cases using 32 byte [masked] vector copy operations. 578 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 579 temp4, temp3, use64byteVector, L_entry, L_exit); 580 581 // PRE-MAIN-POST loop for aligned copy. 582 __ BIND(L_entry); 583 584 if (MaxVectorSize == 64) { 585 __ movq(temp2, temp1); 586 __ shlq(temp2, shift); 587 __ cmpq(temp2, large_threshold); 588 __ jcc(Assembler::greaterEqual, L_copy_large); 589 } 590 if (avx3threshold != 0) { 591 __ cmpq(count, threshold[shift]); 592 if (MaxVectorSize == 64) { 593 // Copy using 64 byte vectors. 594 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 595 } else { 596 assert(MaxVectorSize < 64, "vector size should be < 64 bytes"); 597 // REP MOVS offer a faster copy path. 598 __ jcc(Assembler::greaterEqual, L_repmovs); 599 } 600 } 601 602 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 603 // Partial copy to make dst address 32 byte aligned. 604 __ movq(temp2, to); 605 __ andq(temp2, 31); 606 __ jcc(Assembler::equal, L_main_pre_loop); 607 608 __ negptr(temp2); 609 __ addq(temp2, 32); 610 if (shift) { 611 __ shrq(temp2, shift); 612 } 613 __ movq(temp3, temp2); 614 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift); 615 __ movq(temp4, temp2); 616 __ movq(temp1, count); 617 __ subq(temp1, temp2); 618 619 __ cmpq(temp1, loop_size[shift]); 620 __ jcc(Assembler::less, L_tail); 621 622 __ BIND(L_main_pre_loop); 623 __ subq(temp1, loop_size[shift]); 624 625 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 626 __ align32(); 627 __ BIND(L_main_loop); 628 copy64_avx(to, from, temp4, xmm1, false, shift, 0); 629 copy64_avx(to, from, temp4, xmm1, false, shift, 64); 630 copy64_avx(to, from, temp4, xmm1, false, shift, 128); 631 __ addptr(temp4, loop_size[shift]); 632 __ subq(temp1, loop_size[shift]); 633 __ jcc(Assembler::greater, L_main_loop); 634 635 __ addq(temp1, loop_size[shift]); 636 637 // Tail loop. 638 __ jmp(L_tail); 639 640 __ BIND(L_repmovs); 641 __ movq(temp2, temp1); 642 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics. 643 __ movq(temp3, to); 644 __ movq(to, from); 645 __ movq(from, temp3); 646 // Save to/from for restoration post rep_mov. 647 __ movq(temp1, to); 648 __ movq(temp3, from); 649 if(shift < 3) { 650 __ shrq(temp2, 3-shift); // quad word count 651 } 652 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX). 653 __ rep_mov(); 654 __ shlq(temp2, 3); // convert quad words into byte count. 655 if(shift) { 656 __ shrq(temp2, shift); // type specific count. 657 } 658 // Restore original addresses in to/from. 659 __ movq(to, temp3); 660 __ movq(from, temp1); 661 __ movq(temp4, temp2); 662 __ movq(temp1, count); 663 __ subq(temp1, temp2); // tailing part (less than a quad ward size). 664 __ jmp(L_tail); 665 } 666 667 if (MaxVectorSize > 32) { 668 __ BIND(L_pre_main_post_64); 669 // Partial copy to make dst address 64 byte aligned. 670 __ movq(temp2, to); 671 __ andq(temp2, 63); 672 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 673 674 __ negptr(temp2); 675 __ addq(temp2, 64); 676 if (shift) { 677 __ shrq(temp2, shift); 678 } 679 __ movq(temp3, temp2); 680 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true); 681 __ movq(temp4, temp2); 682 __ movq(temp1, count); 683 __ subq(temp1, temp2); 684 685 __ cmpq(temp1, loop_size[shift]); 686 __ jcc(Assembler::less, L_tail64); 687 688 __ BIND(L_main_pre_loop_64bytes); 689 __ subq(temp1, loop_size[shift]); 690 691 // Main loop with aligned copy block size of 192 bytes at 692 // 64 byte copy granularity. 693 __ align32(); 694 __ BIND(L_main_loop_64bytes); 695 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true); 696 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true); 697 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true); 698 __ addptr(temp4, loop_size[shift]); 699 __ subq(temp1, loop_size[shift]); 700 __ jcc(Assembler::greater, L_main_loop_64bytes); 701 702 __ addq(temp1, loop_size[shift]); 703 // Zero length check. 704 __ jcc(Assembler::lessEqual, L_exit); 705 706 __ BIND(L_tail64); 707 708 // Tail handling using 64 byte [masked] vector copy operations. 709 use64byteVector = true; 710 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 711 temp4, temp3, use64byteVector, L_entry, L_exit); 712 } 713 __ BIND(L_exit); 714 } 715 716 __ BIND(L_finish); 717 address ucme_exit_pc = __ pc(); 718 // When called from generic_arraycopy r11 contains specific values 719 // used during arraycopy epilogue, re-initializing r11. 720 if (is_oop) { 721 __ movq(r11, shift == 3 ? count : to); 722 } 723 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 724 restore_argument_regs(type); 725 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 726 __ xorptr(rax, rax); // return 0 727 __ vzeroupper(); 728 __ leave(); // required for proper stackwalking of RuntimeStub frame 729 __ ret(0); 730 731 if (MaxVectorSize == 64) { 732 __ BIND(L_copy_large); 733 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc); 734 arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift); 735 __ jmp(L_finish); 736 } 737 return start; 738 } 739 740 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2, 741 Register temp3, Register temp4, Register count, 742 XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, 743 XMMRegister xmm4, int shift) { 744 745 // Type(shift) byte(0), short(1), int(2), long(3) 746 int loop_size[] = { 256, 128, 64, 32}; 747 int threshold[] = { 4096, 2048, 1024, 512}; 748 749 Label L_main_loop_large; 750 Label L_tail_large; 751 Label L_exit_large; 752 Label L_entry_large; 753 Label L_main_pre_loop_large; 754 Label L_pre_main_post_large; 755 756 assert(MaxVectorSize == 64, "vector length != 64"); 757 __ BIND(L_entry_large); 758 759 __ BIND(L_pre_main_post_large); 760 // Partial copy to make dst address 64 byte aligned. 761 __ movq(temp2, to); 762 __ andq(temp2, 63); 763 __ jcc(Assembler::equal, L_main_pre_loop_large); 764 765 __ negptr(temp2); 766 __ addq(temp2, 64); 767 if (shift) { 768 __ shrq(temp2, shift); 769 } 770 __ movq(temp3, temp2); 771 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true); 772 __ movq(temp4, temp2); 773 __ movq(temp1, count); 774 __ subq(temp1, temp2); 775 776 __ cmpq(temp1, loop_size[shift]); 777 __ jcc(Assembler::less, L_tail_large); 778 779 __ BIND(L_main_pre_loop_large); 780 __ subq(temp1, loop_size[shift]); 781 782 // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity. 783 __ align32(); 784 __ BIND(L_main_loop_large); 785 copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0); 786 __ addptr(temp4, loop_size[shift]); 787 __ subq(temp1, loop_size[shift]); 788 __ jcc(Assembler::greater, L_main_loop_large); 789 // fence needed because copy256_avx3 uses non-temporal stores 790 __ sfence(); 791 792 __ addq(temp1, loop_size[shift]); 793 // Zero length check. 794 __ jcc(Assembler::lessEqual, L_exit_large); 795 __ BIND(L_tail_large); 796 // Tail handling using 64 byte [masked] vector copy operations. 797 __ cmpq(temp1, 0); 798 __ jcc(Assembler::lessEqual, L_exit_large); 799 arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift, 800 temp4, temp3, L_exit_large); 801 __ BIND(L_exit_large); 802 } 803 804 // Inputs: 805 // c_rarg0 - source array address 806 // c_rarg1 - destination array address 807 // c_rarg2 - element count, treated as ssize_t, can be zero 808 // 809 // 810 address StubGenerator::generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift, 811 address nooverlap_target, bool aligned, 812 bool is_oop, bool dest_uninitialized) { 813 __ align(CodeEntryAlignment); 814 StubCodeMark mark(this, "StubRoutines", name); 815 address start = __ pc(); 816 817 int avx3threshold = VM_Version::avx3_threshold(); 818 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 819 820 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 821 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 822 const Register from = rdi; // source array address 823 const Register to = rsi; // destination array address 824 const Register count = rdx; // elements count 825 const Register temp1 = r8; 826 const Register temp2 = rcx; 827 const Register temp3 = r11; 828 const Register temp4 = rax; 829 // End pointers are inclusive, and if count is not zero they point 830 // to the last unit copied: end_to[0] := end_from[0] 831 832 __ enter(); // required for proper stackwalking of RuntimeStub frame 833 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 834 835 if (entry != nullptr) { 836 *entry = __ pc(); 837 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 838 BLOCK_COMMENT("Entry:"); 839 } 840 841 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift)); 842 843 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 844 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 845 846 setup_argument_regs(type); 847 848 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 849 if (dest_uninitialized) { 850 decorators |= IS_DEST_UNINITIALIZED; 851 } 852 if (aligned) { 853 decorators |= ARRAYCOPY_ALIGNED; 854 } 855 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 856 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 857 { 858 // Type(shift) byte(0), short(1), int(2), long(3) 859 int loop_size[] = { 192, 96, 48, 24}; 860 int threshold[] = { 4096, 2048, 1024, 512}; 861 862 // UnsafeMemoryAccess page error: continue after unsafe access 863 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 864 // 'from', 'to' and 'count' are now valid 865 866 // temp1 holds remaining count. 867 __ movq(temp1, count); 868 869 // Zero length check. 870 __ BIND(L_tail); 871 __ cmpq(temp1, 0); 872 __ jcc(Assembler::lessEqual, L_exit); 873 874 __ mov64(temp2, 0); 875 __ movq(temp3, temp1); 876 // Special cases using 32 byte [masked] vector copy operations. 877 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 878 temp4, use64byteVector, L_entry, L_exit); 879 880 // PRE-MAIN-POST loop for aligned copy. 881 __ BIND(L_entry); 882 883 if ((MaxVectorSize > 32) && (avx3threshold != 0)) { 884 __ cmpq(temp1, threshold[shift]); 885 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 886 } 887 888 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 889 // Partial copy to make dst address 32 byte aligned. 890 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 891 __ andq(temp2, 31); 892 __ jcc(Assembler::equal, L_main_pre_loop); 893 894 if (shift) { 895 __ shrq(temp2, shift); 896 } 897 __ subq(temp1, temp2); 898 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift); 899 900 __ cmpq(temp1, loop_size[shift]); 901 __ jcc(Assembler::less, L_tail); 902 903 __ BIND(L_main_pre_loop); 904 905 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 906 __ align32(); 907 __ BIND(L_main_loop); 908 copy64_avx(to, from, temp1, xmm1, true, shift, -64); 909 copy64_avx(to, from, temp1, xmm1, true, shift, -128); 910 copy64_avx(to, from, temp1, xmm1, true, shift, -192); 911 __ subptr(temp1, loop_size[shift]); 912 __ cmpq(temp1, loop_size[shift]); 913 __ jcc(Assembler::greater, L_main_loop); 914 915 // Tail loop. 916 __ jmp(L_tail); 917 } 918 919 if (MaxVectorSize > 32) { 920 __ BIND(L_pre_main_post_64); 921 // Partial copy to make dst address 64 byte aligned. 922 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 923 __ andq(temp2, 63); 924 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 925 926 if (shift) { 927 __ shrq(temp2, shift); 928 } 929 __ subq(temp1, temp2); 930 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true); 931 932 __ cmpq(temp1, loop_size[shift]); 933 __ jcc(Assembler::less, L_tail64); 934 935 __ BIND(L_main_pre_loop_64bytes); 936 937 // Main loop with aligned copy block size of 192 bytes at 938 // 64 byte copy granularity. 939 __ align32(); 940 __ BIND(L_main_loop_64bytes); 941 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true); 942 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true); 943 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true); 944 __ subq(temp1, loop_size[shift]); 945 __ cmpq(temp1, loop_size[shift]); 946 __ jcc(Assembler::greater, L_main_loop_64bytes); 947 948 // Zero length check. 949 __ cmpq(temp1, 0); 950 __ jcc(Assembler::lessEqual, L_exit); 951 952 __ BIND(L_tail64); 953 954 // Tail handling using 64 byte [masked] vector copy operations. 955 use64byteVector = true; 956 __ mov64(temp2, 0); 957 __ movq(temp3, temp1); 958 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 959 temp4, use64byteVector, L_entry, L_exit); 960 } 961 __ BIND(L_exit); 962 } 963 address ucme_exit_pc = __ pc(); 964 // When called from generic_arraycopy r11 contains specific values 965 // used during arraycopy epilogue, re-initializing r11. 966 if(is_oop) { 967 __ movq(r11, count); 968 } 969 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 970 restore_argument_regs(type); 971 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 972 __ xorptr(rax, rax); // return 0 973 __ vzeroupper(); 974 __ leave(); // required for proper stackwalking of RuntimeStub frame 975 __ ret(0); 976 977 return start; 978 } 979 980 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from, 981 Register to, Register count, int shift, 982 Register index, Register temp, 983 bool use64byteVector, Label& L_entry, Label& L_exit) { 984 Label L_entry_64, L_entry_96, L_entry_128; 985 Label L_entry_160, L_entry_192; 986 987 int size_mat[][6] = { 988 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 989 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 990 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 991 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 992 }; 993 994 // Case A) Special case for length less than equal to 32 bytes. 995 __ cmpq(count, size_mat[shift][0]); 996 __ jccb(Assembler::greater, L_entry_64); 997 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift); 998 __ jmp(L_exit); 999 1000 // Case B) Special case for length less than equal to 64 bytes. 1001 __ BIND(L_entry_64); 1002 __ cmpq(count, size_mat[shift][1]); 1003 __ jccb(Assembler::greater, L_entry_96); 1004 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector); 1005 __ jmp(L_exit); 1006 1007 // Case C) Special case for length less than equal to 96 bytes. 1008 __ BIND(L_entry_96); 1009 __ cmpq(count, size_mat[shift][2]); 1010 __ jccb(Assembler::greater, L_entry_128); 1011 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1012 __ subq(count, 64 >> shift); 1013 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64); 1014 __ jmp(L_exit); 1015 1016 // Case D) Special case for length less than equal to 128 bytes. 1017 __ BIND(L_entry_128); 1018 __ cmpq(count, size_mat[shift][3]); 1019 __ jccb(Assembler::greater, L_entry_160); 1020 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1021 copy32_avx(to, from, index, xmm, shift, 64); 1022 __ subq(count, 96 >> shift); 1023 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96); 1024 __ jmp(L_exit); 1025 1026 // Case E) Special case for length less than equal to 160 bytes. 1027 __ BIND(L_entry_160); 1028 __ cmpq(count, size_mat[shift][4]); 1029 __ jccb(Assembler::greater, L_entry_192); 1030 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1031 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 1032 __ subq(count, 128 >> shift); 1033 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128); 1034 __ jmp(L_exit); 1035 1036 // Case F) Special case for length less than equal to 192 bytes. 1037 __ BIND(L_entry_192); 1038 __ cmpq(count, size_mat[shift][5]); 1039 __ jcc(Assembler::greater, L_entry); 1040 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1041 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 1042 copy32_avx(to, from, index, xmm, shift, 128); 1043 __ subq(count, 160 >> shift); 1044 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160); 1045 __ jmp(L_exit); 1046 } 1047 1048 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from, 1049 Register to, Register count, int shift, Register index, 1050 Register temp, Label& L_exit) { 1051 Label L_entry_64, L_entry_128, L_entry_192, L_entry_256; 1052 1053 int size_mat[][4] = { 1054 /* T_BYTE */ {64, 128, 192, 256}, 1055 /* T_SHORT*/ {32, 64 , 96 , 128}, 1056 /* T_INT */ {16, 32 , 48 , 64}, 1057 /* T_LONG */ { 8, 16 , 24 , 32} 1058 }; 1059 1060 assert(MaxVectorSize == 64, "vector length != 64"); 1061 // Case A) Special case for length less than or equal to 64 bytes. 1062 __ BIND(L_entry_64); 1063 __ cmpq(count, size_mat[shift][0]); 1064 __ jccb(Assembler::greater, L_entry_128); 1065 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true); 1066 __ jmp(L_exit); 1067 1068 // Case B) Special case for length less than or equal to 128 bytes. 1069 __ BIND(L_entry_128); 1070 __ cmpq(count, size_mat[shift][1]); 1071 __ jccb(Assembler::greater, L_entry_192); 1072 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1073 __ subq(count, 64 >> shift); 1074 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true); 1075 __ jmp(L_exit); 1076 1077 // Case C) Special case for length less than or equal to 192 bytes. 1078 __ BIND(L_entry_192); 1079 __ cmpq(count, size_mat[shift][2]); 1080 __ jcc(Assembler::greater, L_entry_256); 1081 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1082 copy64_avx(to, from, index, xmm, false, shift, 64, true); 1083 __ subq(count, 128 >> shift); 1084 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true); 1085 __ jmp(L_exit); 1086 1087 // Case D) Special case for length less than or equal to 256 bytes. 1088 __ BIND(L_entry_256); 1089 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1090 copy64_avx(to, from, index, xmm, false, shift, 64, true); 1091 copy64_avx(to, from, index, xmm, false, shift, 128, true); 1092 __ subq(count, 192 >> shift); 1093 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true); 1094 __ jmp(L_exit); 1095 } 1096 1097 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, 1098 Register to, Register start_index, Register end_index, 1099 Register count, int shift, Register temp, 1100 bool use64byteVector, Label& L_entry, Label& L_exit) { 1101 Label L_entry_64, L_entry_96, L_entry_128; 1102 Label L_entry_160, L_entry_192; 1103 bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 1104 1105 int size_mat[][6] = { 1106 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 1107 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 1108 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 1109 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 1110 }; 1111 1112 // Case A) Special case for length less than equal to 32 bytes. 1113 __ cmpq(count, size_mat[shift][0]); 1114 __ jccb(Assembler::greater, L_entry_64); 1115 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1116 __ jmp(L_exit); 1117 1118 // Case B) Special case for length less than equal to 64 bytes. 1119 __ BIND(L_entry_64); 1120 __ cmpq(count, size_mat[shift][1]); 1121 __ jccb(Assembler::greater, L_entry_96); 1122 if (avx3) { 1123 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true); 1124 } else { 1125 copy32_avx(to, from, end_index, xmm, shift, -32); 1126 __ subq(count, 32 >> shift); 1127 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1128 } 1129 __ jmp(L_exit); 1130 1131 // Case C) Special case for length less than equal to 96 bytes. 1132 __ BIND(L_entry_96); 1133 __ cmpq(count, size_mat[shift][2]); 1134 __ jccb(Assembler::greater, L_entry_128); 1135 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1136 __ subq(count, 64 >> shift); 1137 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1138 __ jmp(L_exit); 1139 1140 // Case D) Special case for length less than equal to 128 bytes. 1141 __ BIND(L_entry_128); 1142 __ cmpq(count, size_mat[shift][3]); 1143 __ jccb(Assembler::greater, L_entry_160); 1144 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1145 copy32_avx(to, from, end_index, xmm, shift, -96); 1146 __ subq(count, 96 >> shift); 1147 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1148 __ jmp(L_exit); 1149 1150 // Case E) Special case for length less than equal to 160 bytes. 1151 __ BIND(L_entry_160); 1152 __ cmpq(count, size_mat[shift][4]); 1153 __ jccb(Assembler::greater, L_entry_192); 1154 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1155 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 1156 __ subq(count, 128 >> shift); 1157 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1158 __ jmp(L_exit); 1159 1160 // Case F) Special case for length less than equal to 192 bytes. 1161 __ BIND(L_entry_192); 1162 __ cmpq(count, size_mat[shift][5]); 1163 __ jcc(Assembler::greater, L_entry); 1164 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1165 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 1166 copy32_avx(to, from, end_index, xmm, shift, -160); 1167 __ subq(count, 160 >> shift); 1168 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1169 __ jmp(L_exit); 1170 } 1171 1172 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, 1173 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, 1174 int shift, int offset) { 1175 if (MaxVectorSize == 64) { 1176 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1177 __ prefetcht0(Address(src, index, scale, offset + 0x200)); 1178 __ prefetcht0(Address(src, index, scale, offset + 0x240)); 1179 __ prefetcht0(Address(src, index, scale, offset + 0x280)); 1180 __ prefetcht0(Address(src, index, scale, offset + 0x2C0)); 1181 1182 __ prefetcht0(Address(src, index, scale, offset + 0x400)); 1183 __ prefetcht0(Address(src, index, scale, offset + 0x440)); 1184 __ prefetcht0(Address(src, index, scale, offset + 0x480)); 1185 __ prefetcht0(Address(src, index, scale, offset + 0x4C0)); 1186 1187 __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit); 1188 __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit); 1189 __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit); 1190 __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit); 1191 1192 __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit); 1193 __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit); 1194 __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit); 1195 __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit); 1196 } 1197 } 1198 1199 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm, 1200 KRegister mask, Register length, Register index, 1201 Register temp, int shift, int offset, 1202 bool use64byteVector) { 1203 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 1204 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1205 if (!use64byteVector) { 1206 copy32_avx(dst, src, index, xmm, shift, offset); 1207 __ subptr(length, 32 >> shift); 1208 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32); 1209 } else { 1210 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1211 assert(MaxVectorSize == 64, "vector length != 64"); 1212 __ mov64(temp, -1L); 1213 __ bzhiq(temp, temp, length); 1214 __ kmovql(mask, temp); 1215 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit); 1216 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit); 1217 } 1218 } 1219 1220 1221 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm, 1222 KRegister mask, Register length, Register index, 1223 Register temp, int shift, int offset) { 1224 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1225 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 1226 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1227 __ mov64(temp, -1L); 1228 __ bzhiq(temp, temp, length); 1229 __ kmovql(mask, temp); 1230 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit); 1231 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit); 1232 } 1233 1234 1235 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, 1236 int shift, int offset) { 1237 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1238 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1239 __ vmovdqu(xmm, Address(src, index, scale, offset)); 1240 __ vmovdqu(Address(dst, index, scale, offset), xmm); 1241 } 1242 1243 1244 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, 1245 bool conjoint, int shift, int offset, bool use64byteVector) { 1246 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch"); 1247 if (!use64byteVector) { 1248 if (conjoint) { 1249 copy32_avx(dst, src, index, xmm, shift, offset+32); 1250 copy32_avx(dst, src, index, xmm, shift, offset); 1251 } else { 1252 copy32_avx(dst, src, index, xmm, shift, offset); 1253 copy32_avx(dst, src, index, xmm, shift, offset+32); 1254 } 1255 } else { 1256 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1257 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit); 1258 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit); 1259 } 1260 } 1261 1262 #endif // COMPILER2_OR_JVMCI 1263 1264 1265 // Arguments: 1266 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1267 // ignored 1268 // name - stub name string 1269 // 1270 // Inputs: 1271 // c_rarg0 - source array address 1272 // c_rarg1 - destination array address 1273 // c_rarg2 - element count, treated as ssize_t, can be zero 1274 // 1275 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1276 // we let the hardware handle it. The one to eight bytes within words, 1277 // dwords or qwords that span cache line boundaries will still be loaded 1278 // and stored atomically. 1279 // 1280 // Side Effects: 1281 // disjoint_byte_copy_entry is set to the no-overlap entry point 1282 // used by generate_conjoint_byte_copy(). 1283 // 1284 address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1285 #if COMPILER2_OR_JVMCI 1286 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1287 return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0, 1288 aligned, false, false); 1289 } 1290 #endif 1291 __ align(CodeEntryAlignment); 1292 StubCodeMark mark(this, "StubRoutines", name); 1293 address start = __ pc(); 1294 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1295 1296 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1297 Label L_copy_byte, L_exit; 1298 const Register from = rdi; // source array address 1299 const Register to = rsi; // destination array address 1300 const Register count = rdx; // elements count 1301 const Register byte_count = rcx; 1302 const Register qword_count = count; 1303 const Register end_from = from; // source array end address 1304 const Register end_to = to; // destination array end address 1305 // End pointers are inclusive, and if count is not zero they point 1306 // to the last unit copied: end_to[0] := end_from[0] 1307 1308 __ enter(); // required for proper stackwalking of RuntimeStub frame 1309 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1310 1311 if (entry != nullptr) { 1312 *entry = __ pc(); 1313 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1314 BLOCK_COMMENT("Entry:"); 1315 } 1316 1317 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1318 // r9 and r10 may be used to save non-volatile registers 1319 1320 { 1321 // UnsafeMemoryAccess page error: continue after unsafe access 1322 UnsafeMemoryAccessMark umam(this, !aligned, true); 1323 // 'from', 'to' and 'count' are now valid 1324 __ movptr(byte_count, count); 1325 __ shrptr(count, 3); // count => qword_count 1326 1327 // Copy from low to high addresses. Use 'to' as scratch. 1328 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1329 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1330 __ negptr(qword_count); // make the count negative 1331 __ jmp(L_copy_bytes); 1332 1333 // Copy trailing qwords 1334 __ BIND(L_copy_8_bytes); 1335 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1336 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1337 __ increment(qword_count); 1338 __ jcc(Assembler::notZero, L_copy_8_bytes); 1339 1340 // Check for and copy trailing dword 1341 __ BIND(L_copy_4_bytes); 1342 __ testl(byte_count, 4); 1343 __ jccb(Assembler::zero, L_copy_2_bytes); 1344 __ movl(rax, Address(end_from, 8)); 1345 __ movl(Address(end_to, 8), rax); 1346 1347 __ addptr(end_from, 4); 1348 __ addptr(end_to, 4); 1349 1350 // Check for and copy trailing word 1351 __ BIND(L_copy_2_bytes); 1352 __ testl(byte_count, 2); 1353 __ jccb(Assembler::zero, L_copy_byte); 1354 __ movw(rax, Address(end_from, 8)); 1355 __ movw(Address(end_to, 8), rax); 1356 1357 __ addptr(end_from, 2); 1358 __ addptr(end_to, 2); 1359 1360 // Check for and copy trailing byte 1361 __ BIND(L_copy_byte); 1362 __ testl(byte_count, 1); 1363 __ jccb(Assembler::zero, L_exit); 1364 __ movb(rax, Address(end_from, 8)); 1365 __ movb(Address(end_to, 8), rax); 1366 } 1367 __ BIND(L_exit); 1368 address ucme_exit_pc = __ pc(); 1369 restore_arg_regs(); 1370 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1371 __ xorptr(rax, rax); // return 0 1372 __ vzeroupper(); 1373 __ leave(); // required for proper stackwalking of RuntimeStub frame 1374 __ ret(0); 1375 1376 { 1377 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc); 1378 // Copy in multi-bytes chunks 1379 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE); 1380 __ jmp(L_copy_4_bytes); 1381 } 1382 return start; 1383 } 1384 1385 1386 // Arguments: 1387 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1388 // ignored 1389 // name - stub name string 1390 // 1391 // Inputs: 1392 // c_rarg0 - source array address 1393 // c_rarg1 - destination array address 1394 // c_rarg2 - element count, treated as ssize_t, can be zero 1395 // 1396 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1397 // we let the hardware handle it. The one to eight bytes within words, 1398 // dwords or qwords that span cache line boundaries will still be loaded 1399 // and stored atomically. 1400 // 1401 address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1402 address* entry, const char *name) { 1403 #if COMPILER2_OR_JVMCI 1404 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1405 return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0, 1406 nooverlap_target, aligned, false, false); 1407 } 1408 #endif 1409 __ align(CodeEntryAlignment); 1410 StubCodeMark mark(this, "StubRoutines", name); 1411 address start = __ pc(); 1412 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1413 1414 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1415 const Register from = rdi; // source array address 1416 const Register to = rsi; // destination array address 1417 const Register count = rdx; // elements count 1418 const Register byte_count = rcx; 1419 const Register qword_count = count; 1420 1421 __ enter(); // required for proper stackwalking of RuntimeStub frame 1422 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1423 1424 if (entry != nullptr) { 1425 *entry = __ pc(); 1426 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1427 BLOCK_COMMENT("Entry:"); 1428 } 1429 1430 array_overlap_test(nooverlap_target, Address::times_1); 1431 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1432 // r9 and r10 may be used to save non-volatile registers 1433 1434 { 1435 // UnsafeMemoryAccess page error: continue after unsafe access 1436 UnsafeMemoryAccessMark umam(this, !aligned, true); 1437 // 'from', 'to' and 'count' are now valid 1438 __ movptr(byte_count, count); 1439 __ shrptr(count, 3); // count => qword_count 1440 1441 // Copy from high to low addresses. 1442 1443 // Check for and copy trailing byte 1444 __ testl(byte_count, 1); 1445 __ jcc(Assembler::zero, L_copy_2_bytes); 1446 __ movb(rax, Address(from, byte_count, Address::times_1, -1)); 1447 __ movb(Address(to, byte_count, Address::times_1, -1), rax); 1448 __ decrement(byte_count); // Adjust for possible trailing word 1449 1450 // Check for and copy trailing word 1451 __ BIND(L_copy_2_bytes); 1452 __ testl(byte_count, 2); 1453 __ jcc(Assembler::zero, L_copy_4_bytes); 1454 __ movw(rax, Address(from, byte_count, Address::times_1, -2)); 1455 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1456 1457 // Check for and copy trailing dword 1458 __ BIND(L_copy_4_bytes); 1459 __ testl(byte_count, 4); 1460 __ jcc(Assembler::zero, L_copy_bytes); 1461 __ movl(rax, Address(from, qword_count, Address::times_8)); 1462 __ movl(Address(to, qword_count, Address::times_8), rax); 1463 __ jmp(L_copy_bytes); 1464 1465 // Copy trailing qwords 1466 __ BIND(L_copy_8_bytes); 1467 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1468 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1469 __ decrement(qword_count); 1470 __ jcc(Assembler::notZero, L_copy_8_bytes); 1471 } 1472 restore_arg_regs(); 1473 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1474 __ xorptr(rax, rax); // return 0 1475 __ vzeroupper(); 1476 __ leave(); // required for proper stackwalking of RuntimeStub frame 1477 __ ret(0); 1478 1479 { 1480 // UnsafeMemoryAccess page error: continue after unsafe access 1481 UnsafeMemoryAccessMark umam(this, !aligned, true); 1482 // Copy in multi-bytes chunks 1483 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE); 1484 } 1485 restore_arg_regs(); 1486 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1487 __ xorptr(rax, rax); // return 0 1488 __ vzeroupper(); 1489 __ leave(); // required for proper stackwalking of RuntimeStub frame 1490 __ ret(0); 1491 1492 return start; 1493 } 1494 1495 1496 // Arguments: 1497 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1498 // ignored 1499 // name - stub name string 1500 // 1501 // Inputs: 1502 // c_rarg0 - source array address 1503 // c_rarg1 - destination array address 1504 // c_rarg2 - element count, treated as ssize_t, can be zero 1505 // 1506 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1507 // let the hardware handle it. The two or four words within dwords 1508 // or qwords that span cache line boundaries will still be loaded 1509 // and stored atomically. 1510 // 1511 // Side Effects: 1512 // disjoint_short_copy_entry is set to the no-overlap entry point 1513 // used by generate_conjoint_short_copy(). 1514 // 1515 address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { 1516 #if COMPILER2_OR_JVMCI 1517 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1518 return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1, 1519 aligned, false, false); 1520 } 1521 #endif 1522 1523 __ align(CodeEntryAlignment); 1524 StubCodeMark mark(this, "StubRoutines", name); 1525 address start = __ pc(); 1526 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1527 1528 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1529 const Register from = rdi; // source array address 1530 const Register to = rsi; // destination array address 1531 const Register count = rdx; // elements count 1532 const Register word_count = rcx; 1533 const Register qword_count = count; 1534 const Register end_from = from; // source array end address 1535 const Register end_to = to; // destination array end address 1536 // End pointers are inclusive, and if count is not zero they point 1537 // to the last unit copied: end_to[0] := end_from[0] 1538 1539 __ enter(); // required for proper stackwalking of RuntimeStub frame 1540 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1541 1542 if (entry != nullptr) { 1543 *entry = __ pc(); 1544 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1545 BLOCK_COMMENT("Entry:"); 1546 } 1547 1548 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1549 // r9 and r10 may be used to save non-volatile registers 1550 1551 { 1552 // UnsafeMemoryAccess page error: continue after unsafe access 1553 UnsafeMemoryAccessMark umam(this, !aligned, true); 1554 // 'from', 'to' and 'count' are now valid 1555 __ movptr(word_count, count); 1556 __ shrptr(count, 2); // count => qword_count 1557 1558 // Copy from low to high addresses. Use 'to' as scratch. 1559 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1560 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1561 __ negptr(qword_count); 1562 __ jmp(L_copy_bytes); 1563 1564 // Copy trailing qwords 1565 __ BIND(L_copy_8_bytes); 1566 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1567 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1568 __ increment(qword_count); 1569 __ jcc(Assembler::notZero, L_copy_8_bytes); 1570 1571 // Original 'dest' is trashed, so we can't use it as a 1572 // base register for a possible trailing word copy 1573 1574 // Check for and copy trailing dword 1575 __ BIND(L_copy_4_bytes); 1576 __ testl(word_count, 2); 1577 __ jccb(Assembler::zero, L_copy_2_bytes); 1578 __ movl(rax, Address(end_from, 8)); 1579 __ movl(Address(end_to, 8), rax); 1580 1581 __ addptr(end_from, 4); 1582 __ addptr(end_to, 4); 1583 1584 // Check for and copy trailing word 1585 __ BIND(L_copy_2_bytes); 1586 __ testl(word_count, 1); 1587 __ jccb(Assembler::zero, L_exit); 1588 __ movw(rax, Address(end_from, 8)); 1589 __ movw(Address(end_to, 8), rax); 1590 } 1591 __ BIND(L_exit); 1592 address ucme_exit_pc = __ pc(); 1593 restore_arg_regs(); 1594 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1595 __ xorptr(rax, rax); // return 0 1596 __ vzeroupper(); 1597 __ leave(); // required for proper stackwalking of RuntimeStub frame 1598 __ ret(0); 1599 1600 { 1601 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc); 1602 // Copy in multi-bytes chunks 1603 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT); 1604 __ jmp(L_copy_4_bytes); 1605 } 1606 1607 return start; 1608 } 1609 1610 1611 address StubGenerator::generate_fill(BasicType t, bool aligned, const char *name) { 1612 __ align(CodeEntryAlignment); 1613 StubCodeMark mark(this, "StubRoutines", name); 1614 address start = __ pc(); 1615 1616 BLOCK_COMMENT("Entry:"); 1617 1618 const Register to = c_rarg0; // destination array address 1619 const Register value = c_rarg1; // value 1620 const Register count = c_rarg2; // elements count 1621 __ mov(r11, count); 1622 1623 __ enter(); // required for proper stackwalking of RuntimeStub frame 1624 1625 { 1626 // Add set memory mark to protect against unsafe accesses faulting 1627 UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true); 1628 __ generate_fill(t, aligned, to, value, r11, rax, xmm0); 1629 } 1630 1631 __ vzeroupper(); 1632 __ leave(); // required for proper stackwalking of RuntimeStub frame 1633 __ ret(0); 1634 1635 return start; 1636 } 1637 1638 1639 // Arguments: 1640 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1641 // ignored 1642 // name - stub name string 1643 // 1644 // Inputs: 1645 // c_rarg0 - source array address 1646 // c_rarg1 - destination array address 1647 // c_rarg2 - element count, treated as ssize_t, can be zero 1648 // 1649 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1650 // let the hardware handle it. The two or four words within dwords 1651 // or qwords that span cache line boundaries will still be loaded 1652 // and stored atomically. 1653 // 1654 address StubGenerator::generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1655 address *entry, const char *name) { 1656 #if COMPILER2_OR_JVMCI 1657 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1658 return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1, 1659 nooverlap_target, aligned, false, false); 1660 } 1661 #endif 1662 __ align(CodeEntryAlignment); 1663 StubCodeMark mark(this, "StubRoutines", name); 1664 address start = __ pc(); 1665 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1666 1667 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; 1668 const Register from = rdi; // source array address 1669 const Register to = rsi; // destination array address 1670 const Register count = rdx; // elements count 1671 const Register word_count = rcx; 1672 const Register qword_count = count; 1673 1674 __ enter(); // required for proper stackwalking of RuntimeStub frame 1675 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1676 1677 if (entry != nullptr) { 1678 *entry = __ pc(); 1679 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1680 BLOCK_COMMENT("Entry:"); 1681 } 1682 1683 array_overlap_test(nooverlap_target, Address::times_2); 1684 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1685 // r9 and r10 may be used to save non-volatile registers 1686 1687 { 1688 // UnsafeMemoryAccess page error: continue after unsafe access 1689 UnsafeMemoryAccessMark umam(this, !aligned, true); 1690 // 'from', 'to' and 'count' are now valid 1691 __ movptr(word_count, count); 1692 __ shrptr(count, 2); // count => qword_count 1693 1694 // Copy from high to low addresses. Use 'to' as scratch. 1695 1696 // Check for and copy trailing word 1697 __ testl(word_count, 1); 1698 __ jccb(Assembler::zero, L_copy_4_bytes); 1699 __ movw(rax, Address(from, word_count, Address::times_2, -2)); 1700 __ movw(Address(to, word_count, Address::times_2, -2), rax); 1701 1702 // Check for and copy trailing dword 1703 __ BIND(L_copy_4_bytes); 1704 __ testl(word_count, 2); 1705 __ jcc(Assembler::zero, L_copy_bytes); 1706 __ movl(rax, Address(from, qword_count, Address::times_8)); 1707 __ movl(Address(to, qword_count, Address::times_8), rax); 1708 __ jmp(L_copy_bytes); 1709 1710 // Copy trailing qwords 1711 __ BIND(L_copy_8_bytes); 1712 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1713 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1714 __ decrement(qword_count); 1715 __ jcc(Assembler::notZero, L_copy_8_bytes); 1716 } 1717 restore_arg_regs(); 1718 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1719 __ xorptr(rax, rax); // return 0 1720 __ vzeroupper(); 1721 __ leave(); // required for proper stackwalking of RuntimeStub frame 1722 __ ret(0); 1723 1724 { 1725 // UnsafeMemoryAccess page error: continue after unsafe access 1726 UnsafeMemoryAccessMark umam(this, !aligned, true); 1727 // Copy in multi-bytes chunks 1728 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT); 1729 } 1730 restore_arg_regs(); 1731 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1732 __ xorptr(rax, rax); // return 0 1733 __ vzeroupper(); 1734 __ leave(); // required for proper stackwalking of RuntimeStub frame 1735 __ ret(0); 1736 1737 return start; 1738 } 1739 1740 1741 // Arguments: 1742 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1743 // ignored 1744 // is_oop - true => oop array, so generate store check code 1745 // name - stub name string 1746 // 1747 // Inputs: 1748 // c_rarg0 - source array address 1749 // c_rarg1 - destination array address 1750 // c_rarg2 - element count, treated as ssize_t, can be zero 1751 // 1752 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1753 // the hardware handle it. The two dwords within qwords that span 1754 // cache line boundaries will still be loaded and stored atomically. 1755 // 1756 // Side Effects: 1757 // disjoint_int_copy_entry is set to the no-overlap entry point 1758 // used by generate_conjoint_int_oop_copy(). 1759 // 1760 address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, 1761 const char *name, bool dest_uninitialized) { 1762 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1763 #if COMPILER2_OR_JVMCI 1764 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1765 return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2, 1766 aligned, is_oop, dest_uninitialized); 1767 } 1768 #endif 1769 1770 __ align(CodeEntryAlignment); 1771 StubCodeMark mark(this, "StubRoutines", name); 1772 address start = __ pc(); 1773 1774 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 1775 const Register from = rdi; // source array address 1776 const Register to = rsi; // destination array address 1777 const Register count = rdx; // elements count 1778 const Register dword_count = rcx; 1779 const Register qword_count = count; 1780 const Register end_from = from; // source array end address 1781 const Register end_to = to; // destination array end address 1782 // End pointers are inclusive, and if count is not zero they point 1783 // to the last unit copied: end_to[0] := end_from[0] 1784 1785 __ enter(); // required for proper stackwalking of RuntimeStub frame 1786 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1787 1788 if (entry != nullptr) { 1789 *entry = __ pc(); 1790 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1791 BLOCK_COMMENT("Entry:"); 1792 } 1793 1794 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1795 // r9 is used to save r15_thread 1796 1797 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1798 if (dest_uninitialized) { 1799 decorators |= IS_DEST_UNINITIALIZED; 1800 } 1801 if (aligned) { 1802 decorators |= ARRAYCOPY_ALIGNED; 1803 } 1804 1805 BasicType type = is_oop ? T_OBJECT : T_INT; 1806 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1807 1808 { 1809 // UnsafeMemoryAccess page error: continue after unsafe access 1810 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 1811 // 'from', 'to' and 'count' are now valid 1812 __ movptr(dword_count, count); 1813 __ shrptr(count, 1); // count => qword_count 1814 1815 // Copy from low to high addresses. Use 'to' as scratch. 1816 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1817 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1818 __ negptr(qword_count); 1819 __ jmp(L_copy_bytes); 1820 1821 // Copy trailing qwords 1822 __ BIND(L_copy_8_bytes); 1823 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1824 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1825 __ increment(qword_count); 1826 __ jcc(Assembler::notZero, L_copy_8_bytes); 1827 1828 // Check for and copy trailing dword 1829 __ BIND(L_copy_4_bytes); 1830 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 1831 __ jccb(Assembler::zero, L_exit); 1832 __ movl(rax, Address(end_from, 8)); 1833 __ movl(Address(end_to, 8), rax); 1834 } 1835 __ BIND(L_exit); 1836 address ucme_exit_pc = __ pc(); 1837 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1838 restore_arg_regs_using_thread(); 1839 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1840 __ vzeroupper(); 1841 __ xorptr(rax, rax); // return 0 1842 __ leave(); // required for proper stackwalking of RuntimeStub frame 1843 __ ret(0); 1844 1845 { 1846 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc); 1847 // Copy in multi-bytes chunks 1848 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT); 1849 __ jmp(L_copy_4_bytes); 1850 } 1851 1852 return start; 1853 } 1854 1855 1856 // Arguments: 1857 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1858 // ignored 1859 // is_oop - true => oop array, so generate store check code 1860 // name - stub name string 1861 // 1862 // Inputs: 1863 // c_rarg0 - source array address 1864 // c_rarg1 - destination array address 1865 // c_rarg2 - element count, treated as ssize_t, can be zero 1866 // 1867 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1868 // the hardware handle it. The two dwords within qwords that span 1869 // cache line boundaries will still be loaded and stored atomically. 1870 // 1871 address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 1872 address *entry, const char *name, 1873 bool dest_uninitialized) { 1874 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1875 #if COMPILER2_OR_JVMCI 1876 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1877 return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2, 1878 nooverlap_target, aligned, is_oop, dest_uninitialized); 1879 } 1880 #endif 1881 __ align(CodeEntryAlignment); 1882 StubCodeMark mark(this, "StubRoutines", name); 1883 address start = __ pc(); 1884 1885 Label L_copy_bytes, L_copy_8_bytes, L_exit; 1886 const Register from = rdi; // source array address 1887 const Register to = rsi; // destination array address 1888 const Register count = rdx; // elements count 1889 const Register dword_count = rcx; 1890 const Register qword_count = count; 1891 1892 __ enter(); // required for proper stackwalking of RuntimeStub frame 1893 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1894 1895 if (entry != nullptr) { 1896 *entry = __ pc(); 1897 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1898 BLOCK_COMMENT("Entry:"); 1899 } 1900 1901 array_overlap_test(nooverlap_target, Address::times_4); 1902 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1903 // r9 is used to save r15_thread 1904 1905 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1906 if (dest_uninitialized) { 1907 decorators |= IS_DEST_UNINITIALIZED; 1908 } 1909 if (aligned) { 1910 decorators |= ARRAYCOPY_ALIGNED; 1911 } 1912 1913 BasicType type = is_oop ? T_OBJECT : T_INT; 1914 // no registers are destroyed by this call 1915 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1916 1917 assert_clean_int(count, rax); // Make sure 'count' is clean int. 1918 { 1919 // UnsafeMemoryAccess page error: continue after unsafe access 1920 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 1921 // 'from', 'to' and 'count' are now valid 1922 __ movptr(dword_count, count); 1923 __ shrptr(count, 1); // count => qword_count 1924 1925 // Copy from high to low addresses. Use 'to' as scratch. 1926 1927 // Check for and copy trailing dword 1928 __ testl(dword_count, 1); 1929 __ jcc(Assembler::zero, L_copy_bytes); 1930 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 1931 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 1932 __ jmp(L_copy_bytes); 1933 1934 // Copy trailing qwords 1935 __ BIND(L_copy_8_bytes); 1936 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1937 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1938 __ decrement(qword_count); 1939 __ jcc(Assembler::notZero, L_copy_8_bytes); 1940 } 1941 if (is_oop) { 1942 __ jmp(L_exit); 1943 } 1944 restore_arg_regs_using_thread(); 1945 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1946 __ xorptr(rax, rax); // return 0 1947 __ vzeroupper(); 1948 __ leave(); // required for proper stackwalking of RuntimeStub frame 1949 __ ret(0); 1950 1951 { 1952 // UnsafeMemoryAccess page error: continue after unsafe access 1953 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 1954 // Copy in multi-bytes chunks 1955 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT); 1956 } 1957 1958 __ BIND(L_exit); 1959 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1960 restore_arg_regs_using_thread(); 1961 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1962 __ xorptr(rax, rax); // return 0 1963 __ vzeroupper(); 1964 __ leave(); // required for proper stackwalking of RuntimeStub frame 1965 __ ret(0); 1966 1967 return start; 1968 } 1969 1970 1971 // Arguments: 1972 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1973 // ignored 1974 // is_oop - true => oop array, so generate store check code 1975 // name - stub name string 1976 // 1977 // Inputs: 1978 // c_rarg0 - source array address 1979 // c_rarg1 - destination array address 1980 // c_rarg2 - element count, treated as ssize_t, can be zero 1981 // 1982 // Side Effects: 1983 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1984 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1985 // 1986 address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, 1987 const char *name, bool dest_uninitialized) { 1988 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1989 #if COMPILER2_OR_JVMCI 1990 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1991 return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3, 1992 aligned, is_oop, dest_uninitialized); 1993 } 1994 #endif 1995 __ align(CodeEntryAlignment); 1996 StubCodeMark mark(this, "StubRoutines", name); 1997 address start = __ pc(); 1998 1999 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2000 const Register from = rdi; // source array address 2001 const Register to = rsi; // destination array address 2002 const Register qword_count = rdx; // elements count 2003 const Register end_from = from; // source array end address 2004 const Register end_to = rcx; // destination array end address 2005 const Register saved_count = r11; 2006 // End pointers are inclusive, and if count is not zero they point 2007 // to the last unit copied: end_to[0] := end_from[0] 2008 2009 __ enter(); // required for proper stackwalking of RuntimeStub frame 2010 // Save no-overlap entry point for generate_conjoint_long_oop_copy() 2011 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2012 2013 if (entry != nullptr) { 2014 *entry = __ pc(); 2015 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2016 BLOCK_COMMENT("Entry:"); 2017 } 2018 2019 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2020 // r9 is used to save r15_thread 2021 // 'from', 'to' and 'qword_count' are now valid 2022 2023 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2024 if (dest_uninitialized) { 2025 decorators |= IS_DEST_UNINITIALIZED; 2026 } 2027 if (aligned) { 2028 decorators |= ARRAYCOPY_ALIGNED; 2029 } 2030 2031 BasicType type = is_oop ? T_OBJECT : T_LONG; 2032 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2033 { 2034 // UnsafeMemoryAccess page error: continue after unsafe access 2035 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2036 2037 // Copy from low to high addresses. Use 'to' as scratch. 2038 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2039 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2040 __ negptr(qword_count); 2041 __ jmp(L_copy_bytes); 2042 2043 // Copy trailing qwords 2044 __ BIND(L_copy_8_bytes); 2045 bs->copy_load_at(_masm, decorators, type, 8, 2046 rax, Address(end_from, qword_count, Address::times_8, 8), 2047 r10); 2048 bs->copy_store_at(_masm, decorators, type, 8, 2049 Address(end_to, qword_count, Address::times_8, 8), rax, 2050 r10); 2051 __ increment(qword_count); 2052 __ jcc(Assembler::notZero, L_copy_8_bytes); 2053 } 2054 if (is_oop) { 2055 __ jmp(L_exit); 2056 } else { 2057 restore_arg_regs_using_thread(); 2058 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2059 __ xorptr(rax, rax); // return 0 2060 __ vzeroupper(); 2061 __ leave(); // required for proper stackwalking of RuntimeStub frame 2062 __ ret(0); 2063 } 2064 2065 { 2066 // UnsafeMemoryAccess page error: continue after unsafe access 2067 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2068 // Copy in multi-bytes chunks 2069 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG); 2070 } 2071 2072 __ BIND(L_exit); 2073 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2074 restore_arg_regs_using_thread(); 2075 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 2076 SharedRuntime::_jlong_array_copy_ctr, 2077 rscratch1); // Update counter after rscratch1 is free 2078 __ vzeroupper(); 2079 __ xorptr(rax, rax); // return 0 2080 __ leave(); // required for proper stackwalking of RuntimeStub frame 2081 __ ret(0); 2082 2083 return start; 2084 } 2085 2086 2087 // Arguments: 2088 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2089 // ignored 2090 // is_oop - true => oop array, so generate store check code 2091 // name - stub name string 2092 // 2093 // Inputs: 2094 // c_rarg0 - source array address 2095 // c_rarg1 - destination array address 2096 // c_rarg2 - element count, treated as ssize_t, can be zero 2097 // 2098 address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 2099 address *entry, const char *name, 2100 bool dest_uninitialized) { 2101 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2102 #if COMPILER2_OR_JVMCI 2103 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 2104 return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3, 2105 nooverlap_target, aligned, is_oop, dest_uninitialized); 2106 } 2107 #endif 2108 __ align(CodeEntryAlignment); 2109 StubCodeMark mark(this, "StubRoutines", name); 2110 address start = __ pc(); 2111 2112 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2113 const Register from = rdi; // source array address 2114 const Register to = rsi; // destination array address 2115 const Register qword_count = rdx; // elements count 2116 const Register saved_count = rcx; 2117 2118 __ enter(); // required for proper stackwalking of RuntimeStub frame 2119 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2120 2121 if (entry != nullptr) { 2122 *entry = __ pc(); 2123 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2124 BLOCK_COMMENT("Entry:"); 2125 } 2126 2127 array_overlap_test(nooverlap_target, Address::times_8); 2128 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2129 // r9 is used to save r15_thread 2130 // 'from', 'to' and 'qword_count' are now valid 2131 2132 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2133 if (dest_uninitialized) { 2134 decorators |= IS_DEST_UNINITIALIZED; 2135 } 2136 if (aligned) { 2137 decorators |= ARRAYCOPY_ALIGNED; 2138 } 2139 2140 BasicType type = is_oop ? T_OBJECT : T_LONG; 2141 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2142 { 2143 // UnsafeMemoryAccess page error: continue after unsafe access 2144 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2145 2146 __ jmp(L_copy_bytes); 2147 2148 // Copy trailing qwords 2149 __ BIND(L_copy_8_bytes); 2150 bs->copy_load_at(_masm, decorators, type, 8, 2151 rax, Address(from, qword_count, Address::times_8, -8), 2152 r10); 2153 bs->copy_store_at(_masm, decorators, type, 8, 2154 Address(to, qword_count, Address::times_8, -8), rax, 2155 r10); 2156 __ decrement(qword_count); 2157 __ jcc(Assembler::notZero, L_copy_8_bytes); 2158 } 2159 if (is_oop) { 2160 __ jmp(L_exit); 2161 } else { 2162 restore_arg_regs_using_thread(); 2163 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2164 __ xorptr(rax, rax); // return 0 2165 __ vzeroupper(); 2166 __ leave(); // required for proper stackwalking of RuntimeStub frame 2167 __ ret(0); 2168 } 2169 { 2170 // UnsafeMemoryAccess page error: continue after unsafe access 2171 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2172 2173 // Copy in multi-bytes chunks 2174 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG); 2175 } 2176 __ BIND(L_exit); 2177 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2178 restore_arg_regs_using_thread(); 2179 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 2180 SharedRuntime::_jlong_array_copy_ctr, 2181 rscratch1); // Update counter after rscratch1 is free 2182 __ vzeroupper(); 2183 __ xorptr(rax, rax); // return 0 2184 __ leave(); // required for proper stackwalking of RuntimeStub frame 2185 __ ret(0); 2186 2187 return start; 2188 } 2189 2190 2191 // Helper for generating a dynamic type check. 2192 // Smashes no registers. 2193 void StubGenerator::generate_type_check(Register sub_klass, 2194 Register super_check_offset, 2195 Register super_klass, 2196 Label& L_success) { 2197 assert_different_registers(sub_klass, super_check_offset, super_klass); 2198 2199 BLOCK_COMMENT("type_check:"); 2200 2201 Label L_miss; 2202 2203 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 2204 super_check_offset); 2205 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 2206 2207 // Fall through on failure! 2208 __ BIND(L_miss); 2209 } 2210 2211 // 2212 // Generate checkcasting array copy stub 2213 // 2214 // Input: 2215 // c_rarg0 - source array address 2216 // c_rarg1 - destination array address 2217 // c_rarg2 - element count, treated as ssize_t, can be zero 2218 // c_rarg3 - size_t ckoff (super_check_offset) 2219 // not Win64 2220 // c_rarg4 - oop ckval (super_klass) 2221 // Win64 2222 // rsp+40 - oop ckval (super_klass) 2223 // 2224 // Output: 2225 // rax == 0 - success 2226 // rax == -1^K - failure, where K is partial transfer count 2227 // 2228 address StubGenerator::generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized) { 2229 2230 Label L_load_element, L_store_element, L_do_card_marks, L_done; 2231 2232 // Input registers (after setup_arg_regs) 2233 const Register from = rdi; // source array address 2234 const Register to = rsi; // destination array address 2235 const Register length = rdx; // elements count 2236 const Register ckoff = rcx; // super_check_offset 2237 const Register ckval = r8; // super_klass 2238 2239 // Registers used as temps (r13, r14 are save-on-entry) 2240 const Register end_from = from; // source array end address 2241 const Register end_to = r13; // destination array end address 2242 const Register count = rdx; // -(count_remaining) 2243 const Register r14_length = r14; // saved copy of length 2244 // End pointers are inclusive, and if length is not zero they point 2245 // to the last unit copied: end_to[0] := end_from[0] 2246 2247 const Register rax_oop = rax; // actual oop copied 2248 const Register r11_klass = r11; // oop._klass 2249 2250 //--------------------------------------------------------------- 2251 // Assembler stub will be used for this call to arraycopy 2252 // if the two arrays are subtypes of Object[] but the 2253 // destination array type is not equal to or a supertype 2254 // of the source type. Each element must be separately 2255 // checked. 2256 2257 __ align(CodeEntryAlignment); 2258 StubCodeMark mark(this, "StubRoutines", name); 2259 address start = __ pc(); 2260 2261 __ enter(); // required for proper stackwalking of RuntimeStub frame 2262 2263 #ifdef ASSERT 2264 // caller guarantees that the arrays really are different 2265 // otherwise, we would have to make conjoint checks 2266 { Label L; 2267 array_overlap_test(L, TIMES_OOP); 2268 __ stop("checkcast_copy within a single array"); 2269 __ bind(L); 2270 } 2271 #endif //ASSERT 2272 2273 setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx 2274 // ckoff => rcx, ckval => r8 2275 // r9 is used to save r15_thread 2276 #ifdef _WIN64 2277 // last argument (#4) is on stack on Win64 2278 __ movptr(ckval, Address(rsp, 6 * wordSize)); 2279 #endif 2280 2281 // Caller of this entry point must set up the argument registers. 2282 if (entry != nullptr) { 2283 *entry = __ pc(); 2284 BLOCK_COMMENT("Entry:"); 2285 } 2286 2287 // allocate spill slots for r13, r14 2288 enum { 2289 saved_r13_offset, 2290 saved_r14_offset, 2291 saved_r10_offset, 2292 saved_rbp_offset 2293 }; 2294 __ subptr(rsp, saved_rbp_offset * wordSize); 2295 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 2296 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 2297 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10); 2298 2299 #ifdef ASSERT 2300 Label L2; 2301 __ get_thread(r14); 2302 __ cmpptr(r15_thread, r14); 2303 __ jcc(Assembler::equal, L2); 2304 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 2305 __ bind(L2); 2306 #endif // ASSERT 2307 2308 // check that int operands are properly extended to size_t 2309 assert_clean_int(length, rax); 2310 assert_clean_int(ckoff, rax); 2311 2312 #ifdef ASSERT 2313 BLOCK_COMMENT("assert consistent ckoff/ckval"); 2314 // The ckoff and ckval must be mutually consistent, 2315 // even though caller generates both. 2316 { Label L; 2317 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2318 __ cmpl(ckoff, Address(ckval, sco_offset)); 2319 __ jcc(Assembler::equal, L); 2320 __ stop("super_check_offset inconsistent"); 2321 __ bind(L); 2322 } 2323 #endif //ASSERT 2324 2325 // Loop-invariant addresses. They are exclusive end pointers. 2326 Address end_from_addr(from, length, TIMES_OOP, 0); 2327 Address end_to_addr(to, length, TIMES_OOP, 0); 2328 // Loop-variant addresses. They assume post-incremented count < 0. 2329 Address from_element_addr(end_from, count, TIMES_OOP, 0); 2330 Address to_element_addr(end_to, count, TIMES_OOP, 0); 2331 2332 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 2333 if (dest_uninitialized) { 2334 decorators |= IS_DEST_UNINITIALIZED; 2335 } 2336 2337 BasicType type = T_OBJECT; 2338 size_t element_size = UseCompressedOops ? 4 : 8; 2339 2340 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2341 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2342 2343 // Copy from low to high addresses, indexed from the end of each array. 2344 __ lea(end_from, end_from_addr); 2345 __ lea(end_to, end_to_addr); 2346 __ movptr(r14_length, length); // save a copy of the length 2347 assert(length == count, ""); // else fix next line: 2348 __ negptr(count); // negate and test the length 2349 __ jcc(Assembler::notZero, L_load_element); 2350 2351 // Empty array: Nothing to do. 2352 __ xorptr(rax, rax); // return 0 on (trivial) success 2353 __ jmp(L_done); 2354 2355 // ======== begin loop ======== 2356 // (Loop is rotated; its entry is L_load_element.) 2357 // Loop control: 2358 // for (count = -count; count != 0; count++) 2359 // Base pointers src, dst are biased by 8*(count-1),to last element. 2360 __ align(OptoLoopAlignment); 2361 2362 __ BIND(L_store_element); 2363 bs->copy_store_at(_masm, 2364 decorators, 2365 type, 2366 element_size, 2367 to_element_addr, 2368 rax_oop, 2369 r10); 2370 __ increment(count); // increment the count toward zero 2371 __ jcc(Assembler::zero, L_do_card_marks); 2372 2373 // ======== loop entry is here ======== 2374 __ BIND(L_load_element); 2375 bs->copy_load_at(_masm, 2376 decorators, 2377 type, 2378 element_size, 2379 rax_oop, 2380 from_element_addr, 2381 r10); 2382 __ testptr(rax_oop, rax_oop); 2383 __ jcc(Assembler::zero, L_store_element); 2384 2385 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass 2386 generate_type_check(r11_klass, ckoff, ckval, L_store_element); 2387 // ======== end loop ======== 2388 2389 // It was a real error; we must depend on the caller to finish the job. 2390 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. 2391 // Emit GC store barriers for the oops we have copied (r14 + rdx), 2392 // and report their number to the caller. 2393 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); 2394 Label L_post_barrier; 2395 __ addptr(r14_length, count); // K = (original - remaining) oops 2396 __ movptr(rax, r14_length); // save the value 2397 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 2398 __ jccb(Assembler::notZero, L_post_barrier); 2399 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 2400 2401 // Come here on success only. 2402 __ BIND(L_do_card_marks); 2403 __ xorptr(rax, rax); // return 0 on success 2404 2405 __ BIND(L_post_barrier); 2406 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length); 2407 2408 // Common exit point (success or failure). 2409 __ BIND(L_done); 2410 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 2411 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 2412 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize)); 2413 restore_arg_regs_using_thread(); 2414 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2415 __ leave(); // required for proper stackwalking of RuntimeStub frame 2416 __ ret(0); 2417 2418 return start; 2419 } 2420 2421 2422 // Generate 'unsafe' array copy stub 2423 // Though just as safe as the other stubs, it takes an unscaled 2424 // size_t argument instead of an element count. 2425 // 2426 // Input: 2427 // c_rarg0 - source array address 2428 // c_rarg1 - destination array address 2429 // c_rarg2 - byte count, treated as ssize_t, can be zero 2430 // 2431 // Examines the alignment of the operands and dispatches 2432 // to a long, int, short, or byte copy loop. 2433 // 2434 address StubGenerator::generate_unsafe_copy(const char *name, 2435 address byte_copy_entry, address short_copy_entry, 2436 address int_copy_entry, address long_copy_entry) { 2437 2438 Label L_long_aligned, L_int_aligned, L_short_aligned; 2439 2440 // Input registers (before setup_arg_regs) 2441 const Register from = c_rarg0; // source array address 2442 const Register to = c_rarg1; // destination array address 2443 const Register size = c_rarg2; // byte count (size_t) 2444 2445 // Register used as a temp 2446 const Register bits = rax; // test copy of low bits 2447 2448 __ align(CodeEntryAlignment); 2449 StubCodeMark mark(this, "StubRoutines", name); 2450 address start = __ pc(); 2451 2452 __ enter(); // required for proper stackwalking of RuntimeStub frame 2453 2454 // bump this on entry, not on exit: 2455 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1); 2456 2457 __ mov(bits, from); 2458 __ orptr(bits, to); 2459 __ orptr(bits, size); 2460 2461 __ testb(bits, BytesPerLong-1); 2462 __ jccb(Assembler::zero, L_long_aligned); 2463 2464 __ testb(bits, BytesPerInt-1); 2465 __ jccb(Assembler::zero, L_int_aligned); 2466 2467 __ testb(bits, BytesPerShort-1); 2468 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 2469 2470 __ BIND(L_short_aligned); 2471 __ shrptr(size, LogBytesPerShort); // size => short_count 2472 __ jump(RuntimeAddress(short_copy_entry)); 2473 2474 __ BIND(L_int_aligned); 2475 __ shrptr(size, LogBytesPerInt); // size => int_count 2476 __ jump(RuntimeAddress(int_copy_entry)); 2477 2478 __ BIND(L_long_aligned); 2479 __ shrptr(size, LogBytesPerLong); // size => qword_count 2480 __ jump(RuntimeAddress(long_copy_entry)); 2481 2482 return start; 2483 } 2484 2485 2486 // Static enum for helper 2487 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD}; 2488 // Helper for generate_unsafe_setmemory 2489 // 2490 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks 2491 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest, 2492 Register size, Register wide_value, 2493 Register tmp, Label& L_exit, 2494 MacroAssembler *_masm) { 2495 Label L_Loop, L_Tail, L_TailLoop; 2496 2497 int shiftval = 0; 2498 int incr = 0; 2499 2500 switch (type) { 2501 case USM_SHORT: 2502 shiftval = 1; 2503 incr = 16; 2504 break; 2505 case USM_DWORD: 2506 shiftval = 2; 2507 incr = 32; 2508 break; 2509 case USM_QUADWORD: 2510 shiftval = 3; 2511 incr = 64; 2512 break; 2513 } 2514 2515 // At this point, we know the lower bits of size are zero 2516 __ shrq(size, shiftval); 2517 // size now has number of X-byte chunks (2, 4 or 8) 2518 2519 // Number of (8*X)-byte chunks into tmp 2520 __ movq(tmp, size); 2521 __ shrq(tmp, 3); 2522 __ jccb(Assembler::zero, L_Tail); 2523 2524 __ BIND(L_Loop); 2525 2526 // Unroll 8 stores 2527 for (int i = 0; i < 8; i++) { 2528 switch (type) { 2529 case USM_SHORT: 2530 __ movw(Address(dest, (2 * i)), wide_value); 2531 break; 2532 case USM_DWORD: 2533 __ movl(Address(dest, (4 * i)), wide_value); 2534 break; 2535 case USM_QUADWORD: 2536 __ movq(Address(dest, (8 * i)), wide_value); 2537 break; 2538 } 2539 } 2540 __ addq(dest, incr); 2541 __ decrementq(tmp); 2542 __ jccb(Assembler::notZero, L_Loop); 2543 2544 __ BIND(L_Tail); 2545 2546 // Find number of remaining X-byte chunks 2547 __ andq(size, 0x7); 2548 2549 // If zero, then we're done 2550 __ jccb(Assembler::zero, L_exit); 2551 2552 __ BIND(L_TailLoop); 2553 2554 switch (type) { 2555 case USM_SHORT: 2556 __ movw(Address(dest, 0), wide_value); 2557 break; 2558 case USM_DWORD: 2559 __ movl(Address(dest, 0), wide_value); 2560 break; 2561 case USM_QUADWORD: 2562 __ movq(Address(dest, 0), wide_value); 2563 break; 2564 } 2565 __ addq(dest, incr >> 3); 2566 __ decrementq(size); 2567 __ jccb(Assembler::notZero, L_TailLoop); 2568 } 2569 2570 // Generate 'unsafe' set memory stub 2571 // Though just as safe as the other stubs, it takes an unscaled 2572 // size_t (# bytes) argument instead of an element count. 2573 // 2574 // Input: 2575 // c_rarg0 - destination array address 2576 // c_rarg1 - byte count (size_t) 2577 // c_rarg2 - byte value 2578 // 2579 // Examines the alignment of the operands and dispatches 2580 // to an int, short, or byte fill loop. 2581 // 2582 address StubGenerator::generate_unsafe_setmemory(const char *name, 2583 address unsafe_byte_fill) { 2584 __ align(CodeEntryAlignment); 2585 StubCodeMark mark(this, "StubRoutines", name); 2586 address start = __ pc(); 2587 __ enter(); // required for proper stackwalking of RuntimeStub frame 2588 2589 assert(unsafe_byte_fill != nullptr, "Invalid call"); 2590 2591 // bump this on entry, not on exit: 2592 INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1); 2593 2594 { 2595 Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes; 2596 2597 const Register dest = c_rarg0; 2598 const Register size = c_rarg1; 2599 const Register byteVal = c_rarg2; 2600 const Register wide_value = rax; 2601 const Register rScratch1 = r10; 2602 2603 assert_different_registers(dest, size, byteVal, wide_value, rScratch1); 2604 2605 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char) 2606 2607 __ testq(size, size); 2608 __ jcc(Assembler::zero, L_exit); 2609 2610 // Propagate byte to full Register 2611 __ movzbl(rScratch1, byteVal); 2612 __ mov64(wide_value, 0x0101010101010101ULL); 2613 __ imulq(wide_value, rScratch1); 2614 2615 // Check for pointer & size alignment 2616 __ movq(rScratch1, dest); 2617 __ orq(rScratch1, size); 2618 2619 __ testb(rScratch1, 7); 2620 __ jcc(Assembler::equal, L_fillQuadwords); 2621 2622 __ testb(rScratch1, 3); 2623 __ jcc(Assembler::equal, L_fillDwords); 2624 2625 __ testb(rScratch1, 1); 2626 __ jcc(Assembler::notEqual, L_fillBytes); 2627 2628 // Fill words 2629 { 2630 Label L_wordsTail, L_wordsLoop, L_wordsTailLoop; 2631 UnsafeMemoryAccessMark umam(this, true, true); 2632 2633 // At this point, we know the lower bit of size is zero and a 2634 // multiple of 2 2635 do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1, 2636 L_exit, _masm); 2637 } 2638 __ jmpb(L_exit); 2639 2640 __ BIND(L_fillQuadwords); 2641 2642 // Fill QUADWORDs 2643 { 2644 Label L_qwordLoop, L_qwordsTail, L_qwordsTailLoop; 2645 UnsafeMemoryAccessMark umam(this, true, true); 2646 2647 // At this point, we know the lower 3 bits of size are zero and a 2648 // multiple of 8 2649 do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1, 2650 L_exit, _masm); 2651 } 2652 __ BIND(L_exit); 2653 2654 __ leave(); // required for proper stackwalking of RuntimeStub frame 2655 __ ret(0); 2656 2657 __ BIND(L_fillDwords); 2658 2659 // Fill DWORDs 2660 { 2661 Label L_dwordLoop, L_dwordsTail, L_dwordsTailLoop; 2662 UnsafeMemoryAccessMark umam(this, true, true); 2663 2664 // At this point, we know the lower 2 bits of size are zero and a 2665 // multiple of 4 2666 do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1, 2667 L_exit, _masm); 2668 } 2669 __ jmpb(L_exit); 2670 2671 __ BIND(L_fillBytes); 2672 // Set up for tail call to previously generated byte fill routine 2673 // Parameter order is (ptr, byteVal, size) 2674 __ xchgq(c_rarg1, c_rarg2); 2675 __ leave(); // Clear effect of enter() 2676 __ jump(RuntimeAddress(unsafe_byte_fill)); 2677 } 2678 2679 return start; 2680 } 2681 2682 // Perform range checks on the proposed arraycopy. 2683 // Kills temp, but nothing else. 2684 // Also, clean the sign bits of src_pos and dst_pos. 2685 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2686 Register src_pos, // source position (c_rarg1) 2687 Register dst, // destination array oo (c_rarg2) 2688 Register dst_pos, // destination position (c_rarg3) 2689 Register length, 2690 Register temp, 2691 Label& L_failed) { 2692 BLOCK_COMMENT("arraycopy_range_checks:"); 2693 2694 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2695 __ movl(temp, length); 2696 __ addl(temp, src_pos); // src_pos + length 2697 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); 2698 __ jcc(Assembler::above, L_failed); 2699 2700 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2701 __ movl(temp, length); 2702 __ addl(temp, dst_pos); // dst_pos + length 2703 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2704 __ jcc(Assembler::above, L_failed); 2705 2706 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2707 // Move with sign extension can be used since they are positive. 2708 __ movslq(src_pos, src_pos); 2709 __ movslq(dst_pos, dst_pos); 2710 2711 BLOCK_COMMENT("arraycopy_range_checks done"); 2712 } 2713 2714 2715 // Generate generic array copy stubs 2716 // 2717 // Input: 2718 // c_rarg0 - src oop 2719 // c_rarg1 - src_pos (32-bits) 2720 // c_rarg2 - dst oop 2721 // c_rarg3 - dst_pos (32-bits) 2722 // not Win64 2723 // c_rarg4 - element count (32-bits) 2724 // Win64 2725 // rsp+40 - element count (32-bits) 2726 // 2727 // Output: 2728 // rax == 0 - success 2729 // rax == -1^K - failure, where K is partial transfer count 2730 // 2731 address StubGenerator::generate_generic_copy(const char *name, 2732 address byte_copy_entry, address short_copy_entry, 2733 address int_copy_entry, address oop_copy_entry, 2734 address long_copy_entry, address checkcast_copy_entry) { 2735 2736 Label L_failed, L_failed_0, L_objArray; 2737 Label L_copy_shorts, L_copy_ints, L_copy_longs; 2738 2739 // Input registers 2740 const Register src = c_rarg0; // source array oop 2741 const Register src_pos = c_rarg1; // source position 2742 const Register dst = c_rarg2; // destination array oop 2743 const Register dst_pos = c_rarg3; // destination position 2744 #ifndef _WIN64 2745 const Register length = c_rarg4; 2746 const Register rklass_tmp = r9; // load_klass 2747 #else 2748 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64 2749 const Register rklass_tmp = rdi; // load_klass 2750 #endif 2751 2752 { int modulus = CodeEntryAlignment; 2753 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 2754 int advance = target - (__ offset() % modulus); 2755 if (advance < 0) advance += modulus; 2756 if (advance > 0) __ nop(advance); 2757 } 2758 StubCodeMark mark(this, "StubRoutines", name); 2759 2760 // Short-hop target to L_failed. Makes for denser prologue code. 2761 __ BIND(L_failed_0); 2762 __ jmp(L_failed); 2763 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 2764 2765 __ align(CodeEntryAlignment); 2766 address start = __ pc(); 2767 2768 __ enter(); // required for proper stackwalking of RuntimeStub frame 2769 2770 #ifdef _WIN64 2771 __ push(rklass_tmp); // rdi is callee-save on Windows 2772 #endif 2773 2774 // bump this on entry, not on exit: 2775 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1); 2776 2777 //----------------------------------------------------------------------- 2778 // Assembler stub will be used for this call to arraycopy 2779 // if the following conditions are met: 2780 // 2781 // (1) src and dst must not be null. 2782 // (2) src_pos must not be negative. 2783 // (3) dst_pos must not be negative. 2784 // (4) length must not be negative. 2785 // (5) src klass and dst klass should be the same and not null. 2786 // (6) src and dst should be arrays. 2787 // (7) src_pos + length must not exceed length of src. 2788 // (8) dst_pos + length must not exceed length of dst. 2789 // 2790 2791 // if (src == nullptr) return -1; 2792 __ testptr(src, src); // src oop 2793 size_t j1off = __ offset(); 2794 __ jccb(Assembler::zero, L_failed_0); 2795 2796 // if (src_pos < 0) return -1; 2797 __ testl(src_pos, src_pos); // src_pos (32-bits) 2798 __ jccb(Assembler::negative, L_failed_0); 2799 2800 // if (dst == nullptr) return -1; 2801 __ testptr(dst, dst); // dst oop 2802 __ jccb(Assembler::zero, L_failed_0); 2803 2804 // if (dst_pos < 0) return -1; 2805 __ testl(dst_pos, dst_pos); // dst_pos (32-bits) 2806 size_t j4off = __ offset(); 2807 __ jccb(Assembler::negative, L_failed_0); 2808 2809 // The first four tests are very dense code, 2810 // but not quite dense enough to put four 2811 // jumps in a 16-byte instruction fetch buffer. 2812 // That's good, because some branch predicters 2813 // do not like jumps so close together. 2814 // Make sure of this. 2815 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps"); 2816 2817 // registers used as temp 2818 const Register r11_length = r11; // elements count to copy 2819 const Register r10_src_klass = r10; // array klass 2820 2821 // if (length < 0) return -1; 2822 __ movl(r11_length, length); // length (elements count, 32-bits value) 2823 __ testl(r11_length, r11_length); 2824 __ jccb(Assembler::negative, L_failed_0); 2825 2826 __ load_klass(r10_src_klass, src, rklass_tmp); 2827 #ifdef ASSERT 2828 // assert(src->klass() != nullptr); 2829 { 2830 BLOCK_COMMENT("assert klasses not null {"); 2831 Label L1, L2; 2832 __ testptr(r10_src_klass, r10_src_klass); 2833 __ jcc(Assembler::notZero, L2); // it is broken if klass is null 2834 __ bind(L1); 2835 __ stop("broken null klass"); 2836 __ bind(L2); 2837 __ load_klass(rax, dst, rklass_tmp); 2838 __ cmpq(rax, 0); 2839 __ jcc(Assembler::equal, L1); // this would be broken also 2840 BLOCK_COMMENT("} assert klasses not null done"); 2841 } 2842 #endif 2843 2844 // Load layout helper (32-bits) 2845 // 2846 // |array_tag| | header_size | element_type | |log2_element_size| 2847 // 32 30 24 16 8 2 0 2848 // 2849 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2850 // 2851 2852 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2853 2854 // Handle objArrays completely differently... 2855 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2856 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); 2857 __ jcc(Assembler::equal, L_objArray); 2858 2859 // if (src->klass() != dst->klass()) return -1; 2860 __ load_klass(rax, dst, rklass_tmp); 2861 __ cmpq(r10_src_klass, rax); 2862 __ jcc(Assembler::notEqual, L_failed); 2863 2864 // Check for flat inline type array -> return -1 2865 __ test_flat_array_oop(src, rax, L_failed); 2866 2867 // Check for null-free (non-flat) inline type array -> handle as object array 2868 __ test_null_free_array_oop(src, rax, L_objArray); 2869 2870 const Register rax_lh = rax; // layout helper 2871 __ movl(rax_lh, Address(r10_src_klass, lh_offset)); 2872 2873 // Check for flat inline type array -> return -1 2874 __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace); 2875 __ jcc(Assembler::notZero, L_failed); 2876 2877 // if (!src->is_Array()) return -1; 2878 __ cmpl(rax_lh, Klass::_lh_neutral_value); 2879 __ jcc(Assembler::greaterEqual, L_failed); 2880 2881 // At this point, it is known to be a typeArray (array_tag 0x3). 2882 #ifdef ASSERT 2883 { 2884 BLOCK_COMMENT("assert primitive array {"); 2885 Label L; 2886 __ movl(rklass_tmp, rax_lh); 2887 __ sarl(rklass_tmp, Klass::_lh_array_tag_shift); 2888 __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value); 2889 __ jcc(Assembler::equal, L); 2890 __ stop("must be a primitive array"); 2891 __ bind(L); 2892 BLOCK_COMMENT("} assert primitive array done"); 2893 } 2894 #endif 2895 2896 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2897 r10, L_failed); 2898 2899 // TypeArrayKlass 2900 // 2901 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2902 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2903 // 2904 2905 const Register r10_offset = r10; // array offset 2906 const Register rax_elsize = rax_lh; // element size 2907 2908 __ movl(r10_offset, rax_lh); 2909 __ shrl(r10_offset, Klass::_lh_header_size_shift); 2910 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset 2911 __ addptr(src, r10_offset); // src array offset 2912 __ addptr(dst, r10_offset); // dst array offset 2913 BLOCK_COMMENT("choose copy loop based on element size"); 2914 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize 2915 2916 #ifdef _WIN64 2917 __ pop(rklass_tmp); // Restore callee-save rdi 2918 #endif 2919 2920 // next registers should be set before the jump to corresponding stub 2921 const Register from = c_rarg0; // source array address 2922 const Register to = c_rarg1; // destination array address 2923 const Register count = c_rarg2; // elements count 2924 2925 // 'from', 'to', 'count' registers should be set in such order 2926 // since they are the same as 'src', 'src_pos', 'dst'. 2927 2928 __ cmpl(rax_elsize, 0); 2929 __ jccb(Assembler::notEqual, L_copy_shorts); 2930 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr 2931 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr 2932 __ movl2ptr(count, r11_length); // length 2933 __ jump(RuntimeAddress(byte_copy_entry)); 2934 2935 __ BIND(L_copy_shorts); 2936 __ cmpl(rax_elsize, LogBytesPerShort); 2937 __ jccb(Assembler::notEqual, L_copy_ints); 2938 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr 2939 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr 2940 __ movl2ptr(count, r11_length); // length 2941 __ jump(RuntimeAddress(short_copy_entry)); 2942 2943 __ BIND(L_copy_ints); 2944 __ cmpl(rax_elsize, LogBytesPerInt); 2945 __ jccb(Assembler::notEqual, L_copy_longs); 2946 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr 2947 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr 2948 __ movl2ptr(count, r11_length); // length 2949 __ jump(RuntimeAddress(int_copy_entry)); 2950 2951 __ BIND(L_copy_longs); 2952 #ifdef ASSERT 2953 { 2954 BLOCK_COMMENT("assert long copy {"); 2955 Label L; 2956 __ cmpl(rax_elsize, LogBytesPerLong); 2957 __ jcc(Assembler::equal, L); 2958 __ stop("must be long copy, but elsize is wrong"); 2959 __ bind(L); 2960 BLOCK_COMMENT("} assert long copy done"); 2961 } 2962 #endif 2963 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 2964 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 2965 __ movl2ptr(count, r11_length); // length 2966 __ jump(RuntimeAddress(long_copy_entry)); 2967 2968 // ObjArrayKlass 2969 __ BIND(L_objArray); 2970 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 2971 2972 Label L_plain_copy, L_checkcast_copy; 2973 // test array classes for subtyping 2974 __ load_klass(rax, dst, rklass_tmp); 2975 __ cmpq(r10_src_klass, rax); // usual case is exact equality 2976 __ jcc(Assembler::notEqual, L_checkcast_copy); 2977 2978 // Identically typed arrays can be copied without element-wise checks. 2979 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2980 r10, L_failed); 2981 2982 __ lea(from, Address(src, src_pos, TIMES_OOP, 2983 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 2984 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2985 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 2986 __ movl2ptr(count, r11_length); // length 2987 __ BIND(L_plain_copy); 2988 #ifdef _WIN64 2989 __ pop(rklass_tmp); // Restore callee-save rdi 2990 #endif 2991 __ jump(RuntimeAddress(oop_copy_entry)); 2992 2993 __ BIND(L_checkcast_copy); 2994 // live at this point: r10_src_klass, r11_length, rax (dst_klass) 2995 { 2996 // Before looking at dst.length, make sure dst is also an objArray. 2997 // This check also fails for flat arrays which are not supported. 2998 __ cmpl(Address(rax, lh_offset), objArray_lh); 2999 __ jcc(Assembler::notEqual, L_failed); 3000 3001 #ifdef ASSERT 3002 { 3003 BLOCK_COMMENT("assert not null-free array {"); 3004 Label L; 3005 __ test_non_null_free_array_oop(dst, rklass_tmp, L); 3006 __ stop("unexpected null-free array"); 3007 __ bind(L); 3008 BLOCK_COMMENT("} assert not null-free array"); 3009 } 3010 #endif 3011 3012 // It is safe to examine both src.length and dst.length. 3013 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 3014 rax, L_failed); 3015 3016 const Register r11_dst_klass = r11; 3017 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload 3018 3019 // Marshal the base address arguments now, freeing registers. 3020 __ lea(from, Address(src, src_pos, TIMES_OOP, 3021 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 3022 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 3023 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 3024 __ movl(count, length); // length (reloaded) 3025 Register sco_temp = c_rarg3; // this register is free now 3026 assert_different_registers(from, to, count, sco_temp, 3027 r11_dst_klass, r10_src_klass); 3028 assert_clean_int(count, sco_temp); 3029 3030 // Generate the type check. 3031 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3032 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 3033 assert_clean_int(sco_temp, rax); 3034 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 3035 3036 // Fetch destination element klass from the ObjArrayKlass header. 3037 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3038 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 3039 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 3040 assert_clean_int(sco_temp, rax); 3041 3042 #ifdef _WIN64 3043 __ pop(rklass_tmp); // Restore callee-save rdi 3044 #endif 3045 3046 // the checkcast_copy loop needs two extra arguments: 3047 assert(c_rarg3 == sco_temp, "#3 already in place"); 3048 // Set up arguments for checkcast_copy_entry. 3049 setup_arg_regs_using_thread(4); 3050 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris 3051 __ jump(RuntimeAddress(checkcast_copy_entry)); 3052 } 3053 3054 __ BIND(L_failed); 3055 #ifdef _WIN64 3056 __ pop(rklass_tmp); // Restore callee-save rdi 3057 #endif 3058 __ xorptr(rax, rax); 3059 __ notptr(rax); // return -1 3060 __ leave(); // required for proper stackwalking of RuntimeStub frame 3061 __ ret(0); 3062 3063 return start; 3064 } 3065 3066 #undef __