1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/macroAssembler.hpp" 26 #include "gc/shared/barrierSet.hpp" 27 #include "gc/shared/barrierSetAssembler.hpp" 28 #include "oops/objArrayKlass.hpp" 29 #include "runtime/sharedRuntime.hpp" 30 #include "runtime/stubRoutines.hpp" 31 #include "stubGenerator_x86_64.hpp" 32 #ifdef COMPILER2 33 #include "opto/c2_globals.hpp" 34 #endif 35 #if INCLUDE_JVMCI 36 #include "jvmci/jvmci_globals.hpp" 37 #endif 38 39 #define __ _masm-> 40 41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #else 46 #define BLOCK_COMMENT(str) __ block_comment(str) 47 #endif // PRODUCT 48 49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 50 51 #ifdef PRODUCT 52 #define INC_COUNTER_NP(counter, rscratch) ((void)0) 53 #else 54 #define INC_COUNTER_NP(counter, rscratch) \ 55 BLOCK_COMMENT("inc_counter " #counter); \ 56 inc_counter_np(_masm, counter, rscratch); 57 58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) { 59 __ incrementl(ExternalAddress((address)&counter), rscratch); 60 } 61 62 #if COMPILER2_OR_JVMCI 63 static uint& get_profile_ctr(int shift) { 64 if (shift == 0) { 65 return SharedRuntime::_jbyte_array_copy_ctr; 66 } else if (shift == 1) { 67 return SharedRuntime::_jshort_array_copy_ctr; 68 } else if (shift == 2) { 69 return SharedRuntime::_jint_array_copy_ctr; 70 } else { 71 assert(shift == 3, ""); 72 return SharedRuntime::_jlong_array_copy_ctr; 73 } 74 } 75 #endif // COMPILER2_OR_JVMCI 76 #endif // !PRODUCT 77 78 void StubGenerator::generate_arraycopy_stubs() { 79 address entry; 80 address entry_jbyte_arraycopy; 81 address entry_jshort_arraycopy; 82 address entry_jint_arraycopy; 83 address entry_oop_arraycopy; 84 address entry_jlong_arraycopy; 85 address entry_checkcast_arraycopy; 86 87 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 88 "jbyte_disjoint_arraycopy"); 89 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy, 90 "jbyte_arraycopy"); 91 92 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 93 "jshort_disjoint_arraycopy"); 94 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy, 95 "jshort_arraycopy"); 96 97 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry, 98 "jint_disjoint_arraycopy"); 99 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry, 100 &entry_jint_arraycopy, "jint_arraycopy"); 101 102 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry, 103 "jlong_disjoint_arraycopy"); 104 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry, 105 &entry_jlong_arraycopy, "jlong_arraycopy"); 106 if (UseCompressedOops) { 107 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry, 108 "oop_disjoint_arraycopy"); 109 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry, 110 &entry_oop_arraycopy, "oop_arraycopy"); 111 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry, 112 "oop_disjoint_arraycopy_uninit", 113 /*dest_uninitialized*/true); 114 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry, 115 nullptr, "oop_arraycopy_uninit", 116 /*dest_uninitialized*/true); 117 } else { 118 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry, 119 "oop_disjoint_arraycopy"); 120 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry, 121 &entry_oop_arraycopy, "oop_arraycopy"); 122 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry, 123 "oop_disjoint_arraycopy_uninit", 124 /*dest_uninitialized*/true); 125 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry, 126 nullptr, "oop_arraycopy_uninit", 127 /*dest_uninitialized*/true); 128 } 129 130 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 131 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 132 /*dest_uninitialized*/true); 133 134 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 135 entry_jbyte_arraycopy, 136 entry_jshort_arraycopy, 137 entry_jint_arraycopy, 138 entry_jlong_arraycopy); 139 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 140 entry_jbyte_arraycopy, 141 entry_jshort_arraycopy, 142 entry_jint_arraycopy, 143 entry_oop_arraycopy, 144 entry_jlong_arraycopy, 145 entry_checkcast_arraycopy); 146 147 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 148 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 149 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 150 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 151 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 152 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 153 154 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory("unsafe_setmemory", StubRoutines::_jbyte_fill); 155 156 // We don't generate specialized code for HeapWord-aligned source 157 // arrays, so just use the code we've already generated 158 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; 159 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; 160 161 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; 162 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; 163 164 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 165 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 166 167 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 168 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 169 170 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 171 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 172 173 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 174 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 175 } 176 177 178 // Verify that a register contains clean 32-bits positive value 179 // (high 32-bits are 0) so it could be used in 64-bits shifts. 180 // 181 // Input: 182 // Rint - 32-bits value 183 // Rtmp - scratch 184 // 185 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) { 186 #ifdef ASSERT 187 Label L; 188 assert_different_registers(Rtmp, Rint); 189 __ movslq(Rtmp, Rint); 190 __ cmpq(Rtmp, Rint); 191 __ jcc(Assembler::equal, L); 192 __ stop("high 32-bits of int value are not 0"); 193 __ bind(L); 194 #endif 195 } 196 197 198 // Generate overlap test for array copy stubs 199 // 200 // Input: 201 // c_rarg0 - from 202 // c_rarg1 - to 203 // c_rarg2 - element count 204 // 205 // Output: 206 // rax - &from[element count - 1] 207 // 208 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { 209 const Register from = c_rarg0; 210 const Register to = c_rarg1; 211 const Register count = c_rarg2; 212 const Register end_from = rax; 213 214 __ cmpptr(to, from); 215 __ lea(end_from, Address(from, count, sf, 0)); 216 if (NOLp == nullptr) { 217 RuntimeAddress no_overlap(no_overlap_target); 218 __ jump_cc(Assembler::belowEqual, no_overlap); 219 __ cmpptr(to, end_from); 220 __ jump_cc(Assembler::aboveEqual, no_overlap); 221 } else { 222 __ jcc(Assembler::belowEqual, (*NOLp)); 223 __ cmpptr(to, end_from); 224 __ jcc(Assembler::aboveEqual, (*NOLp)); 225 } 226 } 227 228 229 // Copy big chunks forward 230 // 231 // Inputs: 232 // end_from - source arrays end address 233 // end_to - destination array end address 234 // qword_count - 64-bits element count, negative 235 // tmp1 - scratch 236 // L_copy_bytes - entry label 237 // L_copy_8_bytes - exit label 238 // 239 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to, 240 Register qword_count, Register tmp1, 241 Register tmp2, Label& L_copy_bytes, 242 Label& L_copy_8_bytes, DecoratorSet decorators, 243 BasicType type) { 244 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 245 DEBUG_ONLY(__ stop("enter at entry label, not here")); 246 Label L_loop; 247 __ align(OptoLoopAlignment); 248 if (UseUnalignedLoadStores) { 249 Label L_end; 250 __ BIND(L_loop); 251 if (UseAVX >= 2) { 252 bs->copy_load_at(_masm, decorators, type, 32, 253 xmm0, Address(end_from, qword_count, Address::times_8, -56), 254 tmp1, xmm1); 255 bs->copy_store_at(_masm, decorators, type, 32, 256 Address(end_to, qword_count, Address::times_8, -56), xmm0, 257 tmp1, tmp2, xmm1); 258 259 bs->copy_load_at(_masm, decorators, type, 32, 260 xmm0, Address(end_from, qword_count, Address::times_8, -24), 261 tmp1, xmm1); 262 bs->copy_store_at(_masm, decorators, type, 32, 263 Address(end_to, qword_count, Address::times_8, -24), xmm0, 264 tmp1, tmp2, xmm1); 265 } else { 266 bs->copy_load_at(_masm, decorators, type, 16, 267 xmm0, Address(end_from, qword_count, Address::times_8, -56), 268 tmp1, xmm1); 269 bs->copy_store_at(_masm, decorators, type, 16, 270 Address(end_to, qword_count, Address::times_8, -56), xmm0, 271 tmp1, tmp2, xmm1); 272 bs->copy_load_at(_masm, decorators, type, 16, 273 xmm0, Address(end_from, qword_count, Address::times_8, -40), 274 tmp1, xmm1); 275 bs->copy_store_at(_masm, decorators, type, 16, 276 Address(end_to, qword_count, Address::times_8, -40), xmm0, 277 tmp1, tmp2, xmm1); 278 bs->copy_load_at(_masm, decorators, type, 16, 279 xmm0, Address(end_from, qword_count, Address::times_8, -24), 280 tmp1, xmm1); 281 bs->copy_store_at(_masm, decorators, type, 16, 282 Address(end_to, qword_count, Address::times_8, -24), xmm0, 283 tmp1, tmp2, xmm1); 284 bs->copy_load_at(_masm, decorators, type, 16, 285 xmm0, Address(end_from, qword_count, Address::times_8, -8), 286 tmp1, xmm1); 287 bs->copy_store_at(_masm, decorators, type, 16, 288 Address(end_to, qword_count, Address::times_8, -8), xmm0, 289 tmp1, tmp2, xmm1); 290 } 291 292 __ BIND(L_copy_bytes); 293 __ addptr(qword_count, 8); 294 __ jcc(Assembler::lessEqual, L_loop); 295 __ subptr(qword_count, 4); // sub(8) and add(4) 296 __ jcc(Assembler::greater, L_end); 297 // Copy trailing 32 bytes 298 if (UseAVX >= 2) { 299 bs->copy_load_at(_masm, decorators, type, 32, 300 xmm0, Address(end_from, qword_count, Address::times_8, -24), 301 tmp1, xmm1); 302 bs->copy_store_at(_masm, decorators, type, 32, 303 Address(end_to, qword_count, Address::times_8, -24), xmm0, 304 tmp1, tmp2, xmm1); 305 } else { 306 bs->copy_load_at(_masm, decorators, type, 16, 307 xmm0, Address(end_from, qword_count, Address::times_8, -24), 308 tmp1, xmm1); 309 bs->copy_store_at(_masm, decorators, type, 16, 310 Address(end_to, qword_count, Address::times_8, -24), xmm0, 311 tmp1, tmp2, xmm1); 312 bs->copy_load_at(_masm, decorators, type, 16, 313 xmm0, Address(end_from, qword_count, Address::times_8, -8), 314 tmp1, xmm1); 315 bs->copy_store_at(_masm, decorators, type, 16, 316 Address(end_to, qword_count, Address::times_8, -8), xmm0, 317 tmp1, tmp2, xmm1); 318 } 319 __ addptr(qword_count, 4); 320 __ BIND(L_end); 321 } else { 322 // Copy 32-bytes per iteration 323 __ BIND(L_loop); 324 bs->copy_load_at(_masm, decorators, type, 8, 325 tmp1, Address(end_from, qword_count, Address::times_8, -24), 326 tmp2); 327 bs->copy_store_at(_masm, decorators, type, 8, 328 Address(end_to, qword_count, Address::times_8, -24), tmp1, 329 tmp2); 330 bs->copy_load_at(_masm, decorators, type, 8, 331 tmp1, Address(end_from, qword_count, Address::times_8, -16), 332 tmp2); 333 bs->copy_store_at(_masm, decorators, type, 8, 334 Address(end_to, qword_count, Address::times_8, -16), tmp1, 335 tmp2); 336 bs->copy_load_at(_masm, decorators, type, 8, 337 tmp1, Address(end_from, qword_count, Address::times_8, -8), 338 tmp2); 339 bs->copy_store_at(_masm, decorators, type, 8, 340 Address(end_to, qword_count, Address::times_8, -8), tmp1, 341 tmp2); 342 bs->copy_load_at(_masm, decorators, type, 8, 343 tmp1, Address(end_from, qword_count, Address::times_8, 0), 344 tmp2); 345 bs->copy_store_at(_masm, decorators, type, 8, 346 Address(end_to, qword_count, Address::times_8, 0), tmp1, 347 tmp2); 348 349 __ BIND(L_copy_bytes); 350 __ addptr(qword_count, 4); 351 __ jcc(Assembler::lessEqual, L_loop); 352 } 353 __ subptr(qword_count, 4); 354 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 355 } 356 357 358 // Copy big chunks backward 359 // 360 // Inputs: 361 // from - source arrays address 362 // dest - destination array address 363 // qword_count - 64-bits element count 364 // tmp1 - scratch 365 // L_copy_bytes - entry label 366 // L_copy_8_bytes - exit label 367 // 368 void StubGenerator::copy_bytes_backward(Register from, Register dest, 369 Register qword_count, Register tmp1, 370 Register tmp2, Label& L_copy_bytes, 371 Label& L_copy_8_bytes, DecoratorSet decorators, 372 BasicType type) { 373 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 374 DEBUG_ONLY(__ stop("enter at entry label, not here")); 375 Label L_loop; 376 __ align(OptoLoopAlignment); 377 if (UseUnalignedLoadStores) { 378 Label L_end; 379 __ BIND(L_loop); 380 if (UseAVX >= 2) { 381 bs->copy_load_at(_masm, decorators, type, 32, 382 xmm0, Address(from, qword_count, Address::times_8, 32), 383 tmp1, xmm1); 384 bs->copy_store_at(_masm, decorators, type, 32, 385 Address(dest, qword_count, Address::times_8, 32), xmm0, 386 tmp1, tmp2, xmm1); 387 bs->copy_load_at(_masm, decorators, type, 32, 388 xmm0, Address(from, qword_count, Address::times_8, 0), 389 tmp1, xmm1); 390 bs->copy_store_at(_masm, decorators, type, 32, 391 Address(dest, qword_count, Address::times_8, 0), xmm0, 392 tmp1, tmp2, xmm1); 393 } else { 394 bs->copy_load_at(_masm, decorators, type, 16, 395 xmm0, Address(from, qword_count, Address::times_8, 48), 396 tmp1, xmm1); 397 bs->copy_store_at(_masm, decorators, type, 16, 398 Address(dest, qword_count, Address::times_8, 48), xmm0, 399 tmp1, tmp2, xmm1); 400 bs->copy_load_at(_masm, decorators, type, 16, 401 xmm0, Address(from, qword_count, Address::times_8, 32), 402 tmp1, xmm1); 403 bs->copy_store_at(_masm, decorators, type, 16, 404 Address(dest, qword_count, Address::times_8, 32), xmm0, 405 tmp1, tmp2, xmm1); 406 bs->copy_load_at(_masm, decorators, type, 16, 407 xmm0, Address(from, qword_count, Address::times_8, 16), 408 tmp1, xmm1); 409 bs->copy_store_at(_masm, decorators, type, 16, 410 Address(dest, qword_count, Address::times_8, 16), xmm0, 411 tmp1, tmp2, xmm1); 412 bs->copy_load_at(_masm, decorators, type, 16, 413 xmm0, Address(from, qword_count, Address::times_8, 0), 414 tmp1, xmm1); 415 bs->copy_store_at(_masm, decorators, type, 16, 416 Address(dest, qword_count, Address::times_8, 0), xmm0, 417 tmp1, tmp2, xmm1); 418 } 419 420 __ BIND(L_copy_bytes); 421 __ subptr(qword_count, 8); 422 __ jcc(Assembler::greaterEqual, L_loop); 423 424 __ addptr(qword_count, 4); // add(8) and sub(4) 425 __ jcc(Assembler::less, L_end); 426 // Copy trailing 32 bytes 427 if (UseAVX >= 2) { 428 bs->copy_load_at(_masm, decorators, type, 32, 429 xmm0, Address(from, qword_count, Address::times_8, 0), 430 tmp1, xmm1); 431 bs->copy_store_at(_masm, decorators, type, 32, 432 Address(dest, qword_count, Address::times_8, 0), xmm0, 433 tmp1, tmp2, xmm1); 434 } else { 435 bs->copy_load_at(_masm, decorators, type, 16, 436 xmm0, Address(from, qword_count, Address::times_8, 16), 437 tmp1, xmm1); 438 bs->copy_store_at(_masm, decorators, type, 16, 439 Address(dest, qword_count, Address::times_8, 16), xmm0, 440 tmp1, tmp2, xmm1); 441 bs->copy_load_at(_masm, decorators, type, 16, 442 xmm0, Address(from, qword_count, Address::times_8, 0), 443 tmp1, xmm1); 444 bs->copy_store_at(_masm, decorators, type, 16, 445 Address(dest, qword_count, Address::times_8, 0), xmm0, 446 tmp1, tmp2, xmm1); 447 } 448 __ subptr(qword_count, 4); 449 __ BIND(L_end); 450 } else { 451 // Copy 32-bytes per iteration 452 __ BIND(L_loop); 453 bs->copy_load_at(_masm, decorators, type, 8, 454 tmp1, Address(from, qword_count, Address::times_8, 24), 455 tmp2); 456 bs->copy_store_at(_masm, decorators, type, 8, 457 Address(dest, qword_count, Address::times_8, 24), tmp1, 458 tmp2); 459 bs->copy_load_at(_masm, decorators, type, 8, 460 tmp1, Address(from, qword_count, Address::times_8, 16), 461 tmp2); 462 bs->copy_store_at(_masm, decorators, type, 8, 463 Address(dest, qword_count, Address::times_8, 16), tmp1, 464 tmp2); 465 bs->copy_load_at(_masm, decorators, type, 8, 466 tmp1, Address(from, qword_count, Address::times_8, 8), 467 tmp2); 468 bs->copy_store_at(_masm, decorators, type, 8, 469 Address(dest, qword_count, Address::times_8, 8), tmp1, 470 tmp2); 471 bs->copy_load_at(_masm, decorators, type, 8, 472 tmp1, Address(from, qword_count, Address::times_8, 0), 473 tmp2); 474 bs->copy_store_at(_masm, decorators, type, 8, 475 Address(dest, qword_count, Address::times_8, 0), tmp1, 476 tmp2); 477 478 __ BIND(L_copy_bytes); 479 __ subptr(qword_count, 4); 480 __ jcc(Assembler::greaterEqual, L_loop); 481 } 482 __ addptr(qword_count, 4); 483 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 484 } 485 486 #if COMPILER2_OR_JVMCI 487 488 // Note: Following rules apply to AVX3 optimized arraycopy stubs:- 489 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) 490 // for both special cases (various small block sizes) and aligned copy loop. This is the 491 // default configuration. 492 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) 493 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. 494 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a 495 // better performance for disjoint copies. For conjoint/backward copy vector based 496 // copy performs better. 497 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over 498 // 64 byte vector registers (ZMMs). 499 500 // Inputs: 501 // c_rarg0 - source array address 502 // c_rarg1 - destination array address 503 // c_rarg2 - element count, treated as ssize_t, can be zero 504 // 505 // 506 // Side Effects: 507 // disjoint_copy_avx3_masked is set to the no-overlap entry point 508 // used by generate_conjoint_[byte/int/short/long]_copy(). 509 // 510 address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const char *name, 511 int shift, bool aligned, bool is_oop, 512 bool dest_uninitialized) { 513 __ align(CodeEntryAlignment); 514 StubCodeMark mark(this, "StubRoutines", name); 515 address start = __ pc(); 516 517 int avx3threshold = VM_Version::avx3_threshold(); 518 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 519 const int large_threshold = 2621440; // 2.5 MB 520 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 521 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 522 Label L_copy_large, L_finish; 523 const Register from = rdi; // source array address 524 const Register to = rsi; // destination array address 525 const Register count = rdx; // elements count 526 const Register temp1 = r8; 527 const Register temp2 = r11; 528 const Register temp3 = rax; 529 const Register temp4 = rcx; 530 // End pointers are inclusive, and if count is not zero they point 531 // to the last unit copied: end_to[0] := end_from[0] 532 533 __ enter(); // required for proper stackwalking of RuntimeStub frame 534 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 535 536 if (entry != nullptr) { 537 *entry = __ pc(); 538 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 539 BLOCK_COMMENT("Entry:"); 540 } 541 542 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 543 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 544 545 setup_argument_regs(type); 546 547 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 548 if (dest_uninitialized) { 549 decorators |= IS_DEST_UNINITIALIZED; 550 } 551 if (aligned) { 552 decorators |= ARRAYCOPY_ALIGNED; 553 } 554 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 555 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 556 557 { 558 // Type(shift) byte(0), short(1), int(2), long(3) 559 int loop_size[] = { 192, 96, 48, 24}; 560 int threshold[] = { 4096, 2048, 1024, 512}; 561 562 // UnsafeMemoryAccess page error: continue after unsafe access 563 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 564 // 'from', 'to' and 'count' are now valid 565 566 // temp1 holds remaining count and temp4 holds running count used to compute 567 // next address offset for start of to/from addresses (temp4 * scale). 568 __ mov64(temp4, 0); 569 __ movq(temp1, count); 570 571 // Zero length check. 572 __ BIND(L_tail); 573 __ cmpq(temp1, 0); 574 __ jcc(Assembler::lessEqual, L_exit); 575 576 // Special cases using 32 byte [masked] vector copy operations. 577 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 578 temp4, temp3, use64byteVector, L_entry, L_exit); 579 580 // PRE-MAIN-POST loop for aligned copy. 581 __ BIND(L_entry); 582 583 if (MaxVectorSize == 64) { 584 __ movq(temp2, temp1); 585 __ shlq(temp2, shift); 586 __ cmpq(temp2, large_threshold); 587 __ jcc(Assembler::greaterEqual, L_copy_large); 588 } 589 if (avx3threshold != 0) { 590 __ cmpq(count, threshold[shift]); 591 if (MaxVectorSize == 64) { 592 // Copy using 64 byte vectors. 593 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 594 } else { 595 assert(MaxVectorSize < 64, "vector size should be < 64 bytes"); 596 // REP MOVS offer a faster copy path. 597 __ jcc(Assembler::greaterEqual, L_repmovs); 598 } 599 } 600 601 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 602 // Partial copy to make dst address 32 byte aligned. 603 __ movq(temp2, to); 604 __ andq(temp2, 31); 605 __ jcc(Assembler::equal, L_main_pre_loop); 606 607 __ negptr(temp2); 608 __ addq(temp2, 32); 609 if (shift) { 610 __ shrq(temp2, shift); 611 } 612 __ movq(temp3, temp2); 613 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift); 614 __ movq(temp4, temp2); 615 __ movq(temp1, count); 616 __ subq(temp1, temp2); 617 618 __ cmpq(temp1, loop_size[shift]); 619 __ jcc(Assembler::less, L_tail); 620 621 __ BIND(L_main_pre_loop); 622 __ subq(temp1, loop_size[shift]); 623 624 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 625 __ align32(); 626 __ BIND(L_main_loop); 627 copy64_avx(to, from, temp4, xmm1, false, shift, 0); 628 copy64_avx(to, from, temp4, xmm1, false, shift, 64); 629 copy64_avx(to, from, temp4, xmm1, false, shift, 128); 630 __ addptr(temp4, loop_size[shift]); 631 __ subq(temp1, loop_size[shift]); 632 __ jcc(Assembler::greater, L_main_loop); 633 634 __ addq(temp1, loop_size[shift]); 635 636 // Tail loop. 637 __ jmp(L_tail); 638 639 __ BIND(L_repmovs); 640 __ movq(temp2, temp1); 641 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics. 642 __ movq(temp3, to); 643 __ movq(to, from); 644 __ movq(from, temp3); 645 // Save to/from for restoration post rep_mov. 646 __ movq(temp1, to); 647 __ movq(temp3, from); 648 if(shift < 3) { 649 __ shrq(temp2, 3-shift); // quad word count 650 } 651 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX). 652 __ rep_mov(); 653 __ shlq(temp2, 3); // convert quad words into byte count. 654 if(shift) { 655 __ shrq(temp2, shift); // type specific count. 656 } 657 // Restore original addresses in to/from. 658 __ movq(to, temp3); 659 __ movq(from, temp1); 660 __ movq(temp4, temp2); 661 __ movq(temp1, count); 662 __ subq(temp1, temp2); // tailing part (less than a quad ward size). 663 __ jmp(L_tail); 664 } 665 666 if (MaxVectorSize > 32) { 667 __ BIND(L_pre_main_post_64); 668 // Partial copy to make dst address 64 byte aligned. 669 __ movq(temp2, to); 670 __ andq(temp2, 63); 671 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 672 673 __ negptr(temp2); 674 __ addq(temp2, 64); 675 if (shift) { 676 __ shrq(temp2, shift); 677 } 678 __ movq(temp3, temp2); 679 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true); 680 __ movq(temp4, temp2); 681 __ movq(temp1, count); 682 __ subq(temp1, temp2); 683 684 __ cmpq(temp1, loop_size[shift]); 685 __ jcc(Assembler::less, L_tail64); 686 687 __ BIND(L_main_pre_loop_64bytes); 688 __ subq(temp1, loop_size[shift]); 689 690 // Main loop with aligned copy block size of 192 bytes at 691 // 64 byte copy granularity. 692 __ align32(); 693 __ BIND(L_main_loop_64bytes); 694 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true); 695 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true); 696 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true); 697 __ addptr(temp4, loop_size[shift]); 698 __ subq(temp1, loop_size[shift]); 699 __ jcc(Assembler::greater, L_main_loop_64bytes); 700 701 __ addq(temp1, loop_size[shift]); 702 // Zero length check. 703 __ jcc(Assembler::lessEqual, L_exit); 704 705 __ BIND(L_tail64); 706 707 // Tail handling using 64 byte [masked] vector copy operations. 708 use64byteVector = true; 709 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 710 temp4, temp3, use64byteVector, L_entry, L_exit); 711 } 712 __ BIND(L_exit); 713 } 714 715 __ BIND(L_finish); 716 address ucme_exit_pc = __ pc(); 717 // When called from generic_arraycopy r11 contains specific values 718 // used during arraycopy epilogue, re-initializing r11. 719 if (is_oop) { 720 __ movq(r11, shift == 3 ? count : to); 721 } 722 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 723 restore_argument_regs(type); 724 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 725 __ xorptr(rax, rax); // return 0 726 __ vzeroupper(); 727 __ leave(); // required for proper stackwalking of RuntimeStub frame 728 __ ret(0); 729 730 if (MaxVectorSize == 64) { 731 __ BIND(L_copy_large); 732 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc); 733 arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift); 734 __ jmp(L_finish); 735 } 736 return start; 737 } 738 739 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2, 740 Register temp3, Register temp4, Register count, 741 XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, 742 XMMRegister xmm4, int shift) { 743 744 // Type(shift) byte(0), short(1), int(2), long(3) 745 int loop_size[] = { 256, 128, 64, 32}; 746 int threshold[] = { 4096, 2048, 1024, 512}; 747 748 Label L_main_loop_large; 749 Label L_tail_large; 750 Label L_exit_large; 751 Label L_entry_large; 752 Label L_main_pre_loop_large; 753 Label L_pre_main_post_large; 754 755 assert(MaxVectorSize == 64, "vector length != 64"); 756 __ BIND(L_entry_large); 757 758 __ BIND(L_pre_main_post_large); 759 // Partial copy to make dst address 64 byte aligned. 760 __ movq(temp2, to); 761 __ andq(temp2, 63); 762 __ jcc(Assembler::equal, L_main_pre_loop_large); 763 764 __ negptr(temp2); 765 __ addq(temp2, 64); 766 if (shift) { 767 __ shrq(temp2, shift); 768 } 769 __ movq(temp3, temp2); 770 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true); 771 __ movq(temp4, temp2); 772 __ movq(temp1, count); 773 __ subq(temp1, temp2); 774 775 __ cmpq(temp1, loop_size[shift]); 776 __ jcc(Assembler::less, L_tail_large); 777 778 __ BIND(L_main_pre_loop_large); 779 __ subq(temp1, loop_size[shift]); 780 781 // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity. 782 __ align32(); 783 __ BIND(L_main_loop_large); 784 copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0); 785 __ addptr(temp4, loop_size[shift]); 786 __ subq(temp1, loop_size[shift]); 787 __ jcc(Assembler::greater, L_main_loop_large); 788 // fence needed because copy256_avx3 uses non-temporal stores 789 __ sfence(); 790 791 __ addq(temp1, loop_size[shift]); 792 // Zero length check. 793 __ jcc(Assembler::lessEqual, L_exit_large); 794 __ BIND(L_tail_large); 795 // Tail handling using 64 byte [masked] vector copy operations. 796 __ cmpq(temp1, 0); 797 __ jcc(Assembler::lessEqual, L_exit_large); 798 arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift, 799 temp4, temp3, L_exit_large); 800 __ BIND(L_exit_large); 801 } 802 803 // Inputs: 804 // c_rarg0 - source array address 805 // c_rarg1 - destination array address 806 // c_rarg2 - element count, treated as ssize_t, can be zero 807 // 808 // 809 address StubGenerator::generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift, 810 address nooverlap_target, bool aligned, 811 bool is_oop, bool dest_uninitialized) { 812 __ align(CodeEntryAlignment); 813 StubCodeMark mark(this, "StubRoutines", name); 814 address start = __ pc(); 815 816 int avx3threshold = VM_Version::avx3_threshold(); 817 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 818 819 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 820 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 821 const Register from = rdi; // source array address 822 const Register to = rsi; // destination array address 823 const Register count = rdx; // elements count 824 const Register temp1 = r8; 825 const Register temp2 = rcx; 826 const Register temp3 = r11; 827 const Register temp4 = rax; 828 // End pointers are inclusive, and if count is not zero they point 829 // to the last unit copied: end_to[0] := end_from[0] 830 831 __ enter(); // required for proper stackwalking of RuntimeStub frame 832 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 833 834 if (entry != nullptr) { 835 *entry = __ pc(); 836 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 837 BLOCK_COMMENT("Entry:"); 838 } 839 840 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift)); 841 842 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 843 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 844 845 setup_argument_regs(type); 846 847 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 848 if (dest_uninitialized) { 849 decorators |= IS_DEST_UNINITIALIZED; 850 } 851 if (aligned) { 852 decorators |= ARRAYCOPY_ALIGNED; 853 } 854 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 855 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 856 { 857 // Type(shift) byte(0), short(1), int(2), long(3) 858 int loop_size[] = { 192, 96, 48, 24}; 859 int threshold[] = { 4096, 2048, 1024, 512}; 860 861 // UnsafeMemoryAccess page error: continue after unsafe access 862 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 863 // 'from', 'to' and 'count' are now valid 864 865 // temp1 holds remaining count. 866 __ movq(temp1, count); 867 868 // Zero length check. 869 __ BIND(L_tail); 870 __ cmpq(temp1, 0); 871 __ jcc(Assembler::lessEqual, L_exit); 872 873 __ mov64(temp2, 0); 874 __ movq(temp3, temp1); 875 // Special cases using 32 byte [masked] vector copy operations. 876 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 877 temp4, use64byteVector, L_entry, L_exit); 878 879 // PRE-MAIN-POST loop for aligned copy. 880 __ BIND(L_entry); 881 882 if ((MaxVectorSize > 32) && (avx3threshold != 0)) { 883 __ cmpq(temp1, threshold[shift]); 884 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 885 } 886 887 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 888 // Partial copy to make dst address 32 byte aligned. 889 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 890 __ andq(temp2, 31); 891 __ jcc(Assembler::equal, L_main_pre_loop); 892 893 if (shift) { 894 __ shrq(temp2, shift); 895 } 896 __ subq(temp1, temp2); 897 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift); 898 899 __ cmpq(temp1, loop_size[shift]); 900 __ jcc(Assembler::less, L_tail); 901 902 __ BIND(L_main_pre_loop); 903 904 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 905 __ align32(); 906 __ BIND(L_main_loop); 907 copy64_avx(to, from, temp1, xmm1, true, shift, -64); 908 copy64_avx(to, from, temp1, xmm1, true, shift, -128); 909 copy64_avx(to, from, temp1, xmm1, true, shift, -192); 910 __ subptr(temp1, loop_size[shift]); 911 __ cmpq(temp1, loop_size[shift]); 912 __ jcc(Assembler::greater, L_main_loop); 913 914 // Tail loop. 915 __ jmp(L_tail); 916 } 917 918 if (MaxVectorSize > 32) { 919 __ BIND(L_pre_main_post_64); 920 // Partial copy to make dst address 64 byte aligned. 921 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 922 __ andq(temp2, 63); 923 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 924 925 if (shift) { 926 __ shrq(temp2, shift); 927 } 928 __ subq(temp1, temp2); 929 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true); 930 931 __ cmpq(temp1, loop_size[shift]); 932 __ jcc(Assembler::less, L_tail64); 933 934 __ BIND(L_main_pre_loop_64bytes); 935 936 // Main loop with aligned copy block size of 192 bytes at 937 // 64 byte copy granularity. 938 __ align32(); 939 __ BIND(L_main_loop_64bytes); 940 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true); 941 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true); 942 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true); 943 __ subq(temp1, loop_size[shift]); 944 __ cmpq(temp1, loop_size[shift]); 945 __ jcc(Assembler::greater, L_main_loop_64bytes); 946 947 // Zero length check. 948 __ cmpq(temp1, 0); 949 __ jcc(Assembler::lessEqual, L_exit); 950 951 __ BIND(L_tail64); 952 953 // Tail handling using 64 byte [masked] vector copy operations. 954 use64byteVector = true; 955 __ mov64(temp2, 0); 956 __ movq(temp3, temp1); 957 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 958 temp4, use64byteVector, L_entry, L_exit); 959 } 960 __ BIND(L_exit); 961 } 962 address ucme_exit_pc = __ pc(); 963 // When called from generic_arraycopy r11 contains specific values 964 // used during arraycopy epilogue, re-initializing r11. 965 if(is_oop) { 966 __ movq(r11, count); 967 } 968 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 969 restore_argument_regs(type); 970 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 971 __ xorptr(rax, rax); // return 0 972 __ vzeroupper(); 973 __ leave(); // required for proper stackwalking of RuntimeStub frame 974 __ ret(0); 975 976 return start; 977 } 978 979 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from, 980 Register to, Register count, int shift, 981 Register index, Register temp, 982 bool use64byteVector, Label& L_entry, Label& L_exit) { 983 Label L_entry_64, L_entry_96, L_entry_128; 984 Label L_entry_160, L_entry_192; 985 986 int size_mat[][6] = { 987 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 988 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 989 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 990 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 991 }; 992 993 // Case A) Special case for length less than equal to 32 bytes. 994 __ cmpq(count, size_mat[shift][0]); 995 __ jccb(Assembler::greater, L_entry_64); 996 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift); 997 __ jmp(L_exit); 998 999 // Case B) Special case for length less than equal to 64 bytes. 1000 __ BIND(L_entry_64); 1001 __ cmpq(count, size_mat[shift][1]); 1002 __ jccb(Assembler::greater, L_entry_96); 1003 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector); 1004 __ jmp(L_exit); 1005 1006 // Case C) Special case for length less than equal to 96 bytes. 1007 __ BIND(L_entry_96); 1008 __ cmpq(count, size_mat[shift][2]); 1009 __ jccb(Assembler::greater, L_entry_128); 1010 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1011 __ subq(count, 64 >> shift); 1012 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64); 1013 __ jmp(L_exit); 1014 1015 // Case D) Special case for length less than equal to 128 bytes. 1016 __ BIND(L_entry_128); 1017 __ cmpq(count, size_mat[shift][3]); 1018 __ jccb(Assembler::greater, L_entry_160); 1019 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1020 copy32_avx(to, from, index, xmm, shift, 64); 1021 __ subq(count, 96 >> shift); 1022 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96); 1023 __ jmp(L_exit); 1024 1025 // Case E) Special case for length less than equal to 160 bytes. 1026 __ BIND(L_entry_160); 1027 __ cmpq(count, size_mat[shift][4]); 1028 __ jccb(Assembler::greater, L_entry_192); 1029 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1030 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 1031 __ subq(count, 128 >> shift); 1032 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128); 1033 __ jmp(L_exit); 1034 1035 // Case F) Special case for length less than equal to 192 bytes. 1036 __ BIND(L_entry_192); 1037 __ cmpq(count, size_mat[shift][5]); 1038 __ jcc(Assembler::greater, L_entry); 1039 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1040 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 1041 copy32_avx(to, from, index, xmm, shift, 128); 1042 __ subq(count, 160 >> shift); 1043 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160); 1044 __ jmp(L_exit); 1045 } 1046 1047 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from, 1048 Register to, Register count, int shift, Register index, 1049 Register temp, Label& L_exit) { 1050 Label L_entry_64, L_entry_128, L_entry_192, L_entry_256; 1051 1052 int size_mat[][4] = { 1053 /* T_BYTE */ {64, 128, 192, 256}, 1054 /* T_SHORT*/ {32, 64 , 96 , 128}, 1055 /* T_INT */ {16, 32 , 48 , 64}, 1056 /* T_LONG */ { 8, 16 , 24 , 32} 1057 }; 1058 1059 assert(MaxVectorSize == 64, "vector length != 64"); 1060 // Case A) Special case for length less than or equal to 64 bytes. 1061 __ BIND(L_entry_64); 1062 __ cmpq(count, size_mat[shift][0]); 1063 __ jccb(Assembler::greater, L_entry_128); 1064 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true); 1065 __ jmp(L_exit); 1066 1067 // Case B) Special case for length less than or equal to 128 bytes. 1068 __ BIND(L_entry_128); 1069 __ cmpq(count, size_mat[shift][1]); 1070 __ jccb(Assembler::greater, L_entry_192); 1071 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1072 __ subq(count, 64 >> shift); 1073 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true); 1074 __ jmp(L_exit); 1075 1076 // Case C) Special case for length less than or equal to 192 bytes. 1077 __ BIND(L_entry_192); 1078 __ cmpq(count, size_mat[shift][2]); 1079 __ jcc(Assembler::greater, L_entry_256); 1080 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1081 copy64_avx(to, from, index, xmm, false, shift, 64, true); 1082 __ subq(count, 128 >> shift); 1083 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true); 1084 __ jmp(L_exit); 1085 1086 // Case D) Special case for length less than or equal to 256 bytes. 1087 __ BIND(L_entry_256); 1088 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1089 copy64_avx(to, from, index, xmm, false, shift, 64, true); 1090 copy64_avx(to, from, index, xmm, false, shift, 128, true); 1091 __ subq(count, 192 >> shift); 1092 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true); 1093 __ jmp(L_exit); 1094 } 1095 1096 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, 1097 Register to, Register start_index, Register end_index, 1098 Register count, int shift, Register temp, 1099 bool use64byteVector, Label& L_entry, Label& L_exit) { 1100 Label L_entry_64, L_entry_96, L_entry_128; 1101 Label L_entry_160, L_entry_192; 1102 bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 1103 1104 int size_mat[][6] = { 1105 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 1106 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 1107 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 1108 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 1109 }; 1110 1111 // Case A) Special case for length less than equal to 32 bytes. 1112 __ cmpq(count, size_mat[shift][0]); 1113 __ jccb(Assembler::greater, L_entry_64); 1114 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1115 __ jmp(L_exit); 1116 1117 // Case B) Special case for length less than equal to 64 bytes. 1118 __ BIND(L_entry_64); 1119 __ cmpq(count, size_mat[shift][1]); 1120 __ jccb(Assembler::greater, L_entry_96); 1121 if (avx3) { 1122 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true); 1123 } else { 1124 copy32_avx(to, from, end_index, xmm, shift, -32); 1125 __ subq(count, 32 >> shift); 1126 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1127 } 1128 __ jmp(L_exit); 1129 1130 // Case C) Special case for length less than equal to 96 bytes. 1131 __ BIND(L_entry_96); 1132 __ cmpq(count, size_mat[shift][2]); 1133 __ jccb(Assembler::greater, L_entry_128); 1134 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1135 __ subq(count, 64 >> shift); 1136 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1137 __ jmp(L_exit); 1138 1139 // Case D) Special case for length less than equal to 128 bytes. 1140 __ BIND(L_entry_128); 1141 __ cmpq(count, size_mat[shift][3]); 1142 __ jccb(Assembler::greater, L_entry_160); 1143 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1144 copy32_avx(to, from, end_index, xmm, shift, -96); 1145 __ subq(count, 96 >> shift); 1146 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1147 __ jmp(L_exit); 1148 1149 // Case E) Special case for length less than equal to 160 bytes. 1150 __ BIND(L_entry_160); 1151 __ cmpq(count, size_mat[shift][4]); 1152 __ jccb(Assembler::greater, L_entry_192); 1153 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1154 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 1155 __ subq(count, 128 >> shift); 1156 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1157 __ jmp(L_exit); 1158 1159 // Case F) Special case for length less than equal to 192 bytes. 1160 __ BIND(L_entry_192); 1161 __ cmpq(count, size_mat[shift][5]); 1162 __ jcc(Assembler::greater, L_entry); 1163 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1164 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 1165 copy32_avx(to, from, end_index, xmm, shift, -160); 1166 __ subq(count, 160 >> shift); 1167 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1168 __ jmp(L_exit); 1169 } 1170 1171 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, 1172 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, 1173 int shift, int offset) { 1174 if (MaxVectorSize == 64) { 1175 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1176 __ prefetcht0(Address(src, index, scale, offset + 0x200)); 1177 __ prefetcht0(Address(src, index, scale, offset + 0x240)); 1178 __ prefetcht0(Address(src, index, scale, offset + 0x280)); 1179 __ prefetcht0(Address(src, index, scale, offset + 0x2C0)); 1180 1181 __ prefetcht0(Address(src, index, scale, offset + 0x400)); 1182 __ prefetcht0(Address(src, index, scale, offset + 0x440)); 1183 __ prefetcht0(Address(src, index, scale, offset + 0x480)); 1184 __ prefetcht0(Address(src, index, scale, offset + 0x4C0)); 1185 1186 __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit); 1187 __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit); 1188 __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit); 1189 __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit); 1190 1191 __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit); 1192 __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit); 1193 __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit); 1194 __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit); 1195 } 1196 } 1197 1198 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm, 1199 KRegister mask, Register length, Register index, 1200 Register temp, int shift, int offset, 1201 bool use64byteVector) { 1202 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 1203 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1204 if (!use64byteVector) { 1205 copy32_avx(dst, src, index, xmm, shift, offset); 1206 __ subptr(length, 32 >> shift); 1207 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32); 1208 } else { 1209 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1210 assert(MaxVectorSize == 64, "vector length != 64"); 1211 __ mov64(temp, -1L); 1212 __ bzhiq(temp, temp, length); 1213 __ kmovql(mask, temp); 1214 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit); 1215 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit); 1216 } 1217 } 1218 1219 1220 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm, 1221 KRegister mask, Register length, Register index, 1222 Register temp, int shift, int offset) { 1223 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1224 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 1225 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1226 __ mov64(temp, -1L); 1227 __ bzhiq(temp, temp, length); 1228 __ kmovql(mask, temp); 1229 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit); 1230 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit); 1231 } 1232 1233 1234 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, 1235 int shift, int offset) { 1236 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1237 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1238 __ vmovdqu(xmm, Address(src, index, scale, offset)); 1239 __ vmovdqu(Address(dst, index, scale, offset), xmm); 1240 } 1241 1242 1243 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, 1244 bool conjoint, int shift, int offset, bool use64byteVector) { 1245 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch"); 1246 if (!use64byteVector) { 1247 if (conjoint) { 1248 copy32_avx(dst, src, index, xmm, shift, offset+32); 1249 copy32_avx(dst, src, index, xmm, shift, offset); 1250 } else { 1251 copy32_avx(dst, src, index, xmm, shift, offset); 1252 copy32_avx(dst, src, index, xmm, shift, offset+32); 1253 } 1254 } else { 1255 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1256 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit); 1257 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit); 1258 } 1259 } 1260 1261 #endif // COMPILER2_OR_JVMCI 1262 1263 1264 // Arguments: 1265 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1266 // ignored 1267 // name - stub name string 1268 // 1269 // Inputs: 1270 // c_rarg0 - source array address 1271 // c_rarg1 - destination array address 1272 // c_rarg2 - element count, treated as ssize_t, can be zero 1273 // 1274 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1275 // we let the hardware handle it. The one to eight bytes within words, 1276 // dwords or qwords that span cache line boundaries will still be loaded 1277 // and stored atomically. 1278 // 1279 // Side Effects: 1280 // disjoint_byte_copy_entry is set to the no-overlap entry point 1281 // used by generate_conjoint_byte_copy(). 1282 // 1283 address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1284 #if COMPILER2_OR_JVMCI 1285 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1286 return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0, 1287 aligned, false, false); 1288 } 1289 #endif 1290 __ align(CodeEntryAlignment); 1291 StubCodeMark mark(this, "StubRoutines", name); 1292 address start = __ pc(); 1293 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1294 1295 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1296 Label L_copy_byte, L_exit; 1297 const Register from = rdi; // source array address 1298 const Register to = rsi; // destination array address 1299 const Register count = rdx; // elements count 1300 const Register byte_count = rcx; 1301 const Register qword_count = count; 1302 const Register end_from = from; // source array end address 1303 const Register end_to = to; // destination array end address 1304 // End pointers are inclusive, and if count is not zero they point 1305 // to the last unit copied: end_to[0] := end_from[0] 1306 1307 __ enter(); // required for proper stackwalking of RuntimeStub frame 1308 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1309 1310 if (entry != nullptr) { 1311 *entry = __ pc(); 1312 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1313 BLOCK_COMMENT("Entry:"); 1314 } 1315 1316 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1317 // r9 and r10 may be used to save non-volatile registers 1318 1319 { 1320 // UnsafeMemoryAccess page error: continue after unsafe access 1321 UnsafeMemoryAccessMark umam(this, !aligned, true); 1322 // 'from', 'to' and 'count' are now valid 1323 __ movptr(byte_count, count); 1324 __ shrptr(count, 3); // count => qword_count 1325 1326 // Copy from low to high addresses. Use 'to' as scratch. 1327 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1328 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1329 __ negptr(qword_count); // make the count negative 1330 __ jmp(L_copy_bytes); 1331 1332 // Copy trailing qwords 1333 __ BIND(L_copy_8_bytes); 1334 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1335 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1336 __ increment(qword_count); 1337 __ jcc(Assembler::notZero, L_copy_8_bytes); 1338 1339 // Check for and copy trailing dword 1340 __ BIND(L_copy_4_bytes); 1341 __ testl(byte_count, 4); 1342 __ jccb(Assembler::zero, L_copy_2_bytes); 1343 __ movl(rax, Address(end_from, 8)); 1344 __ movl(Address(end_to, 8), rax); 1345 1346 __ addptr(end_from, 4); 1347 __ addptr(end_to, 4); 1348 1349 // Check for and copy trailing word 1350 __ BIND(L_copy_2_bytes); 1351 __ testl(byte_count, 2); 1352 __ jccb(Assembler::zero, L_copy_byte); 1353 __ movw(rax, Address(end_from, 8)); 1354 __ movw(Address(end_to, 8), rax); 1355 1356 __ addptr(end_from, 2); 1357 __ addptr(end_to, 2); 1358 1359 // Check for and copy trailing byte 1360 __ BIND(L_copy_byte); 1361 __ testl(byte_count, 1); 1362 __ jccb(Assembler::zero, L_exit); 1363 __ movb(rax, Address(end_from, 8)); 1364 __ movb(Address(end_to, 8), rax); 1365 } 1366 __ BIND(L_exit); 1367 address ucme_exit_pc = __ pc(); 1368 restore_arg_regs(); 1369 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1370 __ xorptr(rax, rax); // return 0 1371 __ vzeroupper(); 1372 __ leave(); // required for proper stackwalking of RuntimeStub frame 1373 __ ret(0); 1374 1375 { 1376 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc); 1377 // Copy in multi-bytes chunks 1378 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE); 1379 __ jmp(L_copy_4_bytes); 1380 } 1381 return start; 1382 } 1383 1384 1385 // Arguments: 1386 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1387 // ignored 1388 // name - stub name string 1389 // 1390 // Inputs: 1391 // c_rarg0 - source array address 1392 // c_rarg1 - destination array address 1393 // c_rarg2 - element count, treated as ssize_t, can be zero 1394 // 1395 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1396 // we let the hardware handle it. The one to eight bytes within words, 1397 // dwords or qwords that span cache line boundaries will still be loaded 1398 // and stored atomically. 1399 // 1400 address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1401 address* entry, const char *name) { 1402 #if COMPILER2_OR_JVMCI 1403 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1404 return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0, 1405 nooverlap_target, aligned, false, false); 1406 } 1407 #endif 1408 __ align(CodeEntryAlignment); 1409 StubCodeMark mark(this, "StubRoutines", name); 1410 address start = __ pc(); 1411 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1412 1413 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1414 const Register from = rdi; // source array address 1415 const Register to = rsi; // destination array address 1416 const Register count = rdx; // elements count 1417 const Register byte_count = rcx; 1418 const Register qword_count = count; 1419 1420 __ enter(); // required for proper stackwalking of RuntimeStub frame 1421 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1422 1423 if (entry != nullptr) { 1424 *entry = __ pc(); 1425 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1426 BLOCK_COMMENT("Entry:"); 1427 } 1428 1429 array_overlap_test(nooverlap_target, Address::times_1); 1430 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1431 // r9 and r10 may be used to save non-volatile registers 1432 1433 { 1434 // UnsafeMemoryAccess page error: continue after unsafe access 1435 UnsafeMemoryAccessMark umam(this, !aligned, true); 1436 // 'from', 'to' and 'count' are now valid 1437 __ movptr(byte_count, count); 1438 __ shrptr(count, 3); // count => qword_count 1439 1440 // Copy from high to low addresses. 1441 1442 // Check for and copy trailing byte 1443 __ testl(byte_count, 1); 1444 __ jcc(Assembler::zero, L_copy_2_bytes); 1445 __ movb(rax, Address(from, byte_count, Address::times_1, -1)); 1446 __ movb(Address(to, byte_count, Address::times_1, -1), rax); 1447 __ decrement(byte_count); // Adjust for possible trailing word 1448 1449 // Check for and copy trailing word 1450 __ BIND(L_copy_2_bytes); 1451 __ testl(byte_count, 2); 1452 __ jcc(Assembler::zero, L_copy_4_bytes); 1453 __ movw(rax, Address(from, byte_count, Address::times_1, -2)); 1454 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1455 1456 // Check for and copy trailing dword 1457 __ BIND(L_copy_4_bytes); 1458 __ testl(byte_count, 4); 1459 __ jcc(Assembler::zero, L_copy_bytes); 1460 __ movl(rax, Address(from, qword_count, Address::times_8)); 1461 __ movl(Address(to, qword_count, Address::times_8), rax); 1462 __ jmp(L_copy_bytes); 1463 1464 // Copy trailing qwords 1465 __ BIND(L_copy_8_bytes); 1466 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1467 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1468 __ decrement(qword_count); 1469 __ jcc(Assembler::notZero, L_copy_8_bytes); 1470 } 1471 restore_arg_regs(); 1472 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1473 __ xorptr(rax, rax); // return 0 1474 __ vzeroupper(); 1475 __ leave(); // required for proper stackwalking of RuntimeStub frame 1476 __ ret(0); 1477 1478 { 1479 // UnsafeMemoryAccess page error: continue after unsafe access 1480 UnsafeMemoryAccessMark umam(this, !aligned, true); 1481 // Copy in multi-bytes chunks 1482 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE); 1483 } 1484 restore_arg_regs(); 1485 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1486 __ xorptr(rax, rax); // return 0 1487 __ vzeroupper(); 1488 __ leave(); // required for proper stackwalking of RuntimeStub frame 1489 __ ret(0); 1490 1491 return start; 1492 } 1493 1494 1495 // Arguments: 1496 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1497 // ignored 1498 // name - stub name string 1499 // 1500 // Inputs: 1501 // c_rarg0 - source array address 1502 // c_rarg1 - destination array address 1503 // c_rarg2 - element count, treated as ssize_t, can be zero 1504 // 1505 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1506 // let the hardware handle it. The two or four words within dwords 1507 // or qwords that span cache line boundaries will still be loaded 1508 // and stored atomically. 1509 // 1510 // Side Effects: 1511 // disjoint_short_copy_entry is set to the no-overlap entry point 1512 // used by generate_conjoint_short_copy(). 1513 // 1514 address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { 1515 #if COMPILER2_OR_JVMCI 1516 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1517 return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1, 1518 aligned, false, false); 1519 } 1520 #endif 1521 1522 __ align(CodeEntryAlignment); 1523 StubCodeMark mark(this, "StubRoutines", name); 1524 address start = __ pc(); 1525 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1526 1527 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1528 const Register from = rdi; // source array address 1529 const Register to = rsi; // destination array address 1530 const Register count = rdx; // elements count 1531 const Register word_count = rcx; 1532 const Register qword_count = count; 1533 const Register end_from = from; // source array end address 1534 const Register end_to = to; // destination array end address 1535 // End pointers are inclusive, and if count is not zero they point 1536 // to the last unit copied: end_to[0] := end_from[0] 1537 1538 __ enter(); // required for proper stackwalking of RuntimeStub frame 1539 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1540 1541 if (entry != nullptr) { 1542 *entry = __ pc(); 1543 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1544 BLOCK_COMMENT("Entry:"); 1545 } 1546 1547 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1548 // r9 and r10 may be used to save non-volatile registers 1549 1550 { 1551 // UnsafeMemoryAccess page error: continue after unsafe access 1552 UnsafeMemoryAccessMark umam(this, !aligned, true); 1553 // 'from', 'to' and 'count' are now valid 1554 __ movptr(word_count, count); 1555 __ shrptr(count, 2); // count => qword_count 1556 1557 // Copy from low to high addresses. Use 'to' as scratch. 1558 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1559 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1560 __ negptr(qword_count); 1561 __ jmp(L_copy_bytes); 1562 1563 // Copy trailing qwords 1564 __ BIND(L_copy_8_bytes); 1565 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1566 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1567 __ increment(qword_count); 1568 __ jcc(Assembler::notZero, L_copy_8_bytes); 1569 1570 // Original 'dest' is trashed, so we can't use it as a 1571 // base register for a possible trailing word copy 1572 1573 // Check for and copy trailing dword 1574 __ BIND(L_copy_4_bytes); 1575 __ testl(word_count, 2); 1576 __ jccb(Assembler::zero, L_copy_2_bytes); 1577 __ movl(rax, Address(end_from, 8)); 1578 __ movl(Address(end_to, 8), rax); 1579 1580 __ addptr(end_from, 4); 1581 __ addptr(end_to, 4); 1582 1583 // Check for and copy trailing word 1584 __ BIND(L_copy_2_bytes); 1585 __ testl(word_count, 1); 1586 __ jccb(Assembler::zero, L_exit); 1587 __ movw(rax, Address(end_from, 8)); 1588 __ movw(Address(end_to, 8), rax); 1589 } 1590 __ BIND(L_exit); 1591 address ucme_exit_pc = __ pc(); 1592 restore_arg_regs(); 1593 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1594 __ xorptr(rax, rax); // return 0 1595 __ vzeroupper(); 1596 __ leave(); // required for proper stackwalking of RuntimeStub frame 1597 __ ret(0); 1598 1599 { 1600 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc); 1601 // Copy in multi-bytes chunks 1602 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT); 1603 __ jmp(L_copy_4_bytes); 1604 } 1605 1606 return start; 1607 } 1608 1609 1610 address StubGenerator::generate_fill(BasicType t, bool aligned, const char *name) { 1611 __ align(CodeEntryAlignment); 1612 StubCodeMark mark(this, "StubRoutines", name); 1613 address start = __ pc(); 1614 1615 BLOCK_COMMENT("Entry:"); 1616 1617 const Register to = c_rarg0; // destination array address 1618 const Register value = c_rarg1; // value 1619 const Register count = c_rarg2; // elements count 1620 __ mov(r11, count); 1621 1622 __ enter(); // required for proper stackwalking of RuntimeStub frame 1623 1624 { 1625 // Add set memory mark to protect against unsafe accesses faulting 1626 UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true); 1627 __ generate_fill(t, aligned, to, value, r11, rax, xmm0); 1628 } 1629 1630 __ vzeroupper(); 1631 __ leave(); // required for proper stackwalking of RuntimeStub frame 1632 __ ret(0); 1633 1634 return start; 1635 } 1636 1637 1638 // Arguments: 1639 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1640 // ignored 1641 // name - stub name string 1642 // 1643 // Inputs: 1644 // c_rarg0 - source array address 1645 // c_rarg1 - destination array address 1646 // c_rarg2 - element count, treated as ssize_t, can be zero 1647 // 1648 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1649 // let the hardware handle it. The two or four words within dwords 1650 // or qwords that span cache line boundaries will still be loaded 1651 // and stored atomically. 1652 // 1653 address StubGenerator::generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1654 address *entry, const char *name) { 1655 #if COMPILER2_OR_JVMCI 1656 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1657 return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1, 1658 nooverlap_target, aligned, false, false); 1659 } 1660 #endif 1661 __ align(CodeEntryAlignment); 1662 StubCodeMark mark(this, "StubRoutines", name); 1663 address start = __ pc(); 1664 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1665 1666 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; 1667 const Register from = rdi; // source array address 1668 const Register to = rsi; // destination array address 1669 const Register count = rdx; // elements count 1670 const Register word_count = rcx; 1671 const Register qword_count = count; 1672 1673 __ enter(); // required for proper stackwalking of RuntimeStub frame 1674 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1675 1676 if (entry != nullptr) { 1677 *entry = __ pc(); 1678 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1679 BLOCK_COMMENT("Entry:"); 1680 } 1681 1682 array_overlap_test(nooverlap_target, Address::times_2); 1683 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1684 // r9 and r10 may be used to save non-volatile registers 1685 1686 { 1687 // UnsafeMemoryAccess page error: continue after unsafe access 1688 UnsafeMemoryAccessMark umam(this, !aligned, true); 1689 // 'from', 'to' and 'count' are now valid 1690 __ movptr(word_count, count); 1691 __ shrptr(count, 2); // count => qword_count 1692 1693 // Copy from high to low addresses. Use 'to' as scratch. 1694 1695 // Check for and copy trailing word 1696 __ testl(word_count, 1); 1697 __ jccb(Assembler::zero, L_copy_4_bytes); 1698 __ movw(rax, Address(from, word_count, Address::times_2, -2)); 1699 __ movw(Address(to, word_count, Address::times_2, -2), rax); 1700 1701 // Check for and copy trailing dword 1702 __ BIND(L_copy_4_bytes); 1703 __ testl(word_count, 2); 1704 __ jcc(Assembler::zero, L_copy_bytes); 1705 __ movl(rax, Address(from, qword_count, Address::times_8)); 1706 __ movl(Address(to, qword_count, Address::times_8), rax); 1707 __ jmp(L_copy_bytes); 1708 1709 // Copy trailing qwords 1710 __ BIND(L_copy_8_bytes); 1711 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1712 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1713 __ decrement(qword_count); 1714 __ jcc(Assembler::notZero, L_copy_8_bytes); 1715 } 1716 restore_arg_regs(); 1717 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1718 __ xorptr(rax, rax); // return 0 1719 __ vzeroupper(); 1720 __ leave(); // required for proper stackwalking of RuntimeStub frame 1721 __ ret(0); 1722 1723 { 1724 // UnsafeMemoryAccess page error: continue after unsafe access 1725 UnsafeMemoryAccessMark umam(this, !aligned, true); 1726 // Copy in multi-bytes chunks 1727 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT); 1728 } 1729 restore_arg_regs(); 1730 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1731 __ xorptr(rax, rax); // return 0 1732 __ vzeroupper(); 1733 __ leave(); // required for proper stackwalking of RuntimeStub frame 1734 __ ret(0); 1735 1736 return start; 1737 } 1738 1739 1740 // Arguments: 1741 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1742 // ignored 1743 // is_oop - true => oop array, so generate store check code 1744 // name - stub name string 1745 // 1746 // Inputs: 1747 // c_rarg0 - source array address 1748 // c_rarg1 - destination array address 1749 // c_rarg2 - element count, treated as ssize_t, can be zero 1750 // 1751 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1752 // the hardware handle it. The two dwords within qwords that span 1753 // cache line boundaries will still be loaded and stored atomically. 1754 // 1755 // Side Effects: 1756 // disjoint_int_copy_entry is set to the no-overlap entry point 1757 // used by generate_conjoint_int_oop_copy(). 1758 // 1759 address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, 1760 const char *name, bool dest_uninitialized) { 1761 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1762 #if COMPILER2_OR_JVMCI 1763 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1764 return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2, 1765 aligned, is_oop, dest_uninitialized); 1766 } 1767 #endif 1768 1769 __ align(CodeEntryAlignment); 1770 StubCodeMark mark(this, "StubRoutines", name); 1771 address start = __ pc(); 1772 1773 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 1774 const Register from = rdi; // source array address 1775 const Register to = rsi; // destination array address 1776 const Register count = rdx; // elements count 1777 const Register dword_count = rcx; 1778 const Register qword_count = count; 1779 const Register end_from = from; // source array end address 1780 const Register end_to = to; // destination array end address 1781 // End pointers are inclusive, and if count is not zero they point 1782 // to the last unit copied: end_to[0] := end_from[0] 1783 1784 __ enter(); // required for proper stackwalking of RuntimeStub frame 1785 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1786 1787 if (entry != nullptr) { 1788 *entry = __ pc(); 1789 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1790 BLOCK_COMMENT("Entry:"); 1791 } 1792 1793 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1794 // r9 is used to save r15_thread 1795 1796 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1797 if (dest_uninitialized) { 1798 decorators |= IS_DEST_UNINITIALIZED; 1799 } 1800 if (aligned) { 1801 decorators |= ARRAYCOPY_ALIGNED; 1802 } 1803 1804 BasicType type = is_oop ? T_OBJECT : T_INT; 1805 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1806 1807 { 1808 // UnsafeMemoryAccess page error: continue after unsafe access 1809 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 1810 // 'from', 'to' and 'count' are now valid 1811 __ movptr(dword_count, count); 1812 __ shrptr(count, 1); // count => qword_count 1813 1814 // Copy from low to high addresses. Use 'to' as scratch. 1815 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1816 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1817 __ negptr(qword_count); 1818 __ jmp(L_copy_bytes); 1819 1820 // Copy trailing qwords 1821 __ BIND(L_copy_8_bytes); 1822 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1823 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1824 __ increment(qword_count); 1825 __ jcc(Assembler::notZero, L_copy_8_bytes); 1826 1827 // Check for and copy trailing dword 1828 __ BIND(L_copy_4_bytes); 1829 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 1830 __ jccb(Assembler::zero, L_exit); 1831 __ movl(rax, Address(end_from, 8)); 1832 __ movl(Address(end_to, 8), rax); 1833 } 1834 __ BIND(L_exit); 1835 address ucme_exit_pc = __ pc(); 1836 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1837 restore_arg_regs_using_thread(); 1838 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1839 __ vzeroupper(); 1840 __ xorptr(rax, rax); // return 0 1841 __ leave(); // required for proper stackwalking of RuntimeStub frame 1842 __ ret(0); 1843 1844 { 1845 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc); 1846 // Copy in multi-bytes chunks 1847 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT); 1848 __ jmp(L_copy_4_bytes); 1849 } 1850 1851 return start; 1852 } 1853 1854 1855 // Arguments: 1856 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1857 // ignored 1858 // is_oop - true => oop array, so generate store check code 1859 // name - stub name string 1860 // 1861 // Inputs: 1862 // c_rarg0 - source array address 1863 // c_rarg1 - destination array address 1864 // c_rarg2 - element count, treated as ssize_t, can be zero 1865 // 1866 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1867 // the hardware handle it. The two dwords within qwords that span 1868 // cache line boundaries will still be loaded and stored atomically. 1869 // 1870 address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 1871 address *entry, const char *name, 1872 bool dest_uninitialized) { 1873 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1874 #if COMPILER2_OR_JVMCI 1875 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1876 return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2, 1877 nooverlap_target, aligned, is_oop, dest_uninitialized); 1878 } 1879 #endif 1880 __ align(CodeEntryAlignment); 1881 StubCodeMark mark(this, "StubRoutines", name); 1882 address start = __ pc(); 1883 1884 Label L_copy_bytes, L_copy_8_bytes, L_exit; 1885 const Register from = rdi; // source array address 1886 const Register to = rsi; // destination array address 1887 const Register count = rdx; // elements count 1888 const Register dword_count = rcx; 1889 const Register qword_count = count; 1890 1891 __ enter(); // required for proper stackwalking of RuntimeStub frame 1892 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1893 1894 if (entry != nullptr) { 1895 *entry = __ pc(); 1896 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1897 BLOCK_COMMENT("Entry:"); 1898 } 1899 1900 array_overlap_test(nooverlap_target, Address::times_4); 1901 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1902 // r9 is used to save r15_thread 1903 1904 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1905 if (dest_uninitialized) { 1906 decorators |= IS_DEST_UNINITIALIZED; 1907 } 1908 if (aligned) { 1909 decorators |= ARRAYCOPY_ALIGNED; 1910 } 1911 1912 BasicType type = is_oop ? T_OBJECT : T_INT; 1913 // no registers are destroyed by this call 1914 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1915 1916 assert_clean_int(count, rax); // Make sure 'count' is clean int. 1917 { 1918 // UnsafeMemoryAccess page error: continue after unsafe access 1919 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 1920 // 'from', 'to' and 'count' are now valid 1921 __ movptr(dword_count, count); 1922 __ shrptr(count, 1); // count => qword_count 1923 1924 // Copy from high to low addresses. Use 'to' as scratch. 1925 1926 // Check for and copy trailing dword 1927 __ testl(dword_count, 1); 1928 __ jcc(Assembler::zero, L_copy_bytes); 1929 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 1930 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 1931 __ jmp(L_copy_bytes); 1932 1933 // Copy trailing qwords 1934 __ BIND(L_copy_8_bytes); 1935 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1936 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1937 __ decrement(qword_count); 1938 __ jcc(Assembler::notZero, L_copy_8_bytes); 1939 } 1940 if (is_oop) { 1941 __ jmp(L_exit); 1942 } 1943 restore_arg_regs_using_thread(); 1944 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1945 __ xorptr(rax, rax); // return 0 1946 __ vzeroupper(); 1947 __ leave(); // required for proper stackwalking of RuntimeStub frame 1948 __ ret(0); 1949 1950 { 1951 // UnsafeMemoryAccess page error: continue after unsafe access 1952 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 1953 // Copy in multi-bytes chunks 1954 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT); 1955 } 1956 1957 __ BIND(L_exit); 1958 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1959 restore_arg_regs_using_thread(); 1960 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1961 __ xorptr(rax, rax); // return 0 1962 __ vzeroupper(); 1963 __ leave(); // required for proper stackwalking of RuntimeStub frame 1964 __ ret(0); 1965 1966 return start; 1967 } 1968 1969 1970 // Arguments: 1971 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1972 // ignored 1973 // is_oop - true => oop array, so generate store check code 1974 // name - stub name string 1975 // 1976 // Inputs: 1977 // c_rarg0 - source array address 1978 // c_rarg1 - destination array address 1979 // c_rarg2 - element count, treated as ssize_t, can be zero 1980 // 1981 // Side Effects: 1982 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1983 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1984 // 1985 address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, 1986 const char *name, bool dest_uninitialized) { 1987 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1988 #if COMPILER2_OR_JVMCI 1989 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1990 return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3, 1991 aligned, is_oop, dest_uninitialized); 1992 } 1993 #endif 1994 __ align(CodeEntryAlignment); 1995 StubCodeMark mark(this, "StubRoutines", name); 1996 address start = __ pc(); 1997 1998 Label L_copy_bytes, L_copy_8_bytes, L_exit; 1999 const Register from = rdi; // source array address 2000 const Register to = rsi; // destination array address 2001 const Register qword_count = rdx; // elements count 2002 const Register end_from = from; // source array end address 2003 const Register end_to = rcx; // destination array end address 2004 const Register saved_count = r11; 2005 // End pointers are inclusive, and if count is not zero they point 2006 // to the last unit copied: end_to[0] := end_from[0] 2007 2008 __ enter(); // required for proper stackwalking of RuntimeStub frame 2009 // Save no-overlap entry point for generate_conjoint_long_oop_copy() 2010 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2011 2012 if (entry != nullptr) { 2013 *entry = __ pc(); 2014 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2015 BLOCK_COMMENT("Entry:"); 2016 } 2017 2018 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2019 // r9 is used to save r15_thread 2020 // 'from', 'to' and 'qword_count' are now valid 2021 2022 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2023 if (dest_uninitialized) { 2024 decorators |= IS_DEST_UNINITIALIZED; 2025 } 2026 if (aligned) { 2027 decorators |= ARRAYCOPY_ALIGNED; 2028 } 2029 2030 BasicType type = is_oop ? T_OBJECT : T_LONG; 2031 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2032 { 2033 // UnsafeMemoryAccess page error: continue after unsafe access 2034 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2035 2036 // Copy from low to high addresses. Use 'to' as scratch. 2037 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2038 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2039 __ negptr(qword_count); 2040 __ jmp(L_copy_bytes); 2041 2042 // Copy trailing qwords 2043 __ BIND(L_copy_8_bytes); 2044 bs->copy_load_at(_masm, decorators, type, 8, 2045 rax, Address(end_from, qword_count, Address::times_8, 8), 2046 r10); 2047 bs->copy_store_at(_masm, decorators, type, 8, 2048 Address(end_to, qword_count, Address::times_8, 8), rax, 2049 r10); 2050 __ increment(qword_count); 2051 __ jcc(Assembler::notZero, L_copy_8_bytes); 2052 } 2053 if (is_oop) { 2054 __ jmp(L_exit); 2055 } else { 2056 restore_arg_regs_using_thread(); 2057 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2058 __ xorptr(rax, rax); // return 0 2059 __ vzeroupper(); 2060 __ leave(); // required for proper stackwalking of RuntimeStub frame 2061 __ ret(0); 2062 } 2063 2064 { 2065 // UnsafeMemoryAccess page error: continue after unsafe access 2066 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2067 // Copy in multi-bytes chunks 2068 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG); 2069 } 2070 2071 __ BIND(L_exit); 2072 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2073 restore_arg_regs_using_thread(); 2074 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 2075 SharedRuntime::_jlong_array_copy_ctr, 2076 rscratch1); // Update counter after rscratch1 is free 2077 __ vzeroupper(); 2078 __ xorptr(rax, rax); // return 0 2079 __ leave(); // required for proper stackwalking of RuntimeStub frame 2080 __ ret(0); 2081 2082 return start; 2083 } 2084 2085 2086 // Arguments: 2087 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2088 // ignored 2089 // is_oop - true => oop array, so generate store check code 2090 // name - stub name string 2091 // 2092 // Inputs: 2093 // c_rarg0 - source array address 2094 // c_rarg1 - destination array address 2095 // c_rarg2 - element count, treated as ssize_t, can be zero 2096 // 2097 address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 2098 address *entry, const char *name, 2099 bool dest_uninitialized) { 2100 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2101 #if COMPILER2_OR_JVMCI 2102 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 2103 return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3, 2104 nooverlap_target, aligned, is_oop, dest_uninitialized); 2105 } 2106 #endif 2107 __ align(CodeEntryAlignment); 2108 StubCodeMark mark(this, "StubRoutines", name); 2109 address start = __ pc(); 2110 2111 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2112 const Register from = rdi; // source array address 2113 const Register to = rsi; // destination array address 2114 const Register qword_count = rdx; // elements count 2115 const Register saved_count = rcx; 2116 2117 __ enter(); // required for proper stackwalking of RuntimeStub frame 2118 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2119 2120 if (entry != nullptr) { 2121 *entry = __ pc(); 2122 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2123 BLOCK_COMMENT("Entry:"); 2124 } 2125 2126 array_overlap_test(nooverlap_target, Address::times_8); 2127 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2128 // r9 is used to save r15_thread 2129 // 'from', 'to' and 'qword_count' are now valid 2130 2131 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2132 if (dest_uninitialized) { 2133 decorators |= IS_DEST_UNINITIALIZED; 2134 } 2135 if (aligned) { 2136 decorators |= ARRAYCOPY_ALIGNED; 2137 } 2138 2139 BasicType type = is_oop ? T_OBJECT : T_LONG; 2140 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2141 { 2142 // UnsafeMemoryAccess page error: continue after unsafe access 2143 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2144 2145 __ jmp(L_copy_bytes); 2146 2147 // Copy trailing qwords 2148 __ BIND(L_copy_8_bytes); 2149 bs->copy_load_at(_masm, decorators, type, 8, 2150 rax, Address(from, qword_count, Address::times_8, -8), 2151 r10); 2152 bs->copy_store_at(_masm, decorators, type, 8, 2153 Address(to, qword_count, Address::times_8, -8), rax, 2154 r10); 2155 __ decrement(qword_count); 2156 __ jcc(Assembler::notZero, L_copy_8_bytes); 2157 } 2158 if (is_oop) { 2159 __ jmp(L_exit); 2160 } else { 2161 restore_arg_regs_using_thread(); 2162 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2163 __ xorptr(rax, rax); // return 0 2164 __ vzeroupper(); 2165 __ leave(); // required for proper stackwalking of RuntimeStub frame 2166 __ ret(0); 2167 } 2168 { 2169 // UnsafeMemoryAccess page error: continue after unsafe access 2170 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2171 2172 // Copy in multi-bytes chunks 2173 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG); 2174 } 2175 __ BIND(L_exit); 2176 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2177 restore_arg_regs_using_thread(); 2178 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 2179 SharedRuntime::_jlong_array_copy_ctr, 2180 rscratch1); // Update counter after rscratch1 is free 2181 __ vzeroupper(); 2182 __ xorptr(rax, rax); // return 0 2183 __ leave(); // required for proper stackwalking of RuntimeStub frame 2184 __ ret(0); 2185 2186 return start; 2187 } 2188 2189 2190 // Helper for generating a dynamic type check. 2191 // Smashes no registers. 2192 void StubGenerator::generate_type_check(Register sub_klass, 2193 Register super_check_offset, 2194 Register super_klass, 2195 Label& L_success) { 2196 assert_different_registers(sub_klass, super_check_offset, super_klass); 2197 2198 BLOCK_COMMENT("type_check:"); 2199 2200 Label L_miss; 2201 2202 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 2203 super_check_offset); 2204 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 2205 2206 // Fall through on failure! 2207 __ BIND(L_miss); 2208 } 2209 2210 // 2211 // Generate checkcasting array copy stub 2212 // 2213 // Input: 2214 // c_rarg0 - source array address 2215 // c_rarg1 - destination array address 2216 // c_rarg2 - element count, treated as ssize_t, can be zero 2217 // c_rarg3 - size_t ckoff (super_check_offset) 2218 // not Win64 2219 // c_rarg4 - oop ckval (super_klass) 2220 // Win64 2221 // rsp+40 - oop ckval (super_klass) 2222 // 2223 // Output: 2224 // rax == 0 - success 2225 // rax == -1^K - failure, where K is partial transfer count 2226 // 2227 address StubGenerator::generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized) { 2228 2229 Label L_load_element, L_store_element, L_do_card_marks, L_done; 2230 2231 // Input registers (after setup_arg_regs) 2232 const Register from = rdi; // source array address 2233 const Register to = rsi; // destination array address 2234 const Register length = rdx; // elements count 2235 const Register ckoff = rcx; // super_check_offset 2236 const Register ckval = r8; // super_klass 2237 2238 // Registers used as temps (r13, r14 are save-on-entry) 2239 const Register end_from = from; // source array end address 2240 const Register end_to = r13; // destination array end address 2241 const Register count = rdx; // -(count_remaining) 2242 const Register r14_length = r14; // saved copy of length 2243 // End pointers are inclusive, and if length is not zero they point 2244 // to the last unit copied: end_to[0] := end_from[0] 2245 2246 const Register rax_oop = rax; // actual oop copied 2247 const Register r11_klass = r11; // oop._klass 2248 2249 //--------------------------------------------------------------- 2250 // Assembler stub will be used for this call to arraycopy 2251 // if the two arrays are subtypes of Object[] but the 2252 // destination array type is not equal to or a supertype 2253 // of the source type. Each element must be separately 2254 // checked. 2255 2256 __ align(CodeEntryAlignment); 2257 StubCodeMark mark(this, "StubRoutines", name); 2258 address start = __ pc(); 2259 2260 __ enter(); // required for proper stackwalking of RuntimeStub frame 2261 2262 #ifdef ASSERT 2263 // caller guarantees that the arrays really are different 2264 // otherwise, we would have to make conjoint checks 2265 { Label L; 2266 array_overlap_test(L, TIMES_OOP); 2267 __ stop("checkcast_copy within a single array"); 2268 __ bind(L); 2269 } 2270 #endif //ASSERT 2271 2272 setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx 2273 // ckoff => rcx, ckval => r8 2274 // r9 is used to save r15_thread 2275 #ifdef _WIN64 2276 // last argument (#4) is on stack on Win64 2277 __ movptr(ckval, Address(rsp, 6 * wordSize)); 2278 #endif 2279 2280 // Caller of this entry point must set up the argument registers. 2281 if (entry != nullptr) { 2282 *entry = __ pc(); 2283 BLOCK_COMMENT("Entry:"); 2284 } 2285 2286 // allocate spill slots for r13, r14 2287 enum { 2288 saved_r13_offset, 2289 saved_r14_offset, 2290 saved_r10_offset, 2291 saved_rbp_offset 2292 }; 2293 __ subptr(rsp, saved_rbp_offset * wordSize); 2294 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 2295 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 2296 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10); 2297 2298 #ifdef ASSERT 2299 Label L2; 2300 __ get_thread(r14); 2301 __ cmpptr(r15_thread, r14); 2302 __ jcc(Assembler::equal, L2); 2303 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 2304 __ bind(L2); 2305 #endif // ASSERT 2306 2307 // check that int operands are properly extended to size_t 2308 assert_clean_int(length, rax); 2309 assert_clean_int(ckoff, rax); 2310 2311 #ifdef ASSERT 2312 BLOCK_COMMENT("assert consistent ckoff/ckval"); 2313 // The ckoff and ckval must be mutually consistent, 2314 // even though caller generates both. 2315 { Label L; 2316 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2317 __ cmpl(ckoff, Address(ckval, sco_offset)); 2318 __ jcc(Assembler::equal, L); 2319 __ stop("super_check_offset inconsistent"); 2320 __ bind(L); 2321 } 2322 #endif //ASSERT 2323 2324 // Loop-invariant addresses. They are exclusive end pointers. 2325 Address end_from_addr(from, length, TIMES_OOP, 0); 2326 Address end_to_addr(to, length, TIMES_OOP, 0); 2327 // Loop-variant addresses. They assume post-incremented count < 0. 2328 Address from_element_addr(end_from, count, TIMES_OOP, 0); 2329 Address to_element_addr(end_to, count, TIMES_OOP, 0); 2330 2331 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 2332 if (dest_uninitialized) { 2333 decorators |= IS_DEST_UNINITIALIZED; 2334 } 2335 2336 BasicType type = T_OBJECT; 2337 size_t element_size = UseCompressedOops ? 4 : 8; 2338 2339 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2340 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2341 2342 // Copy from low to high addresses, indexed from the end of each array. 2343 __ lea(end_from, end_from_addr); 2344 __ lea(end_to, end_to_addr); 2345 __ movptr(r14_length, length); // save a copy of the length 2346 assert(length == count, ""); // else fix next line: 2347 __ negptr(count); // negate and test the length 2348 __ jcc(Assembler::notZero, L_load_element); 2349 2350 // Empty array: Nothing to do. 2351 __ xorptr(rax, rax); // return 0 on (trivial) success 2352 __ jmp(L_done); 2353 2354 // ======== begin loop ======== 2355 // (Loop is rotated; its entry is L_load_element.) 2356 // Loop control: 2357 // for (count = -count; count != 0; count++) 2358 // Base pointers src, dst are biased by 8*(count-1),to last element. 2359 __ align(OptoLoopAlignment); 2360 2361 __ BIND(L_store_element); 2362 bs->copy_store_at(_masm, 2363 decorators, 2364 type, 2365 element_size, 2366 to_element_addr, 2367 rax_oop, 2368 r10); 2369 __ increment(count); // increment the count toward zero 2370 __ jcc(Assembler::zero, L_do_card_marks); 2371 2372 // ======== loop entry is here ======== 2373 __ BIND(L_load_element); 2374 bs->copy_load_at(_masm, 2375 decorators, 2376 type, 2377 element_size, 2378 rax_oop, 2379 from_element_addr, 2380 r10); 2381 __ testptr(rax_oop, rax_oop); 2382 __ jcc(Assembler::zero, L_store_element); 2383 2384 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass 2385 generate_type_check(r11_klass, ckoff, ckval, L_store_element); 2386 // ======== end loop ======== 2387 2388 // It was a real error; we must depend on the caller to finish the job. 2389 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. 2390 // Emit GC store barriers for the oops we have copied (r14 + rdx), 2391 // and report their number to the caller. 2392 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); 2393 Label L_post_barrier; 2394 __ addptr(r14_length, count); // K = (original - remaining) oops 2395 __ movptr(rax, r14_length); // save the value 2396 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 2397 __ jccb(Assembler::notZero, L_post_barrier); 2398 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 2399 2400 // Come here on success only. 2401 __ BIND(L_do_card_marks); 2402 __ xorptr(rax, rax); // return 0 on success 2403 2404 __ BIND(L_post_barrier); 2405 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length); 2406 2407 // Common exit point (success or failure). 2408 __ BIND(L_done); 2409 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 2410 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 2411 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize)); 2412 restore_arg_regs_using_thread(); 2413 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2414 __ leave(); // required for proper stackwalking of RuntimeStub frame 2415 __ ret(0); 2416 2417 return start; 2418 } 2419 2420 2421 // Generate 'unsafe' array copy stub 2422 // Though just as safe as the other stubs, it takes an unscaled 2423 // size_t argument instead of an element count. 2424 // 2425 // Input: 2426 // c_rarg0 - source array address 2427 // c_rarg1 - destination array address 2428 // c_rarg2 - byte count, treated as ssize_t, can be zero 2429 // 2430 // Examines the alignment of the operands and dispatches 2431 // to a long, int, short, or byte copy loop. 2432 // 2433 address StubGenerator::generate_unsafe_copy(const char *name, 2434 address byte_copy_entry, address short_copy_entry, 2435 address int_copy_entry, address long_copy_entry) { 2436 2437 Label L_long_aligned, L_int_aligned, L_short_aligned; 2438 2439 // Input registers (before setup_arg_regs) 2440 const Register from = c_rarg0; // source array address 2441 const Register to = c_rarg1; // destination array address 2442 const Register size = c_rarg2; // byte count (size_t) 2443 2444 // Register used as a temp 2445 const Register bits = rax; // test copy of low bits 2446 2447 __ align(CodeEntryAlignment); 2448 StubCodeMark mark(this, "StubRoutines", name); 2449 address start = __ pc(); 2450 2451 __ enter(); // required for proper stackwalking of RuntimeStub frame 2452 2453 // bump this on entry, not on exit: 2454 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1); 2455 2456 __ mov(bits, from); 2457 __ orptr(bits, to); 2458 __ orptr(bits, size); 2459 2460 __ testb(bits, BytesPerLong-1); 2461 __ jccb(Assembler::zero, L_long_aligned); 2462 2463 __ testb(bits, BytesPerInt-1); 2464 __ jccb(Assembler::zero, L_int_aligned); 2465 2466 __ testb(bits, BytesPerShort-1); 2467 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 2468 2469 __ BIND(L_short_aligned); 2470 __ shrptr(size, LogBytesPerShort); // size => short_count 2471 __ jump(RuntimeAddress(short_copy_entry)); 2472 2473 __ BIND(L_int_aligned); 2474 __ shrptr(size, LogBytesPerInt); // size => int_count 2475 __ jump(RuntimeAddress(int_copy_entry)); 2476 2477 __ BIND(L_long_aligned); 2478 __ shrptr(size, LogBytesPerLong); // size => qword_count 2479 __ jump(RuntimeAddress(long_copy_entry)); 2480 2481 return start; 2482 } 2483 2484 2485 // Static enum for helper 2486 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD}; 2487 // Helper for generate_unsafe_setmemory 2488 // 2489 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks 2490 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest, 2491 Register size, Register wide_value, 2492 Register tmp, Label& L_exit, 2493 MacroAssembler *_masm) { 2494 Label L_Loop, L_Tail, L_TailLoop; 2495 2496 int shiftval = 0; 2497 int incr = 0; 2498 2499 switch (type) { 2500 case USM_SHORT: 2501 shiftval = 1; 2502 incr = 16; 2503 break; 2504 case USM_DWORD: 2505 shiftval = 2; 2506 incr = 32; 2507 break; 2508 case USM_QUADWORD: 2509 shiftval = 3; 2510 incr = 64; 2511 break; 2512 } 2513 2514 // At this point, we know the lower bits of size are zero 2515 __ shrq(size, shiftval); 2516 // size now has number of X-byte chunks (2, 4 or 8) 2517 2518 // Number of (8*X)-byte chunks into tmp 2519 __ movq(tmp, size); 2520 __ shrq(tmp, 3); 2521 __ jccb(Assembler::zero, L_Tail); 2522 2523 __ BIND(L_Loop); 2524 2525 // Unroll 8 stores 2526 for (int i = 0; i < 8; i++) { 2527 switch (type) { 2528 case USM_SHORT: 2529 __ movw(Address(dest, (2 * i)), wide_value); 2530 break; 2531 case USM_DWORD: 2532 __ movl(Address(dest, (4 * i)), wide_value); 2533 break; 2534 case USM_QUADWORD: 2535 __ movq(Address(dest, (8 * i)), wide_value); 2536 break; 2537 } 2538 } 2539 __ addq(dest, incr); 2540 __ decrementq(tmp); 2541 __ jccb(Assembler::notZero, L_Loop); 2542 2543 __ BIND(L_Tail); 2544 2545 // Find number of remaining X-byte chunks 2546 __ andq(size, 0x7); 2547 2548 // If zero, then we're done 2549 __ jccb(Assembler::zero, L_exit); 2550 2551 __ BIND(L_TailLoop); 2552 2553 switch (type) { 2554 case USM_SHORT: 2555 __ movw(Address(dest, 0), wide_value); 2556 break; 2557 case USM_DWORD: 2558 __ movl(Address(dest, 0), wide_value); 2559 break; 2560 case USM_QUADWORD: 2561 __ movq(Address(dest, 0), wide_value); 2562 break; 2563 } 2564 __ addq(dest, incr >> 3); 2565 __ decrementq(size); 2566 __ jccb(Assembler::notZero, L_TailLoop); 2567 } 2568 2569 // Generate 'unsafe' set memory stub 2570 // Though just as safe as the other stubs, it takes an unscaled 2571 // size_t (# bytes) argument instead of an element count. 2572 // 2573 // Input: 2574 // c_rarg0 - destination array address 2575 // c_rarg1 - byte count (size_t) 2576 // c_rarg2 - byte value 2577 // 2578 // Examines the alignment of the operands and dispatches 2579 // to an int, short, or byte fill loop. 2580 // 2581 address StubGenerator::generate_unsafe_setmemory(const char *name, 2582 address unsafe_byte_fill) { 2583 __ align(CodeEntryAlignment); 2584 StubCodeMark mark(this, "StubRoutines", name); 2585 address start = __ pc(); 2586 __ enter(); // required for proper stackwalking of RuntimeStub frame 2587 2588 assert(unsafe_byte_fill != nullptr, "Invalid call"); 2589 2590 // bump this on entry, not on exit: 2591 INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1); 2592 2593 { 2594 Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes; 2595 2596 const Register dest = c_rarg0; 2597 const Register size = c_rarg1; 2598 const Register byteVal = c_rarg2; 2599 const Register wide_value = rax; 2600 const Register rScratch1 = r10; 2601 2602 assert_different_registers(dest, size, byteVal, wide_value, rScratch1); 2603 2604 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char) 2605 2606 __ testq(size, size); 2607 __ jcc(Assembler::zero, L_exit); 2608 2609 // Propagate byte to full Register 2610 __ movzbl(rScratch1, byteVal); 2611 __ mov64(wide_value, 0x0101010101010101ULL); 2612 __ imulq(wide_value, rScratch1); 2613 2614 // Check for pointer & size alignment 2615 __ movq(rScratch1, dest); 2616 __ orq(rScratch1, size); 2617 2618 __ testb(rScratch1, 7); 2619 __ jcc(Assembler::equal, L_fillQuadwords); 2620 2621 __ testb(rScratch1, 3); 2622 __ jcc(Assembler::equal, L_fillDwords); 2623 2624 __ testb(rScratch1, 1); 2625 __ jcc(Assembler::notEqual, L_fillBytes); 2626 2627 // Fill words 2628 { 2629 UnsafeMemoryAccessMark umam(this, true, true); 2630 2631 // At this point, we know the lower bit of size is zero and a 2632 // multiple of 2 2633 do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1, 2634 L_exit, _masm); 2635 } 2636 __ jmpb(L_exit); 2637 2638 __ BIND(L_fillQuadwords); 2639 2640 // Fill QUADWORDs 2641 { 2642 UnsafeMemoryAccessMark umam(this, true, true); 2643 2644 // At this point, we know the lower 3 bits of size are zero and a 2645 // multiple of 8 2646 do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1, 2647 L_exit, _masm); 2648 } 2649 __ BIND(L_exit); 2650 2651 __ leave(); // required for proper stackwalking of RuntimeStub frame 2652 __ ret(0); 2653 2654 __ BIND(L_fillDwords); 2655 2656 // Fill DWORDs 2657 { 2658 UnsafeMemoryAccessMark umam(this, true, true); 2659 2660 // At this point, we know the lower 2 bits of size are zero and a 2661 // multiple of 4 2662 do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1, 2663 L_exit, _masm); 2664 } 2665 __ jmpb(L_exit); 2666 2667 __ BIND(L_fillBytes); 2668 // Set up for tail call to previously generated byte fill routine 2669 // Parameter order is (ptr, byteVal, size) 2670 __ xchgq(c_rarg1, c_rarg2); 2671 __ leave(); // Clear effect of enter() 2672 __ jump(RuntimeAddress(unsafe_byte_fill)); 2673 } 2674 2675 return start; 2676 } 2677 2678 // Perform range checks on the proposed arraycopy. 2679 // Kills temp, but nothing else. 2680 // Also, clean the sign bits of src_pos and dst_pos. 2681 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2682 Register src_pos, // source position (c_rarg1) 2683 Register dst, // destination array oo (c_rarg2) 2684 Register dst_pos, // destination position (c_rarg3) 2685 Register length, 2686 Register temp, 2687 Label& L_failed) { 2688 BLOCK_COMMENT("arraycopy_range_checks:"); 2689 2690 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2691 __ movl(temp, length); 2692 __ addl(temp, src_pos); // src_pos + length 2693 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); 2694 __ jcc(Assembler::above, L_failed); 2695 2696 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2697 __ movl(temp, length); 2698 __ addl(temp, dst_pos); // dst_pos + length 2699 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2700 __ jcc(Assembler::above, L_failed); 2701 2702 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2703 // Move with sign extension can be used since they are positive. 2704 __ movslq(src_pos, src_pos); 2705 __ movslq(dst_pos, dst_pos); 2706 2707 BLOCK_COMMENT("arraycopy_range_checks done"); 2708 } 2709 2710 2711 // Generate generic array copy stubs 2712 // 2713 // Input: 2714 // c_rarg0 - src oop 2715 // c_rarg1 - src_pos (32-bits) 2716 // c_rarg2 - dst oop 2717 // c_rarg3 - dst_pos (32-bits) 2718 // not Win64 2719 // c_rarg4 - element count (32-bits) 2720 // Win64 2721 // rsp+40 - element count (32-bits) 2722 // 2723 // Output: 2724 // rax == 0 - success 2725 // rax == -1^K - failure, where K is partial transfer count 2726 // 2727 address StubGenerator::generate_generic_copy(const char *name, 2728 address byte_copy_entry, address short_copy_entry, 2729 address int_copy_entry, address oop_copy_entry, 2730 address long_copy_entry, address checkcast_copy_entry) { 2731 2732 Label L_failed, L_failed_0, L_objArray; 2733 Label L_copy_shorts, L_copy_ints, L_copy_longs; 2734 2735 // Input registers 2736 const Register src = c_rarg0; // source array oop 2737 const Register src_pos = c_rarg1; // source position 2738 const Register dst = c_rarg2; // destination array oop 2739 const Register dst_pos = c_rarg3; // destination position 2740 #ifndef _WIN64 2741 const Register length = c_rarg4; 2742 const Register rklass_tmp = r9; // load_klass 2743 #else 2744 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64 2745 const Register rklass_tmp = rdi; // load_klass 2746 #endif 2747 2748 { int modulus = CodeEntryAlignment; 2749 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 2750 int advance = target - (__ offset() % modulus); 2751 if (advance < 0) advance += modulus; 2752 if (advance > 0) __ nop(advance); 2753 } 2754 StubCodeMark mark(this, "StubRoutines", name); 2755 2756 // Short-hop target to L_failed. Makes for denser prologue code. 2757 __ BIND(L_failed_0); 2758 __ jmp(L_failed); 2759 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 2760 2761 __ align(CodeEntryAlignment); 2762 address start = __ pc(); 2763 2764 __ enter(); // required for proper stackwalking of RuntimeStub frame 2765 2766 #ifdef _WIN64 2767 __ push(rklass_tmp); // rdi is callee-save on Windows 2768 #endif 2769 2770 // bump this on entry, not on exit: 2771 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1); 2772 2773 //----------------------------------------------------------------------- 2774 // Assembler stub will be used for this call to arraycopy 2775 // if the following conditions are met: 2776 // 2777 // (1) src and dst must not be null. 2778 // (2) src_pos must not be negative. 2779 // (3) dst_pos must not be negative. 2780 // (4) length must not be negative. 2781 // (5) src klass and dst klass should be the same and not null. 2782 // (6) src and dst should be arrays. 2783 // (7) src_pos + length must not exceed length of src. 2784 // (8) dst_pos + length must not exceed length of dst. 2785 // 2786 2787 // if (src == nullptr) return -1; 2788 __ testptr(src, src); // src oop 2789 size_t j1off = __ offset(); 2790 __ jccb(Assembler::zero, L_failed_0); 2791 2792 // if (src_pos < 0) return -1; 2793 __ testl(src_pos, src_pos); // src_pos (32-bits) 2794 __ jccb(Assembler::negative, L_failed_0); 2795 2796 // if (dst == nullptr) return -1; 2797 __ testptr(dst, dst); // dst oop 2798 __ jccb(Assembler::zero, L_failed_0); 2799 2800 // if (dst_pos < 0) return -1; 2801 __ testl(dst_pos, dst_pos); // dst_pos (32-bits) 2802 size_t j4off = __ offset(); 2803 __ jccb(Assembler::negative, L_failed_0); 2804 2805 // The first four tests are very dense code, 2806 // but not quite dense enough to put four 2807 // jumps in a 16-byte instruction fetch buffer. 2808 // That's good, because some branch predicters 2809 // do not like jumps so close together. 2810 // Make sure of this. 2811 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps"); 2812 2813 // registers used as temp 2814 const Register r11_length = r11; // elements count to copy 2815 const Register r10_src_klass = r10; // array klass 2816 2817 // if (length < 0) return -1; 2818 __ movl(r11_length, length); // length (elements count, 32-bits value) 2819 __ testl(r11_length, r11_length); 2820 __ jccb(Assembler::negative, L_failed_0); 2821 2822 __ load_klass(r10_src_klass, src, rklass_tmp); 2823 #ifdef ASSERT 2824 // assert(src->klass() != nullptr); 2825 { 2826 BLOCK_COMMENT("assert klasses not null {"); 2827 Label L1, L2; 2828 __ testptr(r10_src_klass, r10_src_klass); 2829 __ jcc(Assembler::notZero, L2); // it is broken if klass is null 2830 __ bind(L1); 2831 __ stop("broken null klass"); 2832 __ bind(L2); 2833 __ load_klass(rax, dst, rklass_tmp); 2834 __ cmpq(rax, 0); 2835 __ jcc(Assembler::equal, L1); // this would be broken also 2836 BLOCK_COMMENT("} assert klasses not null done"); 2837 } 2838 #endif 2839 2840 // Load layout helper (32-bits) 2841 // 2842 // |array_tag| | header_size | element_type | |log2_element_size| 2843 // 32 30 24 16 8 2 0 2844 // 2845 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2846 // 2847 2848 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2849 2850 // Handle objArrays completely differently... 2851 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2852 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); 2853 __ jcc(Assembler::equal, L_objArray); 2854 2855 // if (src->klass() != dst->klass()) return -1; 2856 __ load_klass(rax, dst, rklass_tmp); 2857 __ cmpq(r10_src_klass, rax); 2858 __ jcc(Assembler::notEqual, L_failed); 2859 2860 const Register rax_lh = rax; // layout helper 2861 __ movl(rax_lh, Address(r10_src_klass, lh_offset)); 2862 2863 // if (!src->is_Array()) return -1; 2864 __ cmpl(rax_lh, Klass::_lh_neutral_value); 2865 __ jcc(Assembler::greaterEqual, L_failed); 2866 2867 // At this point, it is known to be a typeArray (array_tag 0x3). 2868 #ifdef ASSERT 2869 { 2870 BLOCK_COMMENT("assert primitive array {"); 2871 Label L; 2872 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 2873 __ jcc(Assembler::greaterEqual, L); 2874 __ stop("must be a primitive array"); 2875 __ bind(L); 2876 BLOCK_COMMENT("} assert primitive array done"); 2877 } 2878 #endif 2879 2880 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2881 r10, L_failed); 2882 2883 // TypeArrayKlass 2884 // 2885 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2886 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2887 // 2888 2889 const Register r10_offset = r10; // array offset 2890 const Register rax_elsize = rax_lh; // element size 2891 2892 __ movl(r10_offset, rax_lh); 2893 __ shrl(r10_offset, Klass::_lh_header_size_shift); 2894 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset 2895 __ addptr(src, r10_offset); // src array offset 2896 __ addptr(dst, r10_offset); // dst array offset 2897 BLOCK_COMMENT("choose copy loop based on element size"); 2898 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize 2899 2900 #ifdef _WIN64 2901 __ pop(rklass_tmp); // Restore callee-save rdi 2902 #endif 2903 2904 // next registers should be set before the jump to corresponding stub 2905 const Register from = c_rarg0; // source array address 2906 const Register to = c_rarg1; // destination array address 2907 const Register count = c_rarg2; // elements count 2908 2909 // 'from', 'to', 'count' registers should be set in such order 2910 // since they are the same as 'src', 'src_pos', 'dst'. 2911 2912 __ cmpl(rax_elsize, 0); 2913 __ jccb(Assembler::notEqual, L_copy_shorts); 2914 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr 2915 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr 2916 __ movl2ptr(count, r11_length); // length 2917 __ jump(RuntimeAddress(byte_copy_entry)); 2918 2919 __ BIND(L_copy_shorts); 2920 __ cmpl(rax_elsize, LogBytesPerShort); 2921 __ jccb(Assembler::notEqual, L_copy_ints); 2922 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr 2923 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr 2924 __ movl2ptr(count, r11_length); // length 2925 __ jump(RuntimeAddress(short_copy_entry)); 2926 2927 __ BIND(L_copy_ints); 2928 __ cmpl(rax_elsize, LogBytesPerInt); 2929 __ jccb(Assembler::notEqual, L_copy_longs); 2930 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr 2931 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr 2932 __ movl2ptr(count, r11_length); // length 2933 __ jump(RuntimeAddress(int_copy_entry)); 2934 2935 __ BIND(L_copy_longs); 2936 #ifdef ASSERT 2937 { 2938 BLOCK_COMMENT("assert long copy {"); 2939 Label L; 2940 __ cmpl(rax_elsize, LogBytesPerLong); 2941 __ jcc(Assembler::equal, L); 2942 __ stop("must be long copy, but elsize is wrong"); 2943 __ bind(L); 2944 BLOCK_COMMENT("} assert long copy done"); 2945 } 2946 #endif 2947 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 2948 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 2949 __ movl2ptr(count, r11_length); // length 2950 __ jump(RuntimeAddress(long_copy_entry)); 2951 2952 // ObjArrayKlass 2953 __ BIND(L_objArray); 2954 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 2955 2956 Label L_plain_copy, L_checkcast_copy; 2957 // test array classes for subtyping 2958 __ load_klass(rax, dst, rklass_tmp); 2959 __ cmpq(r10_src_klass, rax); // usual case is exact equality 2960 __ jcc(Assembler::notEqual, L_checkcast_copy); 2961 2962 // Identically typed arrays can be copied without element-wise checks. 2963 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2964 r10, L_failed); 2965 2966 __ lea(from, Address(src, src_pos, TIMES_OOP, 2967 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 2968 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2969 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 2970 __ movl2ptr(count, r11_length); // length 2971 __ BIND(L_plain_copy); 2972 #ifdef _WIN64 2973 __ pop(rklass_tmp); // Restore callee-save rdi 2974 #endif 2975 __ jump(RuntimeAddress(oop_copy_entry)); 2976 2977 __ BIND(L_checkcast_copy); 2978 // live at this point: r10_src_klass, r11_length, rax (dst_klass) 2979 { 2980 // Before looking at dst.length, make sure dst is also an objArray. 2981 __ cmpl(Address(rax, lh_offset), objArray_lh); 2982 __ jcc(Assembler::notEqual, L_failed); 2983 2984 // It is safe to examine both src.length and dst.length. 2985 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2986 rax, L_failed); 2987 2988 const Register r11_dst_klass = r11; 2989 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload 2990 2991 // Marshal the base address arguments now, freeing registers. 2992 __ lea(from, Address(src, src_pos, TIMES_OOP, 2993 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2994 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2995 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2996 __ movl(count, length); // length (reloaded) 2997 Register sco_temp = c_rarg3; // this register is free now 2998 assert_different_registers(from, to, count, sco_temp, 2999 r11_dst_klass, r10_src_klass); 3000 assert_clean_int(count, sco_temp); 3001 3002 // Generate the type check. 3003 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3004 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 3005 assert_clean_int(sco_temp, rax); 3006 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 3007 3008 // Fetch destination element klass from the ObjArrayKlass header. 3009 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3010 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 3011 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 3012 assert_clean_int(sco_temp, rax); 3013 3014 #ifdef _WIN64 3015 __ pop(rklass_tmp); // Restore callee-save rdi 3016 #endif 3017 3018 // the checkcast_copy loop needs two extra arguments: 3019 assert(c_rarg3 == sco_temp, "#3 already in place"); 3020 // Set up arguments for checkcast_copy_entry. 3021 setup_arg_regs_using_thread(4); 3022 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris 3023 __ jump(RuntimeAddress(checkcast_copy_entry)); 3024 } 3025 3026 __ BIND(L_failed); 3027 #ifdef _WIN64 3028 __ pop(rklass_tmp); // Restore callee-save rdi 3029 #endif 3030 __ xorptr(rax, rax); 3031 __ notptr(rax); // return -1 3032 __ leave(); // required for proper stackwalking of RuntimeStub frame 3033 __ ret(0); 3034 3035 return start; 3036 } 3037 3038 #undef __