1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/objArrayKlass.hpp" 30 #include "runtime/sharedRuntime.hpp" 31 #include "runtime/stubRoutines.hpp" 32 #include "stubGenerator_x86_64.hpp" 33 #ifdef COMPILER2 34 #include "opto/c2_globals.hpp" 35 #endif 36 #if INCLUDE_JVMCI 37 #include "jvmci/jvmci_globals.hpp" 38 #endif 39 40 #define __ _masm-> 41 42 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #else 47 #define BLOCK_COMMENT(str) __ block_comment(str) 48 #endif // PRODUCT 49 50 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 51 52 #ifdef PRODUCT 53 #define INC_COUNTER_NP(counter, rscratch) ((void)0) 54 #else 55 #define INC_COUNTER_NP(counter, rscratch) \ 56 BLOCK_COMMENT("inc_counter " #counter); \ 57 inc_counter_np(_masm, counter, rscratch); 58 59 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) { 60 __ incrementl(ExternalAddress((address)&counter), rscratch); 61 } 62 63 #if COMPILER2_OR_JVMCI 64 static uint& get_profile_ctr(int shift) { 65 if (shift == 0) { 66 return SharedRuntime::_jbyte_array_copy_ctr; 67 } else if (shift == 1) { 68 return SharedRuntime::_jshort_array_copy_ctr; 69 } else if (shift == 2) { 70 return SharedRuntime::_jint_array_copy_ctr; 71 } else { 72 assert(shift == 3, ""); 73 return SharedRuntime::_jlong_array_copy_ctr; 74 } 75 } 76 #endif // COMPILER2_OR_JVMCI 77 #endif // !PRODUCT 78 79 void StubGenerator::generate_arraycopy_stubs() { 80 address entry; 81 address entry_jbyte_arraycopy; 82 address entry_jshort_arraycopy; 83 address entry_jint_arraycopy; 84 address entry_oop_arraycopy; 85 address entry_jlong_arraycopy; 86 address entry_checkcast_arraycopy; 87 88 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 89 "jbyte_disjoint_arraycopy"); 90 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy, 91 "jbyte_arraycopy"); 92 93 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 94 "jshort_disjoint_arraycopy"); 95 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy, 96 "jshort_arraycopy"); 97 98 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry, 99 "jint_disjoint_arraycopy"); 100 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry, 101 &entry_jint_arraycopy, "jint_arraycopy"); 102 103 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry, 104 "jlong_disjoint_arraycopy"); 105 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry, 106 &entry_jlong_arraycopy, "jlong_arraycopy"); 107 if (UseCompressedOops) { 108 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry, 109 "oop_disjoint_arraycopy"); 110 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry, 111 &entry_oop_arraycopy, "oop_arraycopy"); 112 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry, 113 "oop_disjoint_arraycopy_uninit", 114 /*dest_uninitialized*/true); 115 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry, 116 nullptr, "oop_arraycopy_uninit", 117 /*dest_uninitialized*/true); 118 } else { 119 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry, 120 "oop_disjoint_arraycopy"); 121 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry, 122 &entry_oop_arraycopy, "oop_arraycopy"); 123 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry, 124 "oop_disjoint_arraycopy_uninit", 125 /*dest_uninitialized*/true); 126 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry, 127 nullptr, "oop_arraycopy_uninit", 128 /*dest_uninitialized*/true); 129 } 130 131 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 132 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 133 /*dest_uninitialized*/true); 134 135 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 136 entry_jbyte_arraycopy, 137 entry_jshort_arraycopy, 138 entry_jint_arraycopy, 139 entry_jlong_arraycopy); 140 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 141 entry_jbyte_arraycopy, 142 entry_jshort_arraycopy, 143 entry_jint_arraycopy, 144 entry_oop_arraycopy, 145 entry_jlong_arraycopy, 146 entry_checkcast_arraycopy); 147 148 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 149 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 150 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 151 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 152 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 153 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 154 155 // We don't generate specialized code for HeapWord-aligned source 156 // arrays, so just use the code we've already generated 157 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; 158 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; 159 160 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; 161 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; 162 163 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 164 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 165 166 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 167 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 168 169 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 170 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 171 172 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 173 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 174 } 175 176 177 // Verify that a register contains clean 32-bits positive value 178 // (high 32-bits are 0) so it could be used in 64-bits shifts. 179 // 180 // Input: 181 // Rint - 32-bits value 182 // Rtmp - scratch 183 // 184 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) { 185 #ifdef ASSERT 186 Label L; 187 assert_different_registers(Rtmp, Rint); 188 __ movslq(Rtmp, Rint); 189 __ cmpq(Rtmp, Rint); 190 __ jcc(Assembler::equal, L); 191 __ stop("high 32-bits of int value are not 0"); 192 __ bind(L); 193 #endif 194 } 195 196 197 // Generate overlap test for array copy stubs 198 // 199 // Input: 200 // c_rarg0 - from 201 // c_rarg1 - to 202 // c_rarg2 - element count 203 // 204 // Output: 205 // rax - &from[element count - 1] 206 // 207 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { 208 const Register from = c_rarg0; 209 const Register to = c_rarg1; 210 const Register count = c_rarg2; 211 const Register end_from = rax; 212 213 __ cmpptr(to, from); 214 __ lea(end_from, Address(from, count, sf, 0)); 215 if (NOLp == nullptr) { 216 ExternalAddress no_overlap(no_overlap_target); 217 __ jump_cc(Assembler::belowEqual, no_overlap); 218 __ cmpptr(to, end_from); 219 __ jump_cc(Assembler::aboveEqual, no_overlap); 220 } else { 221 __ jcc(Assembler::belowEqual, (*NOLp)); 222 __ cmpptr(to, end_from); 223 __ jcc(Assembler::aboveEqual, (*NOLp)); 224 } 225 } 226 227 228 // Copy big chunks forward 229 // 230 // Inputs: 231 // end_from - source arrays end address 232 // end_to - destination array end address 233 // qword_count - 64-bits element count, negative 234 // tmp1 - scratch 235 // L_copy_bytes - entry label 236 // L_copy_8_bytes - exit label 237 // 238 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to, 239 Register qword_count, Register tmp1, 240 Register tmp2, Label& L_copy_bytes, 241 Label& L_copy_8_bytes, DecoratorSet decorators, 242 BasicType type) { 243 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 244 DEBUG_ONLY(__ stop("enter at entry label, not here")); 245 Label L_loop; 246 __ align(OptoLoopAlignment); 247 if (UseUnalignedLoadStores) { 248 Label L_end; 249 __ BIND(L_loop); 250 if (UseAVX >= 2) { 251 bs->copy_load_at(_masm, decorators, type, 32, 252 xmm0, Address(end_from, qword_count, Address::times_8, -56), 253 tmp1, xmm1); 254 bs->copy_store_at(_masm, decorators, type, 32, 255 Address(end_to, qword_count, Address::times_8, -56), xmm0, 256 tmp1, tmp2, xmm1); 257 258 bs->copy_load_at(_masm, decorators, type, 32, 259 xmm0, Address(end_from, qword_count, Address::times_8, -24), 260 tmp1, xmm1); 261 bs->copy_store_at(_masm, decorators, type, 32, 262 Address(end_to, qword_count, Address::times_8, -24), xmm0, 263 tmp1, tmp2, xmm1); 264 } else { 265 bs->copy_load_at(_masm, decorators, type, 16, 266 xmm0, Address(end_from, qword_count, Address::times_8, -56), 267 tmp1, xmm1); 268 bs->copy_store_at(_masm, decorators, type, 16, 269 Address(end_to, qword_count, Address::times_8, -56), xmm0, 270 tmp1, tmp2, xmm1); 271 bs->copy_load_at(_masm, decorators, type, 16, 272 xmm0, Address(end_from, qword_count, Address::times_8, -40), 273 tmp1, xmm1); 274 bs->copy_store_at(_masm, decorators, type, 16, 275 Address(end_to, qword_count, Address::times_8, -40), xmm0, 276 tmp1, tmp2, xmm1); 277 bs->copy_load_at(_masm, decorators, type, 16, 278 xmm0, Address(end_from, qword_count, Address::times_8, -24), 279 tmp1, xmm1); 280 bs->copy_store_at(_masm, decorators, type, 16, 281 Address(end_to, qword_count, Address::times_8, -24), xmm0, 282 tmp1, tmp2, xmm1); 283 bs->copy_load_at(_masm, decorators, type, 16, 284 xmm0, Address(end_from, qword_count, Address::times_8, -8), 285 tmp1, xmm1); 286 bs->copy_store_at(_masm, decorators, type, 16, 287 Address(end_to, qword_count, Address::times_8, -8), xmm0, 288 tmp1, tmp2, xmm1); 289 } 290 291 __ BIND(L_copy_bytes); 292 __ addptr(qword_count, 8); 293 __ jcc(Assembler::lessEqual, L_loop); 294 __ subptr(qword_count, 4); // sub(8) and add(4) 295 __ jcc(Assembler::greater, L_end); 296 // Copy trailing 32 bytes 297 if (UseAVX >= 2) { 298 bs->copy_load_at(_masm, decorators, type, 32, 299 xmm0, Address(end_from, qword_count, Address::times_8, -24), 300 tmp1, xmm1); 301 bs->copy_store_at(_masm, decorators, type, 32, 302 Address(end_to, qword_count, Address::times_8, -24), xmm0, 303 tmp1, tmp2, xmm1); 304 } else { 305 bs->copy_load_at(_masm, decorators, type, 16, 306 xmm0, Address(end_from, qword_count, Address::times_8, -24), 307 tmp1, xmm1); 308 bs->copy_store_at(_masm, decorators, type, 16, 309 Address(end_to, qword_count, Address::times_8, -24), xmm0, 310 tmp1, tmp2, xmm1); 311 bs->copy_load_at(_masm, decorators, type, 16, 312 xmm0, Address(end_from, qword_count, Address::times_8, -8), 313 tmp1, xmm1); 314 bs->copy_store_at(_masm, decorators, type, 16, 315 Address(end_to, qword_count, Address::times_8, -8), xmm0, 316 tmp1, tmp2, xmm1); 317 } 318 __ addptr(qword_count, 4); 319 __ BIND(L_end); 320 } else { 321 // Copy 32-bytes per iteration 322 __ BIND(L_loop); 323 bs->copy_load_at(_masm, decorators, type, 8, 324 tmp1, Address(end_from, qword_count, Address::times_8, -24), 325 tmp2); 326 bs->copy_store_at(_masm, decorators, type, 8, 327 Address(end_to, qword_count, Address::times_8, -24), tmp1, 328 tmp2); 329 bs->copy_load_at(_masm, decorators, type, 8, 330 tmp1, Address(end_from, qword_count, Address::times_8, -16), 331 tmp2); 332 bs->copy_store_at(_masm, decorators, type, 8, 333 Address(end_to, qword_count, Address::times_8, -16), tmp1, 334 tmp2); 335 bs->copy_load_at(_masm, decorators, type, 8, 336 tmp1, Address(end_from, qword_count, Address::times_8, -8), 337 tmp2); 338 bs->copy_store_at(_masm, decorators, type, 8, 339 Address(end_to, qword_count, Address::times_8, -8), tmp1, 340 tmp2); 341 bs->copy_load_at(_masm, decorators, type, 8, 342 tmp1, Address(end_from, qword_count, Address::times_8, 0), 343 tmp2); 344 bs->copy_store_at(_masm, decorators, type, 8, 345 Address(end_to, qword_count, Address::times_8, 0), tmp1, 346 tmp2); 347 348 __ BIND(L_copy_bytes); 349 __ addptr(qword_count, 4); 350 __ jcc(Assembler::lessEqual, L_loop); 351 } 352 __ subptr(qword_count, 4); 353 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 354 } 355 356 357 // Copy big chunks backward 358 // 359 // Inputs: 360 // from - source arrays address 361 // dest - destination array address 362 // qword_count - 64-bits element count 363 // tmp1 - scratch 364 // L_copy_bytes - entry label 365 // L_copy_8_bytes - exit label 366 // 367 void StubGenerator::copy_bytes_backward(Register from, Register dest, 368 Register qword_count, Register tmp1, 369 Register tmp2, Label& L_copy_bytes, 370 Label& L_copy_8_bytes, DecoratorSet decorators, 371 BasicType type) { 372 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 373 DEBUG_ONLY(__ stop("enter at entry label, not here")); 374 Label L_loop; 375 __ align(OptoLoopAlignment); 376 if (UseUnalignedLoadStores) { 377 Label L_end; 378 __ BIND(L_loop); 379 if (UseAVX >= 2) { 380 bs->copy_load_at(_masm, decorators, type, 32, 381 xmm0, Address(from, qword_count, Address::times_8, 32), 382 tmp1, xmm1); 383 bs->copy_store_at(_masm, decorators, type, 32, 384 Address(dest, qword_count, Address::times_8, 32), xmm0, 385 tmp1, tmp2, xmm1); 386 bs->copy_load_at(_masm, decorators, type, 32, 387 xmm0, Address(from, qword_count, Address::times_8, 0), 388 tmp1, xmm1); 389 bs->copy_store_at(_masm, decorators, type, 32, 390 Address(dest, qword_count, Address::times_8, 0), xmm0, 391 tmp1, tmp2, xmm1); 392 } else { 393 bs->copy_load_at(_masm, decorators, type, 16, 394 xmm0, Address(from, qword_count, Address::times_8, 48), 395 tmp1, xmm1); 396 bs->copy_store_at(_masm, decorators, type, 16, 397 Address(dest, qword_count, Address::times_8, 48), xmm0, 398 tmp1, tmp2, xmm1); 399 bs->copy_load_at(_masm, decorators, type, 16, 400 xmm0, Address(from, qword_count, Address::times_8, 32), 401 tmp1, xmm1); 402 bs->copy_store_at(_masm, decorators, type, 16, 403 Address(dest, qword_count, Address::times_8, 32), xmm0, 404 tmp1, tmp2, xmm1); 405 bs->copy_load_at(_masm, decorators, type, 16, 406 xmm0, Address(from, qword_count, Address::times_8, 16), 407 tmp1, xmm1); 408 bs->copy_store_at(_masm, decorators, type, 16, 409 Address(dest, qword_count, Address::times_8, 16), xmm0, 410 tmp1, tmp2, xmm1); 411 bs->copy_load_at(_masm, decorators, type, 16, 412 xmm0, Address(from, qword_count, Address::times_8, 0), 413 tmp1, xmm1); 414 bs->copy_store_at(_masm, decorators, type, 16, 415 Address(dest, qword_count, Address::times_8, 0), xmm0, 416 tmp1, tmp2, xmm1); 417 } 418 419 __ BIND(L_copy_bytes); 420 __ subptr(qword_count, 8); 421 __ jcc(Assembler::greaterEqual, L_loop); 422 423 __ addptr(qword_count, 4); // add(8) and sub(4) 424 __ jcc(Assembler::less, L_end); 425 // Copy trailing 32 bytes 426 if (UseAVX >= 2) { 427 bs->copy_load_at(_masm, decorators, type, 32, 428 xmm0, Address(from, qword_count, Address::times_8, 0), 429 tmp1, xmm1); 430 bs->copy_store_at(_masm, decorators, type, 32, 431 Address(dest, qword_count, Address::times_8, 0), xmm0, 432 tmp1, tmp2, xmm1); 433 } else { 434 bs->copy_load_at(_masm, decorators, type, 16, 435 xmm0, Address(from, qword_count, Address::times_8, 16), 436 tmp1, xmm1); 437 bs->copy_store_at(_masm, decorators, type, 16, 438 Address(dest, qword_count, Address::times_8, 16), xmm0, 439 tmp1, tmp2, xmm1); 440 bs->copy_load_at(_masm, decorators, type, 16, 441 xmm0, Address(from, qword_count, Address::times_8, 0), 442 tmp1, xmm1); 443 bs->copy_store_at(_masm, decorators, type, 16, 444 Address(dest, qword_count, Address::times_8, 0), xmm0, 445 tmp1, tmp2, xmm1); 446 } 447 __ subptr(qword_count, 4); 448 __ BIND(L_end); 449 } else { 450 // Copy 32-bytes per iteration 451 __ BIND(L_loop); 452 bs->copy_load_at(_masm, decorators, type, 8, 453 tmp1, Address(from, qword_count, Address::times_8, 24), 454 tmp2); 455 bs->copy_store_at(_masm, decorators, type, 8, 456 Address(dest, qword_count, Address::times_8, 24), tmp1, 457 tmp2); 458 bs->copy_load_at(_masm, decorators, type, 8, 459 tmp1, Address(from, qword_count, Address::times_8, 16), 460 tmp2); 461 bs->copy_store_at(_masm, decorators, type, 8, 462 Address(dest, qword_count, Address::times_8, 16), tmp1, 463 tmp2); 464 bs->copy_load_at(_masm, decorators, type, 8, 465 tmp1, Address(from, qword_count, Address::times_8, 8), 466 tmp2); 467 bs->copy_store_at(_masm, decorators, type, 8, 468 Address(dest, qword_count, Address::times_8, 8), tmp1, 469 tmp2); 470 bs->copy_load_at(_masm, decorators, type, 8, 471 tmp1, Address(from, qword_count, Address::times_8, 0), 472 tmp2); 473 bs->copy_store_at(_masm, decorators, type, 8, 474 Address(dest, qword_count, Address::times_8, 0), tmp1, 475 tmp2); 476 477 __ BIND(L_copy_bytes); 478 __ subptr(qword_count, 4); 479 __ jcc(Assembler::greaterEqual, L_loop); 480 } 481 __ addptr(qword_count, 4); 482 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 483 } 484 485 #if COMPILER2_OR_JVMCI 486 487 // Note: Following rules apply to AVX3 optimized arraycopy stubs:- 488 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) 489 // for both special cases (various small block sizes) and aligned copy loop. This is the 490 // default configuration. 491 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) 492 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. 493 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a 494 // better performance for disjoint copies. For conjoint/backward copy vector based 495 // copy performs better. 496 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over 497 // 64 byte vector registers (ZMMs). 498 499 // Inputs: 500 // c_rarg0 - source array address 501 // c_rarg1 - destination array address 502 // c_rarg2 - element count, treated as ssize_t, can be zero 503 // 504 // 505 // Side Effects: 506 // disjoint_copy_avx3_masked is set to the no-overlap entry point 507 // used by generate_conjoint_[byte/int/short/long]_copy(). 508 // 509 address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const char *name, 510 int shift, bool aligned, bool is_oop, 511 bool dest_uninitialized) { 512 __ align(CodeEntryAlignment); 513 StubCodeMark mark(this, "StubRoutines", name); 514 address start = __ pc(); 515 516 int avx3threshold = VM_Version::avx3_threshold(); 517 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 518 const int large_threshold = 2621440; // 2.5 MB 519 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 520 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 521 Label L_copy_large, L_finish; 522 const Register from = rdi; // source array address 523 const Register to = rsi; // destination array address 524 const Register count = rdx; // elements count 525 const Register temp1 = r8; 526 const Register temp2 = r11; 527 const Register temp3 = rax; 528 const Register temp4 = rcx; 529 // End pointers are inclusive, and if count is not zero they point 530 // to the last unit copied: end_to[0] := end_from[0] 531 532 __ enter(); // required for proper stackwalking of RuntimeStub frame 533 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 534 535 if (entry != nullptr) { 536 *entry = __ pc(); 537 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 538 BLOCK_COMMENT("Entry:"); 539 } 540 541 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 542 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 543 544 setup_argument_regs(type); 545 546 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 547 if (dest_uninitialized) { 548 decorators |= IS_DEST_UNINITIALIZED; 549 } 550 if (aligned) { 551 decorators |= ARRAYCOPY_ALIGNED; 552 } 553 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 554 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 555 556 { 557 // Type(shift) byte(0), short(1), int(2), long(3) 558 int loop_size[] = { 192, 96, 48, 24}; 559 int threshold[] = { 4096, 2048, 1024, 512}; 560 561 // UnsafeCopyMemory page error: continue after ucm 562 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 563 // 'from', 'to' and 'count' are now valid 564 565 // temp1 holds remaining count and temp4 holds running count used to compute 566 // next address offset for start of to/from addresses (temp4 * scale). 567 __ mov64(temp4, 0); 568 __ movq(temp1, count); 569 570 // Zero length check. 571 __ BIND(L_tail); 572 __ cmpq(temp1, 0); 573 __ jcc(Assembler::lessEqual, L_exit); 574 575 // Special cases using 32 byte [masked] vector copy operations. 576 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 577 temp4, temp3, use64byteVector, L_entry, L_exit); 578 579 // PRE-MAIN-POST loop for aligned copy. 580 __ BIND(L_entry); 581 582 if (MaxVectorSize == 64) { 583 __ movq(temp2, temp1); 584 __ shlq(temp2, shift); 585 __ cmpq(temp2, large_threshold); 586 __ jcc(Assembler::greaterEqual, L_copy_large); 587 } 588 if (avx3threshold != 0) { 589 __ cmpq(count, threshold[shift]); 590 if (MaxVectorSize == 64) { 591 // Copy using 64 byte vectors. 592 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 593 } else { 594 assert(MaxVectorSize < 64, "vector size should be < 64 bytes"); 595 // REP MOVS offer a faster copy path. 596 __ jcc(Assembler::greaterEqual, L_repmovs); 597 } 598 } 599 600 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 601 // Partial copy to make dst address 32 byte aligned. 602 __ movq(temp2, to); 603 __ andq(temp2, 31); 604 __ jcc(Assembler::equal, L_main_pre_loop); 605 606 __ negptr(temp2); 607 __ addq(temp2, 32); 608 if (shift) { 609 __ shrq(temp2, shift); 610 } 611 __ movq(temp3, temp2); 612 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift); 613 __ movq(temp4, temp2); 614 __ movq(temp1, count); 615 __ subq(temp1, temp2); 616 617 __ cmpq(temp1, loop_size[shift]); 618 __ jcc(Assembler::less, L_tail); 619 620 __ BIND(L_main_pre_loop); 621 __ subq(temp1, loop_size[shift]); 622 623 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 624 __ align32(); 625 __ BIND(L_main_loop); 626 copy64_avx(to, from, temp4, xmm1, false, shift, 0); 627 copy64_avx(to, from, temp4, xmm1, false, shift, 64); 628 copy64_avx(to, from, temp4, xmm1, false, shift, 128); 629 __ addptr(temp4, loop_size[shift]); 630 __ subq(temp1, loop_size[shift]); 631 __ jcc(Assembler::greater, L_main_loop); 632 633 __ addq(temp1, loop_size[shift]); 634 635 // Tail loop. 636 __ jmp(L_tail); 637 638 __ BIND(L_repmovs); 639 __ movq(temp2, temp1); 640 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics. 641 __ movq(temp3, to); 642 __ movq(to, from); 643 __ movq(from, temp3); 644 // Save to/from for restoration post rep_mov. 645 __ movq(temp1, to); 646 __ movq(temp3, from); 647 if(shift < 3) { 648 __ shrq(temp2, 3-shift); // quad word count 649 } 650 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX). 651 __ rep_mov(); 652 __ shlq(temp2, 3); // convert quad words into byte count. 653 if(shift) { 654 __ shrq(temp2, shift); // type specific count. 655 } 656 // Restore original addresses in to/from. 657 __ movq(to, temp3); 658 __ movq(from, temp1); 659 __ movq(temp4, temp2); 660 __ movq(temp1, count); 661 __ subq(temp1, temp2); // tailing part (less than a quad ward size). 662 __ jmp(L_tail); 663 } 664 665 if (MaxVectorSize > 32) { 666 __ BIND(L_pre_main_post_64); 667 // Partial copy to make dst address 64 byte aligned. 668 __ movq(temp2, to); 669 __ andq(temp2, 63); 670 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 671 672 __ negptr(temp2); 673 __ addq(temp2, 64); 674 if (shift) { 675 __ shrq(temp2, shift); 676 } 677 __ movq(temp3, temp2); 678 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true); 679 __ movq(temp4, temp2); 680 __ movq(temp1, count); 681 __ subq(temp1, temp2); 682 683 __ cmpq(temp1, loop_size[shift]); 684 __ jcc(Assembler::less, L_tail64); 685 686 __ BIND(L_main_pre_loop_64bytes); 687 __ subq(temp1, loop_size[shift]); 688 689 // Main loop with aligned copy block size of 192 bytes at 690 // 64 byte copy granularity. 691 __ align32(); 692 __ BIND(L_main_loop_64bytes); 693 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true); 694 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true); 695 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true); 696 __ addptr(temp4, loop_size[shift]); 697 __ subq(temp1, loop_size[shift]); 698 __ jcc(Assembler::greater, L_main_loop_64bytes); 699 700 __ addq(temp1, loop_size[shift]); 701 // Zero length check. 702 __ jcc(Assembler::lessEqual, L_exit); 703 704 __ BIND(L_tail64); 705 706 // Tail handling using 64 byte [masked] vector copy operations. 707 use64byteVector = true; 708 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 709 temp4, temp3, use64byteVector, L_entry, L_exit); 710 } 711 __ BIND(L_exit); 712 } 713 714 __ BIND(L_finish); 715 address ucme_exit_pc = __ pc(); 716 // When called from generic_arraycopy r11 contains specific values 717 // used during arraycopy epilogue, re-initializing r11. 718 if (is_oop) { 719 __ movq(r11, shift == 3 ? count : to); 720 } 721 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 722 restore_argument_regs(type); 723 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 724 __ xorptr(rax, rax); // return 0 725 __ vzeroupper(); 726 __ leave(); // required for proper stackwalking of RuntimeStub frame 727 __ ret(0); 728 729 if (MaxVectorSize == 64) { 730 __ BIND(L_copy_large); 731 arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift); 732 __ jmp(L_finish); 733 } 734 return start; 735 } 736 737 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2, 738 Register temp3, Register temp4, Register count, 739 XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, 740 XMMRegister xmm4, int shift) { 741 742 // Type(shift) byte(0), short(1), int(2), long(3) 743 int loop_size[] = { 256, 128, 64, 32}; 744 int threshold[] = { 4096, 2048, 1024, 512}; 745 746 Label L_main_loop_large; 747 Label L_tail_large; 748 Label L_exit_large; 749 Label L_entry_large; 750 Label L_main_pre_loop_large; 751 Label L_pre_main_post_large; 752 753 assert(MaxVectorSize == 64, "vector length != 64"); 754 __ BIND(L_entry_large); 755 756 __ BIND(L_pre_main_post_large); 757 // Partial copy to make dst address 64 byte aligned. 758 __ movq(temp2, to); 759 __ andq(temp2, 63); 760 __ jcc(Assembler::equal, L_main_pre_loop_large); 761 762 __ negptr(temp2); 763 __ addq(temp2, 64); 764 if (shift) { 765 __ shrq(temp2, shift); 766 } 767 __ movq(temp3, temp2); 768 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true); 769 __ movq(temp4, temp2); 770 __ movq(temp1, count); 771 __ subq(temp1, temp2); 772 773 __ cmpq(temp1, loop_size[shift]); 774 __ jcc(Assembler::less, L_tail_large); 775 776 __ BIND(L_main_pre_loop_large); 777 __ subq(temp1, loop_size[shift]); 778 779 // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity. 780 __ align32(); 781 __ BIND(L_main_loop_large); 782 copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0); 783 __ addptr(temp4, loop_size[shift]); 784 __ subq(temp1, loop_size[shift]); 785 __ jcc(Assembler::greater, L_main_loop_large); 786 // fence needed because copy256_avx3 uses non-temporal stores 787 __ sfence(); 788 789 __ addq(temp1, loop_size[shift]); 790 // Zero length check. 791 __ jcc(Assembler::lessEqual, L_exit_large); 792 __ BIND(L_tail_large); 793 // Tail handling using 64 byte [masked] vector copy operations. 794 __ cmpq(temp1, 0); 795 __ jcc(Assembler::lessEqual, L_exit_large); 796 arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift, 797 temp4, temp3, L_exit_large); 798 __ BIND(L_exit_large); 799 } 800 801 // Inputs: 802 // c_rarg0 - source array address 803 // c_rarg1 - destination array address 804 // c_rarg2 - element count, treated as ssize_t, can be zero 805 // 806 // 807 address StubGenerator::generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift, 808 address nooverlap_target, bool aligned, 809 bool is_oop, bool dest_uninitialized) { 810 __ align(CodeEntryAlignment); 811 StubCodeMark mark(this, "StubRoutines", name); 812 address start = __ pc(); 813 814 int avx3threshold = VM_Version::avx3_threshold(); 815 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 816 817 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 818 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 819 const Register from = rdi; // source array address 820 const Register to = rsi; // destination array address 821 const Register count = rdx; // elements count 822 const Register temp1 = r8; 823 const Register temp2 = rcx; 824 const Register temp3 = r11; 825 const Register temp4 = rax; 826 // End pointers are inclusive, and if count is not zero they point 827 // to the last unit copied: end_to[0] := end_from[0] 828 829 __ enter(); // required for proper stackwalking of RuntimeStub frame 830 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 831 832 if (entry != nullptr) { 833 *entry = __ pc(); 834 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 835 BLOCK_COMMENT("Entry:"); 836 } 837 838 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift)); 839 840 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 841 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 842 843 setup_argument_regs(type); 844 845 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 846 if (dest_uninitialized) { 847 decorators |= IS_DEST_UNINITIALIZED; 848 } 849 if (aligned) { 850 decorators |= ARRAYCOPY_ALIGNED; 851 } 852 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 853 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 854 { 855 // Type(shift) byte(0), short(1), int(2), long(3) 856 int loop_size[] = { 192, 96, 48, 24}; 857 int threshold[] = { 4096, 2048, 1024, 512}; 858 859 // UnsafeCopyMemory page error: continue after ucm 860 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 861 // 'from', 'to' and 'count' are now valid 862 863 // temp1 holds remaining count. 864 __ movq(temp1, count); 865 866 // Zero length check. 867 __ BIND(L_tail); 868 __ cmpq(temp1, 0); 869 __ jcc(Assembler::lessEqual, L_exit); 870 871 __ mov64(temp2, 0); 872 __ movq(temp3, temp1); 873 // Special cases using 32 byte [masked] vector copy operations. 874 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 875 temp4, use64byteVector, L_entry, L_exit); 876 877 // PRE-MAIN-POST loop for aligned copy. 878 __ BIND(L_entry); 879 880 if ((MaxVectorSize > 32) && (avx3threshold != 0)) { 881 __ cmpq(temp1, threshold[shift]); 882 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 883 } 884 885 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 886 // Partial copy to make dst address 32 byte aligned. 887 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 888 __ andq(temp2, 31); 889 __ jcc(Assembler::equal, L_main_pre_loop); 890 891 if (shift) { 892 __ shrq(temp2, shift); 893 } 894 __ subq(temp1, temp2); 895 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift); 896 897 __ cmpq(temp1, loop_size[shift]); 898 __ jcc(Assembler::less, L_tail); 899 900 __ BIND(L_main_pre_loop); 901 902 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 903 __ align32(); 904 __ BIND(L_main_loop); 905 copy64_avx(to, from, temp1, xmm1, true, shift, -64); 906 copy64_avx(to, from, temp1, xmm1, true, shift, -128); 907 copy64_avx(to, from, temp1, xmm1, true, shift, -192); 908 __ subptr(temp1, loop_size[shift]); 909 __ cmpq(temp1, loop_size[shift]); 910 __ jcc(Assembler::greater, L_main_loop); 911 912 // Tail loop. 913 __ jmp(L_tail); 914 } 915 916 if (MaxVectorSize > 32) { 917 __ BIND(L_pre_main_post_64); 918 // Partial copy to make dst address 64 byte aligned. 919 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 920 __ andq(temp2, 63); 921 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 922 923 if (shift) { 924 __ shrq(temp2, shift); 925 } 926 __ subq(temp1, temp2); 927 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true); 928 929 __ cmpq(temp1, loop_size[shift]); 930 __ jcc(Assembler::less, L_tail64); 931 932 __ BIND(L_main_pre_loop_64bytes); 933 934 // Main loop with aligned copy block size of 192 bytes at 935 // 64 byte copy granularity. 936 __ align32(); 937 __ BIND(L_main_loop_64bytes); 938 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true); 939 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true); 940 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true); 941 __ subq(temp1, loop_size[shift]); 942 __ cmpq(temp1, loop_size[shift]); 943 __ jcc(Assembler::greater, L_main_loop_64bytes); 944 945 // Zero length check. 946 __ cmpq(temp1, 0); 947 __ jcc(Assembler::lessEqual, L_exit); 948 949 __ BIND(L_tail64); 950 951 // Tail handling using 64 byte [masked] vector copy operations. 952 use64byteVector = true; 953 __ mov64(temp2, 0); 954 __ movq(temp3, temp1); 955 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 956 temp4, use64byteVector, L_entry, L_exit); 957 } 958 __ BIND(L_exit); 959 } 960 address ucme_exit_pc = __ pc(); 961 // When called from generic_arraycopy r11 contains specific values 962 // used during arraycopy epilogue, re-initializing r11. 963 if(is_oop) { 964 __ movq(r11, count); 965 } 966 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 967 restore_argument_regs(type); 968 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 969 __ xorptr(rax, rax); // return 0 970 __ vzeroupper(); 971 __ leave(); // required for proper stackwalking of RuntimeStub frame 972 __ ret(0); 973 974 return start; 975 } 976 977 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from, 978 Register to, Register count, int shift, 979 Register index, Register temp, 980 bool use64byteVector, Label& L_entry, Label& L_exit) { 981 Label L_entry_64, L_entry_96, L_entry_128; 982 Label L_entry_160, L_entry_192; 983 984 int size_mat[][6] = { 985 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 986 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 987 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 988 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 989 }; 990 991 // Case A) Special case for length less than equal to 32 bytes. 992 __ cmpq(count, size_mat[shift][0]); 993 __ jccb(Assembler::greater, L_entry_64); 994 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift); 995 __ jmp(L_exit); 996 997 // Case B) Special case for length less than equal to 64 bytes. 998 __ BIND(L_entry_64); 999 __ cmpq(count, size_mat[shift][1]); 1000 __ jccb(Assembler::greater, L_entry_96); 1001 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector); 1002 __ jmp(L_exit); 1003 1004 // Case C) Special case for length less than equal to 96 bytes. 1005 __ BIND(L_entry_96); 1006 __ cmpq(count, size_mat[shift][2]); 1007 __ jccb(Assembler::greater, L_entry_128); 1008 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1009 __ subq(count, 64 >> shift); 1010 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64); 1011 __ jmp(L_exit); 1012 1013 // Case D) Special case for length less than equal to 128 bytes. 1014 __ BIND(L_entry_128); 1015 __ cmpq(count, size_mat[shift][3]); 1016 __ jccb(Assembler::greater, L_entry_160); 1017 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1018 copy32_avx(to, from, index, xmm, shift, 64); 1019 __ subq(count, 96 >> shift); 1020 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96); 1021 __ jmp(L_exit); 1022 1023 // Case E) Special case for length less than equal to 160 bytes. 1024 __ BIND(L_entry_160); 1025 __ cmpq(count, size_mat[shift][4]); 1026 __ jccb(Assembler::greater, L_entry_192); 1027 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1028 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 1029 __ subq(count, 128 >> shift); 1030 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128); 1031 __ jmp(L_exit); 1032 1033 // Case F) Special case for length less than equal to 192 bytes. 1034 __ BIND(L_entry_192); 1035 __ cmpq(count, size_mat[shift][5]); 1036 __ jcc(Assembler::greater, L_entry); 1037 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1038 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 1039 copy32_avx(to, from, index, xmm, shift, 128); 1040 __ subq(count, 160 >> shift); 1041 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160); 1042 __ jmp(L_exit); 1043 } 1044 1045 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from, 1046 Register to, Register count, int shift, Register index, 1047 Register temp, Label& L_exit) { 1048 Label L_entry_64, L_entry_128, L_entry_192, L_entry_256; 1049 1050 int size_mat[][4] = { 1051 /* T_BYTE */ {64, 128, 192, 256}, 1052 /* T_SHORT*/ {32, 64 , 96 , 128}, 1053 /* T_INT */ {16, 32 , 48 , 64}, 1054 /* T_LONG */ { 8, 16 , 24 , 32} 1055 }; 1056 1057 assert(MaxVectorSize == 64, "vector length != 64"); 1058 // Case A) Special case for length less than or equal to 64 bytes. 1059 __ BIND(L_entry_64); 1060 __ cmpq(count, size_mat[shift][0]); 1061 __ jccb(Assembler::greater, L_entry_128); 1062 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true); 1063 __ jmp(L_exit); 1064 1065 // Case B) Special case for length less than or equal to 128 bytes. 1066 __ BIND(L_entry_128); 1067 __ cmpq(count, size_mat[shift][1]); 1068 __ jccb(Assembler::greater, L_entry_192); 1069 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1070 __ subq(count, 64 >> shift); 1071 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true); 1072 __ jmp(L_exit); 1073 1074 // Case C) Special case for length less than or equal to 192 bytes. 1075 __ BIND(L_entry_192); 1076 __ cmpq(count, size_mat[shift][2]); 1077 __ jcc(Assembler::greater, L_entry_256); 1078 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1079 copy64_avx(to, from, index, xmm, false, shift, 64, true); 1080 __ subq(count, 128 >> shift); 1081 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true); 1082 __ jmp(L_exit); 1083 1084 // Case D) Special case for length less than or equal to 256 bytes. 1085 __ BIND(L_entry_256); 1086 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1087 copy64_avx(to, from, index, xmm, false, shift, 64, true); 1088 copy64_avx(to, from, index, xmm, false, shift, 128, true); 1089 __ subq(count, 192 >> shift); 1090 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true); 1091 __ jmp(L_exit); 1092 } 1093 1094 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, 1095 Register to, Register start_index, Register end_index, 1096 Register count, int shift, Register temp, 1097 bool use64byteVector, Label& L_entry, Label& L_exit) { 1098 Label L_entry_64, L_entry_96, L_entry_128; 1099 Label L_entry_160, L_entry_192; 1100 bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 1101 1102 int size_mat[][6] = { 1103 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 1104 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 1105 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 1106 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 1107 }; 1108 1109 // Case A) Special case for length less than equal to 32 bytes. 1110 __ cmpq(count, size_mat[shift][0]); 1111 __ jccb(Assembler::greater, L_entry_64); 1112 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1113 __ jmp(L_exit); 1114 1115 // Case B) Special case for length less than equal to 64 bytes. 1116 __ BIND(L_entry_64); 1117 __ cmpq(count, size_mat[shift][1]); 1118 __ jccb(Assembler::greater, L_entry_96); 1119 if (avx3) { 1120 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true); 1121 } else { 1122 copy32_avx(to, from, end_index, xmm, shift, -32); 1123 __ subq(count, 32 >> shift); 1124 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1125 } 1126 __ jmp(L_exit); 1127 1128 // Case C) Special case for length less than equal to 96 bytes. 1129 __ BIND(L_entry_96); 1130 __ cmpq(count, size_mat[shift][2]); 1131 __ jccb(Assembler::greater, L_entry_128); 1132 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1133 __ subq(count, 64 >> shift); 1134 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1135 __ jmp(L_exit); 1136 1137 // Case D) Special case for length less than equal to 128 bytes. 1138 __ BIND(L_entry_128); 1139 __ cmpq(count, size_mat[shift][3]); 1140 __ jccb(Assembler::greater, L_entry_160); 1141 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1142 copy32_avx(to, from, end_index, xmm, shift, -96); 1143 __ subq(count, 96 >> shift); 1144 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1145 __ jmp(L_exit); 1146 1147 // Case E) Special case for length less than equal to 160 bytes. 1148 __ BIND(L_entry_160); 1149 __ cmpq(count, size_mat[shift][4]); 1150 __ jccb(Assembler::greater, L_entry_192); 1151 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1152 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 1153 __ subq(count, 128 >> shift); 1154 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1155 __ jmp(L_exit); 1156 1157 // Case F) Special case for length less than equal to 192 bytes. 1158 __ BIND(L_entry_192); 1159 __ cmpq(count, size_mat[shift][5]); 1160 __ jcc(Assembler::greater, L_entry); 1161 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1162 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 1163 copy32_avx(to, from, end_index, xmm, shift, -160); 1164 __ subq(count, 160 >> shift); 1165 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1166 __ jmp(L_exit); 1167 } 1168 1169 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, 1170 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, 1171 int shift, int offset) { 1172 if (MaxVectorSize == 64) { 1173 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1174 __ prefetcht0(Address(src, index, scale, offset + 0x200)); 1175 __ prefetcht0(Address(src, index, scale, offset + 0x240)); 1176 __ prefetcht0(Address(src, index, scale, offset + 0x280)); 1177 __ prefetcht0(Address(src, index, scale, offset + 0x2C0)); 1178 1179 __ prefetcht0(Address(src, index, scale, offset + 0x400)); 1180 __ prefetcht0(Address(src, index, scale, offset + 0x440)); 1181 __ prefetcht0(Address(src, index, scale, offset + 0x480)); 1182 __ prefetcht0(Address(src, index, scale, offset + 0x4C0)); 1183 1184 __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit); 1185 __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit); 1186 __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit); 1187 __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit); 1188 1189 __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit); 1190 __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit); 1191 __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit); 1192 __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit); 1193 } 1194 } 1195 1196 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm, 1197 KRegister mask, Register length, Register index, 1198 Register temp, int shift, int offset, 1199 bool use64byteVector) { 1200 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 1201 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1202 if (!use64byteVector) { 1203 copy32_avx(dst, src, index, xmm, shift, offset); 1204 __ subptr(length, 32 >> shift); 1205 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32); 1206 } else { 1207 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1208 assert(MaxVectorSize == 64, "vector length != 64"); 1209 __ mov64(temp, -1L); 1210 __ bzhiq(temp, temp, length); 1211 __ kmovql(mask, temp); 1212 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit); 1213 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit); 1214 } 1215 } 1216 1217 1218 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm, 1219 KRegister mask, Register length, Register index, 1220 Register temp, int shift, int offset) { 1221 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1222 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 1223 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1224 __ mov64(temp, -1L); 1225 __ bzhiq(temp, temp, length); 1226 __ kmovql(mask, temp); 1227 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit); 1228 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit); 1229 } 1230 1231 1232 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, 1233 int shift, int offset) { 1234 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1235 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1236 __ vmovdqu(xmm, Address(src, index, scale, offset)); 1237 __ vmovdqu(Address(dst, index, scale, offset), xmm); 1238 } 1239 1240 1241 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, 1242 bool conjoint, int shift, int offset, bool use64byteVector) { 1243 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch"); 1244 if (!use64byteVector) { 1245 if (conjoint) { 1246 copy32_avx(dst, src, index, xmm, shift, offset+32); 1247 copy32_avx(dst, src, index, xmm, shift, offset); 1248 } else { 1249 copy32_avx(dst, src, index, xmm, shift, offset); 1250 copy32_avx(dst, src, index, xmm, shift, offset+32); 1251 } 1252 } else { 1253 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1254 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit); 1255 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit); 1256 } 1257 } 1258 1259 #endif // COMPILER2_OR_JVMCI 1260 1261 1262 // Arguments: 1263 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1264 // ignored 1265 // name - stub name string 1266 // 1267 // Inputs: 1268 // c_rarg0 - source array address 1269 // c_rarg1 - destination array address 1270 // c_rarg2 - element count, treated as ssize_t, can be zero 1271 // 1272 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1273 // we let the hardware handle it. The one to eight bytes within words, 1274 // dwords or qwords that span cache line boundaries will still be loaded 1275 // and stored atomically. 1276 // 1277 // Side Effects: 1278 // disjoint_byte_copy_entry is set to the no-overlap entry point 1279 // used by generate_conjoint_byte_copy(). 1280 // 1281 address StubGenerator::generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1282 #if COMPILER2_OR_JVMCI 1283 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1284 return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0, 1285 aligned, false, false); 1286 } 1287 #endif 1288 __ align(CodeEntryAlignment); 1289 StubCodeMark mark(this, "StubRoutines", name); 1290 address start = __ pc(); 1291 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1292 1293 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1294 Label L_copy_byte, L_exit; 1295 const Register from = rdi; // source array address 1296 const Register to = rsi; // destination array address 1297 const Register count = rdx; // elements count 1298 const Register byte_count = rcx; 1299 const Register qword_count = count; 1300 const Register end_from = from; // source array end address 1301 const Register end_to = to; // destination array end address 1302 // End pointers are inclusive, and if count is not zero they point 1303 // to the last unit copied: end_to[0] := end_from[0] 1304 1305 __ enter(); // required for proper stackwalking of RuntimeStub frame 1306 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1307 1308 if (entry != nullptr) { 1309 *entry = __ pc(); 1310 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1311 BLOCK_COMMENT("Entry:"); 1312 } 1313 1314 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1315 // r9 and r10 may be used to save non-volatile registers 1316 1317 { 1318 // UnsafeCopyMemory page error: continue after ucm 1319 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1320 // 'from', 'to' and 'count' are now valid 1321 __ movptr(byte_count, count); 1322 __ shrptr(count, 3); // count => qword_count 1323 1324 // Copy from low to high addresses. Use 'to' as scratch. 1325 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1326 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1327 __ negptr(qword_count); // make the count negative 1328 __ jmp(L_copy_bytes); 1329 1330 // Copy trailing qwords 1331 __ BIND(L_copy_8_bytes); 1332 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1333 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1334 __ increment(qword_count); 1335 __ jcc(Assembler::notZero, L_copy_8_bytes); 1336 1337 // Check for and copy trailing dword 1338 __ BIND(L_copy_4_bytes); 1339 __ testl(byte_count, 4); 1340 __ jccb(Assembler::zero, L_copy_2_bytes); 1341 __ movl(rax, Address(end_from, 8)); 1342 __ movl(Address(end_to, 8), rax); 1343 1344 __ addptr(end_from, 4); 1345 __ addptr(end_to, 4); 1346 1347 // Check for and copy trailing word 1348 __ BIND(L_copy_2_bytes); 1349 __ testl(byte_count, 2); 1350 __ jccb(Assembler::zero, L_copy_byte); 1351 __ movw(rax, Address(end_from, 8)); 1352 __ movw(Address(end_to, 8), rax); 1353 1354 __ addptr(end_from, 2); 1355 __ addptr(end_to, 2); 1356 1357 // Check for and copy trailing byte 1358 __ BIND(L_copy_byte); 1359 __ testl(byte_count, 1); 1360 __ jccb(Assembler::zero, L_exit); 1361 __ movb(rax, Address(end_from, 8)); 1362 __ movb(Address(end_to, 8), rax); 1363 } 1364 __ BIND(L_exit); 1365 address ucme_exit_pc = __ pc(); 1366 restore_arg_regs(); 1367 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1368 __ xorptr(rax, rax); // return 0 1369 __ vzeroupper(); 1370 __ leave(); // required for proper stackwalking of RuntimeStub frame 1371 __ ret(0); 1372 1373 { 1374 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc); 1375 // Copy in multi-bytes chunks 1376 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE); 1377 __ jmp(L_copy_4_bytes); 1378 } 1379 return start; 1380 } 1381 1382 1383 // Arguments: 1384 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1385 // ignored 1386 // name - stub name string 1387 // 1388 // Inputs: 1389 // c_rarg0 - source array address 1390 // c_rarg1 - destination array address 1391 // c_rarg2 - element count, treated as ssize_t, can be zero 1392 // 1393 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1394 // we let the hardware handle it. The one to eight bytes within words, 1395 // dwords or qwords that span cache line boundaries will still be loaded 1396 // and stored atomically. 1397 // 1398 address StubGenerator::generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1399 address* entry, const char *name) { 1400 #if COMPILER2_OR_JVMCI 1401 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1402 return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0, 1403 nooverlap_target, aligned, false, false); 1404 } 1405 #endif 1406 __ align(CodeEntryAlignment); 1407 StubCodeMark mark(this, "StubRoutines", name); 1408 address start = __ pc(); 1409 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1410 1411 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1412 const Register from = rdi; // source array address 1413 const Register to = rsi; // destination array address 1414 const Register count = rdx; // elements count 1415 const Register byte_count = rcx; 1416 const Register qword_count = count; 1417 1418 __ enter(); // required for proper stackwalking of RuntimeStub frame 1419 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1420 1421 if (entry != nullptr) { 1422 *entry = __ pc(); 1423 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1424 BLOCK_COMMENT("Entry:"); 1425 } 1426 1427 array_overlap_test(nooverlap_target, Address::times_1); 1428 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1429 // r9 and r10 may be used to save non-volatile registers 1430 1431 { 1432 // UnsafeCopyMemory page error: continue after ucm 1433 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1434 // 'from', 'to' and 'count' are now valid 1435 __ movptr(byte_count, count); 1436 __ shrptr(count, 3); // count => qword_count 1437 1438 // Copy from high to low addresses. 1439 1440 // Check for and copy trailing byte 1441 __ testl(byte_count, 1); 1442 __ jcc(Assembler::zero, L_copy_2_bytes); 1443 __ movb(rax, Address(from, byte_count, Address::times_1, -1)); 1444 __ movb(Address(to, byte_count, Address::times_1, -1), rax); 1445 __ decrement(byte_count); // Adjust for possible trailing word 1446 1447 // Check for and copy trailing word 1448 __ BIND(L_copy_2_bytes); 1449 __ testl(byte_count, 2); 1450 __ jcc(Assembler::zero, L_copy_4_bytes); 1451 __ movw(rax, Address(from, byte_count, Address::times_1, -2)); 1452 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1453 1454 // Check for and copy trailing dword 1455 __ BIND(L_copy_4_bytes); 1456 __ testl(byte_count, 4); 1457 __ jcc(Assembler::zero, L_copy_bytes); 1458 __ movl(rax, Address(from, qword_count, Address::times_8)); 1459 __ movl(Address(to, qword_count, Address::times_8), rax); 1460 __ jmp(L_copy_bytes); 1461 1462 // Copy trailing qwords 1463 __ BIND(L_copy_8_bytes); 1464 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1465 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1466 __ decrement(qword_count); 1467 __ jcc(Assembler::notZero, L_copy_8_bytes); 1468 } 1469 restore_arg_regs(); 1470 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1471 __ xorptr(rax, rax); // return 0 1472 __ vzeroupper(); 1473 __ leave(); // required for proper stackwalking of RuntimeStub frame 1474 __ ret(0); 1475 1476 { 1477 // UnsafeCopyMemory page error: continue after ucm 1478 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1479 // Copy in multi-bytes chunks 1480 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE); 1481 } 1482 restore_arg_regs(); 1483 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1484 __ xorptr(rax, rax); // return 0 1485 __ vzeroupper(); 1486 __ leave(); // required for proper stackwalking of RuntimeStub frame 1487 __ ret(0); 1488 1489 return start; 1490 } 1491 1492 1493 // Arguments: 1494 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1495 // ignored 1496 // name - stub name string 1497 // 1498 // Inputs: 1499 // c_rarg0 - source array address 1500 // c_rarg1 - destination array address 1501 // c_rarg2 - element count, treated as ssize_t, can be zero 1502 // 1503 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1504 // let the hardware handle it. The two or four words within dwords 1505 // or qwords that span cache line boundaries will still be loaded 1506 // and stored atomically. 1507 // 1508 // Side Effects: 1509 // disjoint_short_copy_entry is set to the no-overlap entry point 1510 // used by generate_conjoint_short_copy(). 1511 // 1512 address StubGenerator::generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { 1513 #if COMPILER2_OR_JVMCI 1514 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1515 return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1, 1516 aligned, false, false); 1517 } 1518 #endif 1519 1520 __ align(CodeEntryAlignment); 1521 StubCodeMark mark(this, "StubRoutines", name); 1522 address start = __ pc(); 1523 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1524 1525 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1526 const Register from = rdi; // source array address 1527 const Register to = rsi; // destination array address 1528 const Register count = rdx; // elements count 1529 const Register word_count = rcx; 1530 const Register qword_count = count; 1531 const Register end_from = from; // source array end address 1532 const Register end_to = to; // destination array end address 1533 // End pointers are inclusive, and if count is not zero they point 1534 // to the last unit copied: end_to[0] := end_from[0] 1535 1536 __ enter(); // required for proper stackwalking of RuntimeStub frame 1537 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1538 1539 if (entry != nullptr) { 1540 *entry = __ pc(); 1541 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1542 BLOCK_COMMENT("Entry:"); 1543 } 1544 1545 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1546 // r9 and r10 may be used to save non-volatile registers 1547 1548 { 1549 // UnsafeCopyMemory page error: continue after ucm 1550 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1551 // 'from', 'to' and 'count' are now valid 1552 __ movptr(word_count, count); 1553 __ shrptr(count, 2); // count => qword_count 1554 1555 // Copy from low to high addresses. Use 'to' as scratch. 1556 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1557 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1558 __ negptr(qword_count); 1559 __ jmp(L_copy_bytes); 1560 1561 // Copy trailing qwords 1562 __ BIND(L_copy_8_bytes); 1563 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1564 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1565 __ increment(qword_count); 1566 __ jcc(Assembler::notZero, L_copy_8_bytes); 1567 1568 // Original 'dest' is trashed, so we can't use it as a 1569 // base register for a possible trailing word copy 1570 1571 // Check for and copy trailing dword 1572 __ BIND(L_copy_4_bytes); 1573 __ testl(word_count, 2); 1574 __ jccb(Assembler::zero, L_copy_2_bytes); 1575 __ movl(rax, Address(end_from, 8)); 1576 __ movl(Address(end_to, 8), rax); 1577 1578 __ addptr(end_from, 4); 1579 __ addptr(end_to, 4); 1580 1581 // Check for and copy trailing word 1582 __ BIND(L_copy_2_bytes); 1583 __ testl(word_count, 1); 1584 __ jccb(Assembler::zero, L_exit); 1585 __ movw(rax, Address(end_from, 8)); 1586 __ movw(Address(end_to, 8), rax); 1587 } 1588 __ BIND(L_exit); 1589 address ucme_exit_pc = __ pc(); 1590 restore_arg_regs(); 1591 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1592 __ xorptr(rax, rax); // return 0 1593 __ vzeroupper(); 1594 __ leave(); // required for proper stackwalking of RuntimeStub frame 1595 __ ret(0); 1596 1597 { 1598 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc); 1599 // Copy in multi-bytes chunks 1600 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT); 1601 __ jmp(L_copy_4_bytes); 1602 } 1603 1604 return start; 1605 } 1606 1607 1608 address StubGenerator::generate_fill(BasicType t, bool aligned, const char *name) { 1609 __ align(CodeEntryAlignment); 1610 StubCodeMark mark(this, "StubRoutines", name); 1611 address start = __ pc(); 1612 1613 BLOCK_COMMENT("Entry:"); 1614 1615 const Register to = c_rarg0; // destination array address 1616 const Register value = c_rarg1; // value 1617 const Register count = c_rarg2; // elements count 1618 __ mov(r11, count); 1619 1620 __ enter(); // required for proper stackwalking of RuntimeStub frame 1621 1622 __ generate_fill(t, aligned, to, value, r11, rax, xmm0); 1623 1624 __ vzeroupper(); 1625 __ leave(); // required for proper stackwalking of RuntimeStub frame 1626 __ ret(0); 1627 1628 return start; 1629 } 1630 1631 1632 // Arguments: 1633 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1634 // ignored 1635 // name - stub name string 1636 // 1637 // Inputs: 1638 // c_rarg0 - source array address 1639 // c_rarg1 - destination array address 1640 // c_rarg2 - element count, treated as ssize_t, can be zero 1641 // 1642 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1643 // let the hardware handle it. The two or four words within dwords 1644 // or qwords that span cache line boundaries will still be loaded 1645 // and stored atomically. 1646 // 1647 address StubGenerator::generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1648 address *entry, const char *name) { 1649 #if COMPILER2_OR_JVMCI 1650 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1651 return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1, 1652 nooverlap_target, aligned, false, false); 1653 } 1654 #endif 1655 __ align(CodeEntryAlignment); 1656 StubCodeMark mark(this, "StubRoutines", name); 1657 address start = __ pc(); 1658 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1659 1660 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; 1661 const Register from = rdi; // source array address 1662 const Register to = rsi; // destination array address 1663 const Register count = rdx; // elements count 1664 const Register word_count = rcx; 1665 const Register qword_count = count; 1666 1667 __ enter(); // required for proper stackwalking of RuntimeStub frame 1668 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1669 1670 if (entry != nullptr) { 1671 *entry = __ pc(); 1672 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1673 BLOCK_COMMENT("Entry:"); 1674 } 1675 1676 array_overlap_test(nooverlap_target, Address::times_2); 1677 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1678 // r9 and r10 may be used to save non-volatile registers 1679 1680 { 1681 // UnsafeCopyMemory page error: continue after ucm 1682 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1683 // 'from', 'to' and 'count' are now valid 1684 __ movptr(word_count, count); 1685 __ shrptr(count, 2); // count => qword_count 1686 1687 // Copy from high to low addresses. Use 'to' as scratch. 1688 1689 // Check for and copy trailing word 1690 __ testl(word_count, 1); 1691 __ jccb(Assembler::zero, L_copy_4_bytes); 1692 __ movw(rax, Address(from, word_count, Address::times_2, -2)); 1693 __ movw(Address(to, word_count, Address::times_2, -2), rax); 1694 1695 // Check for and copy trailing dword 1696 __ BIND(L_copy_4_bytes); 1697 __ testl(word_count, 2); 1698 __ jcc(Assembler::zero, L_copy_bytes); 1699 __ movl(rax, Address(from, qword_count, Address::times_8)); 1700 __ movl(Address(to, qword_count, Address::times_8), rax); 1701 __ jmp(L_copy_bytes); 1702 1703 // Copy trailing qwords 1704 __ BIND(L_copy_8_bytes); 1705 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1706 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1707 __ decrement(qword_count); 1708 __ jcc(Assembler::notZero, L_copy_8_bytes); 1709 } 1710 restore_arg_regs(); 1711 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1712 __ xorptr(rax, rax); // return 0 1713 __ vzeroupper(); 1714 __ leave(); // required for proper stackwalking of RuntimeStub frame 1715 __ ret(0); 1716 1717 { 1718 // UnsafeCopyMemory page error: continue after ucm 1719 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1720 // Copy in multi-bytes chunks 1721 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT); 1722 } 1723 restore_arg_regs(); 1724 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1725 __ xorptr(rax, rax); // return 0 1726 __ vzeroupper(); 1727 __ leave(); // required for proper stackwalking of RuntimeStub frame 1728 __ ret(0); 1729 1730 return start; 1731 } 1732 1733 1734 // Arguments: 1735 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1736 // ignored 1737 // is_oop - true => oop array, so generate store check code 1738 // name - stub name string 1739 // 1740 // Inputs: 1741 // c_rarg0 - source array address 1742 // c_rarg1 - destination array address 1743 // c_rarg2 - element count, treated as ssize_t, can be zero 1744 // 1745 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1746 // the hardware handle it. The two dwords within qwords that span 1747 // cache line boundaries will still be loaded and stored atomically. 1748 // 1749 // Side Effects: 1750 // disjoint_int_copy_entry is set to the no-overlap entry point 1751 // used by generate_conjoint_int_oop_copy(). 1752 // 1753 address StubGenerator::generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, 1754 const char *name, bool dest_uninitialized) { 1755 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1756 #if COMPILER2_OR_JVMCI 1757 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1758 return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2, 1759 aligned, is_oop, dest_uninitialized); 1760 } 1761 #endif 1762 1763 __ align(CodeEntryAlignment); 1764 StubCodeMark mark(this, "StubRoutines", name); 1765 address start = __ pc(); 1766 1767 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 1768 const Register from = rdi; // source array address 1769 const Register to = rsi; // destination array address 1770 const Register count = rdx; // elements count 1771 const Register dword_count = rcx; 1772 const Register qword_count = count; 1773 const Register end_from = from; // source array end address 1774 const Register end_to = to; // destination array end address 1775 // End pointers are inclusive, and if count is not zero they point 1776 // to the last unit copied: end_to[0] := end_from[0] 1777 1778 __ enter(); // required for proper stackwalking of RuntimeStub frame 1779 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1780 1781 if (entry != nullptr) { 1782 *entry = __ pc(); 1783 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1784 BLOCK_COMMENT("Entry:"); 1785 } 1786 1787 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1788 // r9 is used to save r15_thread 1789 1790 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1791 if (dest_uninitialized) { 1792 decorators |= IS_DEST_UNINITIALIZED; 1793 } 1794 if (aligned) { 1795 decorators |= ARRAYCOPY_ALIGNED; 1796 } 1797 1798 BasicType type = is_oop ? T_OBJECT : T_INT; 1799 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1800 1801 { 1802 // UnsafeCopyMemory page error: continue after ucm 1803 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1804 // 'from', 'to' and 'count' are now valid 1805 __ movptr(dword_count, count); 1806 __ shrptr(count, 1); // count => qword_count 1807 1808 // Copy from low to high addresses. Use 'to' as scratch. 1809 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1810 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1811 __ negptr(qword_count); 1812 __ jmp(L_copy_bytes); 1813 1814 // Copy trailing qwords 1815 __ BIND(L_copy_8_bytes); 1816 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1817 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1818 __ increment(qword_count); 1819 __ jcc(Assembler::notZero, L_copy_8_bytes); 1820 1821 // Check for and copy trailing dword 1822 __ BIND(L_copy_4_bytes); 1823 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 1824 __ jccb(Assembler::zero, L_exit); 1825 __ movl(rax, Address(end_from, 8)); 1826 __ movl(Address(end_to, 8), rax); 1827 } 1828 __ BIND(L_exit); 1829 address ucme_exit_pc = __ pc(); 1830 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1831 restore_arg_regs_using_thread(); 1832 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1833 __ vzeroupper(); 1834 __ xorptr(rax, rax); // return 0 1835 __ leave(); // required for proper stackwalking of RuntimeStub frame 1836 __ ret(0); 1837 1838 { 1839 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc); 1840 // Copy in multi-bytes chunks 1841 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT); 1842 __ jmp(L_copy_4_bytes); 1843 } 1844 1845 return start; 1846 } 1847 1848 1849 // Arguments: 1850 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1851 // ignored 1852 // is_oop - true => oop array, so generate store check code 1853 // name - stub name string 1854 // 1855 // Inputs: 1856 // c_rarg0 - source array address 1857 // c_rarg1 - destination array address 1858 // c_rarg2 - element count, treated as ssize_t, can be zero 1859 // 1860 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1861 // the hardware handle it. The two dwords within qwords that span 1862 // cache line boundaries will still be loaded and stored atomically. 1863 // 1864 address StubGenerator::generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 1865 address *entry, const char *name, 1866 bool dest_uninitialized) { 1867 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1868 #if COMPILER2_OR_JVMCI 1869 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1870 return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2, 1871 nooverlap_target, aligned, is_oop, dest_uninitialized); 1872 } 1873 #endif 1874 __ align(CodeEntryAlignment); 1875 StubCodeMark mark(this, "StubRoutines", name); 1876 address start = __ pc(); 1877 1878 Label L_copy_bytes, L_copy_8_bytes, L_exit; 1879 const Register from = rdi; // source array address 1880 const Register to = rsi; // destination array address 1881 const Register count = rdx; // elements count 1882 const Register dword_count = rcx; 1883 const Register qword_count = count; 1884 1885 __ enter(); // required for proper stackwalking of RuntimeStub frame 1886 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1887 1888 if (entry != nullptr) { 1889 *entry = __ pc(); 1890 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1891 BLOCK_COMMENT("Entry:"); 1892 } 1893 1894 array_overlap_test(nooverlap_target, Address::times_4); 1895 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1896 // r9 is used to save r15_thread 1897 1898 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1899 if (dest_uninitialized) { 1900 decorators |= IS_DEST_UNINITIALIZED; 1901 } 1902 if (aligned) { 1903 decorators |= ARRAYCOPY_ALIGNED; 1904 } 1905 1906 BasicType type = is_oop ? T_OBJECT : T_INT; 1907 // no registers are destroyed by this call 1908 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1909 1910 assert_clean_int(count, rax); // Make sure 'count' is clean int. 1911 { 1912 // UnsafeCopyMemory page error: continue after ucm 1913 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1914 // 'from', 'to' and 'count' are now valid 1915 __ movptr(dword_count, count); 1916 __ shrptr(count, 1); // count => qword_count 1917 1918 // Copy from high to low addresses. Use 'to' as scratch. 1919 1920 // Check for and copy trailing dword 1921 __ testl(dword_count, 1); 1922 __ jcc(Assembler::zero, L_copy_bytes); 1923 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 1924 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 1925 __ jmp(L_copy_bytes); 1926 1927 // Copy trailing qwords 1928 __ BIND(L_copy_8_bytes); 1929 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1930 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1931 __ decrement(qword_count); 1932 __ jcc(Assembler::notZero, L_copy_8_bytes); 1933 } 1934 if (is_oop) { 1935 __ jmp(L_exit); 1936 } 1937 restore_arg_regs_using_thread(); 1938 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1939 __ xorptr(rax, rax); // return 0 1940 __ vzeroupper(); 1941 __ leave(); // required for proper stackwalking of RuntimeStub frame 1942 __ ret(0); 1943 1944 { 1945 // UnsafeCopyMemory page error: continue after ucm 1946 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1947 // Copy in multi-bytes chunks 1948 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT); 1949 } 1950 1951 __ BIND(L_exit); 1952 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1953 restore_arg_regs_using_thread(); 1954 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1955 __ xorptr(rax, rax); // return 0 1956 __ vzeroupper(); 1957 __ leave(); // required for proper stackwalking of RuntimeStub frame 1958 __ ret(0); 1959 1960 return start; 1961 } 1962 1963 1964 // Arguments: 1965 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1966 // ignored 1967 // is_oop - true => oop array, so generate store check code 1968 // name - stub name string 1969 // 1970 // Inputs: 1971 // c_rarg0 - source array address 1972 // c_rarg1 - destination array address 1973 // c_rarg2 - element count, treated as ssize_t, can be zero 1974 // 1975 // Side Effects: 1976 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1977 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1978 // 1979 address StubGenerator::generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, 1980 const char *name, bool dest_uninitialized) { 1981 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1982 #if COMPILER2_OR_JVMCI 1983 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1984 return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3, 1985 aligned, is_oop, dest_uninitialized); 1986 } 1987 #endif 1988 __ align(CodeEntryAlignment); 1989 StubCodeMark mark(this, "StubRoutines", name); 1990 address start = __ pc(); 1991 1992 Label L_copy_bytes, L_copy_8_bytes, L_exit; 1993 const Register from = rdi; // source array address 1994 const Register to = rsi; // destination array address 1995 const Register qword_count = rdx; // elements count 1996 const Register end_from = from; // source array end address 1997 const Register end_to = rcx; // destination array end address 1998 const Register saved_count = r11; 1999 // End pointers are inclusive, and if count is not zero they point 2000 // to the last unit copied: end_to[0] := end_from[0] 2001 2002 __ enter(); // required for proper stackwalking of RuntimeStub frame 2003 // Save no-overlap entry point for generate_conjoint_long_oop_copy() 2004 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2005 2006 if (entry != nullptr) { 2007 *entry = __ pc(); 2008 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2009 BLOCK_COMMENT("Entry:"); 2010 } 2011 2012 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2013 // r9 is used to save r15_thread 2014 // 'from', 'to' and 'qword_count' are now valid 2015 2016 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2017 if (dest_uninitialized) { 2018 decorators |= IS_DEST_UNINITIALIZED; 2019 } 2020 if (aligned) { 2021 decorators |= ARRAYCOPY_ALIGNED; 2022 } 2023 2024 BasicType type = is_oop ? T_OBJECT : T_LONG; 2025 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2026 { 2027 // UnsafeCopyMemory page error: continue after ucm 2028 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2029 2030 // Copy from low to high addresses. Use 'to' as scratch. 2031 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2032 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2033 __ negptr(qword_count); 2034 __ jmp(L_copy_bytes); 2035 2036 // Copy trailing qwords 2037 __ BIND(L_copy_8_bytes); 2038 bs->copy_load_at(_masm, decorators, type, 8, 2039 rax, Address(end_from, qword_count, Address::times_8, 8), 2040 r10); 2041 bs->copy_store_at(_masm, decorators, type, 8, 2042 Address(end_to, qword_count, Address::times_8, 8), rax, 2043 r10); 2044 __ increment(qword_count); 2045 __ jcc(Assembler::notZero, L_copy_8_bytes); 2046 } 2047 if (is_oop) { 2048 __ jmp(L_exit); 2049 } else { 2050 restore_arg_regs_using_thread(); 2051 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2052 __ xorptr(rax, rax); // return 0 2053 __ vzeroupper(); 2054 __ leave(); // required for proper stackwalking of RuntimeStub frame 2055 __ ret(0); 2056 } 2057 2058 { 2059 // UnsafeCopyMemory page error: continue after ucm 2060 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2061 // Copy in multi-bytes chunks 2062 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG); 2063 } 2064 2065 __ BIND(L_exit); 2066 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2067 restore_arg_regs_using_thread(); 2068 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 2069 SharedRuntime::_jlong_array_copy_ctr, 2070 rscratch1); // Update counter after rscratch1 is free 2071 __ vzeroupper(); 2072 __ xorptr(rax, rax); // return 0 2073 __ leave(); // required for proper stackwalking of RuntimeStub frame 2074 __ ret(0); 2075 2076 return start; 2077 } 2078 2079 2080 // Arguments: 2081 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2082 // ignored 2083 // is_oop - true => oop array, so generate store check code 2084 // name - stub name string 2085 // 2086 // Inputs: 2087 // c_rarg0 - source array address 2088 // c_rarg1 - destination array address 2089 // c_rarg2 - element count, treated as ssize_t, can be zero 2090 // 2091 address StubGenerator::generate_conjoint_long_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 2092 address *entry, const char *name, 2093 bool dest_uninitialized) { 2094 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2095 #if COMPILER2_OR_JVMCI 2096 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 2097 return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3, 2098 nooverlap_target, aligned, is_oop, dest_uninitialized); 2099 } 2100 #endif 2101 __ align(CodeEntryAlignment); 2102 StubCodeMark mark(this, "StubRoutines", name); 2103 address start = __ pc(); 2104 2105 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2106 const Register from = rdi; // source array address 2107 const Register to = rsi; // destination array address 2108 const Register qword_count = rdx; // elements count 2109 const Register saved_count = rcx; 2110 2111 __ enter(); // required for proper stackwalking of RuntimeStub frame 2112 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2113 2114 if (entry != nullptr) { 2115 *entry = __ pc(); 2116 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2117 BLOCK_COMMENT("Entry:"); 2118 } 2119 2120 array_overlap_test(nooverlap_target, Address::times_8); 2121 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2122 // r9 is used to save r15_thread 2123 // 'from', 'to' and 'qword_count' are now valid 2124 2125 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2126 if (dest_uninitialized) { 2127 decorators |= IS_DEST_UNINITIALIZED; 2128 } 2129 if (aligned) { 2130 decorators |= ARRAYCOPY_ALIGNED; 2131 } 2132 2133 BasicType type = is_oop ? T_OBJECT : T_LONG; 2134 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2135 { 2136 // UnsafeCopyMemory page error: continue after ucm 2137 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2138 2139 __ jmp(L_copy_bytes); 2140 2141 // Copy trailing qwords 2142 __ BIND(L_copy_8_bytes); 2143 bs->copy_load_at(_masm, decorators, type, 8, 2144 rax, Address(from, qword_count, Address::times_8, -8), 2145 r10); 2146 bs->copy_store_at(_masm, decorators, type, 8, 2147 Address(to, qword_count, Address::times_8, -8), rax, 2148 r10); 2149 __ decrement(qword_count); 2150 __ jcc(Assembler::notZero, L_copy_8_bytes); 2151 } 2152 if (is_oop) { 2153 __ jmp(L_exit); 2154 } else { 2155 restore_arg_regs_using_thread(); 2156 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2157 __ xorptr(rax, rax); // return 0 2158 __ vzeroupper(); 2159 __ leave(); // required for proper stackwalking of RuntimeStub frame 2160 __ ret(0); 2161 } 2162 { 2163 // UnsafeCopyMemory page error: continue after ucm 2164 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2165 2166 // Copy in multi-bytes chunks 2167 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG); 2168 } 2169 __ BIND(L_exit); 2170 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2171 restore_arg_regs_using_thread(); 2172 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 2173 SharedRuntime::_jlong_array_copy_ctr, 2174 rscratch1); // Update counter after rscratch1 is free 2175 __ vzeroupper(); 2176 __ xorptr(rax, rax); // return 0 2177 __ leave(); // required for proper stackwalking of RuntimeStub frame 2178 __ ret(0); 2179 2180 return start; 2181 } 2182 2183 2184 // Helper for generating a dynamic type check. 2185 // Smashes no registers. 2186 void StubGenerator::generate_type_check(Register sub_klass, 2187 Register super_check_offset, 2188 Register super_klass, 2189 Label& L_success) { 2190 assert_different_registers(sub_klass, super_check_offset, super_klass); 2191 2192 BLOCK_COMMENT("type_check:"); 2193 2194 Label L_miss; 2195 2196 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 2197 super_check_offset); 2198 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 2199 2200 // Fall through on failure! 2201 __ BIND(L_miss); 2202 } 2203 2204 // 2205 // Generate checkcasting array copy stub 2206 // 2207 // Input: 2208 // c_rarg0 - source array address 2209 // c_rarg1 - destination array address 2210 // c_rarg2 - element count, treated as ssize_t, can be zero 2211 // c_rarg3 - size_t ckoff (super_check_offset) 2212 // not Win64 2213 // c_rarg4 - oop ckval (super_klass) 2214 // Win64 2215 // rsp+40 - oop ckval (super_klass) 2216 // 2217 // Output: 2218 // rax == 0 - success 2219 // rax == -1^K - failure, where K is partial transfer count 2220 // 2221 address StubGenerator::generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized) { 2222 2223 Label L_load_element, L_store_element, L_do_card_marks, L_done; 2224 2225 // Input registers (after setup_arg_regs) 2226 const Register from = rdi; // source array address 2227 const Register to = rsi; // destination array address 2228 const Register length = rdx; // elements count 2229 const Register ckoff = rcx; // super_check_offset 2230 const Register ckval = r8; // super_klass 2231 2232 // Registers used as temps (r13, r14 are save-on-entry) 2233 const Register end_from = from; // source array end address 2234 const Register end_to = r13; // destination array end address 2235 const Register count = rdx; // -(count_remaining) 2236 const Register r14_length = r14; // saved copy of length 2237 // End pointers are inclusive, and if length is not zero they point 2238 // to the last unit copied: end_to[0] := end_from[0] 2239 2240 const Register rax_oop = rax; // actual oop copied 2241 const Register r11_klass = r11; // oop._klass 2242 2243 //--------------------------------------------------------------- 2244 // Assembler stub will be used for this call to arraycopy 2245 // if the two arrays are subtypes of Object[] but the 2246 // destination array type is not equal to or a supertype 2247 // of the source type. Each element must be separately 2248 // checked. 2249 2250 __ align(CodeEntryAlignment); 2251 StubCodeMark mark(this, "StubRoutines", name); 2252 address start = __ pc(); 2253 2254 __ enter(); // required for proper stackwalking of RuntimeStub frame 2255 2256 #ifdef ASSERT 2257 // caller guarantees that the arrays really are different 2258 // otherwise, we would have to make conjoint checks 2259 { Label L; 2260 array_overlap_test(L, TIMES_OOP); 2261 __ stop("checkcast_copy within a single array"); 2262 __ bind(L); 2263 } 2264 #endif //ASSERT 2265 2266 setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx 2267 // ckoff => rcx, ckval => r8 2268 // r9 is used to save r15_thread 2269 #ifdef _WIN64 2270 // last argument (#4) is on stack on Win64 2271 __ movptr(ckval, Address(rsp, 6 * wordSize)); 2272 #endif 2273 2274 // Caller of this entry point must set up the argument registers. 2275 if (entry != nullptr) { 2276 *entry = __ pc(); 2277 BLOCK_COMMENT("Entry:"); 2278 } 2279 2280 // allocate spill slots for r13, r14 2281 enum { 2282 saved_r13_offset, 2283 saved_r14_offset, 2284 saved_r10_offset, 2285 saved_rbp_offset 2286 }; 2287 __ subptr(rsp, saved_rbp_offset * wordSize); 2288 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 2289 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 2290 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10); 2291 2292 #ifdef ASSERT 2293 Label L2; 2294 __ get_thread(r14); 2295 __ cmpptr(r15_thread, r14); 2296 __ jcc(Assembler::equal, L2); 2297 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 2298 __ bind(L2); 2299 #endif // ASSERT 2300 2301 // check that int operands are properly extended to size_t 2302 assert_clean_int(length, rax); 2303 assert_clean_int(ckoff, rax); 2304 2305 #ifdef ASSERT 2306 BLOCK_COMMENT("assert consistent ckoff/ckval"); 2307 // The ckoff and ckval must be mutually consistent, 2308 // even though caller generates both. 2309 { Label L; 2310 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2311 __ cmpl(ckoff, Address(ckval, sco_offset)); 2312 __ jcc(Assembler::equal, L); 2313 __ stop("super_check_offset inconsistent"); 2314 __ bind(L); 2315 } 2316 #endif //ASSERT 2317 2318 // Loop-invariant addresses. They are exclusive end pointers. 2319 Address end_from_addr(from, length, TIMES_OOP, 0); 2320 Address end_to_addr(to, length, TIMES_OOP, 0); 2321 // Loop-variant addresses. They assume post-incremented count < 0. 2322 Address from_element_addr(end_from, count, TIMES_OOP, 0); 2323 Address to_element_addr(end_to, count, TIMES_OOP, 0); 2324 2325 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 2326 if (dest_uninitialized) { 2327 decorators |= IS_DEST_UNINITIALIZED; 2328 } 2329 2330 BasicType type = T_OBJECT; 2331 size_t element_size = UseCompressedOops ? 4 : 8; 2332 2333 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2334 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2335 2336 // Copy from low to high addresses, indexed from the end of each array. 2337 __ lea(end_from, end_from_addr); 2338 __ lea(end_to, end_to_addr); 2339 __ movptr(r14_length, length); // save a copy of the length 2340 assert(length == count, ""); // else fix next line: 2341 __ negptr(count); // negate and test the length 2342 __ jcc(Assembler::notZero, L_load_element); 2343 2344 // Empty array: Nothing to do. 2345 __ xorptr(rax, rax); // return 0 on (trivial) success 2346 __ jmp(L_done); 2347 2348 // ======== begin loop ======== 2349 // (Loop is rotated; its entry is L_load_element.) 2350 // Loop control: 2351 // for (count = -count; count != 0; count++) 2352 // Base pointers src, dst are biased by 8*(count-1),to last element. 2353 __ align(OptoLoopAlignment); 2354 2355 __ BIND(L_store_element); 2356 bs->copy_store_at(_masm, 2357 decorators, 2358 type, 2359 element_size, 2360 to_element_addr, 2361 rax_oop, 2362 r10); 2363 __ increment(count); // increment the count toward zero 2364 __ jcc(Assembler::zero, L_do_card_marks); 2365 2366 // ======== loop entry is here ======== 2367 __ BIND(L_load_element); 2368 bs->copy_load_at(_masm, 2369 decorators, 2370 type, 2371 element_size, 2372 rax_oop, 2373 from_element_addr, 2374 r10); 2375 __ testptr(rax_oop, rax_oop); 2376 __ jcc(Assembler::zero, L_store_element); 2377 2378 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass 2379 generate_type_check(r11_klass, ckoff, ckval, L_store_element); 2380 // ======== end loop ======== 2381 2382 // It was a real error; we must depend on the caller to finish the job. 2383 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. 2384 // Emit GC store barriers for the oops we have copied (r14 + rdx), 2385 // and report their number to the caller. 2386 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); 2387 Label L_post_barrier; 2388 __ addptr(r14_length, count); // K = (original - remaining) oops 2389 __ movptr(rax, r14_length); // save the value 2390 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 2391 __ jccb(Assembler::notZero, L_post_barrier); 2392 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 2393 2394 // Come here on success only. 2395 __ BIND(L_do_card_marks); 2396 __ xorptr(rax, rax); // return 0 on success 2397 2398 __ BIND(L_post_barrier); 2399 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length); 2400 2401 // Common exit point (success or failure). 2402 __ BIND(L_done); 2403 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 2404 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 2405 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize)); 2406 restore_arg_regs_using_thread(); 2407 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2408 __ leave(); // required for proper stackwalking of RuntimeStub frame 2409 __ ret(0); 2410 2411 return start; 2412 } 2413 2414 2415 // Generate 'unsafe' array copy stub 2416 // Though just as safe as the other stubs, it takes an unscaled 2417 // size_t argument instead of an element count. 2418 // 2419 // Input: 2420 // c_rarg0 - source array address 2421 // c_rarg1 - destination array address 2422 // c_rarg2 - byte count, treated as ssize_t, can be zero 2423 // 2424 // Examines the alignment of the operands and dispatches 2425 // to a long, int, short, or byte copy loop. 2426 // 2427 address StubGenerator::generate_unsafe_copy(const char *name, 2428 address byte_copy_entry, address short_copy_entry, 2429 address int_copy_entry, address long_copy_entry) { 2430 2431 Label L_long_aligned, L_int_aligned, L_short_aligned; 2432 2433 // Input registers (before setup_arg_regs) 2434 const Register from = c_rarg0; // source array address 2435 const Register to = c_rarg1; // destination array address 2436 const Register size = c_rarg2; // byte count (size_t) 2437 2438 // Register used as a temp 2439 const Register bits = rax; // test copy of low bits 2440 2441 __ align(CodeEntryAlignment); 2442 StubCodeMark mark(this, "StubRoutines", name); 2443 address start = __ pc(); 2444 2445 __ enter(); // required for proper stackwalking of RuntimeStub frame 2446 2447 // bump this on entry, not on exit: 2448 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1); 2449 2450 __ mov(bits, from); 2451 __ orptr(bits, to); 2452 __ orptr(bits, size); 2453 2454 __ testb(bits, BytesPerLong-1); 2455 __ jccb(Assembler::zero, L_long_aligned); 2456 2457 __ testb(bits, BytesPerInt-1); 2458 __ jccb(Assembler::zero, L_int_aligned); 2459 2460 __ testb(bits, BytesPerShort-1); 2461 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 2462 2463 __ BIND(L_short_aligned); 2464 __ shrptr(size, LogBytesPerShort); // size => short_count 2465 __ jump(RuntimeAddress(short_copy_entry)); 2466 2467 __ BIND(L_int_aligned); 2468 __ shrptr(size, LogBytesPerInt); // size => int_count 2469 __ jump(RuntimeAddress(int_copy_entry)); 2470 2471 __ BIND(L_long_aligned); 2472 __ shrptr(size, LogBytesPerLong); // size => qword_count 2473 __ jump(RuntimeAddress(long_copy_entry)); 2474 2475 return start; 2476 } 2477 2478 2479 // Perform range checks on the proposed arraycopy. 2480 // Kills temp, but nothing else. 2481 // Also, clean the sign bits of src_pos and dst_pos. 2482 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2483 Register src_pos, // source position (c_rarg1) 2484 Register dst, // destination array oo (c_rarg2) 2485 Register dst_pos, // destination position (c_rarg3) 2486 Register length, 2487 Register temp, 2488 Label& L_failed) { 2489 BLOCK_COMMENT("arraycopy_range_checks:"); 2490 2491 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2492 __ movl(temp, length); 2493 __ addl(temp, src_pos); // src_pos + length 2494 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); 2495 __ jcc(Assembler::above, L_failed); 2496 2497 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2498 __ movl(temp, length); 2499 __ addl(temp, dst_pos); // dst_pos + length 2500 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2501 __ jcc(Assembler::above, L_failed); 2502 2503 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2504 // Move with sign extension can be used since they are positive. 2505 __ movslq(src_pos, src_pos); 2506 __ movslq(dst_pos, dst_pos); 2507 2508 BLOCK_COMMENT("arraycopy_range_checks done"); 2509 } 2510 2511 2512 // Generate generic array copy stubs 2513 // 2514 // Input: 2515 // c_rarg0 - src oop 2516 // c_rarg1 - src_pos (32-bits) 2517 // c_rarg2 - dst oop 2518 // c_rarg3 - dst_pos (32-bits) 2519 // not Win64 2520 // c_rarg4 - element count (32-bits) 2521 // Win64 2522 // rsp+40 - element count (32-bits) 2523 // 2524 // Output: 2525 // rax == 0 - success 2526 // rax == -1^K - failure, where K is partial transfer count 2527 // 2528 address StubGenerator::generate_generic_copy(const char *name, 2529 address byte_copy_entry, address short_copy_entry, 2530 address int_copy_entry, address oop_copy_entry, 2531 address long_copy_entry, address checkcast_copy_entry) { 2532 2533 Label L_failed, L_failed_0, L_objArray; 2534 Label L_copy_shorts, L_copy_ints, L_copy_longs; 2535 2536 // Input registers 2537 const Register src = c_rarg0; // source array oop 2538 const Register src_pos = c_rarg1; // source position 2539 const Register dst = c_rarg2; // destination array oop 2540 const Register dst_pos = c_rarg3; // destination position 2541 #ifndef _WIN64 2542 const Register length = c_rarg4; 2543 const Register rklass_tmp = r9; // load_klass 2544 #else 2545 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64 2546 const Register rklass_tmp = rdi; // load_klass 2547 #endif 2548 2549 { int modulus = CodeEntryAlignment; 2550 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 2551 int advance = target - (__ offset() % modulus); 2552 if (advance < 0) advance += modulus; 2553 if (advance > 0) __ nop(advance); 2554 } 2555 StubCodeMark mark(this, "StubRoutines", name); 2556 2557 // Short-hop target to L_failed. Makes for denser prologue code. 2558 __ BIND(L_failed_0); 2559 __ jmp(L_failed); 2560 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 2561 2562 __ align(CodeEntryAlignment); 2563 address start = __ pc(); 2564 2565 __ enter(); // required for proper stackwalking of RuntimeStub frame 2566 2567 #ifdef _WIN64 2568 __ push(rklass_tmp); // rdi is callee-save on Windows 2569 #endif 2570 2571 // bump this on entry, not on exit: 2572 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1); 2573 2574 //----------------------------------------------------------------------- 2575 // Assembler stub will be used for this call to arraycopy 2576 // if the following conditions are met: 2577 // 2578 // (1) src and dst must not be null. 2579 // (2) src_pos must not be negative. 2580 // (3) dst_pos must not be negative. 2581 // (4) length must not be negative. 2582 // (5) src klass and dst klass should be the same and not null. 2583 // (6) src and dst should be arrays. 2584 // (7) src_pos + length must not exceed length of src. 2585 // (8) dst_pos + length must not exceed length of dst. 2586 // 2587 2588 // if (src == nullptr) return -1; 2589 __ testptr(src, src); // src oop 2590 size_t j1off = __ offset(); 2591 __ jccb(Assembler::zero, L_failed_0); 2592 2593 // if (src_pos < 0) return -1; 2594 __ testl(src_pos, src_pos); // src_pos (32-bits) 2595 __ jccb(Assembler::negative, L_failed_0); 2596 2597 // if (dst == nullptr) return -1; 2598 __ testptr(dst, dst); // dst oop 2599 __ jccb(Assembler::zero, L_failed_0); 2600 2601 // if (dst_pos < 0) return -1; 2602 __ testl(dst_pos, dst_pos); // dst_pos (32-bits) 2603 size_t j4off = __ offset(); 2604 __ jccb(Assembler::negative, L_failed_0); 2605 2606 // The first four tests are very dense code, 2607 // but not quite dense enough to put four 2608 // jumps in a 16-byte instruction fetch buffer. 2609 // That's good, because some branch predicters 2610 // do not like jumps so close together. 2611 // Make sure of this. 2612 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps"); 2613 2614 // registers used as temp 2615 const Register r11_length = r11; // elements count to copy 2616 const Register r10_src_klass = r10; // array klass 2617 2618 // if (length < 0) return -1; 2619 __ movl(r11_length, length); // length (elements count, 32-bits value) 2620 __ testl(r11_length, r11_length); 2621 __ jccb(Assembler::negative, L_failed_0); 2622 2623 __ load_klass(r10_src_klass, src, rklass_tmp); 2624 #ifdef ASSERT 2625 // assert(src->klass() != nullptr); 2626 { 2627 BLOCK_COMMENT("assert klasses not null {"); 2628 Label L1, L2; 2629 __ testptr(r10_src_klass, r10_src_klass); 2630 __ jcc(Assembler::notZero, L2); // it is broken if klass is null 2631 __ bind(L1); 2632 __ stop("broken null klass"); 2633 __ bind(L2); 2634 __ load_klass(rax, dst, rklass_tmp); 2635 __ cmpq(rax, 0); 2636 __ jcc(Assembler::equal, L1); // this would be broken also 2637 BLOCK_COMMENT("} assert klasses not null done"); 2638 } 2639 #endif 2640 2641 // Load layout helper (32-bits) 2642 // 2643 // |array_tag| | header_size | element_type | |log2_element_size| 2644 // 32 30 24 16 8 2 0 2645 // 2646 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2647 // 2648 2649 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2650 2651 // Handle objArrays completely differently... 2652 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2653 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); 2654 __ jcc(Assembler::equal, L_objArray); 2655 2656 // if (src->klass() != dst->klass()) return -1; 2657 __ load_klass(rax, dst, rklass_tmp); 2658 __ cmpq(r10_src_klass, rax); 2659 __ jcc(Assembler::notEqual, L_failed); 2660 2661 const Register rax_lh = rax; // layout helper 2662 __ movl(rax_lh, Address(r10_src_klass, lh_offset)); 2663 2664 // if (!src->is_Array()) return -1; 2665 __ cmpl(rax_lh, Klass::_lh_neutral_value); 2666 __ jcc(Assembler::greaterEqual, L_failed); 2667 2668 // At this point, it is known to be a typeArray (array_tag 0x3). 2669 #ifdef ASSERT 2670 { 2671 BLOCK_COMMENT("assert primitive array {"); 2672 Label L; 2673 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 2674 __ jcc(Assembler::greaterEqual, L); 2675 __ stop("must be a primitive array"); 2676 __ bind(L); 2677 BLOCK_COMMENT("} assert primitive array done"); 2678 } 2679 #endif 2680 2681 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2682 r10, L_failed); 2683 2684 // TypeArrayKlass 2685 // 2686 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2687 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2688 // 2689 2690 const Register r10_offset = r10; // array offset 2691 const Register rax_elsize = rax_lh; // element size 2692 2693 __ movl(r10_offset, rax_lh); 2694 __ shrl(r10_offset, Klass::_lh_header_size_shift); 2695 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset 2696 __ addptr(src, r10_offset); // src array offset 2697 __ addptr(dst, r10_offset); // dst array offset 2698 BLOCK_COMMENT("choose copy loop based on element size"); 2699 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize 2700 2701 #ifdef _WIN64 2702 __ pop(rklass_tmp); // Restore callee-save rdi 2703 #endif 2704 2705 // next registers should be set before the jump to corresponding stub 2706 const Register from = c_rarg0; // source array address 2707 const Register to = c_rarg1; // destination array address 2708 const Register count = c_rarg2; // elements count 2709 2710 // 'from', 'to', 'count' registers should be set in such order 2711 // since they are the same as 'src', 'src_pos', 'dst'. 2712 2713 __ cmpl(rax_elsize, 0); 2714 __ jccb(Assembler::notEqual, L_copy_shorts); 2715 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr 2716 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr 2717 __ movl2ptr(count, r11_length); // length 2718 __ jump(RuntimeAddress(byte_copy_entry)); 2719 2720 __ BIND(L_copy_shorts); 2721 __ cmpl(rax_elsize, LogBytesPerShort); 2722 __ jccb(Assembler::notEqual, L_copy_ints); 2723 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr 2724 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr 2725 __ movl2ptr(count, r11_length); // length 2726 __ jump(RuntimeAddress(short_copy_entry)); 2727 2728 __ BIND(L_copy_ints); 2729 __ cmpl(rax_elsize, LogBytesPerInt); 2730 __ jccb(Assembler::notEqual, L_copy_longs); 2731 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr 2732 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr 2733 __ movl2ptr(count, r11_length); // length 2734 __ jump(RuntimeAddress(int_copy_entry)); 2735 2736 __ BIND(L_copy_longs); 2737 #ifdef ASSERT 2738 { 2739 BLOCK_COMMENT("assert long copy {"); 2740 Label L; 2741 __ cmpl(rax_elsize, LogBytesPerLong); 2742 __ jcc(Assembler::equal, L); 2743 __ stop("must be long copy, but elsize is wrong"); 2744 __ bind(L); 2745 BLOCK_COMMENT("} assert long copy done"); 2746 } 2747 #endif 2748 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 2749 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 2750 __ movl2ptr(count, r11_length); // length 2751 __ jump(RuntimeAddress(long_copy_entry)); 2752 2753 // ObjArrayKlass 2754 __ BIND(L_objArray); 2755 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 2756 2757 Label L_plain_copy, L_checkcast_copy; 2758 // test array classes for subtyping 2759 __ load_klass(rax, dst, rklass_tmp); 2760 __ cmpq(r10_src_klass, rax); // usual case is exact equality 2761 __ jcc(Assembler::notEqual, L_checkcast_copy); 2762 2763 // Identically typed arrays can be copied without element-wise checks. 2764 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2765 r10, L_failed); 2766 2767 __ lea(from, Address(src, src_pos, TIMES_OOP, 2768 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 2769 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2770 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 2771 __ movl2ptr(count, r11_length); // length 2772 __ BIND(L_plain_copy); 2773 #ifdef _WIN64 2774 __ pop(rklass_tmp); // Restore callee-save rdi 2775 #endif 2776 __ jump(RuntimeAddress(oop_copy_entry)); 2777 2778 __ BIND(L_checkcast_copy); 2779 // live at this point: r10_src_klass, r11_length, rax (dst_klass) 2780 { 2781 // Before looking at dst.length, make sure dst is also an objArray. 2782 __ cmpl(Address(rax, lh_offset), objArray_lh); 2783 __ jcc(Assembler::notEqual, L_failed); 2784 2785 // It is safe to examine both src.length and dst.length. 2786 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2787 rax, L_failed); 2788 2789 const Register r11_dst_klass = r11; 2790 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload 2791 2792 // Marshal the base address arguments now, freeing registers. 2793 __ lea(from, Address(src, src_pos, TIMES_OOP, 2794 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2795 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2796 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2797 __ movl(count, length); // length (reloaded) 2798 Register sco_temp = c_rarg3; // this register is free now 2799 assert_different_registers(from, to, count, sco_temp, 2800 r11_dst_klass, r10_src_klass); 2801 assert_clean_int(count, sco_temp); 2802 2803 // Generate the type check. 2804 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2805 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 2806 assert_clean_int(sco_temp, rax); 2807 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 2808 2809 // Fetch destination element klass from the ObjArrayKlass header. 2810 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2811 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 2812 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 2813 assert_clean_int(sco_temp, rax); 2814 2815 #ifdef _WIN64 2816 __ pop(rklass_tmp); // Restore callee-save rdi 2817 #endif 2818 2819 // the checkcast_copy loop needs two extra arguments: 2820 assert(c_rarg3 == sco_temp, "#3 already in place"); 2821 // Set up arguments for checkcast_copy_entry. 2822 setup_arg_regs_using_thread(4); 2823 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris 2824 __ jump(RuntimeAddress(checkcast_copy_entry)); 2825 } 2826 2827 __ BIND(L_failed); 2828 #ifdef _WIN64 2829 __ pop(rklass_tmp); // Restore callee-save rdi 2830 #endif 2831 __ xorptr(rax, rax); 2832 __ notptr(rax); // return -1 2833 __ leave(); // required for proper stackwalking of RuntimeStub frame 2834 __ ret(0); 2835 2836 return start; 2837 } 2838 2839 #undef __