1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/macroAssembler.hpp" 26 #include "gc/shared/barrierSet.hpp" 27 #include "gc/shared/barrierSetAssembler.hpp" 28 #include "oops/objArrayKlass.hpp" 29 #include "runtime/sharedRuntime.hpp" 30 #include "runtime/stubRoutines.hpp" 31 #include "stubGenerator_x86_64.hpp" 32 #ifdef COMPILER2 33 #include "opto/c2_globals.hpp" 34 #endif 35 #if INCLUDE_JVMCI 36 #include "jvmci/jvmci_globals.hpp" 37 #endif 38 39 #define __ _masm-> 40 41 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #else 46 #define BLOCK_COMMENT(str) __ block_comment(str) 47 #endif // PRODUCT 48 49 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 50 51 #ifdef PRODUCT 52 #define INC_COUNTER_NP(counter, rscratch) ((void)0) 53 #else 54 #define INC_COUNTER_NP(counter, rscratch) \ 55 BLOCK_COMMENT("inc_counter " #counter); \ 56 inc_counter_np(_masm, counter, rscratch); 57 58 static void inc_counter_np(MacroAssembler* _masm, uint& counter, Register rscratch) { 59 __ incrementl(ExternalAddress((address)&counter), rscratch); 60 } 61 62 #if COMPILER2_OR_JVMCI 63 static uint& get_profile_ctr(int shift) { 64 if (shift == 0) { 65 return SharedRuntime::_jbyte_array_copy_ctr; 66 } else if (shift == 1) { 67 return SharedRuntime::_jshort_array_copy_ctr; 68 } else if (shift == 2) { 69 return SharedRuntime::_jint_array_copy_ctr; 70 } else { 71 assert(shift == 3, ""); 72 return SharedRuntime::_jlong_array_copy_ctr; 73 } 74 } 75 #endif // COMPILER2_OR_JVMCI 76 #endif // !PRODUCT 77 78 void StubGenerator::generate_arraycopy_stubs() { 79 address entry; 80 address entry_jbyte_arraycopy; 81 address entry_jshort_arraycopy; 82 address entry_jint_arraycopy; 83 address entry_oop_arraycopy; 84 address entry_jlong_arraycopy; 85 address entry_checkcast_arraycopy; 86 87 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(&entry); 88 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(entry, &entry_jbyte_arraycopy); 89 90 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&entry); 91 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(entry, &entry_jshort_arraycopy); 92 93 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 94 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 95 96 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubGenStubId::jlong_disjoint_arraycopy_id, &entry); 97 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(StubGenStubId::jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 98 if (UseCompressedOops) { 99 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(StubGenStubId::oop_disjoint_arraycopy_id, &entry); 100 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(StubGenStubId::oop_arraycopy_id, entry, &entry_oop_arraycopy); 101 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(StubGenStubId::oop_disjoint_arraycopy_uninit_id, &entry); 102 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(StubGenStubId::oop_arraycopy_uninit_id, entry, nullptr); 103 } else { 104 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(StubGenStubId::oop_disjoint_arraycopy_id, &entry); 105 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(StubGenStubId::oop_arraycopy_id, entry, &entry_oop_arraycopy); 106 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(StubGenStubId::oop_disjoint_arraycopy_uninit_id, &entry); 107 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(StubGenStubId::oop_arraycopy_uninit_id, entry, nullptr); 108 } 109 110 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 111 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 112 113 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 114 entry_jshort_arraycopy, 115 entry_jint_arraycopy, 116 entry_jlong_arraycopy); 117 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 118 entry_jshort_arraycopy, 119 entry_jint_arraycopy, 120 entry_oop_arraycopy, 121 entry_jlong_arraycopy, 122 entry_checkcast_arraycopy); 123 124 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 125 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 126 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 127 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 128 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 129 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 130 131 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill); 132 133 // We don't generate specialized code for HeapWord-aligned source 134 // arrays, so just use the code we've already generated 135 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; 136 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; 137 138 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; 139 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; 140 141 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 142 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 143 144 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 145 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 146 147 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 148 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 149 150 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 151 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 152 } 153 154 155 // Verify that a register contains clean 32-bits positive value 156 // (high 32-bits are 0) so it could be used in 64-bits shifts. 157 // 158 // Input: 159 // Rint - 32-bits value 160 // Rtmp - scratch 161 // 162 void StubGenerator::assert_clean_int(Register Rint, Register Rtmp) { 163 #ifdef ASSERT 164 Label L; 165 assert_different_registers(Rtmp, Rint); 166 __ movslq(Rtmp, Rint); 167 __ cmpq(Rtmp, Rint); 168 __ jcc(Assembler::equal, L); 169 __ stop("high 32-bits of int value are not 0"); 170 __ bind(L); 171 #endif 172 } 173 174 175 // Generate overlap test for array copy stubs 176 // 177 // Input: 178 // c_rarg0 - from 179 // c_rarg1 - to 180 // c_rarg2 - element count 181 // 182 // Output: 183 // rax - &from[element count - 1] 184 // 185 void StubGenerator::array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { 186 const Register from = c_rarg0; 187 const Register to = c_rarg1; 188 const Register count = c_rarg2; 189 const Register end_from = rax; 190 191 __ cmpptr(to, from); 192 __ lea(end_from, Address(from, count, sf, 0)); 193 if (NOLp == nullptr) { 194 RuntimeAddress no_overlap(no_overlap_target); 195 __ jump_cc(Assembler::belowEqual, no_overlap); 196 __ cmpptr(to, end_from); 197 __ jump_cc(Assembler::aboveEqual, no_overlap); 198 } else { 199 __ jcc(Assembler::belowEqual, (*NOLp)); 200 __ cmpptr(to, end_from); 201 __ jcc(Assembler::aboveEqual, (*NOLp)); 202 } 203 } 204 205 206 // Copy big chunks forward 207 // 208 // Inputs: 209 // end_from - source arrays end address 210 // end_to - destination array end address 211 // qword_count - 64-bits element count, negative 212 // tmp1 - scratch 213 // L_copy_bytes - entry label 214 // L_copy_8_bytes - exit label 215 // 216 void StubGenerator::copy_bytes_forward(Register end_from, Register end_to, 217 Register qword_count, Register tmp1, 218 Register tmp2, Label& L_copy_bytes, 219 Label& L_copy_8_bytes, DecoratorSet decorators, 220 BasicType type) { 221 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 222 DEBUG_ONLY(__ stop("enter at entry label, not here")); 223 Label L_loop; 224 __ align(OptoLoopAlignment); 225 if (UseUnalignedLoadStores) { 226 Label L_end; 227 __ BIND(L_loop); 228 if (UseAVX >= 2) { 229 bs->copy_load_at(_masm, decorators, type, 32, 230 xmm0, Address(end_from, qword_count, Address::times_8, -56), 231 tmp1, xmm1); 232 bs->copy_store_at(_masm, decorators, type, 32, 233 Address(end_to, qword_count, Address::times_8, -56), xmm0, 234 tmp1, tmp2, xmm1); 235 236 bs->copy_load_at(_masm, decorators, type, 32, 237 xmm0, Address(end_from, qword_count, Address::times_8, -24), 238 tmp1, xmm1); 239 bs->copy_store_at(_masm, decorators, type, 32, 240 Address(end_to, qword_count, Address::times_8, -24), xmm0, 241 tmp1, tmp2, xmm1); 242 } else { 243 bs->copy_load_at(_masm, decorators, type, 16, 244 xmm0, Address(end_from, qword_count, Address::times_8, -56), 245 tmp1, xmm1); 246 bs->copy_store_at(_masm, decorators, type, 16, 247 Address(end_to, qword_count, Address::times_8, -56), xmm0, 248 tmp1, tmp2, xmm1); 249 bs->copy_load_at(_masm, decorators, type, 16, 250 xmm0, Address(end_from, qword_count, Address::times_8, -40), 251 tmp1, xmm1); 252 bs->copy_store_at(_masm, decorators, type, 16, 253 Address(end_to, qword_count, Address::times_8, -40), xmm0, 254 tmp1, tmp2, xmm1); 255 bs->copy_load_at(_masm, decorators, type, 16, 256 xmm0, Address(end_from, qword_count, Address::times_8, -24), 257 tmp1, xmm1); 258 bs->copy_store_at(_masm, decorators, type, 16, 259 Address(end_to, qword_count, Address::times_8, -24), xmm0, 260 tmp1, tmp2, xmm1); 261 bs->copy_load_at(_masm, decorators, type, 16, 262 xmm0, Address(end_from, qword_count, Address::times_8, -8), 263 tmp1, xmm1); 264 bs->copy_store_at(_masm, decorators, type, 16, 265 Address(end_to, qword_count, Address::times_8, -8), xmm0, 266 tmp1, tmp2, xmm1); 267 } 268 269 __ BIND(L_copy_bytes); 270 __ addptr(qword_count, 8); 271 __ jcc(Assembler::lessEqual, L_loop); 272 __ subptr(qword_count, 4); // sub(8) and add(4) 273 __ jcc(Assembler::greater, L_end); 274 // Copy trailing 32 bytes 275 if (UseAVX >= 2) { 276 bs->copy_load_at(_masm, decorators, type, 32, 277 xmm0, Address(end_from, qword_count, Address::times_8, -24), 278 tmp1, xmm1); 279 bs->copy_store_at(_masm, decorators, type, 32, 280 Address(end_to, qword_count, Address::times_8, -24), xmm0, 281 tmp1, tmp2, xmm1); 282 } else { 283 bs->copy_load_at(_masm, decorators, type, 16, 284 xmm0, Address(end_from, qword_count, Address::times_8, -24), 285 tmp1, xmm1); 286 bs->copy_store_at(_masm, decorators, type, 16, 287 Address(end_to, qword_count, Address::times_8, -24), xmm0, 288 tmp1, tmp2, xmm1); 289 bs->copy_load_at(_masm, decorators, type, 16, 290 xmm0, Address(end_from, qword_count, Address::times_8, -8), 291 tmp1, xmm1); 292 bs->copy_store_at(_masm, decorators, type, 16, 293 Address(end_to, qword_count, Address::times_8, -8), xmm0, 294 tmp1, tmp2, xmm1); 295 } 296 __ addptr(qword_count, 4); 297 __ BIND(L_end); 298 } else { 299 // Copy 32-bytes per iteration 300 __ BIND(L_loop); 301 bs->copy_load_at(_masm, decorators, type, 8, 302 tmp1, Address(end_from, qword_count, Address::times_8, -24), 303 tmp2); 304 bs->copy_store_at(_masm, decorators, type, 8, 305 Address(end_to, qword_count, Address::times_8, -24), tmp1, 306 tmp2); 307 bs->copy_load_at(_masm, decorators, type, 8, 308 tmp1, Address(end_from, qword_count, Address::times_8, -16), 309 tmp2); 310 bs->copy_store_at(_masm, decorators, type, 8, 311 Address(end_to, qword_count, Address::times_8, -16), tmp1, 312 tmp2); 313 bs->copy_load_at(_masm, decorators, type, 8, 314 tmp1, Address(end_from, qword_count, Address::times_8, -8), 315 tmp2); 316 bs->copy_store_at(_masm, decorators, type, 8, 317 Address(end_to, qword_count, Address::times_8, -8), tmp1, 318 tmp2); 319 bs->copy_load_at(_masm, decorators, type, 8, 320 tmp1, Address(end_from, qword_count, Address::times_8, 0), 321 tmp2); 322 bs->copy_store_at(_masm, decorators, type, 8, 323 Address(end_to, qword_count, Address::times_8, 0), tmp1, 324 tmp2); 325 326 __ BIND(L_copy_bytes); 327 __ addptr(qword_count, 4); 328 __ jcc(Assembler::lessEqual, L_loop); 329 } 330 __ subptr(qword_count, 4); 331 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 332 } 333 334 335 // Copy big chunks backward 336 // 337 // Inputs: 338 // from - source arrays address 339 // dest - destination array address 340 // qword_count - 64-bits element count 341 // tmp1 - scratch 342 // L_copy_bytes - entry label 343 // L_copy_8_bytes - exit label 344 // 345 void StubGenerator::copy_bytes_backward(Register from, Register dest, 346 Register qword_count, Register tmp1, 347 Register tmp2, Label& L_copy_bytes, 348 Label& L_copy_8_bytes, DecoratorSet decorators, 349 BasicType type) { 350 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 351 DEBUG_ONLY(__ stop("enter at entry label, not here")); 352 Label L_loop; 353 __ align(OptoLoopAlignment); 354 if (UseUnalignedLoadStores) { 355 Label L_end; 356 __ BIND(L_loop); 357 if (UseAVX >= 2) { 358 bs->copy_load_at(_masm, decorators, type, 32, 359 xmm0, Address(from, qword_count, Address::times_8, 32), 360 tmp1, xmm1); 361 bs->copy_store_at(_masm, decorators, type, 32, 362 Address(dest, qword_count, Address::times_8, 32), xmm0, 363 tmp1, tmp2, xmm1); 364 bs->copy_load_at(_masm, decorators, type, 32, 365 xmm0, Address(from, qword_count, Address::times_8, 0), 366 tmp1, xmm1); 367 bs->copy_store_at(_masm, decorators, type, 32, 368 Address(dest, qword_count, Address::times_8, 0), xmm0, 369 tmp1, tmp2, xmm1); 370 } else { 371 bs->copy_load_at(_masm, decorators, type, 16, 372 xmm0, Address(from, qword_count, Address::times_8, 48), 373 tmp1, xmm1); 374 bs->copy_store_at(_masm, decorators, type, 16, 375 Address(dest, qword_count, Address::times_8, 48), xmm0, 376 tmp1, tmp2, xmm1); 377 bs->copy_load_at(_masm, decorators, type, 16, 378 xmm0, Address(from, qword_count, Address::times_8, 32), 379 tmp1, xmm1); 380 bs->copy_store_at(_masm, decorators, type, 16, 381 Address(dest, qword_count, Address::times_8, 32), xmm0, 382 tmp1, tmp2, xmm1); 383 bs->copy_load_at(_masm, decorators, type, 16, 384 xmm0, Address(from, qword_count, Address::times_8, 16), 385 tmp1, xmm1); 386 bs->copy_store_at(_masm, decorators, type, 16, 387 Address(dest, qword_count, Address::times_8, 16), xmm0, 388 tmp1, tmp2, xmm1); 389 bs->copy_load_at(_masm, decorators, type, 16, 390 xmm0, Address(from, qword_count, Address::times_8, 0), 391 tmp1, xmm1); 392 bs->copy_store_at(_masm, decorators, type, 16, 393 Address(dest, qword_count, Address::times_8, 0), xmm0, 394 tmp1, tmp2, xmm1); 395 } 396 397 __ BIND(L_copy_bytes); 398 __ subptr(qword_count, 8); 399 __ jcc(Assembler::greaterEqual, L_loop); 400 401 __ addptr(qword_count, 4); // add(8) and sub(4) 402 __ jcc(Assembler::less, L_end); 403 // Copy trailing 32 bytes 404 if (UseAVX >= 2) { 405 bs->copy_load_at(_masm, decorators, type, 32, 406 xmm0, Address(from, qword_count, Address::times_8, 0), 407 tmp1, xmm1); 408 bs->copy_store_at(_masm, decorators, type, 32, 409 Address(dest, qword_count, Address::times_8, 0), xmm0, 410 tmp1, tmp2, xmm1); 411 } else { 412 bs->copy_load_at(_masm, decorators, type, 16, 413 xmm0, Address(from, qword_count, Address::times_8, 16), 414 tmp1, xmm1); 415 bs->copy_store_at(_masm, decorators, type, 16, 416 Address(dest, qword_count, Address::times_8, 16), xmm0, 417 tmp1, tmp2, xmm1); 418 bs->copy_load_at(_masm, decorators, type, 16, 419 xmm0, Address(from, qword_count, Address::times_8, 0), 420 tmp1, xmm1); 421 bs->copy_store_at(_masm, decorators, type, 16, 422 Address(dest, qword_count, Address::times_8, 0), xmm0, 423 tmp1, tmp2, xmm1); 424 } 425 __ subptr(qword_count, 4); 426 __ BIND(L_end); 427 } else { 428 // Copy 32-bytes per iteration 429 __ BIND(L_loop); 430 bs->copy_load_at(_masm, decorators, type, 8, 431 tmp1, Address(from, qword_count, Address::times_8, 24), 432 tmp2); 433 bs->copy_store_at(_masm, decorators, type, 8, 434 Address(dest, qword_count, Address::times_8, 24), tmp1, 435 tmp2); 436 bs->copy_load_at(_masm, decorators, type, 8, 437 tmp1, Address(from, qword_count, Address::times_8, 16), 438 tmp2); 439 bs->copy_store_at(_masm, decorators, type, 8, 440 Address(dest, qword_count, Address::times_8, 16), tmp1, 441 tmp2); 442 bs->copy_load_at(_masm, decorators, type, 8, 443 tmp1, Address(from, qword_count, Address::times_8, 8), 444 tmp2); 445 bs->copy_store_at(_masm, decorators, type, 8, 446 Address(dest, qword_count, Address::times_8, 8), tmp1, 447 tmp2); 448 bs->copy_load_at(_masm, decorators, type, 8, 449 tmp1, Address(from, qword_count, Address::times_8, 0), 450 tmp2); 451 bs->copy_store_at(_masm, decorators, type, 8, 452 Address(dest, qword_count, Address::times_8, 0), tmp1, 453 tmp2); 454 455 __ BIND(L_copy_bytes); 456 __ subptr(qword_count, 4); 457 __ jcc(Assembler::greaterEqual, L_loop); 458 } 459 __ addptr(qword_count, 4); 460 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 461 } 462 463 #if COMPILER2_OR_JVMCI 464 465 // Note: Following rules apply to AVX3 optimized arraycopy stubs:- 466 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs) 467 // for both special cases (various small block sizes) and aligned copy loop. This is the 468 // default configuration. 469 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs) 470 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it. 471 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a 472 // better performance for disjoint copies. For conjoint/backward copy vector based 473 // copy performs better. 474 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over 475 // 64 byte vector registers (ZMMs). 476 477 // Inputs: 478 // c_rarg0 - source array address 479 // c_rarg1 - destination array address 480 // c_rarg2 - element count, treated as ssize_t, can be zero 481 // 482 // 483 // Side Effects: 484 // disjoint_copy_avx3_masked is set to the no-overlap entry point 485 // used by generate_conjoint_[byte/int/short/long]_copy(). 486 // 487 address StubGenerator::generate_disjoint_copy_avx3_masked(StubGenStubId stub_id, address* entry) { 488 // aligned is always false -- x86_64 always uses the unaligned code 489 const bool aligned = false; 490 int shift; 491 bool is_oop; 492 bool dest_uninitialized; 493 494 switch (stub_id) { 495 case jbyte_disjoint_arraycopy_id: 496 shift = 0; 497 is_oop = false; 498 dest_uninitialized = false; 499 break; 500 case jshort_disjoint_arraycopy_id: 501 shift = 1; 502 is_oop = false; 503 dest_uninitialized = false; 504 break; 505 case jint_disjoint_arraycopy_id: 506 shift = 2; 507 is_oop = false; 508 dest_uninitialized = false; 509 break; 510 case jlong_disjoint_arraycopy_id: 511 shift = 3; 512 is_oop = false; 513 dest_uninitialized = false; 514 break; 515 case oop_disjoint_arraycopy_id: 516 shift = (UseCompressedOops ? 2 : 3); 517 is_oop = true; 518 dest_uninitialized = false; 519 break; 520 case oop_disjoint_arraycopy_uninit_id: 521 shift = (UseCompressedOops ? 2 : 3); 522 is_oop = true; 523 dest_uninitialized = true; 524 break; 525 default: 526 ShouldNotReachHere(); 527 } 528 529 __ align(CodeEntryAlignment); 530 StubCodeMark mark(this, stub_id); 531 address start = __ pc(); 532 533 int avx3threshold = VM_Version::avx3_threshold(); 534 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 535 const int large_threshold = 2621440; // 2.5 MB 536 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 537 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 538 Label L_copy_large, L_finish; 539 const Register from = rdi; // source array address 540 const Register to = rsi; // destination array address 541 const Register count = rdx; // elements count 542 const Register temp1 = r8; 543 const Register temp2 = r11; 544 const Register temp3 = rax; 545 const Register temp4 = rcx; 546 // End pointers are inclusive, and if count is not zero they point 547 // to the last unit copied: end_to[0] := end_from[0] 548 549 __ enter(); // required for proper stackwalking of RuntimeStub frame 550 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 551 552 if (entry != nullptr) { 553 *entry = __ pc(); 554 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 555 BLOCK_COMMENT("Entry:"); 556 } 557 558 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 559 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 560 561 setup_argument_regs(type); 562 563 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 564 if (dest_uninitialized) { 565 decorators |= IS_DEST_UNINITIALIZED; 566 } 567 if (aligned) { 568 decorators |= ARRAYCOPY_ALIGNED; 569 } 570 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 571 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 572 573 { 574 // Type(shift) byte(0), short(1), int(2), long(3) 575 int loop_size[] = { 192, 96, 48, 24}; 576 int threshold[] = { 4096, 2048, 1024, 512}; 577 578 // UnsafeMemoryAccess page error: continue after unsafe access 579 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 580 // 'from', 'to' and 'count' are now valid 581 582 // temp1 holds remaining count and temp4 holds running count used to compute 583 // next address offset for start of to/from addresses (temp4 * scale). 584 __ mov64(temp4, 0); 585 __ movq(temp1, count); 586 587 // Zero length check. 588 __ BIND(L_tail); 589 __ cmpq(temp1, 0); 590 __ jcc(Assembler::lessEqual, L_exit); 591 592 // Special cases using 32 byte [masked] vector copy operations. 593 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 594 temp4, temp3, use64byteVector, L_entry, L_exit); 595 596 // PRE-MAIN-POST loop for aligned copy. 597 __ BIND(L_entry); 598 599 if (MaxVectorSize == 64) { 600 __ movq(temp2, temp1); 601 __ shlq(temp2, shift); 602 __ cmpq(temp2, large_threshold); 603 __ jcc(Assembler::greaterEqual, L_copy_large); 604 } 605 if (avx3threshold != 0) { 606 __ cmpq(count, threshold[shift]); 607 if (MaxVectorSize == 64) { 608 // Copy using 64 byte vectors. 609 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 610 } else { 611 assert(MaxVectorSize < 64, "vector size should be < 64 bytes"); 612 // REP MOVS offer a faster copy path. 613 __ jcc(Assembler::greaterEqual, L_repmovs); 614 } 615 } 616 617 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 618 // Partial copy to make dst address 32 byte aligned. 619 __ movq(temp2, to); 620 __ andq(temp2, 31); 621 __ jcc(Assembler::equal, L_main_pre_loop); 622 623 __ negptr(temp2); 624 __ addq(temp2, 32); 625 if (shift) { 626 __ shrq(temp2, shift); 627 } 628 __ movq(temp3, temp2); 629 copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift); 630 __ movq(temp4, temp2); 631 __ movq(temp1, count); 632 __ subq(temp1, temp2); 633 634 __ cmpq(temp1, loop_size[shift]); 635 __ jcc(Assembler::less, L_tail); 636 637 __ BIND(L_main_pre_loop); 638 __ subq(temp1, loop_size[shift]); 639 640 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 641 __ align32(); 642 __ BIND(L_main_loop); 643 copy64_avx(to, from, temp4, xmm1, false, shift, 0); 644 copy64_avx(to, from, temp4, xmm1, false, shift, 64); 645 copy64_avx(to, from, temp4, xmm1, false, shift, 128); 646 __ addptr(temp4, loop_size[shift]); 647 __ subq(temp1, loop_size[shift]); 648 __ jcc(Assembler::greater, L_main_loop); 649 650 __ addq(temp1, loop_size[shift]); 651 652 // Tail loop. 653 __ jmp(L_tail); 654 655 __ BIND(L_repmovs); 656 __ movq(temp2, temp1); 657 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics. 658 __ movq(temp3, to); 659 __ movq(to, from); 660 __ movq(from, temp3); 661 // Save to/from for restoration post rep_mov. 662 __ movq(temp1, to); 663 __ movq(temp3, from); 664 if(shift < 3) { 665 __ shrq(temp2, 3-shift); // quad word count 666 } 667 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX). 668 __ rep_mov(); 669 __ shlq(temp2, 3); // convert quad words into byte count. 670 if(shift) { 671 __ shrq(temp2, shift); // type specific count. 672 } 673 // Restore original addresses in to/from. 674 __ movq(to, temp3); 675 __ movq(from, temp1); 676 __ movq(temp4, temp2); 677 __ movq(temp1, count); 678 __ subq(temp1, temp2); // tailing part (less than a quad ward size). 679 __ jmp(L_tail); 680 } 681 682 if (MaxVectorSize > 32) { 683 __ BIND(L_pre_main_post_64); 684 // Partial copy to make dst address 64 byte aligned. 685 __ movq(temp2, to); 686 __ andq(temp2, 63); 687 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 688 689 __ negptr(temp2); 690 __ addq(temp2, 64); 691 if (shift) { 692 __ shrq(temp2, shift); 693 } 694 __ movq(temp3, temp2); 695 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true); 696 __ movq(temp4, temp2); 697 __ movq(temp1, count); 698 __ subq(temp1, temp2); 699 700 __ cmpq(temp1, loop_size[shift]); 701 __ jcc(Assembler::less, L_tail64); 702 703 __ BIND(L_main_pre_loop_64bytes); 704 __ subq(temp1, loop_size[shift]); 705 706 // Main loop with aligned copy block size of 192 bytes at 707 // 64 byte copy granularity. 708 __ align32(); 709 __ BIND(L_main_loop_64bytes); 710 copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true); 711 copy64_avx(to, from, temp4, xmm1, false, shift, 64, true); 712 copy64_avx(to, from, temp4, xmm1, false, shift, 128, true); 713 __ addptr(temp4, loop_size[shift]); 714 __ subq(temp1, loop_size[shift]); 715 __ jcc(Assembler::greater, L_main_loop_64bytes); 716 717 __ addq(temp1, loop_size[shift]); 718 // Zero length check. 719 __ jcc(Assembler::lessEqual, L_exit); 720 721 __ BIND(L_tail64); 722 723 // Tail handling using 64 byte [masked] vector copy operations. 724 use64byteVector = true; 725 arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift, 726 temp4, temp3, use64byteVector, L_entry, L_exit); 727 } 728 __ BIND(L_exit); 729 } 730 731 __ BIND(L_finish); 732 address ucme_exit_pc = __ pc(); 733 // When called from generic_arraycopy r11 contains specific values 734 // used during arraycopy epilogue, re-initializing r11. 735 if (is_oop) { 736 __ movq(r11, shift == 3 ? count : to); 737 } 738 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 739 restore_argument_regs(type); 740 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 741 __ xorptr(rax, rax); // return 0 742 __ vzeroupper(); 743 __ leave(); // required for proper stackwalking of RuntimeStub frame 744 __ ret(0); 745 746 if (MaxVectorSize == 64) { 747 __ BIND(L_copy_large); 748 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc); 749 arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift); 750 __ jmp(L_finish); 751 } 752 return start; 753 } 754 755 void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2, 756 Register temp3, Register temp4, Register count, 757 XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, 758 XMMRegister xmm4, int shift) { 759 760 // Type(shift) byte(0), short(1), int(2), long(3) 761 int loop_size[] = { 256, 128, 64, 32}; 762 int threshold[] = { 4096, 2048, 1024, 512}; 763 764 Label L_main_loop_large; 765 Label L_tail_large; 766 Label L_exit_large; 767 Label L_entry_large; 768 Label L_main_pre_loop_large; 769 Label L_pre_main_post_large; 770 771 assert(MaxVectorSize == 64, "vector length != 64"); 772 __ BIND(L_entry_large); 773 774 __ BIND(L_pre_main_post_large); 775 // Partial copy to make dst address 64 byte aligned. 776 __ movq(temp2, to); 777 __ andq(temp2, 63); 778 __ jcc(Assembler::equal, L_main_pre_loop_large); 779 780 __ negptr(temp2); 781 __ addq(temp2, 64); 782 if (shift) { 783 __ shrq(temp2, shift); 784 } 785 __ movq(temp3, temp2); 786 copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true); 787 __ movq(temp4, temp2); 788 __ movq(temp1, count); 789 __ subq(temp1, temp2); 790 791 __ cmpq(temp1, loop_size[shift]); 792 __ jcc(Assembler::less, L_tail_large); 793 794 __ BIND(L_main_pre_loop_large); 795 __ subq(temp1, loop_size[shift]); 796 797 // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity. 798 __ align32(); 799 __ BIND(L_main_loop_large); 800 copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0); 801 __ addptr(temp4, loop_size[shift]); 802 __ subq(temp1, loop_size[shift]); 803 __ jcc(Assembler::greater, L_main_loop_large); 804 // fence needed because copy256_avx3 uses non-temporal stores 805 __ sfence(); 806 807 __ addq(temp1, loop_size[shift]); 808 // Zero length check. 809 __ jcc(Assembler::lessEqual, L_exit_large); 810 __ BIND(L_tail_large); 811 // Tail handling using 64 byte [masked] vector copy operations. 812 __ cmpq(temp1, 0); 813 __ jcc(Assembler::lessEqual, L_exit_large); 814 arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift, 815 temp4, temp3, L_exit_large); 816 __ BIND(L_exit_large); 817 } 818 819 // Inputs: 820 // c_rarg0 - source array address 821 // c_rarg1 - destination array address 822 // c_rarg2 - element count, treated as ssize_t, can be zero 823 // 824 // 825 address StubGenerator::generate_conjoint_copy_avx3_masked(StubGenStubId stub_id, address* entry, address nooverlap_target) { 826 // aligned is always false -- x86_64 always uses the unaligned code 827 const bool aligned = false; 828 int shift; 829 bool is_oop; 830 bool dest_uninitialized; 831 832 switch (stub_id) { 833 case jbyte_arraycopy_id: 834 shift = 0; 835 is_oop = false; 836 dest_uninitialized = false; 837 break; 838 case jshort_arraycopy_id: 839 shift = 1; 840 is_oop = false; 841 dest_uninitialized = false; 842 break; 843 case jint_arraycopy_id: 844 shift = 2; 845 is_oop = false; 846 dest_uninitialized = false; 847 break; 848 case jlong_arraycopy_id: 849 shift = 3; 850 is_oop = false; 851 dest_uninitialized = false; 852 break; 853 case oop_arraycopy_id: 854 shift = (UseCompressedOops ? 2 : 3); 855 is_oop = true; 856 dest_uninitialized = false; 857 break; 858 case oop_arraycopy_uninit_id: 859 shift = (UseCompressedOops ? 2 : 3); 860 is_oop = true; 861 dest_uninitialized = true; 862 break; 863 default: 864 ShouldNotReachHere(); 865 } 866 867 __ align(CodeEntryAlignment); 868 StubCodeMark mark(this, stub_id); 869 address start = __ pc(); 870 871 int avx3threshold = VM_Version::avx3_threshold(); 872 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0); 873 874 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64; 875 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry; 876 const Register from = rdi; // source array address 877 const Register to = rsi; // destination array address 878 const Register count = rdx; // elements count 879 const Register temp1 = r8; 880 const Register temp2 = rcx; 881 const Register temp3 = r11; 882 const Register temp4 = rax; 883 // End pointers are inclusive, and if count is not zero they point 884 // to the last unit copied: end_to[0] := end_from[0] 885 886 __ enter(); // required for proper stackwalking of RuntimeStub frame 887 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 888 889 if (entry != nullptr) { 890 *entry = __ pc(); 891 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 892 BLOCK_COMMENT("Entry:"); 893 } 894 895 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift)); 896 897 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 898 BasicType type = is_oop ? T_OBJECT : type_vec[shift]; 899 900 setup_argument_regs(type); 901 902 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 903 if (dest_uninitialized) { 904 decorators |= IS_DEST_UNINITIALIZED; 905 } 906 if (aligned) { 907 decorators |= ARRAYCOPY_ALIGNED; 908 } 909 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 910 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 911 { 912 // Type(shift) byte(0), short(1), int(2), long(3) 913 int loop_size[] = { 192, 96, 48, 24}; 914 int threshold[] = { 4096, 2048, 1024, 512}; 915 916 // UnsafeMemoryAccess page error: continue after unsafe access 917 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 918 // 'from', 'to' and 'count' are now valid 919 920 // temp1 holds remaining count. 921 __ movq(temp1, count); 922 923 // Zero length check. 924 __ BIND(L_tail); 925 __ cmpq(temp1, 0); 926 __ jcc(Assembler::lessEqual, L_exit); 927 928 __ mov64(temp2, 0); 929 __ movq(temp3, temp1); 930 // Special cases using 32 byte [masked] vector copy operations. 931 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 932 temp4, use64byteVector, L_entry, L_exit); 933 934 // PRE-MAIN-POST loop for aligned copy. 935 __ BIND(L_entry); 936 937 if ((MaxVectorSize > 32) && (avx3threshold != 0)) { 938 __ cmpq(temp1, threshold[shift]); 939 __ jcc(Assembler::greaterEqual, L_pre_main_post_64); 940 } 941 942 if ((MaxVectorSize < 64) || (avx3threshold != 0)) { 943 // Partial copy to make dst address 32 byte aligned. 944 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 945 __ andq(temp2, 31); 946 __ jcc(Assembler::equal, L_main_pre_loop); 947 948 if (shift) { 949 __ shrq(temp2, shift); 950 } 951 __ subq(temp1, temp2); 952 copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift); 953 954 __ cmpq(temp1, loop_size[shift]); 955 __ jcc(Assembler::less, L_tail); 956 957 __ BIND(L_main_pre_loop); 958 959 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity. 960 __ align32(); 961 __ BIND(L_main_loop); 962 copy64_avx(to, from, temp1, xmm1, true, shift, -64); 963 copy64_avx(to, from, temp1, xmm1, true, shift, -128); 964 copy64_avx(to, from, temp1, xmm1, true, shift, -192); 965 __ subptr(temp1, loop_size[shift]); 966 __ cmpq(temp1, loop_size[shift]); 967 __ jcc(Assembler::greater, L_main_loop); 968 969 // Tail loop. 970 __ jmp(L_tail); 971 } 972 973 if (MaxVectorSize > 32) { 974 __ BIND(L_pre_main_post_64); 975 // Partial copy to make dst address 64 byte aligned. 976 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0)); 977 __ andq(temp2, 63); 978 __ jcc(Assembler::equal, L_main_pre_loop_64bytes); 979 980 if (shift) { 981 __ shrq(temp2, shift); 982 } 983 __ subq(temp1, temp2); 984 copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true); 985 986 __ cmpq(temp1, loop_size[shift]); 987 __ jcc(Assembler::less, L_tail64); 988 989 __ BIND(L_main_pre_loop_64bytes); 990 991 // Main loop with aligned copy block size of 192 bytes at 992 // 64 byte copy granularity. 993 __ align32(); 994 __ BIND(L_main_loop_64bytes); 995 copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true); 996 copy64_avx(to, from, temp1, xmm1, true, shift, -128, true); 997 copy64_avx(to, from, temp1, xmm1, true, shift, -192, true); 998 __ subq(temp1, loop_size[shift]); 999 __ cmpq(temp1, loop_size[shift]); 1000 __ jcc(Assembler::greater, L_main_loop_64bytes); 1001 1002 // Zero length check. 1003 __ cmpq(temp1, 0); 1004 __ jcc(Assembler::lessEqual, L_exit); 1005 1006 __ BIND(L_tail64); 1007 1008 // Tail handling using 64 byte [masked] vector copy operations. 1009 use64byteVector = true; 1010 __ mov64(temp2, 0); 1011 __ movq(temp3, temp1); 1012 arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift, 1013 temp4, use64byteVector, L_entry, L_exit); 1014 } 1015 __ BIND(L_exit); 1016 } 1017 address ucme_exit_pc = __ pc(); 1018 // When called from generic_arraycopy r11 contains specific values 1019 // used during arraycopy epilogue, re-initializing r11. 1020 if(is_oop) { 1021 __ movq(r11, count); 1022 } 1023 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 1024 restore_argument_regs(type); 1025 INC_COUNTER_NP(get_profile_ctr(shift), rscratch1); // Update counter after rscratch1 is free 1026 __ xorptr(rax, rax); // return 0 1027 __ vzeroupper(); 1028 __ leave(); // required for proper stackwalking of RuntimeStub frame 1029 __ ret(0); 1030 1031 return start; 1032 } 1033 1034 void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from, 1035 Register to, Register count, int shift, 1036 Register index, Register temp, 1037 bool use64byteVector, Label& L_entry, Label& L_exit) { 1038 Label L_entry_64, L_entry_96, L_entry_128; 1039 Label L_entry_160, L_entry_192; 1040 1041 int size_mat[][6] = { 1042 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 1043 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 1044 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 1045 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 1046 }; 1047 1048 // Case A) Special case for length less than equal to 32 bytes. 1049 __ cmpq(count, size_mat[shift][0]); 1050 __ jccb(Assembler::greater, L_entry_64); 1051 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift); 1052 __ jmp(L_exit); 1053 1054 // Case B) Special case for length less than equal to 64 bytes. 1055 __ BIND(L_entry_64); 1056 __ cmpq(count, size_mat[shift][1]); 1057 __ jccb(Assembler::greater, L_entry_96); 1058 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector); 1059 __ jmp(L_exit); 1060 1061 // Case C) Special case for length less than equal to 96 bytes. 1062 __ BIND(L_entry_96); 1063 __ cmpq(count, size_mat[shift][2]); 1064 __ jccb(Assembler::greater, L_entry_128); 1065 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1066 __ subq(count, 64 >> shift); 1067 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64); 1068 __ jmp(L_exit); 1069 1070 // Case D) Special case for length less than equal to 128 bytes. 1071 __ BIND(L_entry_128); 1072 __ cmpq(count, size_mat[shift][3]); 1073 __ jccb(Assembler::greater, L_entry_160); 1074 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1075 copy32_avx(to, from, index, xmm, shift, 64); 1076 __ subq(count, 96 >> shift); 1077 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96); 1078 __ jmp(L_exit); 1079 1080 // Case E) Special case for length less than equal to 160 bytes. 1081 __ BIND(L_entry_160); 1082 __ cmpq(count, size_mat[shift][4]); 1083 __ jccb(Assembler::greater, L_entry_192); 1084 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1085 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 1086 __ subq(count, 128 >> shift); 1087 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128); 1088 __ jmp(L_exit); 1089 1090 // Case F) Special case for length less than equal to 192 bytes. 1091 __ BIND(L_entry_192); 1092 __ cmpq(count, size_mat[shift][5]); 1093 __ jcc(Assembler::greater, L_entry); 1094 copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector); 1095 copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector); 1096 copy32_avx(to, from, index, xmm, shift, 128); 1097 __ subq(count, 160 >> shift); 1098 copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160); 1099 __ jmp(L_exit); 1100 } 1101 1102 void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from, 1103 Register to, Register count, int shift, Register index, 1104 Register temp, Label& L_exit) { 1105 Label L_entry_64, L_entry_128, L_entry_192, L_entry_256; 1106 1107 int size_mat[][4] = { 1108 /* T_BYTE */ {64, 128, 192, 256}, 1109 /* T_SHORT*/ {32, 64 , 96 , 128}, 1110 /* T_INT */ {16, 32 , 48 , 64}, 1111 /* T_LONG */ { 8, 16 , 24 , 32} 1112 }; 1113 1114 assert(MaxVectorSize == 64, "vector length != 64"); 1115 // Case A) Special case for length less than or equal to 64 bytes. 1116 __ BIND(L_entry_64); 1117 __ cmpq(count, size_mat[shift][0]); 1118 __ jccb(Assembler::greater, L_entry_128); 1119 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true); 1120 __ jmp(L_exit); 1121 1122 // Case B) Special case for length less than or equal to 128 bytes. 1123 __ BIND(L_entry_128); 1124 __ cmpq(count, size_mat[shift][1]); 1125 __ jccb(Assembler::greater, L_entry_192); 1126 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1127 __ subq(count, 64 >> shift); 1128 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true); 1129 __ jmp(L_exit); 1130 1131 // Case C) Special case for length less than or equal to 192 bytes. 1132 __ BIND(L_entry_192); 1133 __ cmpq(count, size_mat[shift][2]); 1134 __ jcc(Assembler::greater, L_entry_256); 1135 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1136 copy64_avx(to, from, index, xmm, false, shift, 64, true); 1137 __ subq(count, 128 >> shift); 1138 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true); 1139 __ jmp(L_exit); 1140 1141 // Case D) Special case for length less than or equal to 256 bytes. 1142 __ BIND(L_entry_256); 1143 copy64_avx(to, from, index, xmm, false, shift, 0, true); 1144 copy64_avx(to, from, index, xmm, false, shift, 64, true); 1145 copy64_avx(to, from, index, xmm, false, shift, 128, true); 1146 __ subq(count, 192 >> shift); 1147 copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true); 1148 __ jmp(L_exit); 1149 } 1150 1151 void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from, 1152 Register to, Register start_index, Register end_index, 1153 Register count, int shift, Register temp, 1154 bool use64byteVector, Label& L_entry, Label& L_exit) { 1155 Label L_entry_64, L_entry_96, L_entry_128; 1156 Label L_entry_160, L_entry_192; 1157 bool avx3 = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 1158 1159 int size_mat[][6] = { 1160 /* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 }, 1161 /* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 }, 1162 /* T_INT */ {8 , 16, 24 , 32 , 40 , 48 }, 1163 /* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 } 1164 }; 1165 1166 // Case A) Special case for length less than equal to 32 bytes. 1167 __ cmpq(count, size_mat[shift][0]); 1168 __ jccb(Assembler::greater, L_entry_64); 1169 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1170 __ jmp(L_exit); 1171 1172 // Case B) Special case for length less than equal to 64 bytes. 1173 __ BIND(L_entry_64); 1174 __ cmpq(count, size_mat[shift][1]); 1175 __ jccb(Assembler::greater, L_entry_96); 1176 if (avx3) { 1177 copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true); 1178 } else { 1179 copy32_avx(to, from, end_index, xmm, shift, -32); 1180 __ subq(count, 32 >> shift); 1181 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1182 } 1183 __ jmp(L_exit); 1184 1185 // Case C) Special case for length less than equal to 96 bytes. 1186 __ BIND(L_entry_96); 1187 __ cmpq(count, size_mat[shift][2]); 1188 __ jccb(Assembler::greater, L_entry_128); 1189 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1190 __ subq(count, 64 >> shift); 1191 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1192 __ jmp(L_exit); 1193 1194 // Case D) Special case for length less than equal to 128 bytes. 1195 __ BIND(L_entry_128); 1196 __ cmpq(count, size_mat[shift][3]); 1197 __ jccb(Assembler::greater, L_entry_160); 1198 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1199 copy32_avx(to, from, end_index, xmm, shift, -96); 1200 __ subq(count, 96 >> shift); 1201 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1202 __ jmp(L_exit); 1203 1204 // Case E) Special case for length less than equal to 160 bytes. 1205 __ BIND(L_entry_160); 1206 __ cmpq(count, size_mat[shift][4]); 1207 __ jccb(Assembler::greater, L_entry_192); 1208 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1209 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 1210 __ subq(count, 128 >> shift); 1211 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1212 __ jmp(L_exit); 1213 1214 // Case F) Special case for length less than equal to 192 bytes. 1215 __ BIND(L_entry_192); 1216 __ cmpq(count, size_mat[shift][5]); 1217 __ jcc(Assembler::greater, L_entry); 1218 copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector); 1219 copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector); 1220 copy32_avx(to, from, end_index, xmm, shift, -160); 1221 __ subq(count, 160 >> shift); 1222 copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift); 1223 __ jmp(L_exit); 1224 } 1225 1226 void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, 1227 XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, 1228 int shift, int offset) { 1229 if (MaxVectorSize == 64) { 1230 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1231 __ prefetcht0(Address(src, index, scale, offset + 0x200)); 1232 __ prefetcht0(Address(src, index, scale, offset + 0x240)); 1233 __ prefetcht0(Address(src, index, scale, offset + 0x280)); 1234 __ prefetcht0(Address(src, index, scale, offset + 0x2C0)); 1235 1236 __ prefetcht0(Address(src, index, scale, offset + 0x400)); 1237 __ prefetcht0(Address(src, index, scale, offset + 0x440)); 1238 __ prefetcht0(Address(src, index, scale, offset + 0x480)); 1239 __ prefetcht0(Address(src, index, scale, offset + 0x4C0)); 1240 1241 __ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit); 1242 __ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit); 1243 __ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit); 1244 __ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit); 1245 1246 __ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit); 1247 __ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit); 1248 __ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit); 1249 __ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit); 1250 } 1251 } 1252 1253 void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm, 1254 KRegister mask, Register length, Register index, 1255 Register temp, int shift, int offset, 1256 bool use64byteVector) { 1257 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 1258 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1259 if (!use64byteVector) { 1260 copy32_avx(dst, src, index, xmm, shift, offset); 1261 __ subptr(length, 32 >> shift); 1262 copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32); 1263 } else { 1264 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1265 assert(MaxVectorSize == 64, "vector length != 64"); 1266 __ mov64(temp, -1L); 1267 __ bzhiq(temp, temp, length); 1268 __ kmovql(mask, temp); 1269 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_512bit); 1270 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_512bit); 1271 } 1272 } 1273 1274 1275 void StubGenerator::copy32_masked_avx(Register dst, Register src, XMMRegister xmm, 1276 KRegister mask, Register length, Register index, 1277 Register temp, int shift, int offset) { 1278 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1279 BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 1280 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1281 __ mov64(temp, -1L); 1282 __ bzhiq(temp, temp, length); 1283 __ kmovql(mask, temp); 1284 __ evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), false, Assembler::AVX_256bit); 1285 __ evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, true, Assembler::AVX_256bit); 1286 } 1287 1288 1289 void StubGenerator::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm, 1290 int shift, int offset) { 1291 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 1292 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1293 __ vmovdqu(xmm, Address(src, index, scale, offset)); 1294 __ vmovdqu(Address(dst, index, scale, offset), xmm); 1295 } 1296 1297 1298 void StubGenerator::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm, 1299 bool conjoint, int shift, int offset, bool use64byteVector) { 1300 assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch"); 1301 if (!use64byteVector) { 1302 if (conjoint) { 1303 copy32_avx(dst, src, index, xmm, shift, offset+32); 1304 copy32_avx(dst, src, index, xmm, shift, offset); 1305 } else { 1306 copy32_avx(dst, src, index, xmm, shift, offset); 1307 copy32_avx(dst, src, index, xmm, shift, offset+32); 1308 } 1309 } else { 1310 Address::ScaleFactor scale = (Address::ScaleFactor)(shift); 1311 __ evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit); 1312 __ evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit); 1313 } 1314 } 1315 1316 #endif // COMPILER2_OR_JVMCI 1317 1318 1319 // Arguments: 1320 // entry - location for return of (post-push) entry 1321 // 1322 // Inputs: 1323 // c_rarg0 - source array address 1324 // c_rarg1 - destination array address 1325 // c_rarg2 - element count, treated as ssize_t, can be zero 1326 // 1327 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1328 // we let the hardware handle it. The one to eight bytes within words, 1329 // dwords or qwords that span cache line boundaries will still be loaded 1330 // and stored atomically. 1331 // 1332 // Side Effects: 1333 // entry is set to the no-overlap entry point 1334 // used by generate_conjoint_byte_copy(). 1335 // 1336 address StubGenerator::generate_disjoint_byte_copy(address* entry) { 1337 StubGenStubId stub_id = StubGenStubId::jbyte_disjoint_arraycopy_id; 1338 // aligned is always false -- x86_64 always uses the unaligned code 1339 const bool aligned = false; 1340 #if COMPILER2_OR_JVMCI 1341 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1342 return generate_disjoint_copy_avx3_masked(stub_id, entry); 1343 } 1344 #endif 1345 __ align(CodeEntryAlignment); 1346 StubCodeMark mark(this, stub_id); 1347 address start = __ pc(); 1348 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1349 1350 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1351 Label L_copy_byte, L_exit; 1352 const Register from = rdi; // source array address 1353 const Register to = rsi; // destination array address 1354 const Register count = rdx; // elements count 1355 const Register byte_count = rcx; 1356 const Register qword_count = count; 1357 const Register end_from = from; // source array end address 1358 const Register end_to = to; // destination array end address 1359 // End pointers are inclusive, and if count is not zero they point 1360 // to the last unit copied: end_to[0] := end_from[0] 1361 1362 __ enter(); // required for proper stackwalking of RuntimeStub frame 1363 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1364 1365 if (entry != nullptr) { 1366 *entry = __ pc(); 1367 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1368 BLOCK_COMMENT("Entry:"); 1369 } 1370 1371 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1372 // r9 and r10 may be used to save non-volatile registers 1373 1374 { 1375 // UnsafeMemoryAccess page error: continue after unsafe access 1376 UnsafeMemoryAccessMark umam(this, !aligned, true); 1377 // 'from', 'to' and 'count' are now valid 1378 __ movptr(byte_count, count); 1379 __ shrptr(count, 3); // count => qword_count 1380 1381 // Copy from low to high addresses. Use 'to' as scratch. 1382 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1383 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1384 __ negptr(qword_count); // make the count negative 1385 __ jmp(L_copy_bytes); 1386 1387 // Copy trailing qwords 1388 __ BIND(L_copy_8_bytes); 1389 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1390 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1391 __ increment(qword_count); 1392 __ jcc(Assembler::notZero, L_copy_8_bytes); 1393 1394 // Check for and copy trailing dword 1395 __ BIND(L_copy_4_bytes); 1396 __ testl(byte_count, 4); 1397 __ jccb(Assembler::zero, L_copy_2_bytes); 1398 __ movl(rax, Address(end_from, 8)); 1399 __ movl(Address(end_to, 8), rax); 1400 1401 __ addptr(end_from, 4); 1402 __ addptr(end_to, 4); 1403 1404 // Check for and copy trailing word 1405 __ BIND(L_copy_2_bytes); 1406 __ testl(byte_count, 2); 1407 __ jccb(Assembler::zero, L_copy_byte); 1408 __ movw(rax, Address(end_from, 8)); 1409 __ movw(Address(end_to, 8), rax); 1410 1411 __ addptr(end_from, 2); 1412 __ addptr(end_to, 2); 1413 1414 // Check for and copy trailing byte 1415 __ BIND(L_copy_byte); 1416 __ testl(byte_count, 1); 1417 __ jccb(Assembler::zero, L_exit); 1418 __ movb(rax, Address(end_from, 8)); 1419 __ movb(Address(end_to, 8), rax); 1420 } 1421 __ BIND(L_exit); 1422 address ucme_exit_pc = __ pc(); 1423 restore_arg_regs(); 1424 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1425 __ xorptr(rax, rax); // return 0 1426 __ vzeroupper(); 1427 __ leave(); // required for proper stackwalking of RuntimeStub frame 1428 __ ret(0); 1429 1430 { 1431 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc); 1432 // Copy in multi-bytes chunks 1433 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE); 1434 __ jmp(L_copy_4_bytes); 1435 } 1436 return start; 1437 } 1438 1439 1440 // Arguments: 1441 // entry - location for return of (post-push) entry 1442 // nooverlap_target - entry to branch to if no overlap detected 1443 // 1444 // Inputs: 1445 // c_rarg0 - source array address 1446 // c_rarg1 - destination array address 1447 // c_rarg2 - element count, treated as ssize_t, can be zero 1448 // 1449 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1450 // we let the hardware handle it. The one to eight bytes within words, 1451 // dwords or qwords that span cache line boundaries will still be loaded 1452 // and stored atomically. 1453 // 1454 address StubGenerator::generate_conjoint_byte_copy(address nooverlap_target, address* entry) { 1455 StubGenStubId stub_id = StubGenStubId::jbyte_arraycopy_id; 1456 // aligned is always false -- x86_64 always uses the unaligned code 1457 const bool aligned = false; 1458 #if COMPILER2_OR_JVMCI 1459 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1460 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target); 1461 } 1462 #endif 1463 __ align(CodeEntryAlignment); 1464 StubCodeMark mark(this, stub_id); 1465 address start = __ pc(); 1466 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1467 1468 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1469 const Register from = rdi; // source array address 1470 const Register to = rsi; // destination array address 1471 const Register count = rdx; // elements count 1472 const Register byte_count = rcx; 1473 const Register qword_count = count; 1474 1475 __ enter(); // required for proper stackwalking of RuntimeStub frame 1476 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1477 1478 if (entry != nullptr) { 1479 *entry = __ pc(); 1480 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1481 BLOCK_COMMENT("Entry:"); 1482 } 1483 1484 array_overlap_test(nooverlap_target, Address::times_1); 1485 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1486 // r9 and r10 may be used to save non-volatile registers 1487 1488 { 1489 // UnsafeMemoryAccess page error: continue after unsafe access 1490 UnsafeMemoryAccessMark umam(this, !aligned, true); 1491 // 'from', 'to' and 'count' are now valid 1492 __ movptr(byte_count, count); 1493 __ shrptr(count, 3); // count => qword_count 1494 1495 // Copy from high to low addresses. 1496 1497 // Check for and copy trailing byte 1498 __ testl(byte_count, 1); 1499 __ jcc(Assembler::zero, L_copy_2_bytes); 1500 __ movb(rax, Address(from, byte_count, Address::times_1, -1)); 1501 __ movb(Address(to, byte_count, Address::times_1, -1), rax); 1502 __ decrement(byte_count); // Adjust for possible trailing word 1503 1504 // Check for and copy trailing word 1505 __ BIND(L_copy_2_bytes); 1506 __ testl(byte_count, 2); 1507 __ jcc(Assembler::zero, L_copy_4_bytes); 1508 __ movw(rax, Address(from, byte_count, Address::times_1, -2)); 1509 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1510 1511 // Check for and copy trailing dword 1512 __ BIND(L_copy_4_bytes); 1513 __ testl(byte_count, 4); 1514 __ jcc(Assembler::zero, L_copy_bytes); 1515 __ movl(rax, Address(from, qword_count, Address::times_8)); 1516 __ movl(Address(to, qword_count, Address::times_8), rax); 1517 __ jmp(L_copy_bytes); 1518 1519 // Copy trailing qwords 1520 __ BIND(L_copy_8_bytes); 1521 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1522 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1523 __ decrement(qword_count); 1524 __ jcc(Assembler::notZero, L_copy_8_bytes); 1525 } 1526 restore_arg_regs(); 1527 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1528 __ xorptr(rax, rax); // return 0 1529 __ vzeroupper(); 1530 __ leave(); // required for proper stackwalking of RuntimeStub frame 1531 __ ret(0); 1532 1533 { 1534 // UnsafeMemoryAccess page error: continue after unsafe access 1535 UnsafeMemoryAccessMark umam(this, !aligned, true); 1536 // Copy in multi-bytes chunks 1537 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_BYTE); 1538 } 1539 restore_arg_regs(); 1540 INC_COUNTER_NP(SharedRuntime::_jbyte_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1541 __ xorptr(rax, rax); // return 0 1542 __ vzeroupper(); 1543 __ leave(); // required for proper stackwalking of RuntimeStub frame 1544 __ ret(0); 1545 1546 return start; 1547 } 1548 1549 1550 // Arguments: 1551 // entry - location for return of (post-push) entry 1552 // 1553 // Inputs: 1554 // c_rarg0 - source array address 1555 // c_rarg1 - destination array address 1556 // c_rarg2 - element count, treated as ssize_t, can be zero 1557 // 1558 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1559 // let the hardware handle it. The two or four words within dwords 1560 // or qwords that span cache line boundaries will still be loaded 1561 // and stored atomically. 1562 // 1563 // Side Effects: 1564 // entry is set to the no-overlap entry point 1565 // used by generate_conjoint_short_copy(). 1566 // 1567 address StubGenerator::generate_disjoint_short_copy(address *entry) { 1568 StubGenStubId stub_id = StubGenStubId::jshort_disjoint_arraycopy_id; 1569 // aligned is always false -- x86_64 always uses the unaligned code 1570 const bool aligned = false; 1571 #if COMPILER2_OR_JVMCI 1572 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1573 return generate_disjoint_copy_avx3_masked(stub_id, entry); 1574 } 1575 #endif 1576 1577 __ align(CodeEntryAlignment); 1578 StubCodeMark mark(this, stub_id); 1579 address start = __ pc(); 1580 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1581 1582 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1583 const Register from = rdi; // source array address 1584 const Register to = rsi; // destination array address 1585 const Register count = rdx; // elements count 1586 const Register word_count = rcx; 1587 const Register qword_count = count; 1588 const Register end_from = from; // source array end address 1589 const Register end_to = to; // destination array end address 1590 // End pointers are inclusive, and if count is not zero they point 1591 // to the last unit copied: end_to[0] := end_from[0] 1592 1593 __ enter(); // required for proper stackwalking of RuntimeStub frame 1594 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1595 1596 if (entry != nullptr) { 1597 *entry = __ pc(); 1598 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1599 BLOCK_COMMENT("Entry:"); 1600 } 1601 1602 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1603 // r9 and r10 may be used to save non-volatile registers 1604 1605 { 1606 // UnsafeMemoryAccess page error: continue after unsafe access 1607 UnsafeMemoryAccessMark umam(this, !aligned, true); 1608 // 'from', 'to' and 'count' are now valid 1609 __ movptr(word_count, count); 1610 __ shrptr(count, 2); // count => qword_count 1611 1612 // Copy from low to high addresses. Use 'to' as scratch. 1613 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1614 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1615 __ negptr(qword_count); 1616 __ jmp(L_copy_bytes); 1617 1618 // Copy trailing qwords 1619 __ BIND(L_copy_8_bytes); 1620 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1621 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1622 __ increment(qword_count); 1623 __ jcc(Assembler::notZero, L_copy_8_bytes); 1624 1625 // Original 'dest' is trashed, so we can't use it as a 1626 // base register for a possible trailing word copy 1627 1628 // Check for and copy trailing dword 1629 __ BIND(L_copy_4_bytes); 1630 __ testl(word_count, 2); 1631 __ jccb(Assembler::zero, L_copy_2_bytes); 1632 __ movl(rax, Address(end_from, 8)); 1633 __ movl(Address(end_to, 8), rax); 1634 1635 __ addptr(end_from, 4); 1636 __ addptr(end_to, 4); 1637 1638 // Check for and copy trailing word 1639 __ BIND(L_copy_2_bytes); 1640 __ testl(word_count, 1); 1641 __ jccb(Assembler::zero, L_exit); 1642 __ movw(rax, Address(end_from, 8)); 1643 __ movw(Address(end_to, 8), rax); 1644 } 1645 __ BIND(L_exit); 1646 address ucme_exit_pc = __ pc(); 1647 restore_arg_regs(); 1648 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1649 __ xorptr(rax, rax); // return 0 1650 __ vzeroupper(); 1651 __ leave(); // required for proper stackwalking of RuntimeStub frame 1652 __ ret(0); 1653 1654 { 1655 UnsafeMemoryAccessMark umam(this, !aligned, false, ucme_exit_pc); 1656 // Copy in multi-bytes chunks 1657 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT); 1658 __ jmp(L_copy_4_bytes); 1659 } 1660 1661 return start; 1662 } 1663 1664 1665 address StubGenerator::generate_fill(StubGenStubId stub_id) { 1666 BasicType t; 1667 bool aligned; 1668 1669 switch (stub_id) { 1670 case jbyte_fill_id: 1671 t = T_BYTE; 1672 aligned = false; 1673 break; 1674 case jshort_fill_id: 1675 t = T_SHORT; 1676 aligned = false; 1677 break; 1678 case jint_fill_id: 1679 t = T_INT; 1680 aligned = false; 1681 break; 1682 case arrayof_jbyte_fill_id: 1683 t = T_BYTE; 1684 aligned = true; 1685 break; 1686 case arrayof_jshort_fill_id: 1687 t = T_SHORT; 1688 aligned = true; 1689 break; 1690 case arrayof_jint_fill_id: 1691 t = T_INT; 1692 aligned = true; 1693 break; 1694 default: 1695 ShouldNotReachHere(); 1696 } 1697 1698 __ align(CodeEntryAlignment); 1699 StubCodeMark mark(this, stub_id); 1700 address start = __ pc(); 1701 1702 BLOCK_COMMENT("Entry:"); 1703 1704 const Register to = c_rarg0; // destination array address 1705 const Register value = c_rarg1; // value 1706 const Register count = c_rarg2; // elements count 1707 __ mov(r11, count); 1708 1709 __ enter(); // required for proper stackwalking of RuntimeStub frame 1710 1711 { 1712 // Add set memory mark to protect against unsafe accesses faulting 1713 UnsafeMemoryAccessMark umam(this, ((t == T_BYTE) && !aligned), true); 1714 __ generate_fill(t, aligned, to, value, r11, rax, xmm0); 1715 } 1716 1717 __ vzeroupper(); 1718 __ leave(); // required for proper stackwalking of RuntimeStub frame 1719 __ ret(0); 1720 1721 return start; 1722 } 1723 1724 1725 // Arguments: 1726 // entry - location for return of (post-push) entry 1727 // nooverlap_target - entry to branch to if no overlap detected 1728 // 1729 // Inputs: 1730 // c_rarg0 - source array address 1731 // c_rarg1 - destination array address 1732 // c_rarg2 - element count, treated as ssize_t, can be zero 1733 // 1734 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1735 // let the hardware handle it. The two or four words within dwords 1736 // or qwords that span cache line boundaries will still be loaded 1737 // and stored atomically. 1738 // 1739 address StubGenerator::generate_conjoint_short_copy(address nooverlap_target, address *entry) { 1740 StubGenStubId stub_id = StubGenStubId::jshort_arraycopy_id; 1741 // aligned is always false -- x86_64 always uses the unaligned code 1742 const bool aligned = false; 1743 #if COMPILER2_OR_JVMCI 1744 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1745 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target); 1746 } 1747 #endif 1748 1749 __ align(CodeEntryAlignment); 1750 StubCodeMark mark(this, stub_id); 1751 address start = __ pc(); 1752 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1753 1754 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; 1755 const Register from = rdi; // source array address 1756 const Register to = rsi; // destination array address 1757 const Register count = rdx; // elements count 1758 const Register word_count = rcx; 1759 const Register qword_count = count; 1760 1761 __ enter(); // required for proper stackwalking of RuntimeStub frame 1762 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1763 1764 if (entry != nullptr) { 1765 *entry = __ pc(); 1766 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1767 BLOCK_COMMENT("Entry:"); 1768 } 1769 1770 array_overlap_test(nooverlap_target, Address::times_2); 1771 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1772 // r9 and r10 may be used to save non-volatile registers 1773 1774 { 1775 // UnsafeMemoryAccess page error: continue after unsafe access 1776 UnsafeMemoryAccessMark umam(this, !aligned, true); 1777 // 'from', 'to' and 'count' are now valid 1778 __ movptr(word_count, count); 1779 __ shrptr(count, 2); // count => qword_count 1780 1781 // Copy from high to low addresses. Use 'to' as scratch. 1782 1783 // Check for and copy trailing word 1784 __ testl(word_count, 1); 1785 __ jccb(Assembler::zero, L_copy_4_bytes); 1786 __ movw(rax, Address(from, word_count, Address::times_2, -2)); 1787 __ movw(Address(to, word_count, Address::times_2, -2), rax); 1788 1789 // Check for and copy trailing dword 1790 __ BIND(L_copy_4_bytes); 1791 __ testl(word_count, 2); 1792 __ jcc(Assembler::zero, L_copy_bytes); 1793 __ movl(rax, Address(from, qword_count, Address::times_8)); 1794 __ movl(Address(to, qword_count, Address::times_8), rax); 1795 __ jmp(L_copy_bytes); 1796 1797 // Copy trailing qwords 1798 __ BIND(L_copy_8_bytes); 1799 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1800 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1801 __ decrement(qword_count); 1802 __ jcc(Assembler::notZero, L_copy_8_bytes); 1803 } 1804 restore_arg_regs(); 1805 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1806 __ xorptr(rax, rax); // return 0 1807 __ vzeroupper(); 1808 __ leave(); // required for proper stackwalking of RuntimeStub frame 1809 __ ret(0); 1810 1811 { 1812 // UnsafeMemoryAccess page error: continue after unsafe access 1813 UnsafeMemoryAccessMark umam(this, !aligned, true); 1814 // Copy in multi-bytes chunks 1815 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, T_SHORT); 1816 } 1817 restore_arg_regs(); 1818 INC_COUNTER_NP(SharedRuntime::_jshort_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1819 __ xorptr(rax, rax); // return 0 1820 __ vzeroupper(); 1821 __ leave(); // required for proper stackwalking of RuntimeStub frame 1822 __ ret(0); 1823 1824 return start; 1825 } 1826 1827 1828 // Arguments: 1829 // stub_id - unqiue id for stub to generate 1830 // entry - location for return of (post-push) entry 1831 // is_oop - true => oop array, so generate store check code 1832 // 1833 // Inputs: 1834 // c_rarg0 - source array address 1835 // c_rarg1 - destination array address 1836 // c_rarg2 - element count, treated as ssize_t, can be zero 1837 // 1838 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1839 // the hardware handle it. The two dwords within qwords that span 1840 // cache line boundaries will still be loaded and stored atomically. 1841 // 1842 // Side Effects: 1843 // disjoint_int_copy_entry is set to the no-overlap entry point 1844 // used by generate_conjoint_int_oop_copy(). 1845 // 1846 address StubGenerator::generate_disjoint_int_oop_copy(StubGenStubId stub_id, address* entry) { 1847 // aligned is always false -- x86_64 always uses the unaligned code 1848 const bool aligned = false; 1849 bool is_oop; 1850 bool dest_uninitialized; 1851 switch (stub_id) { 1852 case StubGenStubId::jint_disjoint_arraycopy_id: 1853 is_oop = false; 1854 dest_uninitialized = false; 1855 break; 1856 case StubGenStubId::oop_disjoint_arraycopy_id: 1857 assert(UseCompressedOops, "inconsistent oop copy size!"); 1858 is_oop = true; 1859 dest_uninitialized = false; 1860 break; 1861 case StubGenStubId::oop_disjoint_arraycopy_uninit_id: 1862 assert(UseCompressedOops, "inconsistent oop copy size!"); 1863 is_oop = true; 1864 dest_uninitialized = true; 1865 break; 1866 default: 1867 ShouldNotReachHere(); 1868 } 1869 1870 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1871 #if COMPILER2_OR_JVMCI 1872 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 1873 return generate_disjoint_copy_avx3_masked(stub_id, entry); 1874 } 1875 #endif 1876 1877 __ align(CodeEntryAlignment); 1878 StubCodeMark mark(this, stub_id); 1879 address start = __ pc(); 1880 1881 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 1882 const Register from = rdi; // source array address 1883 const Register to = rsi; // destination array address 1884 const Register count = rdx; // elements count 1885 const Register dword_count = rcx; 1886 const Register qword_count = count; 1887 const Register end_from = from; // source array end address 1888 const Register end_to = to; // destination array end address 1889 // End pointers are inclusive, and if count is not zero they point 1890 // to the last unit copied: end_to[0] := end_from[0] 1891 1892 __ enter(); // required for proper stackwalking of RuntimeStub frame 1893 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1894 1895 if (entry != nullptr) { 1896 *entry = __ pc(); 1897 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1898 BLOCK_COMMENT("Entry:"); 1899 } 1900 1901 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1902 // r9 is used to save r15_thread 1903 1904 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1905 if (dest_uninitialized) { 1906 decorators |= IS_DEST_UNINITIALIZED; 1907 } 1908 if (aligned) { 1909 decorators |= ARRAYCOPY_ALIGNED; 1910 } 1911 1912 BasicType type = is_oop ? T_OBJECT : T_INT; 1913 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1914 1915 { 1916 // UnsafeMemoryAccess page error: continue after unsafe access 1917 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 1918 // 'from', 'to' and 'count' are now valid 1919 __ movptr(dword_count, count); 1920 __ shrptr(count, 1); // count => qword_count 1921 1922 // Copy from low to high addresses. Use 'to' as scratch. 1923 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1924 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1925 __ negptr(qword_count); 1926 __ jmp(L_copy_bytes); 1927 1928 // Copy trailing qwords 1929 __ BIND(L_copy_8_bytes); 1930 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1931 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1932 __ increment(qword_count); 1933 __ jcc(Assembler::notZero, L_copy_8_bytes); 1934 1935 // Check for and copy trailing dword 1936 __ BIND(L_copy_4_bytes); 1937 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 1938 __ jccb(Assembler::zero, L_exit); 1939 __ movl(rax, Address(end_from, 8)); 1940 __ movl(Address(end_to, 8), rax); 1941 } 1942 __ BIND(L_exit); 1943 address ucme_exit_pc = __ pc(); 1944 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 1945 restore_arg_regs_using_thread(); 1946 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 1947 __ vzeroupper(); 1948 __ xorptr(rax, rax); // return 0 1949 __ leave(); // required for proper stackwalking of RuntimeStub frame 1950 __ ret(0); 1951 1952 { 1953 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, false, ucme_exit_pc); 1954 // Copy in multi-bytes chunks 1955 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT); 1956 __ jmp(L_copy_4_bytes); 1957 } 1958 1959 return start; 1960 } 1961 1962 1963 // Arguments: 1964 // entry - location for return of (post-push) entry 1965 // nooverlap_target - entry to branch to if no overlap detected 1966 // is_oop - true => oop array, so generate store check code 1967 // 1968 // Inputs: 1969 // c_rarg0 - source array address 1970 // c_rarg1 - destination array address 1971 // c_rarg2 - element count, treated as ssize_t, can be zero 1972 // 1973 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1974 // the hardware handle it. The two dwords within qwords that span 1975 // cache line boundaries will still be loaded and stored atomically. 1976 // 1977 address StubGenerator::generate_conjoint_int_oop_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1978 // aligned is always false -- x86_64 always uses the unaligned code 1979 const bool aligned = false; 1980 bool is_oop; 1981 bool dest_uninitialized; 1982 switch (stub_id) { 1983 case StubGenStubId::jint_arraycopy_id: 1984 is_oop = false; 1985 dest_uninitialized = false; 1986 break; 1987 case StubGenStubId::oop_arraycopy_id: 1988 assert(UseCompressedOops, "inconsistent oop copy size!"); 1989 is_oop = true; 1990 dest_uninitialized = false; 1991 break; 1992 case StubGenStubId::oop_arraycopy_uninit_id: 1993 assert(UseCompressedOops, "inconsistent oop copy size!"); 1994 is_oop = true; 1995 dest_uninitialized = true; 1996 break; 1997 default: 1998 ShouldNotReachHere(); 1999 } 2000 2001 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2002 #if COMPILER2_OR_JVMCI 2003 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 2004 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target); 2005 } 2006 #endif 2007 2008 __ align(CodeEntryAlignment); 2009 StubCodeMark mark(this, stub_id); 2010 address start = __ pc(); 2011 2012 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2013 const Register from = rdi; // source array address 2014 const Register to = rsi; // destination array address 2015 const Register count = rdx; // elements count 2016 const Register dword_count = rcx; 2017 const Register qword_count = count; 2018 2019 __ enter(); // required for proper stackwalking of RuntimeStub frame 2020 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2021 2022 if (entry != nullptr) { 2023 *entry = __ pc(); 2024 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2025 BLOCK_COMMENT("Entry:"); 2026 } 2027 2028 array_overlap_test(nooverlap_target, Address::times_4); 2029 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2030 // r9 is used to save r15_thread 2031 2032 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2033 if (dest_uninitialized) { 2034 decorators |= IS_DEST_UNINITIALIZED; 2035 } 2036 if (aligned) { 2037 decorators |= ARRAYCOPY_ALIGNED; 2038 } 2039 2040 BasicType type = is_oop ? T_OBJECT : T_INT; 2041 // no registers are destroyed by this call 2042 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2043 2044 assert_clean_int(count, rax); // Make sure 'count' is clean int. 2045 { 2046 // UnsafeMemoryAccess page error: continue after unsafe access 2047 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2048 // 'from', 'to' and 'count' are now valid 2049 __ movptr(dword_count, count); 2050 __ shrptr(count, 1); // count => qword_count 2051 2052 // Copy from high to low addresses. Use 'to' as scratch. 2053 2054 // Check for and copy trailing dword 2055 __ testl(dword_count, 1); 2056 __ jcc(Assembler::zero, L_copy_bytes); 2057 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 2058 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 2059 __ jmp(L_copy_bytes); 2060 2061 // Copy trailing qwords 2062 __ BIND(L_copy_8_bytes); 2063 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2064 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2065 __ decrement(qword_count); 2066 __ jcc(Assembler::notZero, L_copy_8_bytes); 2067 } 2068 if (is_oop) { 2069 __ jmp(L_exit); 2070 } 2071 restore_arg_regs_using_thread(); 2072 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2073 __ xorptr(rax, rax); // return 0 2074 __ vzeroupper(); 2075 __ leave(); // required for proper stackwalking of RuntimeStub frame 2076 __ ret(0); 2077 2078 { 2079 // UnsafeMemoryAccess page error: continue after unsafe access 2080 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2081 // Copy in multi-bytes chunks 2082 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_INT); 2083 } 2084 2085 __ BIND(L_exit); 2086 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 2087 restore_arg_regs_using_thread(); 2088 INC_COUNTER_NP(SharedRuntime::_jint_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2089 __ xorptr(rax, rax); // return 0 2090 __ vzeroupper(); 2091 __ leave(); // required for proper stackwalking of RuntimeStub frame 2092 __ ret(0); 2093 2094 return start; 2095 } 2096 2097 2098 // Arguments: 2099 // entry - location for return of (post-push) entry 2100 // 2101 // Inputs: 2102 // c_rarg0 - source array address 2103 // c_rarg1 - destination array address 2104 // c_rarg2 - element count, treated as ssize_t, can be zero 2105 // 2106 // Side Effects: 2107 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 2108 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 2109 // 2110 address StubGenerator::generate_disjoint_long_oop_copy(StubGenStubId stub_id, address *entry) { 2111 // aligned is always false -- x86_64 always uses the unaligned code 2112 const bool aligned = false; 2113 bool is_oop; 2114 bool dest_uninitialized; 2115 switch (stub_id) { 2116 case StubGenStubId::jlong_disjoint_arraycopy_id: 2117 is_oop = false; 2118 dest_uninitialized = false; 2119 break; 2120 case StubGenStubId::oop_disjoint_arraycopy_id: 2121 assert(!UseCompressedOops, "inconsistent oop copy size!"); 2122 is_oop = true; 2123 dest_uninitialized = false; 2124 break; 2125 case StubGenStubId::oop_disjoint_arraycopy_uninit_id: 2126 assert(!UseCompressedOops, "inconsistent oop copy size!"); 2127 is_oop = true; 2128 dest_uninitialized = true; 2129 break; 2130 default: 2131 ShouldNotReachHere(); 2132 } 2133 2134 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2135 #if COMPILER2_OR_JVMCI 2136 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 2137 return generate_disjoint_copy_avx3_masked(stub_id, entry); 2138 } 2139 #endif 2140 2141 __ align(CodeEntryAlignment); 2142 StubCodeMark mark(this, stub_id); 2143 address start = __ pc(); 2144 2145 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2146 const Register from = rdi; // source array address 2147 const Register to = rsi; // destination array address 2148 const Register qword_count = rdx; // elements count 2149 const Register end_from = from; // source array end address 2150 const Register end_to = rcx; // destination array end address 2151 const Register saved_count = r11; 2152 // End pointers are inclusive, and if count is not zero they point 2153 // to the last unit copied: end_to[0] := end_from[0] 2154 2155 __ enter(); // required for proper stackwalking of RuntimeStub frame 2156 // Save no-overlap entry point for generate_conjoint_long_oop_copy() 2157 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2158 2159 if (entry != nullptr) { 2160 *entry = __ pc(); 2161 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2162 BLOCK_COMMENT("Entry:"); 2163 } 2164 2165 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2166 // r9 is used to save r15_thread 2167 // 'from', 'to' and 'qword_count' are now valid 2168 2169 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2170 if (dest_uninitialized) { 2171 decorators |= IS_DEST_UNINITIALIZED; 2172 } 2173 if (aligned) { 2174 decorators |= ARRAYCOPY_ALIGNED; 2175 } 2176 2177 BasicType type = is_oop ? T_OBJECT : T_LONG; 2178 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2179 { 2180 // UnsafeMemoryAccess page error: continue after unsafe access 2181 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2182 2183 // Copy from low to high addresses. Use 'to' as scratch. 2184 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2185 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2186 __ negptr(qword_count); 2187 __ jmp(L_copy_bytes); 2188 2189 // Copy trailing qwords 2190 __ BIND(L_copy_8_bytes); 2191 bs->copy_load_at(_masm, decorators, type, 8, 2192 rax, Address(end_from, qword_count, Address::times_8, 8), 2193 r10); 2194 bs->copy_store_at(_masm, decorators, type, 8, 2195 Address(end_to, qword_count, Address::times_8, 8), rax, 2196 r10); 2197 __ increment(qword_count); 2198 __ jcc(Assembler::notZero, L_copy_8_bytes); 2199 } 2200 if (is_oop) { 2201 __ jmp(L_exit); 2202 } else { 2203 restore_arg_regs_using_thread(); 2204 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2205 __ xorptr(rax, rax); // return 0 2206 __ vzeroupper(); 2207 __ leave(); // required for proper stackwalking of RuntimeStub frame 2208 __ ret(0); 2209 } 2210 2211 { 2212 // UnsafeMemoryAccess page error: continue after unsafe access 2213 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2214 // Copy in multi-bytes chunks 2215 copy_bytes_forward(end_from, end_to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG); 2216 } 2217 2218 __ BIND(L_exit); 2219 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2220 restore_arg_regs_using_thread(); 2221 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 2222 SharedRuntime::_jlong_array_copy_ctr, 2223 rscratch1); // Update counter after rscratch1 is free 2224 __ vzeroupper(); 2225 __ xorptr(rax, rax); // return 0 2226 __ leave(); // required for proper stackwalking of RuntimeStub frame 2227 __ ret(0); 2228 2229 return start; 2230 } 2231 2232 2233 // Arguments: 2234 // entry - location for return of (post-push) entry 2235 // nooverlap_target - entry to branch to if no overlap detected 2236 // is_oop - true => oop array, so generate store check code 2237 // 2238 // Inputs: 2239 // c_rarg0 - source array address 2240 // c_rarg1 - destination array address 2241 // c_rarg2 - element count, treated as ssize_t, can be zero 2242 // 2243 address StubGenerator::generate_conjoint_long_oop_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 2244 // aligned is always false -- x86_64 always uses the unaligned code 2245 const bool aligned = false; 2246 bool is_oop; 2247 bool dest_uninitialized; 2248 switch (stub_id) { 2249 case StubGenStubId::jlong_arraycopy_id: 2250 is_oop = false; 2251 dest_uninitialized = false; 2252 break; 2253 case StubGenStubId::oop_arraycopy_id: 2254 assert(!UseCompressedOops, "inconsistent oop copy size!"); 2255 is_oop = true; 2256 dest_uninitialized = false; 2257 break; 2258 case StubGenStubId::oop_arraycopy_uninit_id: 2259 assert(!UseCompressedOops, "inconsistent oop copy size!"); 2260 is_oop = true; 2261 dest_uninitialized = true; 2262 break; 2263 default: 2264 ShouldNotReachHere(); 2265 } 2266 2267 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2268 #if COMPILER2_OR_JVMCI 2269 if ((!is_oop || bs->supports_avx3_masked_arraycopy()) && VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) { 2270 return generate_conjoint_copy_avx3_masked(stub_id, entry, nooverlap_target); 2271 } 2272 #endif 2273 2274 __ align(CodeEntryAlignment); 2275 StubCodeMark mark(this, stub_id); 2276 address start = __ pc(); 2277 2278 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2279 const Register from = rdi; // source array address 2280 const Register to = rsi; // destination array address 2281 const Register qword_count = rdx; // elements count 2282 const Register saved_count = rcx; 2283 2284 __ enter(); // required for proper stackwalking of RuntimeStub frame 2285 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2286 2287 if (entry != nullptr) { 2288 *entry = __ pc(); 2289 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2290 BLOCK_COMMENT("Entry:"); 2291 } 2292 2293 array_overlap_test(nooverlap_target, Address::times_8); 2294 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2295 // r9 is used to save r15_thread 2296 // 'from', 'to' and 'qword_count' are now valid 2297 2298 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2299 if (dest_uninitialized) { 2300 decorators |= IS_DEST_UNINITIALIZED; 2301 } 2302 if (aligned) { 2303 decorators |= ARRAYCOPY_ALIGNED; 2304 } 2305 2306 BasicType type = is_oop ? T_OBJECT : T_LONG; 2307 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2308 { 2309 // UnsafeMemoryAccess page error: continue after unsafe access 2310 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2311 2312 __ jmp(L_copy_bytes); 2313 2314 // Copy trailing qwords 2315 __ BIND(L_copy_8_bytes); 2316 bs->copy_load_at(_masm, decorators, type, 8, 2317 rax, Address(from, qword_count, Address::times_8, -8), 2318 r10); 2319 bs->copy_store_at(_masm, decorators, type, 8, 2320 Address(to, qword_count, Address::times_8, -8), rax, 2321 r10); 2322 __ decrement(qword_count); 2323 __ jcc(Assembler::notZero, L_copy_8_bytes); 2324 } 2325 if (is_oop) { 2326 __ jmp(L_exit); 2327 } else { 2328 restore_arg_regs_using_thread(); 2329 INC_COUNTER_NP(SharedRuntime::_jlong_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2330 __ xorptr(rax, rax); // return 0 2331 __ vzeroupper(); 2332 __ leave(); // required for proper stackwalking of RuntimeStub frame 2333 __ ret(0); 2334 } 2335 { 2336 // UnsafeMemoryAccess page error: continue after unsafe access 2337 UnsafeMemoryAccessMark umam(this, !is_oop && !aligned, true); 2338 2339 // Copy in multi-bytes chunks 2340 copy_bytes_backward(from, to, qword_count, rax, r10, L_copy_bytes, L_copy_8_bytes, decorators, is_oop ? T_OBJECT : T_LONG); 2341 } 2342 __ BIND(L_exit); 2343 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2344 restore_arg_regs_using_thread(); 2345 INC_COUNTER_NP(is_oop ? SharedRuntime::_oop_array_copy_ctr : 2346 SharedRuntime::_jlong_array_copy_ctr, 2347 rscratch1); // Update counter after rscratch1 is free 2348 __ vzeroupper(); 2349 __ xorptr(rax, rax); // return 0 2350 __ leave(); // required for proper stackwalking of RuntimeStub frame 2351 __ ret(0); 2352 2353 return start; 2354 } 2355 2356 2357 // Helper for generating a dynamic type check. 2358 // Smashes no registers. 2359 void StubGenerator::generate_type_check(Register sub_klass, 2360 Register super_check_offset, 2361 Register super_klass, 2362 Label& L_success) { 2363 assert_different_registers(sub_klass, super_check_offset, super_klass); 2364 2365 BLOCK_COMMENT("type_check:"); 2366 2367 Label L_miss; 2368 2369 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 2370 super_check_offset); 2371 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 2372 2373 // Fall through on failure! 2374 __ BIND(L_miss); 2375 } 2376 2377 // 2378 // Generate checkcasting array copy stub 2379 // 2380 // Input: 2381 // c_rarg0 - source array address 2382 // c_rarg1 - destination array address 2383 // c_rarg2 - element count, treated as ssize_t, can be zero 2384 // c_rarg3 - size_t ckoff (super_check_offset) 2385 // not Win64 2386 // c_rarg4 - oop ckval (super_klass) 2387 // Win64 2388 // rsp+40 - oop ckval (super_klass) 2389 // 2390 // Output: 2391 // rax == 0 - success 2392 // rax == -1^K - failure, where K is partial transfer count 2393 // 2394 address StubGenerator::generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 2395 2396 bool dest_uninitialized; 2397 switch (stub_id) { 2398 case StubGenStubId::checkcast_arraycopy_id: 2399 dest_uninitialized = false; 2400 break; 2401 case StubGenStubId::checkcast_arraycopy_uninit_id: 2402 dest_uninitialized = true; 2403 break; 2404 default: 2405 ShouldNotReachHere(); 2406 } 2407 2408 Label L_load_element, L_store_element, L_do_card_marks, L_done; 2409 2410 // Input registers (after setup_arg_regs) 2411 const Register from = rdi; // source array address 2412 const Register to = rsi; // destination array address 2413 const Register length = rdx; // elements count 2414 const Register ckoff = rcx; // super_check_offset 2415 const Register ckval = r8; // super_klass 2416 2417 // Registers used as temps (r13, r14 are save-on-entry) 2418 const Register end_from = from; // source array end address 2419 const Register end_to = r13; // destination array end address 2420 const Register count = rdx; // -(count_remaining) 2421 const Register r14_length = r14; // saved copy of length 2422 // End pointers are inclusive, and if length is not zero they point 2423 // to the last unit copied: end_to[0] := end_from[0] 2424 2425 const Register rax_oop = rax; // actual oop copied 2426 const Register r11_klass = r11; // oop._klass 2427 2428 //--------------------------------------------------------------- 2429 // Assembler stub will be used for this call to arraycopy 2430 // if the two arrays are subtypes of Object[] but the 2431 // destination array type is not equal to or a supertype 2432 // of the source type. Each element must be separately 2433 // checked. 2434 2435 __ align(CodeEntryAlignment); 2436 StubCodeMark mark(this, stub_id); 2437 address start = __ pc(); 2438 2439 __ enter(); // required for proper stackwalking of RuntimeStub frame 2440 2441 #ifdef ASSERT 2442 // caller guarantees that the arrays really are different 2443 // otherwise, we would have to make conjoint checks 2444 { Label L; 2445 array_overlap_test(L, TIMES_OOP); 2446 __ stop("checkcast_copy within a single array"); 2447 __ bind(L); 2448 } 2449 #endif //ASSERT 2450 2451 setup_arg_regs_using_thread(4); // from => rdi, to => rsi, length => rdx 2452 // ckoff => rcx, ckval => r8 2453 // r9 is used to save r15_thread 2454 #ifdef _WIN64 2455 // last argument (#4) is on stack on Win64 2456 __ movptr(ckval, Address(rsp, 6 * wordSize)); 2457 #endif 2458 2459 // Caller of this entry point must set up the argument registers. 2460 if (entry != nullptr) { 2461 *entry = __ pc(); 2462 BLOCK_COMMENT("Entry:"); 2463 } 2464 2465 // allocate spill slots for r13, r14 2466 enum { 2467 saved_r13_offset, 2468 saved_r14_offset, 2469 saved_r10_offset, 2470 saved_rbp_offset 2471 }; 2472 __ subptr(rsp, saved_rbp_offset * wordSize); 2473 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 2474 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 2475 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10); 2476 2477 #ifdef ASSERT 2478 Label L2; 2479 __ get_thread(r14); 2480 __ cmpptr(r15_thread, r14); 2481 __ jcc(Assembler::equal, L2); 2482 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 2483 __ bind(L2); 2484 #endif // ASSERT 2485 2486 // check that int operands are properly extended to size_t 2487 assert_clean_int(length, rax); 2488 assert_clean_int(ckoff, rax); 2489 2490 #ifdef ASSERT 2491 BLOCK_COMMENT("assert consistent ckoff/ckval"); 2492 // The ckoff and ckval must be mutually consistent, 2493 // even though caller generates both. 2494 { Label L; 2495 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2496 __ cmpl(ckoff, Address(ckval, sco_offset)); 2497 __ jcc(Assembler::equal, L); 2498 __ stop("super_check_offset inconsistent"); 2499 __ bind(L); 2500 } 2501 #endif //ASSERT 2502 2503 // Loop-invariant addresses. They are exclusive end pointers. 2504 Address end_from_addr(from, length, TIMES_OOP, 0); 2505 Address end_to_addr(to, length, TIMES_OOP, 0); 2506 // Loop-variant addresses. They assume post-incremented count < 0. 2507 Address from_element_addr(end_from, count, TIMES_OOP, 0); 2508 Address to_element_addr(end_to, count, TIMES_OOP, 0); 2509 2510 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 2511 if (dest_uninitialized) { 2512 decorators |= IS_DEST_UNINITIALIZED; 2513 } 2514 2515 BasicType type = T_OBJECT; 2516 size_t element_size = UseCompressedOops ? 4 : 8; 2517 2518 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2519 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2520 2521 // Copy from low to high addresses, indexed from the end of each array. 2522 __ lea(end_from, end_from_addr); 2523 __ lea(end_to, end_to_addr); 2524 __ movptr(r14_length, length); // save a copy of the length 2525 assert(length == count, ""); // else fix next line: 2526 __ negptr(count); // negate and test the length 2527 __ jcc(Assembler::notZero, L_load_element); 2528 2529 // Empty array: Nothing to do. 2530 __ xorptr(rax, rax); // return 0 on (trivial) success 2531 __ jmp(L_done); 2532 2533 // ======== begin loop ======== 2534 // (Loop is rotated; its entry is L_load_element.) 2535 // Loop control: 2536 // for (count = -count; count != 0; count++) 2537 // Base pointers src, dst are biased by 8*(count-1),to last element. 2538 __ align(OptoLoopAlignment); 2539 2540 __ BIND(L_store_element); 2541 bs->copy_store_at(_masm, 2542 decorators, 2543 type, 2544 element_size, 2545 to_element_addr, 2546 rax_oop, 2547 r10); 2548 __ increment(count); // increment the count toward zero 2549 __ jcc(Assembler::zero, L_do_card_marks); 2550 2551 // ======== loop entry is here ======== 2552 __ BIND(L_load_element); 2553 bs->copy_load_at(_masm, 2554 decorators, 2555 type, 2556 element_size, 2557 rax_oop, 2558 from_element_addr, 2559 r10); 2560 __ testptr(rax_oop, rax_oop); 2561 __ jcc(Assembler::zero, L_store_element); 2562 2563 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass 2564 generate_type_check(r11_klass, ckoff, ckval, L_store_element); 2565 // ======== end loop ======== 2566 2567 // It was a real error; we must depend on the caller to finish the job. 2568 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. 2569 // Emit GC store barriers for the oops we have copied (r14 + rdx), 2570 // and report their number to the caller. 2571 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); 2572 Label L_post_barrier; 2573 __ addptr(r14_length, count); // K = (original - remaining) oops 2574 __ movptr(rax, r14_length); // save the value 2575 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 2576 __ jccb(Assembler::notZero, L_post_barrier); 2577 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 2578 2579 // Come here on success only. 2580 __ BIND(L_do_card_marks); 2581 __ xorptr(rax, rax); // return 0 on success 2582 2583 __ BIND(L_post_barrier); 2584 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length); 2585 2586 // Common exit point (success or failure). 2587 __ BIND(L_done); 2588 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 2589 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 2590 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize)); 2591 restore_arg_regs_using_thread(); 2592 INC_COUNTER_NP(SharedRuntime::_checkcast_array_copy_ctr, rscratch1); // Update counter after rscratch1 is free 2593 __ leave(); // required for proper stackwalking of RuntimeStub frame 2594 __ ret(0); 2595 2596 return start; 2597 } 2598 2599 2600 // Generate 'unsafe' array copy stub 2601 // Though just as safe as the other stubs, it takes an unscaled 2602 // size_t argument instead of an element count. 2603 // 2604 // Input: 2605 // c_rarg0 - source array address 2606 // c_rarg1 - destination array address 2607 // c_rarg2 - byte count, treated as ssize_t, can be zero 2608 // 2609 // Examines the alignment of the operands and dispatches 2610 // to a long, int, short, or byte copy loop. 2611 // 2612 address StubGenerator::generate_unsafe_copy(address byte_copy_entry, address short_copy_entry, 2613 address int_copy_entry, address long_copy_entry) { 2614 2615 Label L_long_aligned, L_int_aligned, L_short_aligned; 2616 2617 // Input registers (before setup_arg_regs) 2618 const Register from = c_rarg0; // source array address 2619 const Register to = c_rarg1; // destination array address 2620 const Register size = c_rarg2; // byte count (size_t) 2621 2622 // Register used as a temp 2623 const Register bits = rax; // test copy of low bits 2624 2625 __ align(CodeEntryAlignment); 2626 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2627 StubCodeMark mark(this, stub_id); 2628 address start = __ pc(); 2629 2630 __ enter(); // required for proper stackwalking of RuntimeStub frame 2631 2632 // bump this on entry, not on exit: 2633 INC_COUNTER_NP(SharedRuntime::_unsafe_array_copy_ctr, rscratch1); 2634 2635 __ mov(bits, from); 2636 __ orptr(bits, to); 2637 __ orptr(bits, size); 2638 2639 __ testb(bits, BytesPerLong-1); 2640 __ jccb(Assembler::zero, L_long_aligned); 2641 2642 __ testb(bits, BytesPerInt-1); 2643 __ jccb(Assembler::zero, L_int_aligned); 2644 2645 __ testb(bits, BytesPerShort-1); 2646 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 2647 2648 __ BIND(L_short_aligned); 2649 __ shrptr(size, LogBytesPerShort); // size => short_count 2650 __ jump(RuntimeAddress(short_copy_entry)); 2651 2652 __ BIND(L_int_aligned); 2653 __ shrptr(size, LogBytesPerInt); // size => int_count 2654 __ jump(RuntimeAddress(int_copy_entry)); 2655 2656 __ BIND(L_long_aligned); 2657 __ shrptr(size, LogBytesPerLong); // size => qword_count 2658 __ jump(RuntimeAddress(long_copy_entry)); 2659 2660 return start; 2661 } 2662 2663 2664 // Static enum for helper 2665 enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD}; 2666 // Helper for generate_unsafe_setmemory 2667 // 2668 // Atomically fill an array of memory using 2-, 4-, or 8-byte chunks 2669 static void do_setmemory_atomic_loop(USM_TYPE type, Register dest, 2670 Register size, Register wide_value, 2671 Register tmp, Label& L_exit, 2672 MacroAssembler *_masm) { 2673 Label L_Loop, L_Tail, L_TailLoop; 2674 2675 int shiftval = 0; 2676 int incr = 0; 2677 2678 switch (type) { 2679 case USM_SHORT: 2680 shiftval = 1; 2681 incr = 16; 2682 break; 2683 case USM_DWORD: 2684 shiftval = 2; 2685 incr = 32; 2686 break; 2687 case USM_QUADWORD: 2688 shiftval = 3; 2689 incr = 64; 2690 break; 2691 } 2692 2693 // At this point, we know the lower bits of size are zero 2694 __ shrq(size, shiftval); 2695 // size now has number of X-byte chunks (2, 4 or 8) 2696 2697 // Number of (8*X)-byte chunks into tmp 2698 __ movq(tmp, size); 2699 __ shrq(tmp, 3); 2700 __ jccb(Assembler::zero, L_Tail); 2701 2702 __ BIND(L_Loop); 2703 2704 // Unroll 8 stores 2705 for (int i = 0; i < 8; i++) { 2706 switch (type) { 2707 case USM_SHORT: 2708 __ movw(Address(dest, (2 * i)), wide_value); 2709 break; 2710 case USM_DWORD: 2711 __ movl(Address(dest, (4 * i)), wide_value); 2712 break; 2713 case USM_QUADWORD: 2714 __ movq(Address(dest, (8 * i)), wide_value); 2715 break; 2716 } 2717 } 2718 __ addq(dest, incr); 2719 __ decrementq(tmp); 2720 __ jccb(Assembler::notZero, L_Loop); 2721 2722 __ BIND(L_Tail); 2723 2724 // Find number of remaining X-byte chunks 2725 __ andq(size, 0x7); 2726 2727 // If zero, then we're done 2728 __ jccb(Assembler::zero, L_exit); 2729 2730 __ BIND(L_TailLoop); 2731 2732 switch (type) { 2733 case USM_SHORT: 2734 __ movw(Address(dest, 0), wide_value); 2735 break; 2736 case USM_DWORD: 2737 __ movl(Address(dest, 0), wide_value); 2738 break; 2739 case USM_QUADWORD: 2740 __ movq(Address(dest, 0), wide_value); 2741 break; 2742 } 2743 __ addq(dest, incr >> 3); 2744 __ decrementq(size); 2745 __ jccb(Assembler::notZero, L_TailLoop); 2746 } 2747 2748 // Generate 'unsafe' set memory stub 2749 // Though just as safe as the other stubs, it takes an unscaled 2750 // size_t (# bytes) argument instead of an element count. 2751 // 2752 // Input: 2753 // c_rarg0 - destination array address 2754 // c_rarg1 - byte count (size_t) 2755 // c_rarg2 - byte value 2756 // 2757 // Examines the alignment of the operands and dispatches 2758 // to an int, short, or byte fill loop. 2759 // 2760 address StubGenerator::generate_unsafe_setmemory(address unsafe_byte_fill) { 2761 __ align(CodeEntryAlignment); 2762 StubGenStubId stub_id = StubGenStubId::unsafe_setmemory_id; 2763 StubCodeMark mark(this, stub_id); 2764 address start = __ pc(); 2765 __ enter(); // required for proper stackwalking of RuntimeStub frame 2766 2767 assert(unsafe_byte_fill != nullptr, "Invalid call"); 2768 2769 // bump this on entry, not on exit: 2770 INC_COUNTER_NP(SharedRuntime::_unsafe_set_memory_ctr, rscratch1); 2771 2772 { 2773 Label L_exit, L_fillQuadwords, L_fillDwords, L_fillBytes; 2774 2775 const Register dest = c_rarg0; 2776 const Register size = c_rarg1; 2777 const Register byteVal = c_rarg2; 2778 const Register wide_value = rax; 2779 const Register rScratch1 = r10; 2780 2781 assert_different_registers(dest, size, byteVal, wide_value, rScratch1); 2782 2783 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char) 2784 2785 __ testq(size, size); 2786 __ jcc(Assembler::zero, L_exit); 2787 2788 // Propagate byte to full Register 2789 __ movzbl(rScratch1, byteVal); 2790 __ mov64(wide_value, 0x0101010101010101ULL); 2791 __ imulq(wide_value, rScratch1); 2792 2793 // Check for pointer & size alignment 2794 __ movq(rScratch1, dest); 2795 __ orq(rScratch1, size); 2796 2797 __ testb(rScratch1, 7); 2798 __ jcc(Assembler::equal, L_fillQuadwords); 2799 2800 __ testb(rScratch1, 3); 2801 __ jcc(Assembler::equal, L_fillDwords); 2802 2803 __ testb(rScratch1, 1); 2804 __ jcc(Assembler::notEqual, L_fillBytes); 2805 2806 // Fill words 2807 { 2808 UnsafeMemoryAccessMark umam(this, true, true); 2809 2810 // At this point, we know the lower bit of size is zero and a 2811 // multiple of 2 2812 do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1, 2813 L_exit, _masm); 2814 } 2815 __ jmpb(L_exit); 2816 2817 __ BIND(L_fillQuadwords); 2818 2819 // Fill QUADWORDs 2820 { 2821 UnsafeMemoryAccessMark umam(this, true, true); 2822 2823 // At this point, we know the lower 3 bits of size are zero and a 2824 // multiple of 8 2825 do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1, 2826 L_exit, _masm); 2827 } 2828 __ BIND(L_exit); 2829 2830 __ leave(); // required for proper stackwalking of RuntimeStub frame 2831 __ ret(0); 2832 2833 __ BIND(L_fillDwords); 2834 2835 // Fill DWORDs 2836 { 2837 UnsafeMemoryAccessMark umam(this, true, true); 2838 2839 // At this point, we know the lower 2 bits of size are zero and a 2840 // multiple of 4 2841 do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1, 2842 L_exit, _masm); 2843 } 2844 __ jmpb(L_exit); 2845 2846 __ BIND(L_fillBytes); 2847 // Set up for tail call to previously generated byte fill routine 2848 // Parameter order is (ptr, byteVal, size) 2849 __ xchgq(c_rarg1, c_rarg2); 2850 __ leave(); // Clear effect of enter() 2851 __ jump(RuntimeAddress(unsafe_byte_fill)); 2852 } 2853 2854 return start; 2855 } 2856 2857 // Perform range checks on the proposed arraycopy. 2858 // Kills temp, but nothing else. 2859 // Also, clean the sign bits of src_pos and dst_pos. 2860 void StubGenerator::arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2861 Register src_pos, // source position (c_rarg1) 2862 Register dst, // destination array oo (c_rarg2) 2863 Register dst_pos, // destination position (c_rarg3) 2864 Register length, 2865 Register temp, 2866 Label& L_failed) { 2867 BLOCK_COMMENT("arraycopy_range_checks:"); 2868 2869 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2870 __ movl(temp, length); 2871 __ addl(temp, src_pos); // src_pos + length 2872 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); 2873 __ jcc(Assembler::above, L_failed); 2874 2875 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2876 __ movl(temp, length); 2877 __ addl(temp, dst_pos); // dst_pos + length 2878 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2879 __ jcc(Assembler::above, L_failed); 2880 2881 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2882 // Move with sign extension can be used since they are positive. 2883 __ movslq(src_pos, src_pos); 2884 __ movslq(dst_pos, dst_pos); 2885 2886 BLOCK_COMMENT("arraycopy_range_checks done"); 2887 } 2888 2889 2890 // Generate generic array copy stubs 2891 // 2892 // Input: 2893 // c_rarg0 - src oop 2894 // c_rarg1 - src_pos (32-bits) 2895 // c_rarg2 - dst oop 2896 // c_rarg3 - dst_pos (32-bits) 2897 // not Win64 2898 // c_rarg4 - element count (32-bits) 2899 // Win64 2900 // rsp+40 - element count (32-bits) 2901 // 2902 // Output: 2903 // rax == 0 - success 2904 // rax == -1^K - failure, where K is partial transfer count 2905 // 2906 address StubGenerator::generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2907 address int_copy_entry, address oop_copy_entry, 2908 address long_copy_entry, address checkcast_copy_entry) { 2909 2910 Label L_failed, L_failed_0, L_objArray; 2911 Label L_copy_shorts, L_copy_ints, L_copy_longs; 2912 2913 // Input registers 2914 const Register src = c_rarg0; // source array oop 2915 const Register src_pos = c_rarg1; // source position 2916 const Register dst = c_rarg2; // destination array oop 2917 const Register dst_pos = c_rarg3; // destination position 2918 #ifndef _WIN64 2919 const Register length = c_rarg4; 2920 const Register rklass_tmp = r9; // load_klass 2921 #else 2922 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64 2923 const Register rklass_tmp = rdi; // load_klass 2924 #endif 2925 2926 { int modulus = CodeEntryAlignment; 2927 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 2928 int advance = target - (__ offset() % modulus); 2929 if (advance < 0) advance += modulus; 2930 if (advance > 0) __ nop(advance); 2931 } 2932 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2933 StubCodeMark mark(this, stub_id); 2934 2935 // Short-hop target to L_failed. Makes for denser prologue code. 2936 __ BIND(L_failed_0); 2937 __ jmp(L_failed); 2938 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 2939 2940 __ align(CodeEntryAlignment); 2941 address start = __ pc(); 2942 2943 __ enter(); // required for proper stackwalking of RuntimeStub frame 2944 2945 #ifdef _WIN64 2946 __ push(rklass_tmp); // rdi is callee-save on Windows 2947 #endif 2948 2949 // bump this on entry, not on exit: 2950 INC_COUNTER_NP(SharedRuntime::_generic_array_copy_ctr, rscratch1); 2951 2952 //----------------------------------------------------------------------- 2953 // Assembler stub will be used for this call to arraycopy 2954 // if the following conditions are met: 2955 // 2956 // (1) src and dst must not be null. 2957 // (2) src_pos must not be negative. 2958 // (3) dst_pos must not be negative. 2959 // (4) length must not be negative. 2960 // (5) src klass and dst klass should be the same and not null. 2961 // (6) src and dst should be arrays. 2962 // (7) src_pos + length must not exceed length of src. 2963 // (8) dst_pos + length must not exceed length of dst. 2964 // 2965 2966 // if (src == nullptr) return -1; 2967 __ testptr(src, src); // src oop 2968 size_t j1off = __ offset(); 2969 __ jccb(Assembler::zero, L_failed_0); 2970 2971 // if (src_pos < 0) return -1; 2972 __ testl(src_pos, src_pos); // src_pos (32-bits) 2973 __ jccb(Assembler::negative, L_failed_0); 2974 2975 // if (dst == nullptr) return -1; 2976 __ testptr(dst, dst); // dst oop 2977 __ jccb(Assembler::zero, L_failed_0); 2978 2979 // if (dst_pos < 0) return -1; 2980 __ testl(dst_pos, dst_pos); // dst_pos (32-bits) 2981 size_t j4off = __ offset(); 2982 __ jccb(Assembler::negative, L_failed_0); 2983 2984 // The first four tests are very dense code, 2985 // but not quite dense enough to put four 2986 // jumps in a 16-byte instruction fetch buffer. 2987 // That's good, because some branch predicters 2988 // do not like jumps so close together. 2989 // Make sure of this. 2990 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps"); 2991 2992 // registers used as temp 2993 const Register r11_length = r11; // elements count to copy 2994 const Register r10_src_klass = r10; // array klass 2995 2996 // if (length < 0) return -1; 2997 __ movl(r11_length, length); // length (elements count, 32-bits value) 2998 __ testl(r11_length, r11_length); 2999 __ jccb(Assembler::negative, L_failed_0); 3000 3001 __ load_klass(r10_src_klass, src, rklass_tmp); 3002 #ifdef ASSERT 3003 // assert(src->klass() != nullptr); 3004 { 3005 BLOCK_COMMENT("assert klasses not null {"); 3006 Label L1, L2; 3007 __ testptr(r10_src_klass, r10_src_klass); 3008 __ jcc(Assembler::notZero, L2); // it is broken if klass is null 3009 __ bind(L1); 3010 __ stop("broken null klass"); 3011 __ bind(L2); 3012 __ load_klass(rax, dst, rklass_tmp); 3013 __ cmpq(rax, 0); 3014 __ jcc(Assembler::equal, L1); // this would be broken also 3015 BLOCK_COMMENT("} assert klasses not null done"); 3016 } 3017 #endif 3018 3019 // Load layout helper (32-bits) 3020 // 3021 // |array_tag| | header_size | element_type | |log2_element_size| 3022 // 32 30 24 16 8 2 0 3023 // 3024 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 3025 // 3026 3027 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 3028 3029 // Handle objArrays completely differently... 3030 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 3031 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); 3032 __ jcc(Assembler::equal, L_objArray); 3033 3034 // if (src->klass() != dst->klass()) return -1; 3035 __ load_klass(rax, dst, rklass_tmp); 3036 __ cmpq(r10_src_klass, rax); 3037 __ jcc(Assembler::notEqual, L_failed); 3038 3039 // Check for flat inline type array -> return -1 3040 __ test_flat_array_oop(src, rax, L_failed); 3041 3042 // Check for null-free (non-flat) inline type array -> handle as object array 3043 __ test_null_free_array_oop(src, rax, L_objArray); 3044 3045 const Register rax_lh = rax; // layout helper 3046 __ movl(rax_lh, Address(r10_src_klass, lh_offset)); 3047 3048 // Check for flat inline type array -> return -1 3049 __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace); 3050 __ jcc(Assembler::notZero, L_failed); 3051 3052 // if (!src->is_Array()) return -1; 3053 __ cmpl(rax_lh, Klass::_lh_neutral_value); 3054 __ jcc(Assembler::greaterEqual, L_failed); 3055 3056 // At this point, it is known to be a typeArray (array_tag 0x3). 3057 #ifdef ASSERT 3058 { 3059 BLOCK_COMMENT("assert primitive array {"); 3060 Label L; 3061 __ movl(rklass_tmp, rax_lh); 3062 __ sarl(rklass_tmp, Klass::_lh_array_tag_shift); 3063 __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value); 3064 __ jcc(Assembler::equal, L); 3065 __ stop("must be a primitive array"); 3066 __ bind(L); 3067 BLOCK_COMMENT("} assert primitive array done"); 3068 } 3069 #endif 3070 3071 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 3072 r10, L_failed); 3073 3074 // TypeArrayKlass 3075 // 3076 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 3077 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 3078 // 3079 3080 const Register r10_offset = r10; // array offset 3081 const Register rax_elsize = rax_lh; // element size 3082 3083 __ movl(r10_offset, rax_lh); 3084 __ shrl(r10_offset, Klass::_lh_header_size_shift); 3085 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset 3086 __ addptr(src, r10_offset); // src array offset 3087 __ addptr(dst, r10_offset); // dst array offset 3088 BLOCK_COMMENT("choose copy loop based on element size"); 3089 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize 3090 3091 #ifdef _WIN64 3092 __ pop(rklass_tmp); // Restore callee-save rdi 3093 #endif 3094 3095 // next registers should be set before the jump to corresponding stub 3096 const Register from = c_rarg0; // source array address 3097 const Register to = c_rarg1; // destination array address 3098 const Register count = c_rarg2; // elements count 3099 3100 // 'from', 'to', 'count' registers should be set in such order 3101 // since they are the same as 'src', 'src_pos', 'dst'. 3102 3103 __ cmpl(rax_elsize, 0); 3104 __ jccb(Assembler::notEqual, L_copy_shorts); 3105 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr 3106 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr 3107 __ movl2ptr(count, r11_length); // length 3108 __ jump(RuntimeAddress(byte_copy_entry)); 3109 3110 __ BIND(L_copy_shorts); 3111 __ cmpl(rax_elsize, LogBytesPerShort); 3112 __ jccb(Assembler::notEqual, L_copy_ints); 3113 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr 3114 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr 3115 __ movl2ptr(count, r11_length); // length 3116 __ jump(RuntimeAddress(short_copy_entry)); 3117 3118 __ BIND(L_copy_ints); 3119 __ cmpl(rax_elsize, LogBytesPerInt); 3120 __ jccb(Assembler::notEqual, L_copy_longs); 3121 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr 3122 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr 3123 __ movl2ptr(count, r11_length); // length 3124 __ jump(RuntimeAddress(int_copy_entry)); 3125 3126 __ BIND(L_copy_longs); 3127 #ifdef ASSERT 3128 { 3129 BLOCK_COMMENT("assert long copy {"); 3130 Label L; 3131 __ cmpl(rax_elsize, LogBytesPerLong); 3132 __ jcc(Assembler::equal, L); 3133 __ stop("must be long copy, but elsize is wrong"); 3134 __ bind(L); 3135 BLOCK_COMMENT("} assert long copy done"); 3136 } 3137 #endif 3138 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 3139 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 3140 __ movl2ptr(count, r11_length); // length 3141 __ jump(RuntimeAddress(long_copy_entry)); 3142 3143 // ObjArrayKlass 3144 __ BIND(L_objArray); 3145 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 3146 3147 Label L_plain_copy, L_checkcast_copy; 3148 // test array classes for subtyping 3149 __ load_klass(rax, dst, rklass_tmp); 3150 __ cmpq(r10_src_klass, rax); // usual case is exact equality 3151 __ jcc(Assembler::notEqual, L_checkcast_copy); 3152 3153 // Identically typed arrays can be copied without element-wise checks. 3154 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 3155 r10, L_failed); 3156 3157 __ lea(from, Address(src, src_pos, TIMES_OOP, 3158 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 3159 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 3160 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 3161 __ movl2ptr(count, r11_length); // length 3162 __ BIND(L_plain_copy); 3163 #ifdef _WIN64 3164 __ pop(rklass_tmp); // Restore callee-save rdi 3165 #endif 3166 __ jump(RuntimeAddress(oop_copy_entry)); 3167 3168 __ BIND(L_checkcast_copy); 3169 // live at this point: r10_src_klass, r11_length, rax (dst_klass) 3170 { 3171 // Before looking at dst.length, make sure dst is also an objArray. 3172 // This check also fails for flat arrays which are not supported. 3173 __ cmpl(Address(rax, lh_offset), objArray_lh); 3174 __ jcc(Assembler::notEqual, L_failed); 3175 3176 #ifdef ASSERT 3177 { 3178 BLOCK_COMMENT("assert not null-free array {"); 3179 Label L; 3180 __ test_non_null_free_array_oop(dst, rklass_tmp, L); 3181 __ stop("unexpected null-free array"); 3182 __ bind(L); 3183 BLOCK_COMMENT("} assert not null-free array"); 3184 } 3185 #endif 3186 3187 // It is safe to examine both src.length and dst.length. 3188 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 3189 rax, L_failed); 3190 3191 const Register r11_dst_klass = r11; 3192 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload 3193 3194 // Marshal the base address arguments now, freeing registers. 3195 __ lea(from, Address(src, src_pos, TIMES_OOP, 3196 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 3197 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 3198 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 3199 __ movl(count, length); // length (reloaded) 3200 Register sco_temp = c_rarg3; // this register is free now 3201 assert_different_registers(from, to, count, sco_temp, 3202 r11_dst_klass, r10_src_klass); 3203 assert_clean_int(count, sco_temp); 3204 3205 // Generate the type check. 3206 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3207 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 3208 assert_clean_int(sco_temp, rax); 3209 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 3210 3211 // Fetch destination element klass from the ObjArrayKlass header. 3212 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3213 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 3214 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 3215 assert_clean_int(sco_temp, rax); 3216 3217 #ifdef _WIN64 3218 __ pop(rklass_tmp); // Restore callee-save rdi 3219 #endif 3220 3221 // the checkcast_copy loop needs two extra arguments: 3222 assert(c_rarg3 == sco_temp, "#3 already in place"); 3223 // Set up arguments for checkcast_copy_entry. 3224 setup_arg_regs_using_thread(4); 3225 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris 3226 __ jump(RuntimeAddress(checkcast_copy_entry)); 3227 } 3228 3229 __ BIND(L_failed); 3230 #ifdef _WIN64 3231 __ pop(rklass_tmp); // Restore callee-save rdi 3232 #endif 3233 __ xorptr(rax, rax); 3234 __ notptr(rax); // return -1 3235 __ leave(); // required for proper stackwalking of RuntimeStub frame 3236 __ ret(0); 3237 3238 return start; 3239 } 3240 3241 #undef __