1 /* 2 * Copyright (c) 2008, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.inline.hpp" 26 #include "compiler/oopMap.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "gc/shared/barrierSetNMethod.hpp" 30 #include "interpreter/interpreter.hpp" 31 #include "memory/universe.hpp" 32 #include "nativeInst_arm.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "utilities/align.hpp" 44 #include "utilities/powerOfTwo.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 // Declaration and definition of StubGenerator (no .hpp file). 50 // For a more detailed description of the stub routine structure 51 // see the comment in stubRoutines.hpp 52 53 #define __ _masm-> 54 55 #ifdef PRODUCT 56 #define BLOCK_COMMENT(str) /* nothing */ 57 #else 58 #define BLOCK_COMMENT(str) __ block_comment(str) 59 #endif 60 61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 62 63 // ------------------------------------------------------------------------------------------------------------------------- 64 // Stub Code definitions 65 66 // Platform dependent parameters for array copy stubs 67 68 // Note: we have noticed a huge change in behavior on a microbenchmark 69 // from platform to platform depending on the configuration. 70 71 // Instead of adding a series of command line options (which 72 // unfortunately have to be done in the shared file and cannot appear 73 // only in the ARM port), the tested result are hard-coded here in a set 74 // of options, selected by specifying 'ArmCopyPlatform' 75 76 // Currently, this 'platform' is hardcoded to a value that is a good 77 // enough trade-off. However, one can easily modify this file to test 78 // the hard-coded configurations or create new ones. If the gain is 79 // significant, we could decide to either add command line options or 80 // add code to automatically choose a configuration. 81 82 // see comments below for the various configurations created 83 #define DEFAULT_ARRAYCOPY_CONFIG 0 84 #define TEGRA2_ARRAYCOPY_CONFIG 1 85 #define IMX515_ARRAYCOPY_CONFIG 2 86 87 // Hard coded choices (XXX: could be changed to a command line option) 88 #define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG 89 90 #define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains 91 92 // configuration for each kind of loop 93 typedef struct { 94 int pld_distance; // prefetch distance (0 => no prefetch, <0: prefetch_before); 95 bool split_ldm; // if true, split each STM in STMs with fewer registers 96 bool split_stm; // if true, split each LTM in LTMs with fewer registers 97 } arraycopy_loop_config; 98 99 // configuration for all loops 100 typedef struct { 101 // const char *description; 102 arraycopy_loop_config forward_aligned; 103 arraycopy_loop_config backward_aligned; 104 arraycopy_loop_config forward_shifted; 105 arraycopy_loop_config backward_shifted; 106 } arraycopy_platform_config; 107 108 // configured platforms 109 static arraycopy_platform_config arraycopy_configurations[] = { 110 // configuration parameters for arraycopy loops 111 112 // Configurations were chosen based on manual analysis of benchmark 113 // results, minimizing overhead with respect to best results on the 114 // different test cases. 115 116 // Prefetch before is always favored since it avoids dirtying the 117 // cache uselessly for small copies. Code for prefetch after has 118 // been kept in case the difference is significant for some 119 // platforms but we might consider dropping it. 120 121 // distance, ldm, stm 122 { 123 // default: tradeoff tegra2/imx515/nv-tegra2, 124 // Notes on benchmarking: 125 // - not far from optimal configuration on nv-tegra2 126 // - within 5% of optimal configuration except for backward aligned on IMX 127 // - up to 40% from optimal configuration for backward shifted and backward align for tegra2 128 // but still on par with the operating system copy 129 {-256, true, true }, // forward aligned 130 {-256, true, true }, // backward aligned 131 {-256, false, false }, // forward shifted 132 {-256, true, true } // backward shifted 133 }, 134 { 135 // configuration tuned on tegra2-4. 136 // Warning: should not be used on nv-tegra2 ! 137 // Notes: 138 // - prefetch after gives 40% gain on backward copies on tegra2-4, 139 // resulting in better number than the operating system 140 // copy. However, this can lead to a 300% loss on nv-tegra and has 141 // more impact on the cache (fetches further than what is 142 // copied). Use this configuration with care, in case it improves 143 // reference benchmarks. 144 {-256, true, true }, // forward aligned 145 {96, false, false }, // backward aligned 146 {-256, false, false }, // forward shifted 147 {96, false, false } // backward shifted 148 }, 149 { 150 // configuration tuned on imx515 151 // Notes: 152 // - smaller prefetch distance is sufficient to get good result and might be more stable 153 // - refined backward aligned options within 5% of optimal configuration except for 154 // tests were the arrays fit in the cache 155 {-160, false, false }, // forward aligned 156 {-160, false, false }, // backward aligned 157 {-160, false, false }, // forward shifted 158 {-160, true, true } // backward shifted 159 } 160 }; 161 162 class StubGenerator: public StubCodeGenerator { 163 164 #ifdef PRODUCT 165 #define inc_counter_np(a,b,c) ((void)0) 166 #else 167 #define inc_counter_np(counter, t1, t2) \ 168 BLOCK_COMMENT("inc_counter " #counter); \ 169 __ inc_counter(&counter, t1, t2); 170 #endif 171 172 private: 173 174 address generate_call_stub(address& return_address) { 175 StubGenStubId stub_id = StubGenStubId::call_stub_id; 176 StubCodeMark mark(this, stub_id); 177 address start = __ pc(); 178 179 180 assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code"); 181 182 __ mov(Rtemp, SP); 183 __ push(RegisterSet(FP) | RegisterSet(LR)); 184 __ fpush_hardfp(FloatRegisterSet(D8, 8)); 185 __ stmdb(SP, RegisterSet(R0, R2) | RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11, writeback); 186 __ mov(Rmethod, R3); 187 __ ldmia(Rtemp, RegisterSet(R1, R3) | Rthread); // stacked arguments 188 189 // XXX: TODO 190 // Would be better with respect to native tools if the following 191 // setting of FP was changed to conform to the native ABI, with FP 192 // pointing to the saved FP slot (and the corresponding modifications 193 // for entry_frame_call_wrapper_offset and frame::real_fp). 194 __ mov(FP, SP); 195 196 { 197 Label no_parameters, pass_parameters; 198 __ cmp(R3, 0); 199 __ b(no_parameters, eq); 200 201 __ bind(pass_parameters); 202 __ ldr(Rtemp, Address(R2, wordSize, post_indexed)); // Rtemp OK, unused and scratchable 203 __ subs(R3, R3, 1); 204 __ push(Rtemp); 205 __ b(pass_parameters, ne); 206 __ bind(no_parameters); 207 } 208 209 __ mov(Rsender_sp, SP); 210 __ blx(R1); 211 return_address = __ pc(); 212 213 __ add(SP, FP, wordSize); // Skip link to JavaCallWrapper 214 __ pop(RegisterSet(R2, R3)); 215 #ifndef __ABI_HARD__ 216 __ cmp(R3, T_LONG); 217 __ cmp(R3, T_DOUBLE, ne); 218 __ str(R0, Address(R2)); 219 __ str(R1, Address(R2, wordSize), eq); 220 #else 221 Label cont, l_float, l_double; 222 223 __ cmp(R3, T_DOUBLE); 224 __ b(l_double, eq); 225 226 __ cmp(R3, T_FLOAT); 227 __ b(l_float, eq); 228 229 __ cmp(R3, T_LONG); 230 __ str(R0, Address(R2)); 231 __ str(R1, Address(R2, wordSize), eq); 232 __ b(cont); 233 234 235 __ bind(l_double); 236 __ fstd(D0, Address(R2)); 237 __ b(cont); 238 239 __ bind(l_float); 240 __ fsts(S0, Address(R2)); 241 242 __ bind(cont); 243 #endif 244 245 __ pop(RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11); 246 __ fpop_hardfp(FloatRegisterSet(D8, 8)); 247 __ pop(RegisterSet(FP) | RegisterSet(PC)); 248 249 return start; 250 } 251 252 253 // (in) Rexception_obj: exception oop 254 address generate_catch_exception() { 255 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 256 StubCodeMark mark(this, stub_id); 257 address start = __ pc(); 258 259 __ str(Rexception_obj, Address(Rthread, Thread::pending_exception_offset())); 260 __ b(StubRoutines::_call_stub_return_address); 261 262 return start; 263 } 264 265 266 // (in) Rexception_pc: return address 267 address generate_forward_exception() { 268 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 269 StubCodeMark mark(this, stub_id); 270 address start = __ pc(); 271 272 __ mov(c_rarg0, Rthread); 273 __ mov(c_rarg1, Rexception_pc); 274 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 275 SharedRuntime::exception_handler_for_return_address), 276 c_rarg0, c_rarg1); 277 __ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset())); 278 const Register Rzero = __ zero_register(Rtemp); // Rtemp OK (cleared by above call) 279 __ str(Rzero, Address(Rthread, Thread::pending_exception_offset())); 280 281 #ifdef ASSERT 282 // make sure exception is set 283 { Label L; 284 __ cbnz(Rexception_obj, L); 285 __ stop("StubRoutines::forward exception: no pending exception (2)"); 286 __ bind(L); 287 } 288 #endif 289 290 // Verify that there is really a valid exception in RAX. 291 __ verify_oop(Rexception_obj); 292 293 __ jump(R0); // handler is returned in R0 by runtime function 294 return start; 295 } 296 297 298 299 // Integer division shared routine 300 // Input: 301 // R0 - dividend 302 // R2 - divisor 303 // Output: 304 // R0 - remainder 305 // R1 - quotient 306 // Destroys: 307 // R2 308 // LR 309 address generate_idiv_irem() { 310 Label positive_arguments, negative_or_zero, call_slow_path; 311 Register dividend = R0; 312 Register divisor = R2; 313 Register remainder = R0; 314 Register quotient = R1; 315 Register tmp = LR; 316 assert(dividend == remainder, "must be"); 317 318 StubGenStubId stub_id = StubGenStubId::idiv_irem_id; 319 StubCodeMark mark(this, stub_id); 320 address start = __ pc(); 321 322 // Check for special cases: divisor <= 0 or dividend < 0 323 __ cmp(divisor, 0); 324 __ orrs(quotient, dividend, divisor, ne); 325 __ b(negative_or_zero, le); 326 327 __ bind(positive_arguments); 328 // Save return address on stack to free one extra register 329 __ push(LR); 330 // Approximate the mamximum order of the quotient 331 __ clz(tmp, dividend); 332 __ clz(quotient, divisor); 333 __ subs(tmp, quotient, tmp); 334 __ mov(quotient, 0); 335 // Jump to the appropriate place in the unrolled loop below 336 __ ldr(PC, Address(PC, tmp, lsl, 2), pl); 337 // If divisor is greater than dividend, return immediately 338 __ pop(PC); 339 340 // Offset table 341 Label offset_table[32]; 342 int i; 343 for (i = 0; i <= 31; i++) { 344 __ emit_address(offset_table[i]); 345 } 346 347 // Unrolled loop of 32 division steps 348 for (i = 31; i >= 0; i--) { 349 __ bind(offset_table[i]); 350 __ cmp(remainder, AsmOperand(divisor, lsl, i)); 351 __ sub(remainder, remainder, AsmOperand(divisor, lsl, i), hs); 352 __ add(quotient, quotient, 1 << i, hs); 353 } 354 __ pop(PC); 355 356 __ bind(negative_or_zero); 357 // Find the combination of argument signs and jump to corresponding handler 358 __ andr(quotient, dividend, 0x80000000, ne); 359 __ orr(quotient, quotient, AsmOperand(divisor, lsr, 31), ne); 360 __ add(PC, PC, AsmOperand(quotient, ror, 26), ne); 361 __ str(LR, Address(Rthread, JavaThread::saved_exception_pc_offset())); 362 363 // The leaf runtime function can destroy R0-R3 and R12 registers which are still alive 364 RegisterSet saved_registers = RegisterSet(R3) | RegisterSet(R12); 365 #if R9_IS_SCRATCHED 366 // Safer to save R9 here since callers may have been written 367 // assuming R9 survives. This is suboptimal but may not be worth 368 // revisiting for this slow case. 369 370 // save also R10 for alignment 371 saved_registers = saved_registers | RegisterSet(R9, R10); 372 #endif 373 { 374 // divisor == 0 375 FixedSizeCodeBlock zero_divisor(_masm, 8, true); 376 __ push(saved_registers); 377 __ mov(R0, Rthread); 378 __ mov(R1, LR); 379 __ mov(R2, SharedRuntime::IMPLICIT_DIVIDE_BY_ZERO); 380 __ b(call_slow_path); 381 } 382 383 { 384 // divisor > 0 && dividend < 0 385 FixedSizeCodeBlock positive_divisor_negative_dividend(_masm, 8, true); 386 __ push(LR); 387 __ rsb(dividend, dividend, 0); 388 __ bl(positive_arguments); 389 __ rsb(remainder, remainder, 0); 390 __ rsb(quotient, quotient, 0); 391 __ pop(PC); 392 } 393 394 { 395 // divisor < 0 && dividend > 0 396 FixedSizeCodeBlock negative_divisor_positive_dividend(_masm, 8, true); 397 __ push(LR); 398 __ rsb(divisor, divisor, 0); 399 __ bl(positive_arguments); 400 __ rsb(quotient, quotient, 0); 401 __ pop(PC); 402 } 403 404 { 405 // divisor < 0 && dividend < 0 406 FixedSizeCodeBlock negative_divisor_negative_dividend(_masm, 8, true); 407 __ push(LR); 408 __ rsb(dividend, dividend, 0); 409 __ rsb(divisor, divisor, 0); 410 __ bl(positive_arguments); 411 __ rsb(remainder, remainder, 0); 412 __ pop(PC); 413 } 414 415 __ bind(call_slow_path); 416 __ call(CAST_FROM_FN_PTR(address, SharedRuntime::continuation_for_implicit_exception)); 417 __ pop(saved_registers); 418 __ bx(R0); 419 420 return start; 421 } 422 423 424 // As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as: 425 // <fence>; <op>; <membar StoreLoad|StoreStore> 426 // But for load-linked/store-conditional based systems a fence here simply means 427 // no load/store can be reordered with respect to the initial load-linked, so we have: 428 // <membar storeload|loadload> ; load-linked; <op>; store-conditional; <membar storeload|storestore> 429 // There are no memory actions in <op> so nothing further is needed. 430 // 431 // So we define the following for convenience: 432 #define MEMBAR_ATOMIC_OP_PRE \ 433 MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::LoadLoad) 434 #define MEMBAR_ATOMIC_OP_POST \ 435 MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::StoreStore) 436 437 // Note: JDK 9 only supports ARMv7+ so we always have ldrexd available even though the 438 // code below allows for it to be otherwise. The else clause indicates an ARMv5 system 439 // for which we do not support MP and so membars are not necessary. This ARMv5 code will 440 // be removed in the future. 441 442 // Implementation of atomic_add(jint add_value, volatile jint* dest) 443 // used by Atomic::add(volatile jint* dest, jint add_value) 444 // 445 // Arguments : 446 // 447 // add_value: R0 448 // dest: R1 449 // 450 // Results: 451 // 452 // R0: the new stored in dest 453 // 454 // Overwrites: 455 // 456 // R1, R2, R3 457 // 458 address generate_atomic_add() { 459 address start; 460 461 StubGenStubId stub_id = StubGenStubId::atomic_add_id; 462 StubCodeMark mark(this, stub_id); 463 Label retry; 464 start = __ pc(); 465 Register addval = R0; 466 Register dest = R1; 467 Register prev = R2; 468 Register ok = R2; 469 Register newval = R3; 470 471 if (VM_Version::supports_ldrex()) { 472 __ membar(MEMBAR_ATOMIC_OP_PRE, prev); 473 __ bind(retry); 474 __ ldrex(newval, Address(dest)); 475 __ add(newval, addval, newval); 476 __ strex(ok, newval, Address(dest)); 477 __ cmp(ok, 0); 478 __ b(retry, ne); 479 __ mov (R0, newval); 480 __ membar(MEMBAR_ATOMIC_OP_POST, prev); 481 } else { 482 __ bind(retry); 483 __ ldr (prev, Address(dest)); 484 __ add(newval, addval, prev); 485 __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/); 486 __ b(retry, ne); 487 __ mov (R0, newval); 488 } 489 __ bx(LR); 490 491 return start; 492 } 493 494 // Implementation of jint atomic_xchg(jint exchange_value, volatile jint* dest) 495 // used by Atomic::add(volatile jint* dest, jint exchange_value) 496 // 497 // Arguments : 498 // 499 // exchange_value: R0 500 // dest: R1 501 // 502 // Results: 503 // 504 // R0: the value previously stored in dest 505 // 506 // Overwrites: 507 // 508 // R1, R2, R3 509 // 510 address generate_atomic_xchg() { 511 address start; 512 513 StubGenStubId stub_id = StubGenStubId::atomic_xchg_id; 514 StubCodeMark mark(this, stub_id); 515 start = __ pc(); 516 Register newval = R0; 517 Register dest = R1; 518 Register prev = R2; 519 520 Label retry; 521 522 if (VM_Version::supports_ldrex()) { 523 Register ok=R3; 524 __ membar(MEMBAR_ATOMIC_OP_PRE, prev); 525 __ bind(retry); 526 __ ldrex(prev, Address(dest)); 527 __ strex(ok, newval, Address(dest)); 528 __ cmp(ok, 0); 529 __ b(retry, ne); 530 __ mov (R0, prev); 531 __ membar(MEMBAR_ATOMIC_OP_POST, prev); 532 } else { 533 __ bind(retry); 534 __ ldr (prev, Address(dest)); 535 __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/); 536 __ b(retry, ne); 537 __ mov (R0, prev); 538 } 539 __ bx(LR); 540 541 return start; 542 } 543 544 // Implementation of jint atomic_cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value) 545 // used by Atomic::cmpxchg(volatile jint *dest, jint compare_value, jint exchange_value) 546 // 547 // Arguments : 548 // 549 // compare_value: R0 550 // exchange_value: R1 551 // dest: R2 552 // 553 // Results: 554 // 555 // R0: the value previously stored in dest 556 // 557 // Overwrites: 558 // 559 // R0, R1, R2, R3, Rtemp 560 // 561 address generate_atomic_cmpxchg() { 562 address start; 563 564 StubGenStubId stub_id = StubGenStubId::atomic_cmpxchg_id; 565 StubCodeMark mark(this, stub_id); 566 start = __ pc(); 567 Register cmp = R0; 568 Register newval = R1; 569 Register dest = R2; 570 Register temp1 = R3; 571 Register temp2 = Rtemp; // Rtemp free (native ABI) 572 573 __ membar(MEMBAR_ATOMIC_OP_PRE, temp1); 574 575 // atomic_cas returns previous value in R0 576 __ atomic_cas(temp1, temp2, cmp, newval, dest, 0); 577 578 __ membar(MEMBAR_ATOMIC_OP_POST, temp1); 579 580 __ bx(LR); 581 582 return start; 583 } 584 585 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 586 // reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest) 587 // 588 // Arguments : 589 // 590 // compare_value: R1 (High), R0 (Low) 591 // exchange_value: R3 (High), R2 (Low) 592 // dest: SP+0 593 // 594 // Results: 595 // 596 // R0:R1: the value previously stored in dest 597 // 598 // Overwrites: 599 // 600 address generate_atomic_cmpxchg_long() { 601 address start; 602 603 StubGenStubId stub_id = StubGenStubId::atomic_cmpxchg_long_id; 604 StubCodeMark mark(this, stub_id); 605 start = __ pc(); 606 Register cmp_lo = R0; 607 Register cmp_hi = R1; 608 Register newval_lo = R2; 609 Register newval_hi = R3; 610 Register addr = Rtemp; /* After load from stack */ 611 Register temp_lo = R4; 612 Register temp_hi = R5; 613 Register temp_result = R8; 614 assert_different_registers(cmp_lo, newval_lo, temp_lo, addr, temp_result, R7); 615 assert_different_registers(cmp_hi, newval_hi, temp_hi, addr, temp_result, R7); 616 617 __ membar(MEMBAR_ATOMIC_OP_PRE, Rtemp); // Rtemp free (native ABI) 618 619 // Stack is unaligned, maintain double word alignment by pushing 620 // odd number of regs. 621 __ push(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi)); 622 __ ldr(addr, Address(SP, 12)); 623 624 // atomic_cas64 returns previous value in temp_lo, temp_hi 625 __ atomic_cas64(temp_lo, temp_hi, temp_result, cmp_lo, cmp_hi, 626 newval_lo, newval_hi, addr, 0); 627 __ mov(R0, temp_lo); 628 __ mov(R1, temp_hi); 629 630 __ pop(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi)); 631 632 __ membar(MEMBAR_ATOMIC_OP_POST, Rtemp); // Rtemp free (native ABI) 633 __ bx(LR); 634 635 return start; 636 } 637 638 address generate_atomic_load_long() { 639 address start; 640 641 StubGenStubId stub_id = StubGenStubId::atomic_load_long_id; 642 StubCodeMark mark(this, stub_id); 643 start = __ pc(); 644 Register result_lo = R0; 645 Register result_hi = R1; 646 Register src = R0; 647 648 if (VM_Version::supports_ldrexd()) { 649 __ ldrexd(result_lo, Address(src)); 650 __ clrex(); // FIXME: safe to remove? 651 } else if (!os::is_MP()) { 652 // Last-ditch attempt: we are allegedly running on uni-processor. 653 // Load the thing non-atomically and hope for the best. 654 __ ldmia(src, RegisterSet(result_lo, result_hi)); 655 } else { 656 __ stop("Atomic load(jlong) unsupported on this platform"); 657 } 658 __ bx(LR); 659 660 return start; 661 } 662 663 address generate_atomic_store_long() { 664 address start; 665 666 StubGenStubId stub_id = StubGenStubId::atomic_store_long_id; 667 StubCodeMark mark(this, stub_id); 668 start = __ pc(); 669 Register newval_lo = R0; 670 Register newval_hi = R1; 671 Register dest = R2; 672 Register scratch_lo = R2; 673 Register scratch_hi = R3; /* After load from stack */ 674 Register result = R3; 675 676 if (VM_Version::supports_ldrexd()) { 677 __ mov(Rtemp, dest); // get dest to Rtemp 678 Label retry; 679 __ bind(retry); 680 __ ldrexd(scratch_lo, Address(Rtemp)); 681 __ strexd(result, R0, Address(Rtemp)); 682 __ rsbs(result, result, 1); 683 __ b(retry, eq); 684 } else if (!os::is_MP()) { 685 // Last-ditch attempt: we are allegedly running on uni-processor. 686 // Store the thing non-atomically and hope for the best. 687 __ stmia(dest, RegisterSet(newval_lo, newval_hi)); 688 } else { 689 __ stop("Atomic store(jlong) unsupported on this platform"); 690 } 691 __ bx(LR); 692 693 return start; 694 } 695 696 697 698 #ifdef COMPILER2 699 // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super ); 700 // Arguments : 701 // 702 // ret : R0, returned 703 // icc/xcc: set as R0 (depending on wordSize) 704 // sub : R1, argument, not changed 705 // super: R2, argument, not changed 706 // raddr: LR, blown by call 707 address generate_partial_subtype_check() { 708 __ align(CodeEntryAlignment); 709 StubGenStubId stub_id = StubGenStubId::partial_subtype_check_id; 710 StubCodeMark mark(this, stub_id); 711 address start = __ pc(); 712 713 // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops) 714 715 // R0 used as tmp_reg (in addition to return reg) 716 Register sub_klass = R1; 717 Register super_klass = R2; 718 Register tmp_reg2 = R3; 719 Register tmp_reg3 = R4; 720 #define saved_set tmp_reg2, tmp_reg3 721 722 Label L_loop, L_fail; 723 724 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 725 726 // fast check should be redundant 727 728 // slow check 729 { 730 __ raw_push(saved_set); 731 732 // a couple of useful fields in sub_klass: 733 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 734 735 // Do a linear scan of the secondary super-klass chain. 736 // This code is rarely used, so simplicity is a virtue here. 737 738 inc_counter_np(SharedRuntime::_partial_subtype_ctr, tmp_reg2, tmp_reg3); 739 740 Register scan_temp = tmp_reg2; 741 Register count_temp = tmp_reg3; 742 743 // We will consult the secondary-super array. 744 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 745 746 Register search_key = super_klass; 747 748 // Load the array length. 749 __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 750 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 751 752 __ add(count_temp, count_temp, 1); 753 754 // Top of search loop 755 __ bind(L_loop); 756 // Notes: 757 // scan_temp starts at the array elements 758 // count_temp is 1+size 759 __ subs(count_temp, count_temp, 1); 760 __ b(L_fail, eq); // not found in the array 761 762 // Load next super to check 763 // In the array of super classes elements are pointer sized. 764 int element_size = wordSize; 765 __ ldr(R0, Address(scan_temp, element_size, post_indexed)); 766 767 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 768 __ subs(R0, R0, search_key); // set R0 to 0 on success (and flags to eq) 769 770 // A miss means we are NOT a subtype and need to keep looping 771 __ b(L_loop, ne); 772 773 // Falling out the bottom means we found a hit; we ARE a subtype 774 775 // Success. Cache the super we found and proceed in triumph. 776 __ str(super_klass, Address(sub_klass, sc_offset)); 777 778 // Return success 779 // R0 is already 0 and flags are already set to eq 780 __ raw_pop(saved_set); 781 __ ret(); 782 783 // Return failure 784 __ bind(L_fail); 785 __ movs(R0, 1); // sets the flags 786 __ raw_pop(saved_set); 787 __ ret(); 788 } 789 return start; 790 } 791 #undef saved_set 792 #endif // COMPILER2 793 794 795 //---------------------------------------------------------------------------------------------------- 796 // Non-destructive plausibility checks for oops 797 798 address generate_verify_oop() { 799 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 800 StubCodeMark mark(this, stub_id); 801 address start = __ pc(); 802 803 // Incoming arguments: 804 // 805 // R0: error message (char* ) 806 // R1: address of register save area 807 // R2: oop to verify 808 // 809 // All registers are saved before calling this stub. However, condition flags should be saved here. 810 811 const Register oop = R2; 812 const Register klass = R3; 813 const Register tmp1 = R6; 814 const Register tmp2 = R8; 815 816 const Register flags = Rtmp_save0; // R4/R19 817 const Register ret_addr = Rtmp_save1; // R5/R20 818 assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7); 819 820 Label exit, error; 821 InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr()); 822 823 __ mrs(Assembler::CPSR, flags); 824 825 __ ldr_literal(tmp1, verify_oop_count); 826 __ ldr_s32(tmp2, Address(tmp1)); 827 __ add(tmp2, tmp2, 1); 828 __ str_32(tmp2, Address(tmp1)); 829 830 // make sure object is 'reasonable' 831 __ cbz(oop, exit); // if obj is null it is ok 832 833 // Check if the oop is in the right area of memory 834 // Note: oop_mask and oop_bits must be updated if the code is saved/reused 835 const address oop_mask = (address) Universe::verify_oop_mask(); 836 const address oop_bits = (address) Universe::verify_oop_bits(); 837 __ mov_address(tmp1, oop_mask); 838 __ andr(tmp2, oop, tmp1); 839 __ mov_address(tmp1, oop_bits); 840 __ cmp(tmp2, tmp1); 841 __ b(error, ne); 842 843 // make sure klass is 'reasonable' 844 __ load_klass(klass, oop); // get klass 845 __ cbz(klass, error); // if klass is null it is broken 846 847 // return if everything seems ok 848 __ bind(exit); 849 850 __ msr(Assembler::CPSR_f, flags); 851 852 __ ret(); 853 854 // handle errors 855 __ bind(error); 856 857 __ mov(ret_addr, LR); // save return address 858 859 // R0: error message 860 // R1: register save area 861 __ call(CAST_FROM_FN_PTR(address, MacroAssembler::debug)); 862 863 __ mov(LR, ret_addr); 864 __ b(exit); 865 866 __ bind_literal(verify_oop_count); 867 868 return start; 869 } 870 871 //---------------------------------------------------------------------------------------------------- 872 // Array copy stubs 873 874 // 875 // Generate overlap test for array copy stubs 876 // 877 // Input: 878 // R0 - array1 879 // R1 - array2 880 // R2 - element count, 32-bit int 881 // 882 // input registers are preserved 883 // 884 void array_overlap_test(address no_overlap_target, int log2_elem_size, Register tmp1, Register tmp2) { 885 assert(no_overlap_target != nullptr, "must be generated"); 886 array_overlap_test(no_overlap_target, nullptr, log2_elem_size, tmp1, tmp2); 887 } 888 void array_overlap_test(Label& L_no_overlap, int log2_elem_size, Register tmp1, Register tmp2) { 889 array_overlap_test(nullptr, &L_no_overlap, log2_elem_size, tmp1, tmp2); 890 } 891 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) { 892 const Register from = R0; 893 const Register to = R1; 894 const Register count = R2; 895 const Register to_from = tmp1; // to - from 896 const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size 897 assert_different_registers(from, to, count, tmp1, tmp2); 898 899 // no_overlap version works if 'to' lower (unsigned) than 'from' 900 // and or 'to' more than (count*size) from 'from' 901 902 BLOCK_COMMENT("Array Overlap Test:"); 903 __ subs(to_from, to, from); 904 if (log2_elem_size != 0) { 905 __ mov(byte_count, AsmOperand(count, lsl, log2_elem_size)); 906 } 907 if (NOLp == nullptr) 908 __ b(no_overlap_target,lo); 909 else 910 __ b((*NOLp), lo); 911 __ cmp(to_from, byte_count); 912 if (NOLp == nullptr) 913 __ b(no_overlap_target, ge); 914 else 915 __ b((*NOLp), ge); 916 } 917 918 919 // probably we should choose between "prefetch-store before or after store", not "before or after load". 920 void prefetch(Register from, Register to, int offset, int to_delta = 0) { 921 __ prefetch_read(Address(from, offset)); 922 } 923 924 // Generate the inner loop for forward aligned array copy 925 // 926 // Arguments 927 // from: src address, 64 bits aligned 928 // to: dst address, wordSize aligned 929 // count: number of elements (32-bit int) 930 // bytes_per_count: number of bytes for each unit of 'count' 931 // 932 // Return the minimum initial value for count 933 // 934 // Notes: 935 // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) 936 // - 'to' aligned on wordSize 937 // - 'count' must be greater or equal than the returned value 938 // 939 // Increases 'from' and 'to' by count*bytes_per_count. 940 // 941 // Scratches 'count', R3. 942 // R4-R10 are preserved (saved/restored). 943 // 944 int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool unsafe_copy = false) { 945 assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); 946 947 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration 948 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned; 949 int pld_offset = config->pld_distance; 950 const int count_per_loop = bytes_per_loop / bytes_per_count; 951 952 bool split_read= config->split_ldm; 953 bool split_write= config->split_stm; 954 955 // XXX optim: use VLDM/VSTM when available (Neon) with PLD 956 // NEONCopyPLD 957 // PLD [r1, #0xC0] 958 // VLDM r1!,{d0-d7} 959 // VSTM r0!,{d0-d7} 960 // SUBS r2,r2,#0x40 961 // BGE NEONCopyPLD 962 963 __ push(RegisterSet(R4,R10)); 964 965 const bool prefetch_before = pld_offset < 0; 966 const bool prefetch_after = pld_offset > 0; 967 968 Label L_skip_pld; 969 970 { 971 // UnsafeMemoryAccess page error: continue after unsafe access 972 UnsafeMemoryAccessMark umam(this, unsafe_copy, true); 973 // predecrease to exit when there is less than count_per_loop 974 __ sub_32(count, count, count_per_loop); 975 976 if (pld_offset != 0) { 977 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 978 979 prefetch(from, to, 0); 980 981 if (prefetch_before) { 982 // If prefetch is done ahead, final PLDs that overflow the 983 // copied area can be easily avoided. 'count' is predecreased 984 // by the prefetch distance to optimize the inner loop and the 985 // outer loop skips the PLD. 986 __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count); 987 988 // skip prefetch for small copies 989 __ b(L_skip_pld, lt); 990 } 991 992 int offset = ArmCopyCacheLineSize; 993 while (offset <= pld_offset) { 994 prefetch(from, to, offset); 995 offset += ArmCopyCacheLineSize; 996 }; 997 } 998 999 { 1000 // 32-bit ARM note: we have tried implementing loop unrolling to skip one 1001 // PLD with 64 bytes cache line but the gain was not significant. 1002 1003 Label L_copy_loop; 1004 __ align(OptoLoopAlignment); 1005 __ BIND(L_copy_loop); 1006 1007 if (prefetch_before) { 1008 prefetch(from, to, bytes_per_loop + pld_offset); 1009 __ BIND(L_skip_pld); 1010 } 1011 1012 if (split_read) { 1013 // Split the register set in two sets so that there is less 1014 // latency between LDM and STM (R3-R6 available while R7-R10 1015 // still loading) and less register locking issue when iterating 1016 // on the first LDM. 1017 __ ldmia(from, RegisterSet(R3, R6), writeback); 1018 __ ldmia(from, RegisterSet(R7, R10), writeback); 1019 } else { 1020 __ ldmia(from, RegisterSet(R3, R10), writeback); 1021 } 1022 1023 __ subs_32(count, count, count_per_loop); 1024 1025 if (prefetch_after) { 1026 prefetch(from, to, pld_offset, bytes_per_loop); 1027 } 1028 1029 if (split_write) { 1030 __ stmia(to, RegisterSet(R3, R6), writeback); 1031 __ stmia(to, RegisterSet(R7, R10), writeback); 1032 } else { 1033 __ stmia(to, RegisterSet(R3, R10), writeback); 1034 } 1035 1036 __ b(L_copy_loop, ge); 1037 1038 if (prefetch_before) { 1039 // the inner loop may end earlier, allowing to skip PLD for the last iterations 1040 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1041 __ b(L_skip_pld, ge); 1042 } 1043 } 1044 BLOCK_COMMENT("Remaining bytes:"); 1045 // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes 1046 1047 // __ add(count, count, ...); // addition useless for the bit tests 1048 assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); 1049 1050 __ tst(count, 16 / bytes_per_count); 1051 __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes 1052 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1053 1054 __ tst(count, 8 / bytes_per_count); 1055 __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes 1056 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1057 1058 if (bytes_per_count <= 4) { 1059 __ tst(count, 4 / bytes_per_count); 1060 __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes 1061 __ str(R3, Address(to, 4, post_indexed), ne); 1062 } 1063 1064 if (bytes_per_count <= 2) { 1065 __ tst(count, 2 / bytes_per_count); 1066 __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes 1067 __ strh(R3, Address(to, 2, post_indexed), ne); 1068 } 1069 1070 if (bytes_per_count == 1) { 1071 __ tst(count, 1); 1072 __ ldrb(R3, Address(from, 1, post_indexed), ne); 1073 __ strb(R3, Address(to, 1, post_indexed), ne); 1074 } 1075 } 1076 1077 __ pop(RegisterSet(R4,R10)); 1078 1079 return count_per_loop; 1080 } 1081 1082 1083 // Generate the inner loop for backward aligned array copy 1084 // 1085 // Arguments 1086 // end_from: src end address, 64 bits aligned 1087 // end_to: dst end address, wordSize aligned 1088 // count: number of elements (32-bit int) 1089 // bytes_per_count: number of bytes for each unit of 'count' 1090 // 1091 // Return the minimum initial value for count 1092 // 1093 // Notes: 1094 // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) 1095 // - 'end_to' aligned on wordSize 1096 // - 'count' must be greater or equal than the returned value 1097 // 1098 // Decreases 'end_from' and 'end_to' by count*bytes_per_count. 1099 // 1100 // Scratches 'count', R3. 1101 // ARM R4-R10 are preserved (saved/restored). 1102 // 1103 int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, bool unsafe_copy = false) { 1104 assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); 1105 1106 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration 1107 const int count_per_loop = bytes_per_loop / bytes_per_count; 1108 1109 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned; 1110 int pld_offset = config->pld_distance; 1111 1112 bool split_read= config->split_ldm; 1113 bool split_write= config->split_stm; 1114 1115 // See the forward copy variant for additional comments. 1116 1117 __ push(RegisterSet(R4,R10)); 1118 1119 { 1120 // UnsafeMemoryAccess page error: continue after unsafe access 1121 UnsafeMemoryAccessMark umam(this, unsafe_copy, true); 1122 __ sub_32(count, count, count_per_loop); 1123 1124 const bool prefetch_before = pld_offset < 0; 1125 const bool prefetch_after = pld_offset > 0; 1126 1127 Label L_skip_pld; 1128 1129 if (pld_offset != 0) { 1130 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1131 1132 prefetch(end_from, end_to, -wordSize); 1133 1134 if (prefetch_before) { 1135 __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count); 1136 __ b(L_skip_pld, lt); 1137 } 1138 1139 int offset = ArmCopyCacheLineSize; 1140 while (offset <= pld_offset) { 1141 prefetch(end_from, end_to, -(wordSize + offset)); 1142 offset += ArmCopyCacheLineSize; 1143 }; 1144 } 1145 1146 { 1147 // 32-bit ARM note: we have tried implementing loop unrolling to skip one 1148 // PLD with 64 bytes cache line but the gain was not significant. 1149 1150 Label L_copy_loop; 1151 __ align(OptoLoopAlignment); 1152 __ BIND(L_copy_loop); 1153 1154 if (prefetch_before) { 1155 prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); 1156 __ BIND(L_skip_pld); 1157 } 1158 1159 if (split_read) { 1160 __ ldmdb(end_from, RegisterSet(R7, R10), writeback); 1161 __ ldmdb(end_from, RegisterSet(R3, R6), writeback); 1162 } else { 1163 __ ldmdb(end_from, RegisterSet(R3, R10), writeback); 1164 } 1165 1166 __ subs_32(count, count, count_per_loop); 1167 1168 if (prefetch_after) { 1169 prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); 1170 } 1171 1172 if (split_write) { 1173 __ stmdb(end_to, RegisterSet(R7, R10), writeback); 1174 __ stmdb(end_to, RegisterSet(R3, R6), writeback); 1175 } else { 1176 __ stmdb(end_to, RegisterSet(R3, R10), writeback); 1177 } 1178 1179 __ b(L_copy_loop, ge); 1180 1181 if (prefetch_before) { 1182 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1183 __ b(L_skip_pld, ge); 1184 } 1185 } 1186 BLOCK_COMMENT("Remaining bytes:"); 1187 // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes 1188 1189 // __ add(count, count, ...); // addition useless for the bit tests 1190 assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); 1191 1192 __ tst(count, 16 / bytes_per_count); 1193 __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes 1194 __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne); 1195 1196 __ tst(count, 8 / bytes_per_count); 1197 __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes 1198 __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne); 1199 1200 if (bytes_per_count <= 4) { 1201 __ tst(count, 4 / bytes_per_count); 1202 __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes 1203 __ str(R3, Address(end_to, -4, pre_indexed), ne); 1204 } 1205 1206 if (bytes_per_count <= 2) { 1207 __ tst(count, 2 / bytes_per_count); 1208 __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes 1209 __ strh(R3, Address(end_to, -2, pre_indexed), ne); 1210 } 1211 1212 if (bytes_per_count == 1) { 1213 __ tst(count, 1); 1214 __ ldrb(R3, Address(end_from, -1, pre_indexed), ne); 1215 __ strb(R3, Address(end_to, -1, pre_indexed), ne); 1216 } 1217 } 1218 __ pop(RegisterSet(R4,R10)); 1219 1220 return count_per_loop; 1221 } 1222 1223 1224 // Generate the inner loop for shifted forward array copy (unaligned copy). 1225 // It can be used when bytes_per_count < wordSize, i.e. byte/short copy 1226 // 1227 // Arguments 1228 // from: start src address, 64 bits aligned 1229 // to: start dst address, (now) wordSize aligned 1230 // count: number of elements (32-bit int) 1231 // bytes_per_count: number of bytes for each unit of 'count' 1232 // lsr_shift: shift applied to 'old' value to skipped already written bytes 1233 // lsl_shift: shift applied to 'new' value to set the high bytes of the next write 1234 // 1235 // Return the minimum initial value for count 1236 // 1237 // Notes: 1238 // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) 1239 // - 'to' aligned on wordSize 1240 // - 'count' must be greater or equal than the returned value 1241 // - 'lsr_shift' + 'lsl_shift' = BitsPerWord 1242 // - 'bytes_per_count' is 1 or 2 1243 // 1244 // Increases 'to' by count*bytes_per_count. 1245 // 1246 // Scratches 'from' and 'count', R3-R10, R12 1247 // 1248 // On entry: 1249 // - R12 is preloaded with the first 'BitsPerWord' bits read just before 'from' 1250 // - (R12 >> lsr_shift) is the part not yet written (just before 'to') 1251 // --> (*to) = (R12 >> lsr_shift) | (*from) << lsl_shift); ... 1252 // 1253 // This implementation may read more bytes than required. 1254 // Actually, it always reads exactly all data from the copied region with upper bound aligned up by wordSize, 1255 // so excessive read do not cross a word bound and is thus harmless. 1256 // 1257 int generate_forward_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) { 1258 assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); 1259 1260 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter 1261 const int count_per_loop = bytes_per_loop / bytes_per_count; 1262 1263 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted; 1264 int pld_offset = config->pld_distance; 1265 1266 bool split_read= config->split_ldm; 1267 bool split_write= config->split_stm; 1268 1269 const bool prefetch_before = pld_offset < 0; 1270 const bool prefetch_after = pld_offset > 0; 1271 Label L_skip_pld, L_last_read, L_done; 1272 if (pld_offset != 0) { 1273 1274 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1275 1276 prefetch(from, to, 0); 1277 1278 if (prefetch_before) { 1279 __ cmp_32(count, count_per_loop); 1280 __ b(L_last_read, lt); 1281 // skip prefetch for small copies 1282 // warning: count is predecreased by the prefetch distance to optimize the inner loop 1283 __ subs_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1284 __ b(L_skip_pld, lt); 1285 } 1286 1287 int offset = ArmCopyCacheLineSize; 1288 while (offset <= pld_offset) { 1289 prefetch(from, to, offset); 1290 offset += ArmCopyCacheLineSize; 1291 }; 1292 } 1293 1294 Label L_shifted_loop; 1295 1296 __ align(OptoLoopAlignment); 1297 __ BIND(L_shifted_loop); 1298 1299 if (prefetch_before) { 1300 // do it early if there might be register locking issues 1301 prefetch(from, to, bytes_per_loop + pld_offset); 1302 __ BIND(L_skip_pld); 1303 } else { 1304 __ cmp_32(count, count_per_loop); 1305 __ b(L_last_read, lt); 1306 } 1307 1308 // read 32 bytes 1309 if (split_read) { 1310 // if write is not split, use less registers in first set to reduce locking 1311 RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5); 1312 RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12; 1313 __ ldmia(from, set1, writeback); 1314 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written 1315 __ ldmia(from, set2, writeback); 1316 __ subs(count, count, count_per_loop); // XXX: should it be before the 2nd LDM ? (latency vs locking) 1317 } else { 1318 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written 1319 __ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4 1320 __ subs(count, count, count_per_loop); 1321 } 1322 1323 if (prefetch_after) { 1324 // do it after the 1st ldm/ldp anyway (no locking issues with early STM/STP) 1325 prefetch(from, to, pld_offset, bytes_per_loop); 1326 } 1327 1328 // prepare (shift) the values in R3..R10 1329 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); // merged below low bytes of next val 1330 __ logical_shift_right(R4, R4, lsr_shift); // unused part of next val 1331 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ... 1332 __ logical_shift_right(R5, R5, lsr_shift); 1333 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift)); 1334 __ logical_shift_right(R6, R6, lsr_shift); 1335 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift)); 1336 if (split_write) { 1337 // write the first half as soon as possible to reduce stm locking 1338 __ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge); 1339 } 1340 __ logical_shift_right(R7, R7, lsr_shift); 1341 __ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift)); 1342 __ logical_shift_right(R8, R8, lsr_shift); 1343 __ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift)); 1344 __ logical_shift_right(R9, R9, lsr_shift); 1345 __ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift)); 1346 __ logical_shift_right(R10, R10, lsr_shift); 1347 __ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift)); 1348 1349 if (split_write) { 1350 __ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge); 1351 } else { 1352 __ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge); 1353 } 1354 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) 1355 1356 if (prefetch_before) { 1357 // the first loop may end earlier, allowing to skip pld at the end 1358 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1359 __ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped 1360 __ b(L_skip_pld, ge); 1361 __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1362 } 1363 1364 __ BIND(L_last_read); 1365 __ b(L_done, eq); 1366 1367 switch (bytes_per_count) { 1368 case 2: 1369 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); 1370 __ tst(count, 8); 1371 __ ldmia(from, RegisterSet(R4, R7), writeback, ne); 1372 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1373 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1374 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1375 __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne); 1376 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne); 1377 __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne); 1378 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne); 1379 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1380 __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne); 1381 1382 __ tst(count, 4); 1383 __ ldmia(from, RegisterSet(R4, R5), writeback, ne); 1384 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1385 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1386 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1387 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1388 __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne); 1389 1390 __ tst(count, 2); 1391 __ ldr(R4, Address(from, 4, post_indexed), ne); 1392 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); 1393 __ str(R3, Address(to, 4, post_indexed), ne); 1394 __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne); 1395 1396 __ tst(count, 1); 1397 __ strh(R3, Address(to, 2, post_indexed), ne); // one last short 1398 break; 1399 1400 case 1: 1401 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); 1402 __ tst(count, 16); 1403 __ ldmia(from, RegisterSet(R4, R7), writeback, ne); 1404 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1405 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1406 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1407 __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne); 1408 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne); 1409 __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne); 1410 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne); 1411 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1412 __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne); 1413 1414 __ tst(count, 8); 1415 __ ldmia(from, RegisterSet(R4, R5), writeback, ne); 1416 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1417 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1418 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1419 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1420 __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne); 1421 1422 __ tst(count, 4); 1423 __ ldr(R4, Address(from, 4, post_indexed), ne); 1424 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); 1425 __ str(R3, Address(to, 4, post_indexed), ne); 1426 __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne); 1427 1428 __ andr(count, count, 3); 1429 __ cmp(count, 2); 1430 1431 // Note: R3 might contain enough bytes ready to write (3 needed at most), 1432 // thus load on lsl_shift==24 is not needed (in fact forces reading 1433 // beyond source buffer end boundary) 1434 if (lsl_shift == 8) { 1435 __ ldr(R4, Address(from, 4, post_indexed), ge); 1436 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ge); 1437 } else if (lsl_shift == 16) { 1438 __ ldr(R4, Address(from, 4, post_indexed), gt); 1439 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), gt); 1440 } 1441 1442 __ strh(R3, Address(to, 2, post_indexed), ge); // two last bytes 1443 __ mov(R3, AsmOperand(R3, lsr, 16), gt); 1444 1445 __ tst(count, 1); 1446 __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte 1447 break; 1448 } 1449 1450 __ BIND(L_done); 1451 return 0; // no minimum 1452 } 1453 1454 // Generate the inner loop for shifted backward array copy (unaligned copy). 1455 // It can be used when bytes_per_count < wordSize, i.e. byte/short copy 1456 // 1457 // Arguments 1458 // end_from: end src address, 64 bits aligned 1459 // end_to: end dst address, (now) wordSize aligned 1460 // count: number of elements (32-bit int) 1461 // bytes_per_count: number of bytes for each unit of 'count' 1462 // lsl_shift: shift applied to 'old' value to skipped already written bytes 1463 // lsr_shift: shift applied to 'new' value to set the low bytes of the next write 1464 // 1465 // Return the minimum initial value for count 1466 // 1467 // Notes: 1468 // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) 1469 // - 'end_to' aligned on wordSize 1470 // - 'count' must be greater or equal than the returned value 1471 // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord' 1472 // - 'bytes_per_count' is 1 or 2 on 32-bit ARM 1473 // 1474 // Decreases 'end_to' by count*bytes_per_count. 1475 // 1476 // Scratches 'end_from', 'count', R3-R10, R12 1477 // 1478 // On entry: 1479 // - R3 is preloaded with the first 'BitsPerWord' bits read just after 'from' 1480 // - (R3 << lsl_shift) is the part not yet written 1481 // --> (*--to) = (R3 << lsl_shift) | (*--from) >> lsr_shift); ... 1482 // 1483 // This implementation may read more bytes than required. 1484 // Actually, it always reads exactly all data from the copied region with beginning aligned down by wordSize, 1485 // so excessive read do not cross a word bound and is thus harmless. 1486 // 1487 int generate_backward_shifted_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) { 1488 assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); 1489 1490 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter 1491 const int count_per_loop = bytes_per_loop / bytes_per_count; 1492 1493 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted; 1494 int pld_offset = config->pld_distance; 1495 1496 bool split_read= config->split_ldm; 1497 bool split_write= config->split_stm; 1498 1499 1500 const bool prefetch_before = pld_offset < 0; 1501 const bool prefetch_after = pld_offset > 0; 1502 1503 Label L_skip_pld, L_done, L_last_read; 1504 if (pld_offset != 0) { 1505 1506 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1507 1508 prefetch(end_from, end_to, -wordSize); 1509 1510 if (prefetch_before) { 1511 __ cmp_32(count, count_per_loop); 1512 __ b(L_last_read, lt); 1513 1514 // skip prefetch for small copies 1515 // warning: count is predecreased by the prefetch distance to optimize the inner loop 1516 __ subs_32(count, count, ((bytes_per_loop + pld_offset)/bytes_per_count) + count_per_loop); 1517 __ b(L_skip_pld, lt); 1518 } 1519 1520 int offset = ArmCopyCacheLineSize; 1521 while (offset <= pld_offset) { 1522 prefetch(end_from, end_to, -(wordSize + offset)); 1523 offset += ArmCopyCacheLineSize; 1524 }; 1525 } 1526 1527 Label L_shifted_loop; 1528 __ align(OptoLoopAlignment); 1529 __ BIND(L_shifted_loop); 1530 1531 if (prefetch_before) { 1532 // do the 1st ldm/ldp first anyway (no locking issues with early STM/STP) 1533 prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); 1534 __ BIND(L_skip_pld); 1535 } else { 1536 __ cmp_32(count, count_per_loop); 1537 __ b(L_last_read, lt); 1538 } 1539 1540 if (split_read) { 1541 __ ldmdb(end_from, RegisterSet(R7, R10), writeback); 1542 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 1543 __ ldmdb(end_from, RegisterSet(R3, R6), writeback); 1544 } else { 1545 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 1546 __ ldmdb(end_from, RegisterSet(R3, R10), writeback); 1547 } 1548 1549 __ subs_32(count, count, count_per_loop); 1550 1551 if (prefetch_after) { // do prefetch during ldm/ldp latency 1552 prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); 1553 } 1554 1555 // prepare the values in R4..R10,R12 1556 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); // merged above high bytes of prev val 1557 __ logical_shift_left(R10, R10, lsl_shift); // unused part of prev val 1558 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); // ... 1559 __ logical_shift_left(R9, R9, lsl_shift); 1560 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift)); 1561 __ logical_shift_left(R8, R8, lsl_shift); 1562 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift)); 1563 __ logical_shift_left(R7, R7, lsl_shift); 1564 __ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift)); 1565 __ logical_shift_left(R6, R6, lsl_shift); 1566 __ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift)); 1567 if (split_write) { 1568 // store early to reduce locking issues 1569 __ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge); 1570 } 1571 __ logical_shift_left(R5, R5, lsl_shift); 1572 __ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift)); 1573 __ logical_shift_left(R4, R4, lsl_shift); 1574 __ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift)); 1575 1576 if (split_write) { 1577 __ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge); 1578 } else { 1579 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge); 1580 } 1581 1582 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) 1583 1584 if (prefetch_before) { 1585 // the first loop may end earlier, allowing to skip pld at the end 1586 __ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count)); 1587 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped 1588 __ b(L_skip_pld, ge); 1589 __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1590 } 1591 1592 __ BIND(L_last_read); 1593 __ b(L_done, eq); 1594 1595 switch(bytes_per_count) { 1596 case 2: 1597 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 1598 __ tst(count, 8); 1599 __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); 1600 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1601 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 1602 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 1603 __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne); 1604 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne); 1605 __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne); 1606 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne); 1607 __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne); 1608 __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne); 1609 1610 __ tst(count, 4); 1611 __ ldmdb(end_from, RegisterSet(R9, R10), writeback, ne); 1612 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1613 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 1614 __ orr(R10, R10, AsmOperand(R9, lsr,lsr_shift),ne); // ... 1615 __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne); 1616 __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne); 1617 1618 __ tst(count, 2); 1619 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 1620 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1621 __ str(R12, Address(end_to, -4, pre_indexed), ne); 1622 __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne); 1623 1624 __ tst(count, 1); 1625 __ mov(R12, AsmOperand(R12, lsr, lsr_shift),ne); 1626 __ strh(R12, Address(end_to, -2, pre_indexed), ne); // one last short 1627 break; 1628 1629 case 1: 1630 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 1631 __ tst(count, 16); 1632 __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); 1633 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1634 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 1635 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 1636 __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne); 1637 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne); 1638 __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne); 1639 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne); 1640 __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne); 1641 __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne); 1642 1643 __ tst(count, 8); 1644 __ ldmdb(end_from, RegisterSet(R9,R10), writeback, ne); 1645 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1646 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 1647 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 1648 __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne); 1649 __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne); 1650 1651 __ tst(count, 4); 1652 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 1653 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1654 __ str(R12, Address(end_to, -4, pre_indexed), ne); 1655 __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne); 1656 1657 __ tst(count, 2); 1658 if (lsr_shift != 24) { 1659 // avoid useless reading R10 when we already have 3 bytes ready in R12 1660 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 1661 __ orr(R12, R12, AsmOperand(R10, lsr,lsr_shift), ne); 1662 } 1663 1664 // Note: R12 contains enough bytes ready to write (3 needed at most) 1665 // write the 2 MSBs 1666 __ mov(R9, AsmOperand(R12, lsr, 16), ne); 1667 __ strh(R9, Address(end_to, -2, pre_indexed), ne); 1668 // promote remaining to MSB 1669 __ mov(R12, AsmOperand(R12, lsl, 16), ne); 1670 1671 __ tst(count, 1); 1672 // write the MSB of R12 1673 __ mov(R12, AsmOperand(R12, lsr, 24), ne); 1674 __ strb(R12, Address(end_to, -1, pre_indexed), ne); 1675 1676 break; 1677 } 1678 1679 __ BIND(L_done); 1680 return 0; // no minimum 1681 } 1682 1683 // This method is very useful for merging forward/backward implementations 1684 Address get_addr_with_indexing(Register base, int delta, bool forward) { 1685 if (forward) { 1686 return Address(base, delta, post_indexed); 1687 } else { 1688 return Address(base, -delta, pre_indexed); 1689 } 1690 } 1691 1692 void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { 1693 assert_different_registers(from, rd, rd2); 1694 if (size_in_bytes < 8) { 1695 Address addr = get_addr_with_indexing(from, size_in_bytes, forward); 1696 __ load_sized_value(rd, addr, size_in_bytes, false, cond); 1697 } else { 1698 assert (rd2 != noreg, "second value register must be specified"); 1699 assert (rd->encoding() < rd2->encoding(), "wrong value register set"); 1700 1701 if (forward) { 1702 __ ldmia(from, RegisterSet(rd) | rd2, writeback, cond); 1703 } else { 1704 __ ldmdb(from, RegisterSet(rd) | rd2, writeback, cond); 1705 } 1706 } 1707 } 1708 1709 void store_one(Register rd, Register to, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { 1710 assert_different_registers(to, rd, rd2); 1711 if (size_in_bytes < 8) { 1712 Address addr = get_addr_with_indexing(to, size_in_bytes, forward); 1713 __ store_sized_value(rd, addr, size_in_bytes, cond); 1714 } else { 1715 assert (rd2 != noreg, "second value register must be specified"); 1716 assert (rd->encoding() < rd2->encoding(), "wrong value register set"); 1717 1718 if (forward) { 1719 __ stmia(to, RegisterSet(rd) | rd2, writeback, cond); 1720 } else { 1721 __ stmdb(to, RegisterSet(rd) | rd2, writeback, cond); 1722 } 1723 } 1724 } 1725 1726 // Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits. 1727 // (on 32-bit ARM 64-bit alignment is better for LDM). 1728 // 1729 // Arguments: 1730 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 1731 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1732 // count: 32-bit int, maximum number of elements which can be copied 1733 // bytes_per_count: size of an element 1734 // forward: specifies copy direction 1735 // 1736 // Notes: 1737 // 'from' and 'to' must be aligned by 'bytes_per_count' 1738 // 'count' must not be less than the returned value 1739 // shifts 'from' and 'to' by the number of copied bytes in corresponding direction 1740 // decreases 'count' by the number of elements copied 1741 // 1742 // Returns maximum number of bytes which may be copied. 1743 int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) { 1744 assert_different_registers(from, to, count, tmp); 1745 if (bytes_per_count < 8) { 1746 Label L_align_src; 1747 __ BIND(L_align_src); 1748 __ tst(from, 7); 1749 // ne => not aligned: copy one element and (if bytes_per_count < 4) loop 1750 __ sub(count, count, 1, ne); 1751 load_one(tmp, from, bytes_per_count, forward, ne); 1752 store_one(tmp, to, bytes_per_count, forward, ne); 1753 if (bytes_per_count < 4) { 1754 __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough 1755 } 1756 } 1757 return 7/bytes_per_count; 1758 } 1759 1760 // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction. 1761 // 1762 // Arguments: 1763 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 1764 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1765 // count: 32-bit int, number of elements to be copied 1766 // entry: copy loop entry point 1767 // bytes_per_count: size of an element 1768 // forward: specifies copy direction 1769 // 1770 // Notes: 1771 // shifts 'from' and 'to' 1772 void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry, bool unsafe_copy = false) { 1773 assert_different_registers(from, to, count, tmp); 1774 1775 { 1776 // UnsafeMemoryAccess page error: continue after unsafe access 1777 UnsafeMemoryAccessMark umam(this, unsafe_copy, true); 1778 __ align(OptoLoopAlignment); 1779 Label L_small_loop; 1780 __ BIND(L_small_loop); 1781 store_one(tmp, to, bytes_per_count, forward, al, tmp2); 1782 __ BIND(entry); // entry point 1783 __ subs(count, count, 1); 1784 load_one(tmp, from, bytes_per_count, forward, ge, tmp2); 1785 __ b(L_small_loop, ge); 1786 } 1787 } 1788 1789 // Aligns 'to' by reading one word from 'from' and writing its part to 'to'. 1790 // 1791 // Arguments: 1792 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1793 // count: 32-bit int, number of elements allowed to be copied 1794 // to_remainder: remainder of dividing 'to' by wordSize 1795 // bytes_per_count: size of an element 1796 // forward: specifies copy direction 1797 // Rval: contains an already read but not yet written word; 1798 // its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'. 1799 // 1800 // Notes: 1801 // 'count' must not be less then the returned value 1802 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 1803 // shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written) 1804 // decreases 'count' by the number of elements written 1805 // Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop 1806 int align_dst(Register to, Register count, Register Rval, Register tmp, 1807 int to_remainder, int bytes_per_count, bool forward) { 1808 assert_different_registers(to, count, tmp, Rval); 1809 1810 assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is not valid"); 1811 assert (to_remainder % bytes_per_count == 0, "to must be aligned by bytes_per_count"); 1812 1813 int bytes_to_write = forward ? (wordSize - to_remainder) : to_remainder; 1814 1815 int offset = 0; 1816 1817 for (int l = 0; l < LogBytesPerWord; ++l) { 1818 int s = (1 << l); 1819 if (bytes_to_write & s) { 1820 int new_offset = offset + s*BitsPerByte; 1821 if (forward) { 1822 if (offset == 0) { 1823 store_one(Rval, to, s, forward); 1824 } else { 1825 __ logical_shift_right(tmp, Rval, offset); 1826 store_one(tmp, to, s, forward); 1827 } 1828 } else { 1829 __ logical_shift_right(tmp, Rval, BitsPerWord - new_offset); 1830 store_one(tmp, to, s, forward); 1831 } 1832 1833 offset = new_offset; 1834 } 1835 } 1836 1837 assert (offset == bytes_to_write * BitsPerByte, "all bytes must be copied"); 1838 1839 __ sub_32(count, count, bytes_to_write/bytes_per_count); 1840 1841 return bytes_to_write / bytes_per_count; 1842 } 1843 1844 // Copies 'count' of elements using shifted copy loop 1845 // 1846 // Arguments: 1847 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 1848 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1849 // count: 32-bit int, number of elements to be copied 1850 // to_remainder: remainder of dividing 'to' by wordSize 1851 // bytes_per_count: size of an element 1852 // forward: specifies copy direction 1853 // Rval: contains an already read but not yet written word 1854 // 1855 // 1856 // Notes: 1857 // 'count' must not be less then the returned value 1858 // 'from' must be aligned by wordSize 1859 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 1860 // shifts 'to' by the number of copied bytes 1861 // 1862 // Scratches R3-R10, R12 1863 int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval, 1864 int to_remainder, int bytes_per_count, bool forward) { 1865 1866 assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid"); 1867 1868 const Register tmp = forward ? R3 : R12; 1869 assert_different_registers(from, to, count, Rval, tmp); 1870 1871 int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward); 1872 1873 int lsr_shift = (wordSize - to_remainder) * BitsPerByte; 1874 int lsl_shift = to_remainder * BitsPerByte; 1875 1876 int min_copy; 1877 if (forward) { 1878 min_copy = generate_forward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift); 1879 } else { 1880 min_copy = generate_backward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift); 1881 } 1882 1883 return min_copy + required_to_align; 1884 } 1885 1886 // Copies 'count' of elements using shifted copy loop 1887 // 1888 // Arguments: 1889 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 1890 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1891 // count: 32-bit int, number of elements to be copied 1892 // bytes_per_count: size of an element 1893 // forward: specifies copy direction 1894 // 1895 // Notes: 1896 // 'count' must not be less then the returned value 1897 // 'from' must be aligned by wordSize 1898 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 1899 // shifts 'to' by the number of copied bytes 1900 // 1901 // Scratches 'from', 'count', R3 and R12. 1902 // R4-R10 saved for use. 1903 int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward, bool unsafe_copy = false) { 1904 1905 const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect 1906 1907 int min_copy = 0; 1908 1909 // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point, 1910 // then the remainder of 'to' divided by wordSize is one of elements of {seq}. 1911 1912 __ push(RegisterSet(R4,R10)); 1913 1914 { 1915 // UnsafeMemoryAccess page error: continue after unsafe access 1916 UnsafeMemoryAccessMark umam(this, unsafe_copy, true); 1917 load_one(Rval, from, wordSize, forward); 1918 1919 switch (bytes_per_count) { 1920 case 2: 1921 min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 1922 break; 1923 case 1: 1924 { 1925 Label L1, L2, L3; 1926 int min_copy1, min_copy2, min_copy3; 1927 1928 Label L_loop_finished; 1929 1930 if (forward) { 1931 __ tbz(to, 0, L2); 1932 __ tbz(to, 1, L1); 1933 1934 __ BIND(L3); 1935 min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 1936 __ b(L_loop_finished); 1937 1938 __ BIND(L1); 1939 min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 1940 __ b(L_loop_finished); 1941 1942 __ BIND(L2); 1943 min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 1944 } else { 1945 __ tbz(to, 0, L2); 1946 __ tbnz(to, 1, L3); 1947 1948 __ BIND(L1); 1949 min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 1950 __ b(L_loop_finished); 1951 1952 __ BIND(L3); 1953 min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 1954 __ b(L_loop_finished); 1955 1956 __ BIND(L2); 1957 min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 1958 } 1959 1960 min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3); 1961 1962 __ BIND(L_loop_finished); 1963 1964 break; 1965 } 1966 default: 1967 ShouldNotReachHere(); 1968 break; 1969 } 1970 } 1971 __ pop(RegisterSet(R4,R10)); 1972 1973 return min_copy; 1974 } 1975 1976 #ifndef PRODUCT 1977 uint * get_arraycopy_counter(int bytes_per_count) { 1978 switch (bytes_per_count) { 1979 case 1: 1980 return &SharedRuntime::_jbyte_array_copy_ctr; 1981 case 2: 1982 return &SharedRuntime::_jshort_array_copy_ctr; 1983 case 4: 1984 return &SharedRuntime::_jint_array_copy_ctr; 1985 case 8: 1986 return &SharedRuntime::_jlong_array_copy_ctr; 1987 default: 1988 ShouldNotReachHere(); 1989 return nullptr; 1990 } 1991 } 1992 #endif // !PRODUCT 1993 1994 address generate_unsafecopy_common_error_exit() { 1995 address start_pc = __ pc(); 1996 __ mov(R0, 0); 1997 __ ret(); 1998 return start_pc; 1999 } 2000 2001 /* Internal development flag */ 2002 /* enabled by defining TEST_C2_GENERIC_ARRAYCOPY */ 2003 2004 // With this flag, the C2 stubs are tested by generating calls to 2005 // generic_arraycopy instead of Runtime1::arraycopy 2006 2007 // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied) 2008 // and the result is tested to see whether the arraycopy stub should 2009 // be called. 2010 2011 // When we test arraycopy this way, we must generate extra code in the 2012 // arraycopy methods callable from C2 generic_arraycopy to set the 2013 // status to 0 for those who always succeed (calling the slow path stub might 2014 // lead to errors since the copy has already been performed). 2015 2016 static const bool set_status; 2017 2018 // 2019 // Generate stub for primitive array copy. If "aligned" is true, the 2020 // "from" and "to" addresses are assumed to be heapword aligned. 2021 // 2022 // If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and 2023 // "nooverlap_target" must be specified as the address to jump if they don't. 2024 // 2025 // Arguments for generated stub: 2026 // from: R0 2027 // to: R1 2028 // count: R2 treated as signed 32-bit int 2029 // 2030 address generate_primitive_copy(StubGenStubId stub_id, address nooverlap_target = nullptr) { 2031 bool aligned; 2032 bool status; 2033 int bytes_per_count; 2034 bool disjoint; 2035 2036 switch (stub_id) { 2037 case jbyte_disjoint_arraycopy_id: 2038 aligned = false; 2039 status = true; 2040 bytes_per_count = 1; 2041 disjoint = true; 2042 break; 2043 case jshort_disjoint_arraycopy_id: 2044 aligned = false; 2045 status = true; 2046 bytes_per_count = 2; 2047 disjoint = true; 2048 break; 2049 case jint_disjoint_arraycopy_id: 2050 aligned = false; 2051 status = true; 2052 bytes_per_count = 4; 2053 disjoint = true; 2054 break; 2055 case jlong_disjoint_arraycopy_id: 2056 aligned = false; 2057 status = true; 2058 bytes_per_count = 8; 2059 disjoint = true; 2060 break; 2061 case arrayof_jbyte_disjoint_arraycopy_id: 2062 aligned = true; 2063 status = set_status; 2064 bytes_per_count = 1; 2065 disjoint = true; 2066 break; 2067 case arrayof_jshort_disjoint_arraycopy_id: 2068 aligned = true; 2069 status = set_status; 2070 bytes_per_count = 2; 2071 disjoint = true; 2072 break; 2073 case arrayof_jint_disjoint_arraycopy_id: 2074 aligned = true; 2075 status = set_status; 2076 bytes_per_count = 4; 2077 disjoint = true; 2078 break; 2079 case arrayof_jlong_disjoint_arraycopy_id: 2080 aligned = false; 2081 status = set_status; 2082 bytes_per_count = 8; 2083 disjoint = true; 2084 break; 2085 case jbyte_arraycopy_id: 2086 aligned = false; 2087 status = true; 2088 bytes_per_count = 1; 2089 disjoint = false; 2090 break; 2091 case jshort_arraycopy_id: 2092 aligned = false; 2093 status = true; 2094 bytes_per_count = 2; 2095 disjoint = false; 2096 break; 2097 case jint_arraycopy_id: 2098 aligned = false; 2099 status = true; 2100 bytes_per_count = 4; 2101 disjoint = false; 2102 break; 2103 case jlong_arraycopy_id: 2104 aligned = false; 2105 status = true; 2106 bytes_per_count = 8; 2107 disjoint = false; 2108 break; 2109 case arrayof_jbyte_arraycopy_id: 2110 aligned = true; 2111 status = set_status; 2112 bytes_per_count = 1; 2113 disjoint = false; 2114 break; 2115 case arrayof_jshort_arraycopy_id: 2116 aligned = true; 2117 status = set_status; 2118 bytes_per_count = 2; 2119 disjoint = false; 2120 break; 2121 case arrayof_jint_arraycopy_id: 2122 aligned = true; 2123 status = set_status; 2124 bytes_per_count = 4; 2125 disjoint = false; 2126 break; 2127 default: 2128 ShouldNotReachHere(); 2129 } 2130 2131 __ align(CodeEntryAlignment); 2132 StubCodeMark mark(this, stub_id); 2133 address start = __ pc(); 2134 2135 const Register from = R0; // source array address 2136 const Register to = R1; // destination array address 2137 const Register count = R2; // elements count 2138 const Register tmp1 = R3; 2139 const Register tmp2 = R12; 2140 2141 if (!aligned) { 2142 BLOCK_COMMENT("Entry:"); 2143 } 2144 2145 __ zap_high_non_significant_bits(R2); 2146 2147 if (!disjoint) { 2148 assert (nooverlap_target != nullptr, "must be specified for conjoint case"); 2149 array_overlap_test(nooverlap_target, exact_log2(bytes_per_count), tmp1, tmp2); 2150 } 2151 2152 inc_counter_np(*get_arraycopy_counter(bytes_per_count), tmp1, tmp2); 2153 2154 // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy 2155 // Disjoint case: perform forward copy 2156 bool forward = disjoint; 2157 2158 2159 if (!forward) { 2160 // Set 'from' and 'to' to upper bounds 2161 int log_bytes_per_count = exact_log2(bytes_per_count); 2162 __ add_ptr_scaled_int32(to, to, count, log_bytes_per_count); 2163 __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count); 2164 } 2165 2166 // There are two main copy loop implementations: 2167 // *) The huge and complex one applicable only for large enough arrays 2168 // *) The small and simple one applicable for any array (but not efficient for large arrays). 2169 // Currently "small" implementation is used if and only if the "large" one could not be used. 2170 // XXX optim: tune the limit higher ? 2171 // Large implementation lower applicability bound is actually determined by 2172 // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop. 2173 const int small_copy_limit = (8*wordSize + 7) / bytes_per_count; 2174 2175 Label L_small_array; 2176 __ cmp_32(count, small_copy_limit); 2177 __ b(L_small_array, le); 2178 2179 // Otherwise proceed with large implementation. 2180 2181 bool from_is_aligned = (bytes_per_count >= 8); 2182 if (aligned && forward && (HeapWordSize % 8 == 0)) { 2183 // if 'from' is heapword aligned and HeapWordSize is divisible by 8, 2184 // then from is aligned by 8 2185 from_is_aligned = true; 2186 } 2187 2188 int count_required_to_align = 0; 2189 { 2190 // UnsafeMemoryAccessMark page error: continue at UnsafeMemoryAccess common_error_exit 2191 UnsafeMemoryAccessMark umam(this, !aligned, false); 2192 count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward); 2193 assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count"); 2194 } 2195 2196 // now 'from' is aligned 2197 2198 bool to_is_aligned = false; 2199 2200 if (bytes_per_count >= wordSize) { 2201 // 'to' is aligned by bytes_per_count, so it is aligned by wordSize 2202 to_is_aligned = true; 2203 } else { 2204 if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) { 2205 // Originally 'from' and 'to' were heapword aligned; 2206 // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned, 2207 // so 'to' is also heapword aligned and thus aligned by wordSize. 2208 to_is_aligned = true; 2209 } 2210 } 2211 2212 Label L_unaligned_dst; 2213 2214 if (!to_is_aligned) { 2215 BLOCK_COMMENT("Check dst alignment:"); 2216 __ tst(to, wordSize - 1); 2217 __ b(L_unaligned_dst, ne); // 'to' is not aligned 2218 } 2219 2220 // 'from' and 'to' are properly aligned 2221 2222 int min_copy; 2223 if (forward) { 2224 min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count, !aligned /*add UnsafeMemoryAccess entry*/); 2225 } else { 2226 min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count, !aligned /*add UnsafeMemoryAccess entry*/); 2227 } 2228 assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count"); 2229 2230 if (status) { 2231 __ mov(R0, 0); // OK 2232 } 2233 2234 __ ret(); 2235 2236 { 2237 copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */, !aligned /*add UnsafeMemoryAccess entry*/); 2238 2239 if (status) { 2240 __ mov(R0, 0); // OK 2241 } 2242 2243 __ ret(); 2244 } 2245 2246 if (! to_is_aligned) { 2247 __ BIND(L_unaligned_dst); 2248 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward, !aligned /*add UnsafeMemoryAccess entry*/); 2249 assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); 2250 2251 if (status) { 2252 __ mov(R0, 0); // OK 2253 } 2254 2255 __ ret(); 2256 } 2257 2258 return start; 2259 } 2260 2261 2262 // Generates pattern of code to be placed after raw data copying in generate_oop_copy 2263 // Includes return from arraycopy stub. 2264 // 2265 // Arguments: 2266 // to: destination pointer after copying. 2267 // if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region 2268 // count: total number of copied elements, 32-bit int 2269 // 2270 // Blows all volatile R0-R3, Rtemp, LR) and 'to', 'count', 'tmp' registers. 2271 void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward, DecoratorSet decorators) { 2272 assert_different_registers(to, count, tmp); 2273 2274 if (forward) { 2275 // 'to' is upper bound of the modified region 2276 // restore initial dst: 2277 __ sub_ptr_scaled_int32(to, to, count, LogBytesPerHeapOop); 2278 } 2279 2280 // 'to' is the beginning of the region 2281 2282 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2283 bs->arraycopy_epilogue(_masm, decorators, true, to, count, tmp); 2284 2285 if (status) { 2286 __ mov(R0, 0); // OK 2287 } 2288 2289 __ pop(PC); 2290 } 2291 2292 2293 // Generate stub for assign-compatible oop copy. If "aligned" is true, the 2294 // "from" and "to" addresses are assumed to be heapword aligned. 2295 // 2296 // If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and 2297 // "nooverlap_target" must be specified as the address to jump if they don't. 2298 // 2299 // Arguments for generated stub: 2300 // from: R0 2301 // to: R1 2302 // count: R2 treated as signed 32-bit int 2303 // 2304 address generate_oop_copy(StubGenStubId stub_id, address nooverlap_target = nullptr) { 2305 bool aligned; 2306 bool status; 2307 bool disjoint; 2308 2309 switch (stub_id) { 2310 case oop_disjoint_arraycopy_id: 2311 aligned = false; 2312 status = true; 2313 disjoint = true; 2314 break; 2315 case arrayof_oop_disjoint_arraycopy_id: 2316 aligned = true; 2317 status = set_status; 2318 disjoint = true; 2319 break; 2320 case oop_arraycopy_id: 2321 aligned = false; 2322 status = true; 2323 disjoint = false; 2324 break; 2325 case arrayof_oop_arraycopy_id: 2326 aligned = true; 2327 status = set_status; 2328 disjoint = false; 2329 break; 2330 default: 2331 ShouldNotReachHere(); 2332 } 2333 2334 __ align(CodeEntryAlignment); 2335 StubCodeMark mark(this, stub_id); 2336 address start = __ pc(); 2337 2338 Register from = R0; 2339 Register to = R1; 2340 Register count = R2; 2341 Register tmp1 = R3; 2342 Register tmp2 = R12; 2343 2344 2345 if (!aligned) { 2346 BLOCK_COMMENT("Entry:"); 2347 } 2348 2349 __ zap_high_non_significant_bits(R2); 2350 2351 if (!disjoint) { 2352 assert (nooverlap_target != nullptr, "must be specified for conjoint case"); 2353 array_overlap_test(nooverlap_target, LogBytesPerHeapOop, tmp1, tmp2); 2354 } 2355 2356 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, tmp1, tmp2); 2357 2358 // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy 2359 // Disjoint case: perform forward copy 2360 bool forward = disjoint; 2361 2362 const int bytes_per_count = BytesPerHeapOop; 2363 const int log_bytes_per_count = LogBytesPerHeapOop; 2364 2365 const Register saved_count = LR; 2366 const int callee_saved_regs = 3; // R0-R2 2367 2368 // LR is used later to save barrier args 2369 __ push(LR); 2370 2371 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2372 if (disjoint) { 2373 decorators |= ARRAYCOPY_DISJOINT; 2374 } 2375 if (aligned) { 2376 decorators |= ARRAYCOPY_ALIGNED; 2377 } 2378 2379 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2380 bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs); 2381 2382 // save arguments for barrier generation (after the pre barrier) 2383 __ mov(saved_count, count); 2384 2385 if (!forward) { 2386 __ add_ptr_scaled_int32(to, to, count, log_bytes_per_count); 2387 __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count); 2388 } 2389 2390 // for short arrays, just do single element copy 2391 Label L_small_array; 2392 const int small_copy_limit = (8*wordSize + 7)/bytes_per_count; // XXX optim: tune the limit higher ? 2393 __ cmp_32(count, small_copy_limit); 2394 __ b(L_small_array, le); 2395 2396 bool from_is_aligned = (bytes_per_count >= 8); 2397 if (aligned && forward && (HeapWordSize % 8 == 0)) { 2398 // if 'from' is heapword aligned and HeapWordSize is divisible by 8, 2399 // then from is aligned by 8 2400 from_is_aligned = true; 2401 } 2402 2403 int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward); 2404 assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count"); 2405 2406 // now 'from' is aligned 2407 2408 bool to_is_aligned = false; 2409 2410 if (bytes_per_count >= wordSize) { 2411 // 'to' is aligned by bytes_per_count, so it is aligned by wordSize 2412 to_is_aligned = true; 2413 } else { 2414 if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) { 2415 // Originally 'from' and 'to' were heapword aligned; 2416 // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned, 2417 // so 'to' is also heapword aligned and thus aligned by wordSize. 2418 to_is_aligned = true; 2419 } 2420 } 2421 2422 Label L_unaligned_dst; 2423 2424 if (!to_is_aligned) { 2425 BLOCK_COMMENT("Check dst alignment:"); 2426 __ tst(to, wordSize - 1); 2427 __ b(L_unaligned_dst, ne); // 'to' is not aligned 2428 } 2429 2430 int min_copy; 2431 if (forward) { 2432 min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count); 2433 } else { 2434 min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count); 2435 } 2436 assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count"); 2437 2438 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators); 2439 2440 { 2441 copy_small_array(from, to, count, tmp1, noreg, bytes_per_count, forward, L_small_array); 2442 2443 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators); 2444 } 2445 2446 if (!to_is_aligned) { 2447 __ BIND(L_unaligned_dst); 2448 ShouldNotReachHere(); 2449 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward); 2450 assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); 2451 2452 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators); 2453 } 2454 2455 return start; 2456 } 2457 2458 // Generate 'unsafe' array copy stub 2459 // Though just as safe as the other stubs, it takes an unscaled 2460 // size_t argument instead of an element count. 2461 // 2462 // Arguments for generated stub: 2463 // from: R0 2464 // to: R1 2465 // count: R2 byte count, treated as ssize_t, can be zero 2466 // 2467 // Examines the alignment of the operands and dispatches 2468 // to a long, int, short, or byte copy loop. 2469 // 2470 address generate_unsafe_copy() { 2471 2472 const Register R0_from = R0; // source array address 2473 const Register R1_to = R1; // destination array address 2474 const Register R2_count = R2; // elements count 2475 2476 const Register R3_bits = R3; // test copy of low bits 2477 2478 __ align(CodeEntryAlignment); 2479 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2480 StubCodeMark mark(this, stub_id); 2481 address start = __ pc(); 2482 const Register tmp = Rtemp; 2483 2484 // bump this on entry, not on exit: 2485 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp); 2486 2487 __ orr(R3_bits, R0_from, R1_to); 2488 __ orr(R3_bits, R2_count, R3_bits); 2489 2490 __ tst(R3_bits, BytesPerLong-1); 2491 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerLong), eq); 2492 __ jump(StubRoutines::_jlong_arraycopy, relocInfo::runtime_call_type, tmp, eq); 2493 2494 __ tst(R3_bits, BytesPerInt-1); 2495 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerInt), eq); 2496 __ jump(StubRoutines::_jint_arraycopy, relocInfo::runtime_call_type, tmp, eq); 2497 2498 __ tst(R3_bits, BytesPerShort-1); 2499 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq); 2500 __ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq); 2501 2502 __ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp); 2503 return start; 2504 } 2505 2506 // Helper for generating a dynamic type check. 2507 // Smashes only the given temp registers. 2508 void generate_type_check(Register sub_klass, 2509 Register super_check_offset, 2510 Register super_klass, 2511 Register tmp1, 2512 Register tmp2, 2513 Register tmp3, 2514 Label& L_success) { 2515 assert_different_registers(sub_klass, super_check_offset, super_klass, tmp1, tmp2, tmp3); 2516 2517 BLOCK_COMMENT("type_check:"); 2518 2519 // If the pointers are equal, we are done (e.g., String[] elements). 2520 2521 __ cmp(super_klass, sub_klass); 2522 __ b(L_success, eq); // fast success 2523 2524 2525 Label L_loop, L_fail; 2526 2527 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 2528 2529 // Check the supertype display: 2530 __ ldr(tmp1, Address(sub_klass, super_check_offset)); 2531 __ cmp(tmp1, super_klass); 2532 __ b(L_success, eq); 2533 2534 __ cmp(super_check_offset, sc_offset); 2535 __ b(L_fail, ne); // failure 2536 2537 BLOCK_COMMENT("type_check_slow_path:"); 2538 2539 // a couple of useful fields in sub_klass: 2540 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 2541 2542 // Do a linear scan of the secondary super-klass chain. 2543 2544 #ifndef PRODUCT 2545 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr; 2546 __ inc_counter((address) pst_counter, tmp1, tmp2); 2547 #endif 2548 2549 Register scan_temp = tmp1; 2550 Register count_temp = tmp2; 2551 2552 // We will consult the secondary-super array. 2553 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 2554 2555 Register search_key = super_klass; 2556 2557 // Load the array length. 2558 __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 2559 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 2560 2561 __ add(count_temp, count_temp, 1); 2562 2563 // Top of search loop 2564 __ bind(L_loop); 2565 // Notes: 2566 // scan_temp starts at the array elements 2567 // count_temp is 1+size 2568 2569 __ subs(count_temp, count_temp, 1); 2570 __ b(L_fail, eq); // not found 2571 2572 // Load next super to check 2573 // In the array of super classes elements are pointer sized. 2574 int element_size = wordSize; 2575 __ ldr(tmp3, Address(scan_temp, element_size, post_indexed)); 2576 2577 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 2578 __ cmp(tmp3, search_key); 2579 2580 // A miss means we are NOT a subtype and need to keep looping 2581 __ b(L_loop, ne); 2582 2583 // Falling out the bottom means we found a hit; we ARE a subtype 2584 2585 // Success. Cache the super we found and proceed in triumph. 2586 __ str(super_klass, Address(sub_klass, sc_offset)); 2587 2588 // Jump to success 2589 __ b(L_success); 2590 2591 // Fall through on failure! 2592 __ bind(L_fail); 2593 } 2594 2595 // Generate stub for checked oop copy. 2596 // 2597 // Arguments for generated stub: 2598 // from: R0 2599 // to: R1 2600 // count: R2 treated as signed 32-bit int 2601 // ckoff: R3 (super_check_offset) 2602 // ckval: R4 (super_klass) 2603 // ret: R0 zero for success; (-1^K) where K is partial transfer count (32-bit) 2604 // 2605 address generate_checkcast_copy() { 2606 __ align(CodeEntryAlignment); 2607 StubGenStubId stub_id = StubGenStubId::checkcast_arraycopy_id; 2608 StubCodeMark mark(this, stub_id); 2609 address start = __ pc(); 2610 2611 const Register from = R0; // source array address 2612 const Register to = R1; // destination array address 2613 const Register count = R2; // elements count 2614 2615 const Register R3_ckoff = R3; // super_check_offset 2616 const Register R4_ckval = R4; // super_klass 2617 2618 const int callee_saved_regs = 4; // LR saved differently 2619 2620 Label load_element, store_element, do_epilogue, fail; 2621 2622 BLOCK_COMMENT("Entry:"); 2623 2624 __ zap_high_non_significant_bits(R2); 2625 2626 int pushed = 0; 2627 __ push(LR); 2628 pushed+=1; 2629 2630 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 2631 2632 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2633 bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs); 2634 2635 const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; 2636 __ push(caller_saved_regs); 2637 assert(caller_saved_regs.size() == 6, "check the count"); 2638 pushed+=6; 2639 2640 __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack 2641 2642 // Save arguments for barrier generation (after the pre barrier): 2643 // - must be a caller saved register and not LR 2644 // - ARM32: avoid R10 in case RThread is needed 2645 const Register saved_count = altFP_7_11; 2646 __ movs(saved_count, count); // and test count 2647 __ b(load_element,ne); 2648 2649 // nothing to copy 2650 __ mov(R0, 0); 2651 2652 __ pop(caller_saved_regs); 2653 __ pop(PC); 2654 2655 // ======== begin loop ======== 2656 // (Loop is rotated; its entry is load_element.) 2657 __ align(OptoLoopAlignment); 2658 __ BIND(store_element); 2659 if (UseCompressedOops) { 2660 __ store_heap_oop(Address(to, BytesPerHeapOop, post_indexed), R5); // store the oop, changes flags 2661 __ subs_32(count,count,1); 2662 } else { 2663 __ subs_32(count,count,1); 2664 __ str(R5, Address(to, BytesPerHeapOop, post_indexed)); // store the oop 2665 } 2666 __ b(do_epilogue, eq); // count exhausted 2667 2668 // ======== loop entry is here ======== 2669 __ BIND(load_element); 2670 __ load_heap_oop(R5, Address(from, BytesPerHeapOop, post_indexed)); // load the oop 2671 __ cbz(R5, store_element); // null 2672 2673 __ load_klass(R6, R5); 2674 2675 generate_type_check(R6, R3_ckoff, R4_ckval, /*tmps*/ R12, R8, R9, 2676 // branch to this on success: 2677 store_element); 2678 // ======== end loop ======== 2679 2680 // It was a real error; we must depend on the caller to finish the job. 2681 // Register count has number of *remaining* oops, saved_count number of *total* oops. 2682 // Emit GC store barriers for the oops we have copied 2683 // and report their number to the caller (0 or (-1^n)) 2684 __ BIND(fail); 2685 2686 // Note: fail marked by the fact that count differs from saved_count 2687 2688 __ BIND(do_epilogue); 2689 2690 Register copied = R4; // saved 2691 Label L_not_copied; 2692 2693 __ subs_32(copied, saved_count, count); // copied count (in saved reg) 2694 __ b(L_not_copied, eq); // nothing was copied, skip post barrier 2695 __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value 2696 __ mov(R12, copied); // count arg scratched by post barrier 2697 2698 bs->arraycopy_epilogue(_masm, decorators, true, to, R12, R3); 2699 2700 assert_different_registers(R3,R12,LR,copied,saved_count); 2701 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12); 2702 2703 __ BIND(L_not_copied); 2704 __ cmp_32(copied, saved_count); // values preserved in saved registers 2705 2706 __ mov(R0, 0, eq); // 0 if all copied 2707 __ mvn(R0, copied, ne); // else NOT(copied) 2708 __ pop(caller_saved_regs); 2709 __ pop(PC); 2710 2711 return start; 2712 } 2713 2714 // Perform range checks on the proposed arraycopy. 2715 // Kills the two temps, but nothing else. 2716 void arraycopy_range_checks(Register src, // source array oop 2717 Register src_pos, // source position (32-bit int) 2718 Register dst, // destination array oop 2719 Register dst_pos, // destination position (32-bit int) 2720 Register length, // length of copy (32-bit int) 2721 Register temp1, Register temp2, 2722 Label& L_failed) { 2723 2724 BLOCK_COMMENT("arraycopy_range_checks:"); 2725 2726 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2727 2728 const Register array_length = temp1; // scratch 2729 const Register end_pos = temp2; // scratch 2730 2731 __ add_32(end_pos, length, src_pos); // src_pos + length 2732 __ ldr_s32(array_length, Address(src, arrayOopDesc::length_offset_in_bytes())); 2733 __ cmp_32(end_pos, array_length); 2734 __ b(L_failed, hi); 2735 2736 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2737 __ add_32(end_pos, length, dst_pos); // dst_pos + length 2738 __ ldr_s32(array_length, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2739 __ cmp_32(end_pos, array_length); 2740 __ b(L_failed, hi); 2741 2742 BLOCK_COMMENT("arraycopy_range_checks done"); 2743 } 2744 2745 // 2746 // Generate generic array copy stubs 2747 // 2748 // Input: 2749 // R0 - src oop 2750 // R1 - src_pos (32-bit int) 2751 // R2 - dst oop 2752 // R3 - dst_pos (32-bit int) 2753 // SP[0] - element count (32-bit int) 2754 // 2755 // Output: (32-bit int) 2756 // R0 == 0 - success 2757 // R0 < 0 - need to call System.arraycopy 2758 // 2759 address generate_generic_copy() { 2760 Label L_failed, L_objArray; 2761 2762 // Input registers 2763 const Register src = R0; // source array oop 2764 const Register src_pos = R1; // source position 2765 const Register dst = R2; // destination array oop 2766 const Register dst_pos = R3; // destination position 2767 2768 // registers used as temp 2769 const Register R5_src_klass = R5; // source array klass 2770 const Register R6_dst_klass = R6; // destination array klass 2771 const Register R_lh = altFP_7_11; // layout handler 2772 const Register R8_temp = R8; 2773 2774 __ align(CodeEntryAlignment); 2775 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2776 StubCodeMark mark(this, stub_id); 2777 address start = __ pc(); 2778 2779 __ zap_high_non_significant_bits(R1); 2780 __ zap_high_non_significant_bits(R3); 2781 __ zap_high_non_significant_bits(R4); 2782 2783 int pushed = 0; 2784 const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; 2785 __ push(saved_regs); 2786 assert(saved_regs.size() == 6, "check the count"); 2787 pushed+=6; 2788 2789 // bump this on entry, not on exit: 2790 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12); 2791 2792 const Register length = R4; // elements count 2793 __ ldr(length, Address(SP,4*pushed)); 2794 2795 2796 //----------------------------------------------------------------------- 2797 // Assembler stubs will be used for this call to arraycopy 2798 // if the following conditions are met: 2799 // 2800 // (1) src and dst must not be null. 2801 // (2) src_pos must not be negative. 2802 // (3) dst_pos must not be negative. 2803 // (4) length must not be negative. 2804 // (5) src klass and dst klass should be the same and not null. 2805 // (6) src and dst should be arrays. 2806 // (7) src_pos + length must not exceed length of src. 2807 // (8) dst_pos + length must not exceed length of dst. 2808 BLOCK_COMMENT("arraycopy initial argument checks"); 2809 2810 // if (src == nullptr) return -1; 2811 __ cbz(src, L_failed); 2812 2813 // if (src_pos < 0) return -1; 2814 __ cmp_32(src_pos, 0); 2815 __ b(L_failed, lt); 2816 2817 // if (dst == nullptr) return -1; 2818 __ cbz(dst, L_failed); 2819 2820 // if (dst_pos < 0) return -1; 2821 __ cmp_32(dst_pos, 0); 2822 __ b(L_failed, lt); 2823 2824 // if (length < 0) return -1; 2825 __ cmp_32(length, 0); 2826 __ b(L_failed, lt); 2827 2828 BLOCK_COMMENT("arraycopy argument klass checks"); 2829 // get src->klass() 2830 __ load_klass(R5_src_klass, src); 2831 2832 // Load layout helper 2833 // 2834 // |array_tag| | header_size | element_type | |log2_element_size| 2835 // 32 30 24 16 8 2 0 2836 // 2837 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2838 // 2839 2840 int lh_offset = in_bytes(Klass::layout_helper_offset()); 2841 __ ldr_u32(R_lh, Address(R5_src_klass, lh_offset)); 2842 2843 __ load_klass(R6_dst_klass, dst); 2844 2845 // Handle objArrays completely differently... 2846 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2847 __ mov_slow(R8_temp, objArray_lh); 2848 __ cmp_32(R_lh, R8_temp); 2849 __ b(L_objArray,eq); 2850 2851 // if (src->klass() != dst->klass()) return -1; 2852 __ cmp(R5_src_klass, R6_dst_klass); 2853 __ b(L_failed, ne); 2854 2855 // if (!src->is_Array()) return -1; 2856 __ cmp_32(R_lh, Klass::_lh_neutral_value); // < 0 2857 __ b(L_failed, ge); 2858 2859 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2860 R8_temp, R6_dst_klass, L_failed); 2861 2862 { 2863 // TypeArrayKlass 2864 // 2865 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2866 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2867 // 2868 2869 const Register R6_offset = R6_dst_klass; // array offset 2870 const Register R12_elsize = R12; // log2 element size 2871 2872 __ logical_shift_right(R6_offset, R_lh, Klass::_lh_header_size_shift); 2873 __ andr(R6_offset, R6_offset, (unsigned int)Klass::_lh_header_size_mask); // array_offset 2874 __ add(src, src, R6_offset); // src array offset 2875 __ add(dst, dst, R6_offset); // dst array offset 2876 __ andr(R12_elsize, R_lh, (unsigned int)Klass::_lh_log2_element_size_mask); // log2 element size 2877 2878 // next registers should be set before the jump to corresponding stub 2879 const Register from = R0; // source array address 2880 const Register to = R1; // destination array address 2881 const Register count = R2; // elements count 2882 2883 // 'from', 'to', 'count' registers should be set in this order 2884 // since they are the same as 'src', 'src_pos', 'dst'. 2885 2886 2887 BLOCK_COMMENT("scale indexes to element size"); 2888 __ add(from, src, AsmOperand(src_pos, lsl, R12_elsize)); // src_addr 2889 __ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize)); // dst_addr 2890 2891 __ mov(count, length); // length 2892 2893 // XXX optim: avoid later push in arraycopy variants ? 2894 2895 __ pop(saved_regs); 2896 2897 BLOCK_COMMENT("choose copy loop based on element size"); 2898 __ cmp(R12_elsize, 0); 2899 __ b(StubRoutines::_jbyte_arraycopy,eq); 2900 2901 __ cmp(R12_elsize, LogBytesPerShort); 2902 __ b(StubRoutines::_jshort_arraycopy,eq); 2903 2904 __ cmp(R12_elsize, LogBytesPerInt); 2905 __ b(StubRoutines::_jint_arraycopy,eq); 2906 2907 __ b(StubRoutines::_jlong_arraycopy); 2908 2909 } 2910 2911 // ObjArrayKlass 2912 __ BIND(L_objArray); 2913 // live at this point: R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length 2914 2915 Label L_plain_copy, L_checkcast_copy; 2916 // test array classes for subtyping 2917 __ cmp(R5_src_klass, R6_dst_klass); // usual case is exact equality 2918 __ b(L_checkcast_copy, ne); 2919 2920 BLOCK_COMMENT("Identically typed arrays"); 2921 { 2922 // Identically typed arrays can be copied without element-wise checks. 2923 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2924 R8_temp, R_lh, L_failed); 2925 2926 // next registers should be set before the jump to corresponding stub 2927 const Register from = R0; // source array address 2928 const Register to = R1; // destination array address 2929 const Register count = R2; // elements count 2930 2931 __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 2932 __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 2933 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr 2934 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr 2935 __ BIND(L_plain_copy); 2936 __ mov(count, length); 2937 2938 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? 2939 __ b(StubRoutines::_oop_arraycopy); 2940 } 2941 2942 { 2943 __ BIND(L_checkcast_copy); 2944 // live at this point: R5_src_klass, R6_dst_klass 2945 2946 // Before looking at dst.length, make sure dst is also an objArray. 2947 __ ldr_u32(R8_temp, Address(R6_dst_klass, lh_offset)); 2948 __ cmp_32(R_lh, R8_temp); 2949 __ b(L_failed, ne); 2950 2951 // It is safe to examine both src.length and dst.length. 2952 2953 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2954 R8_temp, R_lh, L_failed); 2955 2956 // next registers should be set before the jump to corresponding stub 2957 const Register from = R0; // source array address 2958 const Register to = R1; // destination array address 2959 const Register count = R2; // elements count 2960 2961 // Marshal the base address arguments now, freeing registers. 2962 __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 2963 __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 2964 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr 2965 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr 2966 2967 __ mov(count, length); // length (reloaded) 2968 2969 Register sco_temp = R3; // this register is free now 2970 assert_different_registers(from, to, count, sco_temp, 2971 R6_dst_klass, R5_src_klass); 2972 2973 // Generate the type check. 2974 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2975 __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset)); 2976 generate_type_check(R5_src_klass, sco_temp, R6_dst_klass, 2977 R8_temp, R9, 2978 R12, 2979 L_plain_copy); 2980 2981 // Fetch destination element klass from the ObjArrayKlass header. 2982 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2983 2984 // the checkcast_copy loop needs two extra arguments: 2985 const Register Rdst_elem_klass = R3; 2986 __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset)); // dest elem klass 2987 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? 2988 __ str(Rdst_elem_klass, Address(SP,0)); // dest elem klass argument 2989 __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset)); // sco of elem klass 2990 __ b(StubRoutines::_checkcast_arraycopy); 2991 } 2992 2993 __ BIND(L_failed); 2994 2995 __ pop(saved_regs); 2996 __ mvn(R0, 0); // failure, with 0 copied 2997 __ ret(); 2998 2999 return start; 3000 } 3001 3002 void generate_arraycopy_stubs() { 3003 3004 // Note: the disjoint stubs must be generated first, some of 3005 // the conjoint stubs use them. 3006 3007 address ucm_common_error_exit = generate_unsafecopy_common_error_exit(); 3008 UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit); 3009 3010 // these need always status in case they are called from generic_arraycopy 3011 StubRoutines::_jbyte_disjoint_arraycopy = generate_primitive_copy(StubGenStubId::jbyte_disjoint_arraycopy_id); 3012 StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(StubGenStubId::jshort_disjoint_arraycopy_id); 3013 StubRoutines::_jint_disjoint_arraycopy = generate_primitive_copy(StubGenStubId::jint_disjoint_arraycopy_id); 3014 StubRoutines::_jlong_disjoint_arraycopy = generate_primitive_copy(StubGenStubId::jlong_disjoint_arraycopy_id); 3015 StubRoutines::_oop_disjoint_arraycopy = generate_oop_copy (StubGenStubId::oop_disjoint_arraycopy_id); 3016 3017 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_primitive_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id); 3018 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id); 3019 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_primitive_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id); 3020 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_primitive_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id); 3021 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_oop_copy (StubGenStubId::arrayof_oop_disjoint_arraycopy_id); 3022 3023 // these need always status in case they are called from generic_arraycopy 3024 StubRoutines::_jbyte_arraycopy = generate_primitive_copy(StubGenStubId::jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy); 3025 StubRoutines::_jshort_arraycopy = generate_primitive_copy(StubGenStubId::jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy); 3026 StubRoutines::_jint_arraycopy = generate_primitive_copy(StubGenStubId::jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy); 3027 StubRoutines::_jlong_arraycopy = generate_primitive_copy(StubGenStubId::jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy); 3028 StubRoutines::_oop_arraycopy = generate_oop_copy (StubGenStubId::oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy); 3029 3030 StubRoutines::_arrayof_jbyte_arraycopy = generate_primitive_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy); 3031 StubRoutines::_arrayof_jshort_arraycopy = generate_primitive_copy(StubGenStubId::arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy); 3032 #ifdef _LP64 3033 // since sizeof(jint) < sizeof(HeapWord), there's a different flavor: 3034 StubRoutines::_arrayof_jint_arraycopy = generate_primitive_copy(StubGenStubId::arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy); 3035 #else 3036 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 3037 #endif 3038 if (BytesPerHeapOop < HeapWordSize) { 3039 StubRoutines::_arrayof_oop_arraycopy = generate_oop_copy (StubGenStubId::arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy); 3040 } else { 3041 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 3042 } 3043 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 3044 3045 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(); 3046 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(); 3047 StubRoutines::_generic_arraycopy = generate_generic_copy(); 3048 3049 3050 } 3051 3052 address generate_method_entry_barrier() { 3053 __ align(CodeEntryAlignment); 3054 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 3055 StubCodeMark mark(this, stub_id); 3056 3057 Label deoptimize_label; 3058 3059 address start = __ pc(); 3060 3061 // No need to save PC on Arm 3062 __ set_last_Java_frame(SP, FP, false, Rtemp); 3063 3064 __ enter(); 3065 3066 __ add(Rtemp, SP, wordSize); // Rtemp points to the saved lr 3067 __ sub(SP, SP, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 3068 3069 const RegisterSet saved_regs = RegisterSet(R0, R10); 3070 __ push(saved_regs); 3071 __ fpush(FloatRegisterSet(D0, 16)); 3072 3073 __ mov(c_rarg0, Rtemp); 3074 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), c_rarg0); 3075 3076 __ reset_last_Java_frame(Rtemp); 3077 3078 __ mov(Rtemp, R0); 3079 3080 __ fpop(FloatRegisterSet(D0, 16)); 3081 __ pop(saved_regs); 3082 3083 __ cbnz(Rtemp, deoptimize_label); 3084 3085 __ leave(); 3086 __ bx(LR); 3087 3088 __ BIND(deoptimize_label); 3089 3090 __ ldr(Rtemp, Address(SP, 0)); 3091 __ ldr(FP, Address(SP, wordSize)); 3092 __ ldr(LR, Address(SP, wordSize * 2)); 3093 __ ldr(R5, Address(SP, wordSize * 3)); 3094 __ mov(SP, Rtemp); 3095 __ bx(R5); 3096 3097 return start; 3098 } 3099 3100 #define COMPILE_CRYPTO 3101 #include "stubRoutinesCrypto_arm.cpp" 3102 3103 private: 3104 3105 #undef __ 3106 #define __ masm-> 3107 3108 address generate_cont_thaw(StubGenStubId stub_id) { 3109 if (!Continuations::enabled()) return nullptr; 3110 Unimplemented(); 3111 return nullptr; 3112 } 3113 3114 address generate_cont_thaw() { 3115 return generate_cont_thaw(StubGenStubId::cont_thaw_id); 3116 } 3117 3118 address generate_cont_returnBarrier() { 3119 return generate_cont_thaw(StubGenStubId::cont_returnBarrier_id); 3120 } 3121 3122 address generate_cont_returnBarrier_exception() { 3123 return generate_cont_thaw(StubGenStubId::cont_returnBarrierExc_id); 3124 } 3125 3126 //--------------------------------------------------------------------------- 3127 // Initialization 3128 3129 void generate_initial_stubs() { 3130 // Generates all stubs and initializes the entry points 3131 3132 //------------------------------------------------------------------------------------------------------------------------ 3133 // entry points that exist in all platforms 3134 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 3135 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 3136 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3137 3138 StubRoutines::_call_stub_entry = 3139 generate_call_stub(StubRoutines::_call_stub_return_address); 3140 // is referenced by megamorphic call 3141 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3142 3143 // stub for throwing stack overflow error used both by interpreter and compiler 3144 if (UnsafeMemoryAccess::_table == nullptr) { 3145 UnsafeMemoryAccess::create_table(32 + 4); // 32 for copyMemory; 4 for setMemory 3146 } 3147 3148 // integer division used both by interpreter and compiler 3149 StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem(); 3150 3151 StubRoutines::_atomic_add_entry = generate_atomic_add(); 3152 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 3153 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 3154 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 3155 StubRoutines::Arm::_atomic_load_long_entry = generate_atomic_load_long(); 3156 StubRoutines::Arm::_atomic_store_long_entry = generate_atomic_store_long(); 3157 3158 } 3159 3160 void generate_continuation_stubs() { 3161 // Continuation stubs: 3162 StubRoutines::_cont_thaw = generate_cont_thaw(); 3163 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 3164 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 3165 } 3166 3167 void generate_final_stubs() { 3168 // Generates all stubs and initializes the entry points 3169 3170 //------------------------------------------------------------------------------------------------------------------------ 3171 // entry points that are platform specific 3172 3173 // support for verify_oop (must happen after universe_init) 3174 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 3175 3176 // arraycopy stubs used by compilers 3177 generate_arraycopy_stubs(); 3178 3179 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 3180 } 3181 3182 void generate_compiler_stubs() { 3183 #ifdef COMPILER2 3184 // Generate partial_subtype_check first here since its code depends on 3185 // UseZeroBaseCompressedOops which is defined after heap initialization. 3186 StubRoutines::Arm::_partial_subtype_check = generate_partial_subtype_check(); 3187 3188 #ifdef COMPILE_CRYPTO 3189 // generate AES intrinsics code 3190 if (UseAESIntrinsics) { 3191 aes_init(); 3192 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 3193 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 3194 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 3195 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 3196 } 3197 #endif // COMPILE_CRYPTO 3198 #endif // COMPILER2 3199 } 3200 3201 public: 3202 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 3203 switch(blob_id) { 3204 case initial_id: 3205 generate_initial_stubs(); 3206 break; 3207 case continuation_id: 3208 generate_continuation_stubs(); 3209 break; 3210 case compiler_id: 3211 generate_compiler_stubs(); 3212 break; 3213 case final_id: 3214 generate_final_stubs(); 3215 break; 3216 default: 3217 fatal("unexpected blob id: %d", blob_id); 3218 break; 3219 }; 3220 } 3221 }; // end class declaration 3222 3223 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 3224 StubGenerator g(code, blob_id); 3225 } 3226 3227 // implementation of internal development flag 3228 3229 #ifdef TEST_C2_GENERIC_ARRAYCOPY 3230 const bool StubGenerator::set_status = true; // generate a status compatible with C1 calls 3231 #else 3232 const bool StubGenerator::set_status = false; // non failing C2 stubs need not return a status in R0 3233 #endif 3234