1 /* 2 * Copyright (c) 2013, Red Hat Inc. 3 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. 4 * All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "interpreter/interpreter.hpp" 31 #include "nativeInst_aarch64.hpp" 32 #include "oops/instanceOop.hpp" 33 #include "oops/method.hpp" 34 #include "oops/objArrayKlass.hpp" 35 #include "oops/oop.inline.hpp" 36 #include "prims/methodHandles.hpp" 37 #include "runtime/frame.inline.hpp" 38 #include "runtime/handles.inline.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubCodeGenerator.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "runtime/thread.inline.hpp" 43 #include "utilities/macros.hpp" 44 #include "utilities/top.hpp" 45 46 #include "stubRoutines_aarch64.hpp" 47 48 #ifdef COMPILER2 49 #include "opto/runtime.hpp" 50 #endif 51 #if INCLUDE_ALL_GCS 52 #include "shenandoahBarrierSetAssembler_aarch64.hpp" 53 #endif 54 55 // Declaration and definition of StubGenerator (no .hpp file). 56 // For a more detailed description of the stub routine structure 57 // see the comment in stubRoutines.hpp 58 59 #undef __ 60 #define __ _masm-> 61 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #else 66 #define BLOCK_COMMENT(str) __ block_comment(str) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Stub Code definitions 72 73 class StubGenerator: public StubCodeGenerator { 74 private: 75 76 #ifdef PRODUCT 77 #define inc_counter_np(counter) ((void)0) 78 #else 79 void inc_counter_np_(int& counter) { 80 __ lea(rscratch2, ExternalAddress((address)&counter)); 81 __ ldrw(rscratch1, Address(rscratch2)); 82 __ addw(rscratch1, rscratch1, 1); 83 __ strw(rscratch1, Address(rscratch2)); 84 } 85 #define inc_counter_np(counter) \ 86 BLOCK_COMMENT("inc_counter " #counter); \ 87 inc_counter_np_(counter); 88 #endif 89 90 // Call stubs are used to call Java from C 91 // 92 // Arguments: 93 // c_rarg0: call wrapper address address 94 // c_rarg1: result address 95 // c_rarg2: result type BasicType 96 // c_rarg3: method Method* 97 // c_rarg4: (interpreter) entry point address 98 // c_rarg5: parameters intptr_t* 99 // c_rarg6: parameter size (in words) int 100 // c_rarg7: thread Thread* 101 // 102 // There is no return from the stub itself as any Java result 103 // is written to result 104 // 105 // we save r30 (lr) as the return PC at the base of the frame and 106 // link r29 (fp) below it as the frame pointer installing sp (r31) 107 // into fp. 108 // 109 // we save r0-r7, which accounts for all the c arguments. 110 // 111 // TODO: strictly do we need to save them all? they are treated as 112 // volatile by C so could we omit saving the ones we are going to 113 // place in global registers (thread? method?) or those we only use 114 // during setup of the Java call? 115 // 116 // we don't need to save r8 which C uses as an indirect result location 117 // return register. 118 // 119 // we don't need to save r9-r15 which both C and Java treat as 120 // volatile 121 // 122 // we don't need to save r16-18 because Java does not use them 123 // 124 // we save r19-r28 which Java uses as scratch registers and C 125 // expects to be callee-save 126 // 127 // we save the bottom 64 bits of each value stored in v8-v15; it is 128 // the responsibility of the caller to preserve larger values. 129 // 130 // so the stub frame looks like this when we enter Java code 131 // 132 // [ return_from_Java ] <--- sp 133 // [ argument word n ] 134 // ... 135 // -27 [ argument word 1 ] 136 // -26 [ saved v15 ] <--- sp_after_call 137 // -25 [ saved v14 ] 138 // -24 [ saved v13 ] 139 // -23 [ saved v12 ] 140 // -22 [ saved v11 ] 141 // -21 [ saved v10 ] 142 // -20 [ saved v9 ] 143 // -19 [ saved v8 ] 144 // -18 [ saved r28 ] 145 // -17 [ saved r27 ] 146 // -16 [ saved r26 ] 147 // -15 [ saved r25 ] 148 // -14 [ saved r24 ] 149 // -13 [ saved r23 ] 150 // -12 [ saved r22 ] 151 // -11 [ saved r21 ] 152 // -10 [ saved r20 ] 153 // -9 [ saved r19 ] 154 // -8 [ call wrapper (r0) ] 155 // -7 [ result (r1) ] 156 // -6 [ result type (r2) ] 157 // -5 [ method (r3) ] 158 // -4 [ entry point (r4) ] 159 // -3 [ parameters (r5) ] 160 // -2 [ parameter size (r6) ] 161 // -1 [ thread (r7) ] 162 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 163 // 1 [ saved lr (r30) ] 164 165 // Call stub stack layout word offsets from fp 166 enum call_stub_layout { 167 sp_after_call_off = -26, 168 169 d15_off = -26, 170 d13_off = -24, 171 d11_off = -22, 172 d9_off = -20, 173 174 r28_off = -18, 175 r26_off = -16, 176 r24_off = -14, 177 r22_off = -12, 178 r20_off = -10, 179 call_wrapper_off = -8, 180 result_off = -7, 181 result_type_off = -6, 182 method_off = -5, 183 entry_point_off = -4, 184 parameter_size_off = -2, 185 thread_off = -1, 186 fp_f = 0, 187 retaddr_off = 1, 188 }; 189 190 address generate_call_stub(address& return_address) { 191 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 192 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 193 "adjust this code"); 194 195 StubCodeMark mark(this, "StubRoutines", "call_stub"); 196 address start = __ pc(); 197 198 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 199 200 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 201 const Address result (rfp, result_off * wordSize); 202 const Address result_type (rfp, result_type_off * wordSize); 203 const Address method (rfp, method_off * wordSize); 204 const Address entry_point (rfp, entry_point_off * wordSize); 205 const Address parameter_size(rfp, parameter_size_off * wordSize); 206 207 const Address thread (rfp, thread_off * wordSize); 208 209 const Address d15_save (rfp, d15_off * wordSize); 210 const Address d13_save (rfp, d13_off * wordSize); 211 const Address d11_save (rfp, d11_off * wordSize); 212 const Address d9_save (rfp, d9_off * wordSize); 213 214 const Address r28_save (rfp, r28_off * wordSize); 215 const Address r26_save (rfp, r26_off * wordSize); 216 const Address r24_save (rfp, r24_off * wordSize); 217 const Address r22_save (rfp, r22_off * wordSize); 218 const Address r20_save (rfp, r20_off * wordSize); 219 220 // stub code 221 222 address aarch64_entry = __ pc(); 223 224 // set up frame and move sp to end of save area 225 __ enter(); 226 __ sub(sp, rfp, -sp_after_call_off * wordSize); 227 228 // save register parameters and Java scratch/global registers 229 // n.b. we save thread even though it gets installed in 230 // rthread because we want to sanity check rthread later 231 __ str(c_rarg7, thread); 232 __ strw(c_rarg6, parameter_size); 233 __ stp(c_rarg4, c_rarg5, entry_point); 234 __ stp(c_rarg2, c_rarg3, result_type); 235 __ stp(c_rarg0, c_rarg1, call_wrapper); 236 237 __ stp(r20, r19, r20_save); 238 __ stp(r22, r21, r22_save); 239 __ stp(r24, r23, r24_save); 240 __ stp(r26, r25, r26_save); 241 __ stp(r28, r27, r28_save); 242 243 __ stpd(v9, v8, d9_save); 244 __ stpd(v11, v10, d11_save); 245 __ stpd(v13, v12, d13_save); 246 __ stpd(v15, v14, d15_save); 247 248 // install Java thread in global register now we have saved 249 // whatever value it held 250 __ mov(rthread, c_rarg7); 251 // And method 252 __ mov(rmethod, c_rarg3); 253 254 // set up the heapbase register 255 __ reinit_heapbase(); 256 257 #ifdef ASSERT 258 // make sure we have no pending exceptions 259 { 260 Label L; 261 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 262 __ cmp(rscratch1, (unsigned)NULL_WORD); 263 __ br(Assembler::EQ, L); 264 __ stop("StubRoutines::call_stub: entered with pending exception"); 265 __ BIND(L); 266 } 267 #endif 268 // pass parameters if any 269 __ mov(esp, sp); 270 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 271 __ andr(sp, rscratch1, -2 * wordSize); 272 273 BLOCK_COMMENT("pass parameters if any"); 274 Label parameters_done; 275 // parameter count is still in c_rarg6 276 // and parameter pointer identifying param 1 is in c_rarg5 277 __ cbzw(c_rarg6, parameters_done); 278 279 address loop = __ pc(); 280 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 281 __ subsw(c_rarg6, c_rarg6, 1); 282 __ push(rscratch1); 283 __ br(Assembler::GT, loop); 284 285 __ BIND(parameters_done); 286 287 // call Java entry -- passing methdoOop, and current sp 288 // rmethod: Method* 289 // r13: sender sp 290 BLOCK_COMMENT("call Java function"); 291 __ mov(r13, sp); 292 __ blr(c_rarg4); 293 294 // we do this here because the notify will already have been done 295 // if we get to the next instruction via an exception 296 // 297 // n.b. adding this instruction here affects the calculation of 298 // whether or not a routine returns to the call stub (used when 299 // doing stack walks) since the normal test is to check the return 300 // pc against the address saved below. so we may need to allow for 301 // this extra instruction in the check. 302 303 // save current address for use by exception handling code 304 305 return_address = __ pc(); 306 307 // store result depending on type (everything that is not 308 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 309 // n.b. this assumes Java returns an integral result in r0 310 // and a floating result in j_farg0 311 __ ldr(j_rarg2, result); 312 Label is_long, is_float, is_double, exit; 313 __ ldr(j_rarg1, result_type); 314 __ cmp(j_rarg1, T_OBJECT); 315 __ br(Assembler::EQ, is_long); 316 __ cmp(j_rarg1, T_LONG); 317 __ br(Assembler::EQ, is_long); 318 __ cmp(j_rarg1, T_FLOAT); 319 __ br(Assembler::EQ, is_float); 320 __ cmp(j_rarg1, T_DOUBLE); 321 __ br(Assembler::EQ, is_double); 322 323 // handle T_INT case 324 __ strw(r0, Address(j_rarg2)); 325 326 __ BIND(exit); 327 328 // pop parameters 329 __ sub(esp, rfp, -sp_after_call_off * wordSize); 330 331 #ifdef ASSERT 332 // verify that threads correspond 333 { 334 Label L, S; 335 __ ldr(rscratch1, thread); 336 __ cmp(rthread, rscratch1); 337 __ br(Assembler::NE, S); 338 __ get_thread(rscratch1); 339 __ cmp(rthread, rscratch1); 340 __ br(Assembler::EQ, L); 341 __ BIND(S); 342 __ stop("StubRoutines::call_stub: threads must correspond"); 343 __ BIND(L); 344 } 345 #endif 346 347 // restore callee-save registers 348 __ ldpd(v15, v14, d15_save); 349 __ ldpd(v13, v12, d13_save); 350 __ ldpd(v11, v10, d11_save); 351 __ ldpd(v9, v8, d9_save); 352 353 __ ldp(r28, r27, r28_save); 354 __ ldp(r26, r25, r26_save); 355 __ ldp(r24, r23, r24_save); 356 __ ldp(r22, r21, r22_save); 357 __ ldp(r20, r19, r20_save); 358 359 __ ldp(c_rarg0, c_rarg1, call_wrapper); 360 __ ldrw(c_rarg2, result_type); 361 __ ldr(c_rarg3, method); 362 __ ldp(c_rarg4, c_rarg5, entry_point); 363 __ ldp(c_rarg6, c_rarg7, parameter_size); 364 365 // leave frame and return to caller 366 __ leave(); 367 __ ret(lr); 368 369 // handle return types different from T_INT 370 371 __ BIND(is_long); 372 __ str(r0, Address(j_rarg2, 0)); 373 __ br(Assembler::AL, exit); 374 375 __ BIND(is_float); 376 __ strs(j_farg0, Address(j_rarg2, 0)); 377 __ br(Assembler::AL, exit); 378 379 __ BIND(is_double); 380 __ strd(j_farg0, Address(j_rarg2, 0)); 381 __ br(Assembler::AL, exit); 382 383 return start; 384 } 385 386 // Return point for a Java call if there's an exception thrown in 387 // Java code. The exception is caught and transformed into a 388 // pending exception stored in JavaThread that can be tested from 389 // within the VM. 390 // 391 // Note: Usually the parameters are removed by the callee. In case 392 // of an exception crossing an activation frame boundary, that is 393 // not the case if the callee is compiled code => need to setup the 394 // rsp. 395 // 396 // r0: exception oop 397 398 address generate_catch_exception() { 399 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 400 address start = __ pc(); 401 402 // same as in generate_call_stub(): 403 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 404 const Address thread (rfp, thread_off * wordSize); 405 406 #ifdef ASSERT 407 // verify that threads correspond 408 { 409 Label L, S; 410 __ ldr(rscratch1, thread); 411 __ cmp(rthread, rscratch1); 412 __ br(Assembler::NE, S); 413 __ get_thread(rscratch1); 414 __ cmp(rthread, rscratch1); 415 __ br(Assembler::EQ, L); 416 __ bind(S); 417 __ stop("StubRoutines::catch_exception: threads must correspond"); 418 __ bind(L); 419 } 420 #endif 421 422 // set pending exception 423 __ verify_oop(r0); 424 425 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 426 __ mov(rscratch1, (address)__FILE__); 427 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 428 __ movw(rscratch1, (int)__LINE__); 429 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 430 431 // complete return to VM 432 assert(StubRoutines::_call_stub_return_address != NULL, 433 "_call_stub_return_address must have been generated before"); 434 __ b(StubRoutines::_call_stub_return_address); 435 436 return start; 437 } 438 439 // Continuation point for runtime calls returning with a pending 440 // exception. The pending exception check happened in the runtime 441 // or native call stub. The pending exception in Thread is 442 // converted into a Java-level exception. 443 // 444 // Contract with Java-level exception handlers: 445 // r0: exception 446 // r3: throwing pc 447 // 448 // NOTE: At entry of this stub, exception-pc must be in LR !! 449 450 // NOTE: this is always used as a jump target within generated code 451 // so it just needs to be generated code wiht no x86 prolog 452 453 address generate_forward_exception() { 454 StubCodeMark mark(this, "StubRoutines", "forward exception"); 455 address start = __ pc(); 456 457 // Upon entry, LR points to the return address returning into 458 // Java (interpreted or compiled) code; i.e., the return address 459 // becomes the throwing pc. 460 // 461 // Arguments pushed before the runtime call are still on the stack 462 // but the exception handler will reset the stack pointer -> 463 // ignore them. A potential result in registers can be ignored as 464 // well. 465 466 #ifdef ASSERT 467 // make sure this code is only executed if there is a pending exception 468 { 469 Label L; 470 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 471 __ cbnz(rscratch1, L); 472 __ stop("StubRoutines::forward exception: no pending exception (1)"); 473 __ bind(L); 474 } 475 #endif 476 477 // compute exception handler into r19 478 479 // call the VM to find the handler address associated with the 480 // caller address. pass thread in r0 and caller pc (ret address) 481 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 482 // the stack. 483 __ mov(c_rarg1, lr); 484 // lr will be trashed by the VM call so we move it to R19 485 // (callee-saved) because we also need to pass it to the handler 486 // returned by this call. 487 __ mov(r19, lr); 488 BLOCK_COMMENT("call exception_handler_for_return_address"); 489 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 490 SharedRuntime::exception_handler_for_return_address), 491 rthread, c_rarg1); 492 // we should not really care that lr is no longer the callee 493 // address. we saved the value the handler needs in r19 so we can 494 // just copy it to r3. however, the C2 handler will push its own 495 // frame and then calls into the VM and the VM code asserts that 496 // the PC for the frame above the handler belongs to a compiled 497 // Java method. So, we restore lr here to satisfy that assert. 498 __ mov(lr, r19); 499 // setup r0 & r3 & clear pending exception 500 __ mov(r3, r19); 501 __ mov(r19, r0); 502 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 503 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 504 505 #ifdef ASSERT 506 // make sure exception is set 507 { 508 Label L; 509 __ cbnz(r0, L); 510 __ stop("StubRoutines::forward exception: no pending exception (2)"); 511 __ bind(L); 512 } 513 #endif 514 515 // continue at exception handler 516 // r0: exception 517 // r3: throwing pc 518 // r19: exception handler 519 __ verify_oop(r0); 520 __ br(r19); 521 522 return start; 523 } 524 525 // Non-destructive plausibility checks for oops 526 // 527 // Arguments: 528 // r0: oop to verify 529 // rscratch1: error message 530 // 531 // Stack after saving c_rarg3: 532 // [tos + 0]: saved c_rarg3 533 // [tos + 1]: saved c_rarg2 534 // [tos + 2]: saved lr 535 // [tos + 3]: saved rscratch2 536 // [tos + 4]: saved r0 537 // [tos + 5]: saved rscratch1 538 address generate_verify_oop() { 539 540 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 541 address start = __ pc(); 542 543 Label exit, error; 544 545 // save c_rarg2 and c_rarg3 546 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 547 548 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 549 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 550 __ ldr(c_rarg3, Address(c_rarg2)); 551 __ add(c_rarg3, c_rarg3, 1); 552 __ str(c_rarg3, Address(c_rarg2)); 553 554 // object is in r0 555 // make sure object is 'reasonable' 556 __ cbz(r0, exit); // if obj is NULL it is OK 557 558 // Check if the oop is in the right area of memory 559 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 560 __ andr(c_rarg2, r0, c_rarg3); 561 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 562 563 // Compare c_rarg2 and c_rarg3. We don't use a compare 564 // instruction here because the flags register is live. 565 __ eor(c_rarg2, c_rarg2, c_rarg3); 566 __ cbnz(c_rarg2, error); 567 568 // make sure klass is 'reasonable', which is not zero. 569 __ load_klass(r0, r0); // get klass 570 __ cbz(r0, error); // if klass is NULL it is broken 571 572 // return if everything seems ok 573 __ bind(exit); 574 575 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 576 __ ret(lr); 577 578 // handle errors 579 __ bind(error); 580 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 581 582 __ push(RegSet::range(r0, r29), sp); 583 // debug(char* msg, int64_t pc, int64_t regs[]) 584 __ mov(c_rarg0, rscratch1); // pass address of error message 585 __ mov(c_rarg1, lr); // pass return address 586 __ mov(c_rarg2, sp); // pass address of regs on stack 587 #ifndef PRODUCT 588 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 589 #endif 590 BLOCK_COMMENT("call MacroAssembler::debug"); 591 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 592 __ blr(rscratch1); 593 594 return start; 595 } 596 597 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 598 599 // Generate code for an array write pre barrier 600 // 601 // addr - starting address 602 // count - element count 603 // tmp - scratch register 604 // 605 // Destroy no registers except rscratch1 and rscratch2 606 // 607 void gen_write_ref_array_pre_barrier(Register src, Register addr, Register count, bool dest_uninitialized) { 608 BarrierSet* bs = Universe::heap()->barrier_set(); 609 switch (bs->kind()) { 610 case BarrierSet::G1SATBCT: 611 case BarrierSet::G1SATBCTLogging: 612 // Don't generate the call if we statically know that the target is uninitialized 613 if (!dest_uninitialized) { 614 __ push_call_clobbered_registers(); 615 if (count == c_rarg0) { 616 if (addr == c_rarg1) { 617 // exactly backwards!! 618 __ mov(rscratch1, c_rarg0); 619 __ mov(c_rarg0, c_rarg1); 620 __ mov(c_rarg1, rscratch1); 621 } else { 622 __ mov(c_rarg1, count); 623 __ mov(c_rarg0, addr); 624 } 625 } else { 626 __ mov(c_rarg0, addr); 627 __ mov(c_rarg1, count); 628 } 629 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 630 __ pop_call_clobbered_registers(); 631 break; 632 case BarrierSet::CardTableModRef: 633 case BarrierSet::CardTableExtension: 634 case BarrierSet::ModRef: 635 break; 636 #if INCLUDE_ALL_GCS 637 case BarrierSet::ShenandoahBarrierSet: 638 ShenandoahBarrierSetAssembler::bsasm()->arraycopy_prologue(_masm, dest_uninitialized, src, addr, count); 639 break; 640 #endif 641 default: 642 ShouldNotReachHere(); 643 644 } 645 } 646 } 647 648 // 649 // Generate code for an array write post barrier 650 // 651 // Input: 652 // start - register containing starting address of destination array 653 // end - register containing ending address of destination array 654 // scratch - scratch register 655 // 656 // The input registers are overwritten. 657 // The ending address is inclusive. 658 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 659 assert_different_registers(start, end, scratch); 660 Label L_done; 661 662 // "end" is inclusive end pointer == start + (count - 1) * array_element_size 663 // If count == 0, "end" is less than "start" and we need to skip card marking. 664 __ cmp(end, start); 665 __ br(__ LO, L_done); 666 667 BarrierSet* bs = Universe::heap()->barrier_set(); 668 switch (bs->kind()) { 669 case BarrierSet::G1SATBCT: 670 case BarrierSet::G1SATBCTLogging: 671 672 { 673 __ push_call_clobbered_registers(); 674 // must compute element count unless barrier set interface is changed (other platforms supply count) 675 assert_different_registers(start, end, scratch); 676 __ lea(scratch, Address(end, BytesPerHeapOop)); 677 __ sub(scratch, scratch, start); // subtract start to get #bytes 678 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 679 __ mov(c_rarg0, start); 680 __ mov(c_rarg1, scratch); 681 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 682 __ pop_call_clobbered_registers(); 683 } 684 break; 685 case BarrierSet::CardTableModRef: 686 case BarrierSet::CardTableExtension: 687 { 688 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 689 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 690 691 Label L_loop; 692 693 __ lsr(start, start, CardTableModRefBS::card_shift); 694 __ lsr(end, end, CardTableModRefBS::card_shift); 695 __ sub(end, end, start); // number of bytes to copy 696 697 const Register count = end; // 'end' register contains bytes count now 698 __ load_byte_map_base(scratch); 699 __ add(start, start, scratch); 700 if (UseConcMarkSweepGC) { 701 __ membar(__ StoreStore); 702 } 703 __ BIND(L_loop); 704 __ strb(zr, Address(start, count)); 705 __ subs(count, count, 1); 706 __ br(Assembler::GE, L_loop); 707 } 708 break; 709 #if INCLUDE_ALL_GCS 710 case BarrierSet::ShenandoahBarrierSet: 711 break; 712 #endif 713 default: 714 ShouldNotReachHere(); 715 716 } 717 __ bind(L_done); 718 } 719 720 address generate_zero_longs(Register base, Register cnt) { 721 Register tmp = rscratch1; 722 Register tmp2 = rscratch2; 723 int zva_length = VM_Version::zva_length(); 724 Label initial_table_end, loop_zva; 725 Label fini; 726 727 __ align(CodeEntryAlignment); 728 StubCodeMark mark(this, "StubRoutines", "zero_longs"); 729 address start = __ pc(); 730 731 // Base must be 16 byte aligned. If not just return and let caller handle it 732 __ tst(base, 0x0f); 733 __ br(Assembler::NE, fini); 734 // Align base with ZVA length. 735 __ neg(tmp, base); 736 __ andr(tmp, tmp, zva_length - 1); 737 738 // tmp: the number of bytes to be filled to align the base with ZVA length. 739 __ add(base, base, tmp); 740 __ sub(cnt, cnt, tmp, Assembler::ASR, 3); 741 __ adr(tmp2, initial_table_end); 742 __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 743 __ br(tmp2); 744 745 for (int i = -zva_length + 16; i < 0; i += 16) 746 __ stp(zr, zr, Address(base, i)); 747 __ bind(initial_table_end); 748 749 __ sub(cnt, cnt, zva_length >> 3); 750 __ bind(loop_zva); 751 __ dc(Assembler::ZVA, base); 752 __ subs(cnt, cnt, zva_length >> 3); 753 __ add(base, base, zva_length); 754 __ br(Assembler::GE, loop_zva); 755 __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 756 __ bind(fini); 757 __ ret(lr); 758 759 return start; 760 } 761 762 typedef enum { 763 copy_forwards = 1, 764 copy_backwards = -1 765 } copy_direction; 766 767 // Bulk copy of blocks of 8 words. 768 // 769 // count is a count of words. 770 // 771 // Precondition: count >= 8 772 // 773 // Postconditions: 774 // 775 // The least significant bit of count contains the remaining count 776 // of words to copy. The rest of count is trash. 777 // 778 // s and d are adjusted to point to the remaining words to copy 779 // 780 void generate_copy_longs(Label &start, Register s, Register d, Register count, 781 copy_direction direction) { 782 int unit = wordSize * direction; 783 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 784 785 int offset; 786 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 787 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 788 const Register stride = r13; 789 790 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 791 assert_different_registers(s, d, count, rscratch1); 792 793 Label again, drain; 794 const char *stub_name; 795 if (direction == copy_forwards) 796 stub_name = "foward_copy_longs"; 797 else 798 stub_name = "backward_copy_longs"; 799 800 __ align(CodeEntryAlignment); 801 802 StubCodeMark mark(this, "StubRoutines", stub_name); 803 804 __ bind(start); 805 806 Label unaligned_copy_long; 807 if (AvoidUnalignedAccesses) { 808 __ tbnz(d, 3, unaligned_copy_long); 809 } 810 811 if (direction == copy_forwards) { 812 __ sub(s, s, bias); 813 __ sub(d, d, bias); 814 } 815 816 #ifdef ASSERT 817 // Make sure we are never given < 8 words 818 { 819 Label L; 820 __ cmp(count, 8); 821 __ br(Assembler::GE, L); 822 __ stop("genrate_copy_longs called with < 8 words"); 823 __ bind(L); 824 } 825 #endif 826 827 // Fill 8 registers 828 if (UseSIMDForMemoryOps) { 829 __ ldpq(v0, v1, Address(s, 4 * unit)); 830 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 831 } else { 832 __ ldp(t0, t1, Address(s, 2 * unit)); 833 __ ldp(t2, t3, Address(s, 4 * unit)); 834 __ ldp(t4, t5, Address(s, 6 * unit)); 835 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 836 } 837 838 __ subs(count, count, 16); 839 __ br(Assembler::LO, drain); 840 841 int prefetch = PrefetchCopyIntervalInBytes; 842 bool use_stride = false; 843 if (direction == copy_backwards) { 844 use_stride = prefetch > 256; 845 prefetch = -prefetch; 846 if (use_stride) __ mov(stride, prefetch); 847 } 848 849 __ bind(again); 850 851 if (PrefetchCopyIntervalInBytes > 0) 852 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 853 854 if (UseSIMDForMemoryOps) { 855 __ stpq(v0, v1, Address(d, 4 * unit)); 856 __ ldpq(v0, v1, Address(s, 4 * unit)); 857 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 858 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 859 } else { 860 __ stp(t0, t1, Address(d, 2 * unit)); 861 __ ldp(t0, t1, Address(s, 2 * unit)); 862 __ stp(t2, t3, Address(d, 4 * unit)); 863 __ ldp(t2, t3, Address(s, 4 * unit)); 864 __ stp(t4, t5, Address(d, 6 * unit)); 865 __ ldp(t4, t5, Address(s, 6 * unit)); 866 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 867 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 868 } 869 870 __ subs(count, count, 8); 871 __ br(Assembler::HS, again); 872 873 // Drain 874 __ bind(drain); 875 if (UseSIMDForMemoryOps) { 876 __ stpq(v0, v1, Address(d, 4 * unit)); 877 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 878 } else { 879 __ stp(t0, t1, Address(d, 2 * unit)); 880 __ stp(t2, t3, Address(d, 4 * unit)); 881 __ stp(t4, t5, Address(d, 6 * unit)); 882 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 883 } 884 885 { 886 Label L1, L2; 887 __ tbz(count, exact_log2(4), L1); 888 if (UseSIMDForMemoryOps) { 889 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 890 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 891 } else { 892 __ ldp(t0, t1, Address(s, 2 * unit)); 893 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 894 __ stp(t0, t1, Address(d, 2 * unit)); 895 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 896 } 897 __ bind(L1); 898 899 if (direction == copy_forwards) { 900 __ add(s, s, bias); 901 __ add(d, d, bias); 902 } 903 904 __ tbz(count, 1, L2); 905 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 906 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 907 __ bind(L2); 908 } 909 910 __ ret(lr); 911 912 if (AvoidUnalignedAccesses) { 913 Label drain, again; 914 // Register order for storing. Order is different for backward copy. 915 916 __ bind(unaligned_copy_long); 917 918 // source address is even aligned, target odd aligned 919 // 920 // when forward copying word pairs we read long pairs at offsets 921 // {0, 2, 4, 6} (in long words). when backwards copying we read 922 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 923 // address by -2 in the forwards case so we can compute the 924 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 925 // or -1. 926 // 927 // when forward copying we need to store 1 word, 3 pairs and 928 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 929 // zero offset We adjust the destination by -1 which means we 930 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 931 // 932 // When backwards copyng we need to store 1 word, 3 pairs and 933 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 934 // offsets {1, 3, 5, 7, 8} * unit. 935 936 if (direction == copy_forwards) { 937 __ sub(s, s, 16); 938 __ sub(d, d, 8); 939 } 940 941 // Fill 8 registers 942 // 943 // for forwards copy s was offset by -16 from the original input 944 // value of s so the register contents are at these offsets 945 // relative to the 64 bit block addressed by that original input 946 // and so on for each successive 64 byte block when s is updated 947 // 948 // t0 at offset 0, t1 at offset 8 949 // t2 at offset 16, t3 at offset 24 950 // t4 at offset 32, t5 at offset 40 951 // t6 at offset 48, t7 at offset 56 952 953 // for backwards copy s was not offset so the register contents 954 // are at these offsets into the preceding 64 byte block 955 // relative to that original input and so on for each successive 956 // preceding 64 byte block when s is updated. this explains the 957 // slightly counter-intuitive looking pattern of register usage 958 // in the stp instructions for backwards copy. 959 // 960 // t0 at offset -16, t1 at offset -8 961 // t2 at offset -32, t3 at offset -24 962 // t4 at offset -48, t5 at offset -40 963 // t6 at offset -64, t7 at offset -56 964 965 __ ldp(t0, t1, Address(s, 2 * unit)); 966 __ ldp(t2, t3, Address(s, 4 * unit)); 967 __ ldp(t4, t5, Address(s, 6 * unit)); 968 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 969 970 __ subs(count, count, 16); 971 __ br(Assembler::LO, drain); 972 973 int prefetch = PrefetchCopyIntervalInBytes; 974 bool use_stride = false; 975 if (direction == copy_backwards) { 976 use_stride = prefetch > 256; 977 prefetch = -prefetch; 978 if (use_stride) __ mov(stride, prefetch); 979 } 980 981 __ bind(again); 982 983 if (PrefetchCopyIntervalInBytes > 0) 984 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 985 986 if (direction == copy_forwards) { 987 // allowing for the offset of -8 the store instructions place 988 // registers into the target 64 bit block at the following 989 // offsets 990 // 991 // t0 at offset 0 992 // t1 at offset 8, t2 at offset 16 993 // t3 at offset 24, t4 at offset 32 994 // t5 at offset 40, t6 at offset 48 995 // t7 at offset 56 996 997 __ str(t0, Address(d, 1 * unit)); 998 __ stp(t1, t2, Address(d, 2 * unit)); 999 __ ldp(t0, t1, Address(s, 2 * unit)); 1000 __ stp(t3, t4, Address(d, 4 * unit)); 1001 __ ldp(t2, t3, Address(s, 4 * unit)); 1002 __ stp(t5, t6, Address(d, 6 * unit)); 1003 __ ldp(t4, t5, Address(s, 6 * unit)); 1004 __ str(t7, Address(__ pre(d, 8 * unit))); 1005 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1006 } else { 1007 // d was not offset when we started so the registers are 1008 // written into the 64 bit block preceding d with the following 1009 // offsets 1010 // 1011 // t1 at offset -8 1012 // t3 at offset -24, t0 at offset -16 1013 // t5 at offset -48, t2 at offset -32 1014 // t7 at offset -56, t4 at offset -48 1015 // t6 at offset -64 1016 // 1017 // note that this matches the offsets previously noted for the 1018 // loads 1019 1020 __ str(t1, Address(d, 1 * unit)); 1021 __ stp(t3, t0, Address(d, 3 * unit)); 1022 __ ldp(t0, t1, Address(s, 2 * unit)); 1023 __ stp(t5, t2, Address(d, 5 * unit)); 1024 __ ldp(t2, t3, Address(s, 4 * unit)); 1025 __ stp(t7, t4, Address(d, 7 * unit)); 1026 __ ldp(t4, t5, Address(s, 6 * unit)); 1027 __ str(t6, Address(__ pre(d, 8 * unit))); 1028 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1029 } 1030 1031 __ subs(count, count, 8); 1032 __ br(Assembler::HS, again); 1033 1034 // Drain 1035 // 1036 // this uses the same pattern of offsets and register arguments 1037 // as above 1038 __ bind(drain); 1039 if (direction == copy_forwards) { 1040 __ str(t0, Address(d, 1 * unit)); 1041 __ stp(t1, t2, Address(d, 2 * unit)); 1042 __ stp(t3, t4, Address(d, 4 * unit)); 1043 __ stp(t5, t6, Address(d, 6 * unit)); 1044 __ str(t7, Address(__ pre(d, 8 * unit))); 1045 } else { 1046 __ str(t1, Address(d, 1 * unit)); 1047 __ stp(t3, t0, Address(d, 3 * unit)); 1048 __ stp(t5, t2, Address(d, 5 * unit)); 1049 __ stp(t7, t4, Address(d, 7 * unit)); 1050 __ str(t6, Address(__ pre(d, 8 * unit))); 1051 } 1052 // now we need to copy any remaining part block which may 1053 // include a 4 word block subblock and/or a 2 word subblock. 1054 // bits 2 and 1 in the count are the tell-tale for whetehr we 1055 // have each such subblock 1056 { 1057 Label L1, L2; 1058 __ tbz(count, exact_log2(4), L1); 1059 // this is the same as above but copying only 4 longs hence 1060 // with ony one intervening stp between the str instructions 1061 // but note that the offsets and registers still follow the 1062 // same pattern 1063 __ ldp(t0, t1, Address(s, 2 * unit)); 1064 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1065 if (direction == copy_forwards) { 1066 __ str(t0, Address(d, 1 * unit)); 1067 __ stp(t1, t2, Address(d, 2 * unit)); 1068 __ str(t3, Address(__ pre(d, 4 * unit))); 1069 } else { 1070 __ str(t1, Address(d, 1 * unit)); 1071 __ stp(t3, t0, Address(d, 3 * unit)); 1072 __ str(t2, Address(__ pre(d, 4 * unit))); 1073 } 1074 __ bind(L1); 1075 1076 __ tbz(count, 1, L2); 1077 // this is the same as above but copying only 2 longs hence 1078 // there is no intervening stp between the str instructions 1079 // but note that the offset and register patterns are still 1080 // the same 1081 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1082 if (direction == copy_forwards) { 1083 __ str(t0, Address(d, 1 * unit)); 1084 __ str(t1, Address(__ pre(d, 2 * unit))); 1085 } else { 1086 __ str(t1, Address(d, 1 * unit)); 1087 __ str(t0, Address(__ pre(d, 2 * unit))); 1088 } 1089 __ bind(L2); 1090 1091 // for forwards copy we need to re-adjust the offsets we 1092 // applied so that s and d are follow the last words written 1093 1094 if (direction == copy_forwards) { 1095 __ add(s, s, 16); 1096 __ add(d, d, 8); 1097 } 1098 1099 } 1100 1101 __ ret(lr); 1102 } 1103 } 1104 1105 // Small copy: less than 16 bytes. 1106 // 1107 // NB: Ignores all of the bits of count which represent more than 15 1108 // bytes, so a caller doesn't have to mask them. 1109 1110 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1111 bool is_backwards = step < 0; 1112 size_t granularity = uabs(step); 1113 int direction = is_backwards ? -1 : 1; 1114 int unit = wordSize * direction; 1115 1116 Label Lpair, Lword, Lint, Lshort, Lbyte; 1117 1118 assert(granularity 1119 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1120 1121 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1122 1123 // ??? I don't know if this bit-test-and-branch is the right thing 1124 // to do. It does a lot of jumping, resulting in several 1125 // mispredicted branches. It might make more sense to do this 1126 // with something like Duff's device with a single computed branch. 1127 1128 __ tbz(count, 3 - exact_log2(granularity), Lword); 1129 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1130 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1131 __ bind(Lword); 1132 1133 if (granularity <= sizeof (jint)) { 1134 __ tbz(count, 2 - exact_log2(granularity), Lint); 1135 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1136 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1137 __ bind(Lint); 1138 } 1139 1140 if (granularity <= sizeof (jshort)) { 1141 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1142 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1143 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1144 __ bind(Lshort); 1145 } 1146 1147 if (granularity <= sizeof (jbyte)) { 1148 __ tbz(count, 0, Lbyte); 1149 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1150 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1151 __ bind(Lbyte); 1152 } 1153 } 1154 1155 Label copy_f, copy_b; 1156 1157 // All-singing all-dancing memory copy. 1158 // 1159 // Copy count units of memory from s to d. The size of a unit is 1160 // step, which can be positive or negative depending on the direction 1161 // of copy. If is_aligned is false, we align the source address. 1162 // 1163 1164 void copy_memory(bool is_aligned, Register s, Register d, 1165 Register count, Register tmp, int step) { 1166 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1167 bool is_backwards = step < 0; 1168 int granularity = uabs(step); 1169 const Register t0 = r3, t1 = r4; 1170 1171 // <= 96 bytes do inline. Direction doesn't matter because we always 1172 // load all the data before writing anything 1173 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1174 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1175 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1176 const Register send = r17, dend = r18; 1177 1178 if (PrefetchCopyIntervalInBytes > 0) 1179 __ prfm(Address(s, 0), PLDL1KEEP); 1180 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1181 __ br(Assembler::HI, copy_big); 1182 1183 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1184 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1185 1186 __ cmp(count, 16/granularity); 1187 __ br(Assembler::LS, copy16); 1188 1189 __ cmp(count, 64/granularity); 1190 __ br(Assembler::HI, copy80); 1191 1192 __ cmp(count, 32/granularity); 1193 __ br(Assembler::LS, copy32); 1194 1195 // 33..64 bytes 1196 if (UseSIMDForMemoryOps) { 1197 __ ldpq(v0, v1, Address(s, 0)); 1198 __ ldpq(v2, v3, Address(send, -32)); 1199 __ stpq(v0, v1, Address(d, 0)); 1200 __ stpq(v2, v3, Address(dend, -32)); 1201 } else { 1202 __ ldp(t0, t1, Address(s, 0)); 1203 __ ldp(t2, t3, Address(s, 16)); 1204 __ ldp(t4, t5, Address(send, -32)); 1205 __ ldp(t6, t7, Address(send, -16)); 1206 1207 __ stp(t0, t1, Address(d, 0)); 1208 __ stp(t2, t3, Address(d, 16)); 1209 __ stp(t4, t5, Address(dend, -32)); 1210 __ stp(t6, t7, Address(dend, -16)); 1211 } 1212 __ b(finish); 1213 1214 // 17..32 bytes 1215 __ bind(copy32); 1216 __ ldp(t0, t1, Address(s, 0)); 1217 __ ldp(t2, t3, Address(send, -16)); 1218 __ stp(t0, t1, Address(d, 0)); 1219 __ stp(t2, t3, Address(dend, -16)); 1220 __ b(finish); 1221 1222 // 65..80/96 bytes 1223 // (96 bytes if SIMD because we do 32 byes per instruction) 1224 __ bind(copy80); 1225 if (UseSIMDForMemoryOps) { 1226 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1227 __ ldpq(v4, v5, Address(send, -32)); 1228 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1229 __ stpq(v4, v5, Address(dend, -32)); 1230 } else { 1231 __ ldp(t0, t1, Address(s, 0)); 1232 __ ldp(t2, t3, Address(s, 16)); 1233 __ ldp(t4, t5, Address(s, 32)); 1234 __ ldp(t6, t7, Address(s, 48)); 1235 __ ldp(t8, t9, Address(send, -16)); 1236 1237 __ stp(t0, t1, Address(d, 0)); 1238 __ stp(t2, t3, Address(d, 16)); 1239 __ stp(t4, t5, Address(d, 32)); 1240 __ stp(t6, t7, Address(d, 48)); 1241 __ stp(t8, t9, Address(dend, -16)); 1242 } 1243 __ b(finish); 1244 1245 // 0..16 bytes 1246 __ bind(copy16); 1247 __ cmp(count, 8/granularity); 1248 __ br(Assembler::LO, copy8); 1249 1250 // 8..16 bytes 1251 __ ldr(t0, Address(s, 0)); 1252 __ ldr(t1, Address(send, -8)); 1253 __ str(t0, Address(d, 0)); 1254 __ str(t1, Address(dend, -8)); 1255 __ b(finish); 1256 1257 if (granularity < 8) { 1258 // 4..7 bytes 1259 __ bind(copy8); 1260 __ tbz(count, 2 - exact_log2(granularity), copy4); 1261 __ ldrw(t0, Address(s, 0)); 1262 __ ldrw(t1, Address(send, -4)); 1263 __ strw(t0, Address(d, 0)); 1264 __ strw(t1, Address(dend, -4)); 1265 __ b(finish); 1266 if (granularity < 4) { 1267 // 0..3 bytes 1268 __ bind(copy4); 1269 __ cbz(count, finish); // get rid of 0 case 1270 if (granularity == 2) { 1271 __ ldrh(t0, Address(s, 0)); 1272 __ strh(t0, Address(d, 0)); 1273 } else { // granularity == 1 1274 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1275 // the first and last byte. 1276 // Handle the 3 byte case by loading and storing base + count/2 1277 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1278 // This does means in the 1 byte case we load/store the same 1279 // byte 3 times. 1280 __ lsr(count, count, 1); 1281 __ ldrb(t0, Address(s, 0)); 1282 __ ldrb(t1, Address(send, -1)); 1283 __ ldrb(t2, Address(s, count)); 1284 __ strb(t0, Address(d, 0)); 1285 __ strb(t1, Address(dend, -1)); 1286 __ strb(t2, Address(d, count)); 1287 } 1288 __ b(finish); 1289 } 1290 } 1291 1292 __ bind(copy_big); 1293 if (is_backwards) { 1294 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1295 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1296 } 1297 1298 // Now we've got the small case out of the way we can align the 1299 // source address on a 2-word boundary. 1300 1301 Label aligned; 1302 1303 if (is_aligned) { 1304 // We may have to adjust by 1 word to get s 2-word-aligned. 1305 __ tbz(s, exact_log2(wordSize), aligned); 1306 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1307 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1308 __ sub(count, count, wordSize/granularity); 1309 } else { 1310 if (is_backwards) { 1311 __ andr(rscratch2, s, 2 * wordSize - 1); 1312 } else { 1313 __ neg(rscratch2, s); 1314 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1315 } 1316 // rscratch2 is the byte adjustment needed to align s. 1317 __ cbz(rscratch2, aligned); 1318 int shift = exact_log2(granularity); 1319 if (shift) __ lsr(rscratch2, rscratch2, shift); 1320 __ sub(count, count, rscratch2); 1321 1322 #if 0 1323 // ?? This code is only correct for a disjoint copy. It may or 1324 // may not make sense to use it in that case. 1325 1326 // Copy the first pair; s and d may not be aligned. 1327 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1328 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1329 1330 // Align s and d, adjust count 1331 if (is_backwards) { 1332 __ sub(s, s, rscratch2); 1333 __ sub(d, d, rscratch2); 1334 } else { 1335 __ add(s, s, rscratch2); 1336 __ add(d, d, rscratch2); 1337 } 1338 #else 1339 copy_memory_small(s, d, rscratch2, rscratch1, step); 1340 #endif 1341 } 1342 1343 __ bind(aligned); 1344 1345 // s is now 2-word-aligned. 1346 1347 // We have a count of units and some trailing bytes. Adjust the 1348 // count and do a bulk copy of words. 1349 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1350 if (direction == copy_forwards) 1351 __ bl(copy_f); 1352 else 1353 __ bl(copy_b); 1354 1355 // And the tail. 1356 copy_memory_small(s, d, count, tmp, step); 1357 1358 if (granularity >= 8) __ bind(copy8); 1359 if (granularity >= 4) __ bind(copy4); 1360 __ bind(finish); 1361 } 1362 1363 1364 void clobber_registers() { 1365 #ifdef ASSERT 1366 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1367 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1368 for (Register r = r3; r <= r18; r++) 1369 if (r != rscratch1) __ mov(r, rscratch1); 1370 #endif 1371 } 1372 1373 // Scan over array at a for count oops, verifying each one. 1374 // Preserves a and count, clobbers rscratch1 and rscratch2. 1375 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1376 Label loop, end; 1377 __ mov(rscratch1, a); 1378 __ mov(rscratch2, zr); 1379 __ bind(loop); 1380 __ cmp(rscratch2, count); 1381 __ br(Assembler::HS, end); 1382 if (size == (size_t)wordSize) { 1383 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1384 __ verify_oop(temp); 1385 } else { 1386 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1387 __ decode_heap_oop(temp); // calls verify_oop 1388 } 1389 __ add(rscratch2, rscratch2, 1); 1390 __ b(loop); 1391 __ bind(end); 1392 } 1393 1394 // Arguments: 1395 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1396 // ignored 1397 // is_oop - true => oop array, so generate store check code 1398 // name - stub name string 1399 // 1400 // Inputs: 1401 // c_rarg0 - source array address 1402 // c_rarg1 - destination array address 1403 // c_rarg2 - element count, treated as ssize_t, can be zero 1404 // 1405 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1406 // the hardware handle it. The two dwords within qwords that span 1407 // cache line boundaries will still be loaded and stored atomicly. 1408 // 1409 // Side Effects: 1410 // disjoint_int_copy_entry is set to the no-overlap entry point 1411 // used by generate_conjoint_int_oop_copy(). 1412 // 1413 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1414 const char *name, bool dest_uninitialized = false) { 1415 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1416 __ align(CodeEntryAlignment); 1417 StubCodeMark mark(this, "StubRoutines", name); 1418 address start = __ pc(); 1419 __ enter(); 1420 1421 if (entry != NULL) { 1422 *entry = __ pc(); 1423 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1424 BLOCK_COMMENT("Entry:"); 1425 } 1426 1427 if (is_oop) { 1428 __ push(RegSet::of(d, count), sp); 1429 // no registers are destroyed by this call 1430 gen_write_ref_array_pre_barrier(s, d, count, dest_uninitialized); 1431 } 1432 copy_memory(aligned, s, d, count, rscratch1, size); 1433 if (is_oop) { 1434 __ pop(RegSet::of(d, count), sp); 1435 if (VerifyOops) 1436 verify_oop_array(size, d, count, r16); 1437 __ sub(count, count, 1); // make an inclusive end pointer 1438 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1439 gen_write_ref_array_post_barrier(d, count, rscratch1); 1440 } 1441 __ leave(); 1442 __ mov(r0, zr); // return 0 1443 __ ret(lr); 1444 return start; 1445 } 1446 1447 // Arguments: 1448 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1449 // ignored 1450 // is_oop - true => oop array, so generate store check code 1451 // name - stub name string 1452 // 1453 // Inputs: 1454 // c_rarg0 - source array address 1455 // c_rarg1 - destination array address 1456 // c_rarg2 - element count, treated as ssize_t, can be zero 1457 // 1458 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1459 // the hardware handle it. The two dwords within qwords that span 1460 // cache line boundaries will still be loaded and stored atomicly. 1461 // 1462 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1463 address *entry, const char *name, 1464 bool dest_uninitialized = false) { 1465 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1466 1467 StubCodeMark mark(this, "StubRoutines", name); 1468 address start = __ pc(); 1469 1470 __ enter(); 1471 1472 if (entry != NULL) { 1473 *entry = __ pc(); 1474 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1475 BLOCK_COMMENT("Entry:"); 1476 } 1477 1478 // use fwd copy when (d-s) above_equal (count*size) 1479 __ sub(rscratch1, d, s); 1480 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1481 __ br(Assembler::HS, nooverlap_target); 1482 1483 if (is_oop) { 1484 __ push(RegSet::of(d, count), sp); 1485 // no registers are destroyed by this call 1486 gen_write_ref_array_pre_barrier(s, d, count, dest_uninitialized); 1487 } 1488 copy_memory(aligned, s, d, count, rscratch1, -size); 1489 if (is_oop) { 1490 __ pop(RegSet::of(d, count), sp); 1491 if (VerifyOops) 1492 verify_oop_array(size, d, count, r16); 1493 __ sub(count, count, 1); // make an inclusive end pointer 1494 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1495 gen_write_ref_array_post_barrier(d, count, rscratch1); 1496 } 1497 __ leave(); 1498 __ mov(r0, zr); // return 0 1499 __ ret(lr); 1500 return start; 1501 } 1502 1503 // Arguments: 1504 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1505 // ignored 1506 // name - stub name string 1507 // 1508 // Inputs: 1509 // c_rarg0 - source array address 1510 // c_rarg1 - destination array address 1511 // c_rarg2 - element count, treated as ssize_t, can be zero 1512 // 1513 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1514 // we let the hardware handle it. The one to eight bytes within words, 1515 // dwords or qwords that span cache line boundaries will still be loaded 1516 // and stored atomically. 1517 // 1518 // Side Effects: 1519 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1520 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1521 // we let the hardware handle it. The one to eight bytes within words, 1522 // dwords or qwords that span cache line boundaries will still be loaded 1523 // and stored atomically. 1524 // 1525 // Side Effects: 1526 // disjoint_byte_copy_entry is set to the no-overlap entry point 1527 // used by generate_conjoint_byte_copy(). 1528 // 1529 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1530 const bool not_oop = false; 1531 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1532 } 1533 1534 // Arguments: 1535 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1536 // ignored 1537 // name - stub name string 1538 // 1539 // Inputs: 1540 // c_rarg0 - source array address 1541 // c_rarg1 - destination array address 1542 // c_rarg2 - element count, treated as ssize_t, can be zero 1543 // 1544 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1545 // we let the hardware handle it. The one to eight bytes within words, 1546 // dwords or qwords that span cache line boundaries will still be loaded 1547 // and stored atomically. 1548 // 1549 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1550 address* entry, const char *name) { 1551 const bool not_oop = false; 1552 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1553 } 1554 1555 // Arguments: 1556 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1557 // ignored 1558 // name - stub name string 1559 // 1560 // Inputs: 1561 // c_rarg0 - source array address 1562 // c_rarg1 - destination array address 1563 // c_rarg2 - element count, treated as ssize_t, can be zero 1564 // 1565 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1566 // let the hardware handle it. The two or four words within dwords 1567 // or qwords that span cache line boundaries will still be loaded 1568 // and stored atomically. 1569 // 1570 // Side Effects: 1571 // disjoint_short_copy_entry is set to the no-overlap entry point 1572 // used by generate_conjoint_short_copy(). 1573 // 1574 address generate_disjoint_short_copy(bool aligned, 1575 address* entry, const char *name) { 1576 const bool not_oop = false; 1577 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1578 } 1579 1580 // Arguments: 1581 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1582 // ignored 1583 // name - stub name string 1584 // 1585 // Inputs: 1586 // c_rarg0 - source array address 1587 // c_rarg1 - destination array address 1588 // c_rarg2 - element count, treated as ssize_t, can be zero 1589 // 1590 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1591 // let the hardware handle it. The two or four words within dwords 1592 // or qwords that span cache line boundaries will still be loaded 1593 // and stored atomically. 1594 // 1595 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1596 address *entry, const char *name) { 1597 const bool not_oop = false; 1598 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1599 1600 } 1601 // Arguments: 1602 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1603 // ignored 1604 // name - stub name string 1605 // 1606 // Inputs: 1607 // c_rarg0 - source array address 1608 // c_rarg1 - destination array address 1609 // c_rarg2 - element count, treated as ssize_t, can be zero 1610 // 1611 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1612 // the hardware handle it. The two dwords within qwords that span 1613 // cache line boundaries will still be loaded and stored atomicly. 1614 // 1615 // Side Effects: 1616 // disjoint_int_copy_entry is set to the no-overlap entry point 1617 // used by generate_conjoint_int_oop_copy(). 1618 // 1619 address generate_disjoint_int_copy(bool aligned, address *entry, 1620 const char *name) { 1621 const bool not_oop = false; 1622 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1623 } 1624 1625 // Arguments: 1626 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1627 // ignored 1628 // name - stub name string 1629 // 1630 // Inputs: 1631 // c_rarg0 - source array address 1632 // c_rarg1 - destination array address 1633 // c_rarg2 - element count, treated as ssize_t, can be zero 1634 // 1635 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1636 // the hardware handle it. The two dwords within qwords that span 1637 // cache line boundaries will still be loaded and stored atomicly. 1638 // 1639 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1640 address *entry, const char *name, 1641 bool dest_uninitialized = false) { 1642 const bool not_oop = false; 1643 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1644 } 1645 1646 1647 // Arguments: 1648 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1649 // ignored 1650 // name - stub name string 1651 // 1652 // Inputs: 1653 // c_rarg0 - source array address 1654 // c_rarg1 - destination array address 1655 // c_rarg2 - element count, treated as size_t, can be zero 1656 // 1657 // Side Effects: 1658 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1659 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1660 // 1661 address generate_disjoint_long_copy(bool aligned, address *entry, 1662 const char *name, bool dest_uninitialized = false) { 1663 const bool not_oop = false; 1664 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1665 } 1666 1667 // Arguments: 1668 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1669 // ignored 1670 // name - stub name string 1671 // 1672 // Inputs: 1673 // c_rarg0 - source array address 1674 // c_rarg1 - destination array address 1675 // c_rarg2 - element count, treated as size_t, can be zero 1676 // 1677 address generate_conjoint_long_copy(bool aligned, 1678 address nooverlap_target, address *entry, 1679 const char *name, bool dest_uninitialized = false) { 1680 const bool not_oop = false; 1681 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1682 } 1683 1684 // Arguments: 1685 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1686 // ignored 1687 // name - stub name string 1688 // 1689 // Inputs: 1690 // c_rarg0 - source array address 1691 // c_rarg1 - destination array address 1692 // c_rarg2 - element count, treated as size_t, can be zero 1693 // 1694 // Side Effects: 1695 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1696 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1697 // 1698 address generate_disjoint_oop_copy(bool aligned, address *entry, 1699 const char *name, bool dest_uninitialized) { 1700 const bool is_oop = true; 1701 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1702 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1703 } 1704 1705 // Arguments: 1706 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1707 // ignored 1708 // name - stub name string 1709 // 1710 // Inputs: 1711 // c_rarg0 - source array address 1712 // c_rarg1 - destination array address 1713 // c_rarg2 - element count, treated as size_t, can be zero 1714 // 1715 address generate_conjoint_oop_copy(bool aligned, 1716 address nooverlap_target, address *entry, 1717 const char *name, bool dest_uninitialized) { 1718 const bool is_oop = true; 1719 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1720 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1721 name, dest_uninitialized); 1722 } 1723 1724 1725 // Helper for generating a dynamic type check. 1726 // Smashes rscratch1. 1727 void generate_type_check(Register sub_klass, 1728 Register super_check_offset, 1729 Register super_klass, 1730 Label& L_success) { 1731 assert_different_registers(sub_klass, super_check_offset, super_klass); 1732 1733 BLOCK_COMMENT("type_check:"); 1734 1735 Label L_miss; 1736 1737 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1738 super_check_offset); 1739 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1740 1741 // Fall through on failure! 1742 __ BIND(L_miss); 1743 } 1744 1745 // 1746 // Generate checkcasting array copy stub 1747 // 1748 // Input: 1749 // c_rarg0 - source array address 1750 // c_rarg1 - destination array address 1751 // c_rarg2 - element count, treated as ssize_t, can be zero 1752 // c_rarg3 - size_t ckoff (super_check_offset) 1753 // c_rarg4 - oop ckval (super_klass) 1754 // 1755 // Output: 1756 // r0 == 0 - success 1757 // r0 == -1^K - failure, where K is partial transfer count 1758 // 1759 address generate_checkcast_copy(const char *name, address *entry, 1760 bool dest_uninitialized = false) { 1761 1762 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1763 1764 // Input registers (after setup_arg_regs) 1765 const Register from = c_rarg0; // source array address 1766 const Register to = c_rarg1; // destination array address 1767 const Register count = c_rarg2; // elementscount 1768 const Register ckoff = c_rarg3; // super_check_offset 1769 const Register ckval = c_rarg4; // super_klass 1770 1771 // Registers used as temps (r18, r19, r20 are save-on-entry) 1772 const Register count_save = r21; // orig elementscount 1773 const Register start_to = r20; // destination array start address 1774 const Register copied_oop = r18; // actual oop copied 1775 const Register r19_klass = r19; // oop._klass 1776 1777 //--------------------------------------------------------------- 1778 // Assembler stub will be used for this call to arraycopy 1779 // if the two arrays are subtypes of Object[] but the 1780 // destination array type is not equal to or a supertype 1781 // of the source type. Each element must be separately 1782 // checked. 1783 1784 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1785 copied_oop, r19_klass, count_save); 1786 1787 __ align(CodeEntryAlignment); 1788 StubCodeMark mark(this, "StubRoutines", name); 1789 address start = __ pc(); 1790 1791 __ enter(); // required for proper stackwalking of RuntimeStub frame 1792 1793 #ifdef ASSERT 1794 // caller guarantees that the arrays really are different 1795 // otherwise, we would have to make conjoint checks 1796 { Label L; 1797 array_overlap_test(L, TIMES_OOP); 1798 __ stop("checkcast_copy within a single array"); 1799 __ bind(L); 1800 } 1801 #endif //ASSERT 1802 1803 // Caller of this entry point must set up the argument registers. 1804 if (entry != NULL) { 1805 *entry = __ pc(); 1806 BLOCK_COMMENT("Entry:"); 1807 } 1808 1809 // Empty array: Nothing to do. 1810 __ cbz(count, L_done); 1811 1812 __ push(RegSet::of(r18, r19, r20, r21), sp); 1813 1814 #ifdef ASSERT 1815 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1816 // The ckoff and ckval must be mutually consistent, 1817 // even though caller generates both. 1818 { Label L; 1819 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1820 __ ldrw(start_to, Address(ckval, sco_offset)); 1821 __ cmpw(ckoff, start_to); 1822 __ br(Assembler::EQ, L); 1823 __ stop("super_check_offset inconsistent"); 1824 __ bind(L); 1825 } 1826 #endif //ASSERT 1827 1828 gen_write_ref_array_pre_barrier(from, to, count, dest_uninitialized); 1829 1830 // save the original count 1831 __ mov(count_save, count); 1832 1833 // Copy from low to high addresses 1834 __ mov(start_to, to); // Save destination array start address 1835 __ b(L_load_element); 1836 1837 // ======== begin loop ======== 1838 // (Loop is rotated; its entry is L_load_element.) 1839 // Loop control: 1840 // for (; count != 0; count--) { 1841 // copied_oop = load_heap_oop(from++); 1842 // ... generate_type_check ...; 1843 // store_heap_oop(to++, copied_oop); 1844 // } 1845 __ align(OptoLoopAlignment); 1846 1847 __ BIND(L_store_element); 1848 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1849 __ sub(count, count, 1); 1850 __ cbz(count, L_do_card_marks); 1851 1852 // ======== loop entry is here ======== 1853 __ BIND(L_load_element); 1854 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1855 __ cbz(copied_oop, L_store_element); 1856 1857 __ load_klass(r19_klass, copied_oop);// query the object klass 1858 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1859 // ======== end loop ======== 1860 1861 // It was a real error; we must depend on the caller to finish the job. 1862 // Register count = remaining oops, count_orig = total oops. 1863 // Emit GC store barriers for the oops we have copied and report 1864 // their number to the caller. 1865 1866 __ subs(count, count_save, count); // K = partially copied oop count 1867 __ eon(count, count, zr); // report (-1^K) to caller 1868 __ br(Assembler::EQ, L_done_pop); 1869 1870 __ BIND(L_do_card_marks); 1871 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1872 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1873 1874 __ bind(L_done_pop); 1875 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1876 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1877 1878 __ bind(L_done); 1879 __ mov(r0, count); 1880 __ leave(); 1881 __ ret(lr); 1882 1883 return start; 1884 } 1885 1886 // Perform range checks on the proposed arraycopy. 1887 // Kills temp, but nothing else. 1888 // Also, clean the sign bits of src_pos and dst_pos. 1889 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1890 Register src_pos, // source position (c_rarg1) 1891 Register dst, // destination array oo (c_rarg2) 1892 Register dst_pos, // destination position (c_rarg3) 1893 Register length, 1894 Register temp, 1895 Label& L_failed) { 1896 BLOCK_COMMENT("arraycopy_range_checks:"); 1897 1898 assert_different_registers(rscratch1, temp); 1899 1900 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1901 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1902 __ addw(temp, length, src_pos); 1903 __ cmpw(temp, rscratch1); 1904 __ br(Assembler::HI, L_failed); 1905 1906 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1907 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1908 __ addw(temp, length, dst_pos); 1909 __ cmpw(temp, rscratch1); 1910 __ br(Assembler::HI, L_failed); 1911 1912 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1913 __ movw(src_pos, src_pos); 1914 __ movw(dst_pos, dst_pos); 1915 1916 BLOCK_COMMENT("arraycopy_range_checks done"); 1917 } 1918 1919 // These stubs get called from some dumb test routine. 1920 // I'll write them properly when they're called from 1921 // something that's actually doing something. 1922 static void fake_arraycopy_stub(address src, address dst, int count) { 1923 assert(count == 0, "huh?"); 1924 } 1925 1926 1927 // 1928 // Generate stub for array fill. If "aligned" is true, the 1929 // "to" address is assumed to be heapword aligned. 1930 // 1931 // Arguments for generated stub: 1932 // to: c_rarg0 1933 // value: c_rarg1 1934 // count: c_rarg2 treated as signed 1935 // 1936 address generate_fill(BasicType t, bool aligned, const char *name) { 1937 __ align(CodeEntryAlignment); 1938 StubCodeMark mark(this, "StubRoutines", name); 1939 address start = __ pc(); 1940 1941 BLOCK_COMMENT("Entry:"); 1942 1943 const Register to = c_rarg0; // source array address 1944 const Register value = c_rarg1; // value 1945 const Register count = c_rarg2; // elements count 1946 1947 const Register bz_base = r10; // base for block_zero routine 1948 const Register cnt_words = r11; // temp register 1949 1950 __ enter(); 1951 1952 Label L_fill_elements, L_exit1; 1953 1954 int shift = -1; 1955 switch (t) { 1956 case T_BYTE: 1957 shift = 0; 1958 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 1959 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 1960 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 1961 __ br(Assembler::LO, L_fill_elements); 1962 break; 1963 case T_SHORT: 1964 shift = 1; 1965 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 1966 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 1967 __ br(Assembler::LO, L_fill_elements); 1968 break; 1969 case T_INT: 1970 shift = 2; 1971 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 1972 __ br(Assembler::LO, L_fill_elements); 1973 break; 1974 default: ShouldNotReachHere(); 1975 } 1976 1977 // Align source address at 8 bytes address boundary. 1978 Label L_skip_align1, L_skip_align2, L_skip_align4; 1979 if (!aligned) { 1980 switch (t) { 1981 case T_BYTE: 1982 // One byte misalignment happens only for byte arrays. 1983 __ tbz(to, 0, L_skip_align1); 1984 __ strb(value, Address(__ post(to, 1))); 1985 __ subw(count, count, 1); 1986 __ bind(L_skip_align1); 1987 // Fallthrough 1988 case T_SHORT: 1989 // Two bytes misalignment happens only for byte and short (char) arrays. 1990 __ tbz(to, 1, L_skip_align2); 1991 __ strh(value, Address(__ post(to, 2))); 1992 __ subw(count, count, 2 >> shift); 1993 __ bind(L_skip_align2); 1994 // Fallthrough 1995 case T_INT: 1996 // Align to 8 bytes, we know we are 4 byte aligned to start. 1997 __ tbz(to, 2, L_skip_align4); 1998 __ strw(value, Address(__ post(to, 4))); 1999 __ subw(count, count, 4 >> shift); 2000 __ bind(L_skip_align4); 2001 break; 2002 default: ShouldNotReachHere(); 2003 } 2004 } 2005 2006 // 2007 // Fill large chunks 2008 // 2009 __ lsrw(cnt_words, count, 3 - shift); // number of words 2010 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2011 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2012 if (UseBlockZeroing) { 2013 Label non_block_zeroing, rest; 2014 // count >= BlockZeroingLowLimit && value == 0 2015 __ subs(rscratch1, cnt_words, BlockZeroingLowLimit >> 3); 2016 __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE); 2017 __ br(Assembler::NE, non_block_zeroing); 2018 __ mov(bz_base, to); 2019 __ block_zero(bz_base, cnt_words, true); 2020 __ mov(to, bz_base); 2021 __ b(rest); 2022 __ bind(non_block_zeroing); 2023 __ fill_words(to, cnt_words, value); 2024 __ bind(rest); 2025 } 2026 else { 2027 __ fill_words(to, cnt_words, value); 2028 } 2029 2030 // Remaining count is less than 8 bytes. Fill it by a single store. 2031 // Note that the total length is no less than 8 bytes. 2032 if (t == T_BYTE || t == T_SHORT) { 2033 Label L_exit1; 2034 __ cbzw(count, L_exit1); 2035 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2036 __ str(value, Address(to, -8)); // overwrite some elements 2037 __ bind(L_exit1); 2038 __ leave(); 2039 __ ret(lr); 2040 } 2041 2042 // Handle copies less than 8 bytes. 2043 Label L_fill_2, L_fill_4, L_exit2; 2044 __ bind(L_fill_elements); 2045 switch (t) { 2046 case T_BYTE: 2047 __ tbz(count, 0, L_fill_2); 2048 __ strb(value, Address(__ post(to, 1))); 2049 __ bind(L_fill_2); 2050 __ tbz(count, 1, L_fill_4); 2051 __ strh(value, Address(__ post(to, 2))); 2052 __ bind(L_fill_4); 2053 __ tbz(count, 2, L_exit2); 2054 __ strw(value, Address(to)); 2055 break; 2056 case T_SHORT: 2057 __ tbz(count, 0, L_fill_4); 2058 __ strh(value, Address(__ post(to, 2))); 2059 __ bind(L_fill_4); 2060 __ tbz(count, 1, L_exit2); 2061 __ strw(value, Address(to)); 2062 break; 2063 case T_INT: 2064 __ cbzw(count, L_exit2); 2065 __ strw(value, Address(to)); 2066 break; 2067 default: ShouldNotReachHere(); 2068 } 2069 __ bind(L_exit2); 2070 __ leave(); 2071 __ ret(lr); 2072 return start; 2073 } 2074 2075 // 2076 // Generate 'unsafe' array copy stub 2077 // Though just as safe as the other stubs, it takes an unscaled 2078 // size_t argument instead of an element count. 2079 // 2080 // Input: 2081 // c_rarg0 - source array address 2082 // c_rarg1 - destination array address 2083 // c_rarg2 - byte count, treated as ssize_t, can be zero 2084 // 2085 // Examines the alignment of the operands and dispatches 2086 // to a long, int, short, or byte copy loop. 2087 // 2088 address generate_unsafe_copy(const char *name, 2089 address byte_copy_entry, 2090 address short_copy_entry, 2091 address int_copy_entry, 2092 address long_copy_entry) { 2093 Label L_long_aligned, L_int_aligned, L_short_aligned; 2094 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2095 2096 __ align(CodeEntryAlignment); 2097 StubCodeMark mark(this, "StubRoutines", name); 2098 address start = __ pc(); 2099 __ enter(); // required for proper stackwalking of RuntimeStub frame 2100 2101 // bump this on entry, not on exit: 2102 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2103 2104 __ orr(rscratch1, s, d); 2105 __ orr(rscratch1, rscratch1, count); 2106 2107 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2108 __ cbz(rscratch1, L_long_aligned); 2109 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2110 __ cbz(rscratch1, L_int_aligned); 2111 __ tbz(rscratch1, 0, L_short_aligned); 2112 __ b(RuntimeAddress(byte_copy_entry)); 2113 2114 __ BIND(L_short_aligned); 2115 __ lsr(count, count, LogBytesPerShort); // size => short_count 2116 __ b(RuntimeAddress(short_copy_entry)); 2117 __ BIND(L_int_aligned); 2118 __ lsr(count, count, LogBytesPerInt); // size => int_count 2119 __ b(RuntimeAddress(int_copy_entry)); 2120 __ BIND(L_long_aligned); 2121 __ lsr(count, count, LogBytesPerLong); // size => long_count 2122 __ b(RuntimeAddress(long_copy_entry)); 2123 2124 return start; 2125 } 2126 2127 // 2128 // Generate generic array copy stubs 2129 // 2130 // Input: 2131 // c_rarg0 - src oop 2132 // c_rarg1 - src_pos (32-bits) 2133 // c_rarg2 - dst oop 2134 // c_rarg3 - dst_pos (32-bits) 2135 // c_rarg4 - element count (32-bits) 2136 // 2137 // Output: 2138 // r0 == 0 - success 2139 // r0 == -1^K - failure, where K is partial transfer count 2140 // 2141 address generate_generic_copy(const char *name, 2142 address byte_copy_entry, address short_copy_entry, 2143 address int_copy_entry, address oop_copy_entry, 2144 address long_copy_entry, address checkcast_copy_entry) { 2145 2146 Label L_failed, L_failed_0, L_objArray; 2147 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2148 2149 // Input registers 2150 const Register src = c_rarg0; // source array oop 2151 const Register src_pos = c_rarg1; // source position 2152 const Register dst = c_rarg2; // destination array oop 2153 const Register dst_pos = c_rarg3; // destination position 2154 const Register length = c_rarg4; 2155 2156 __ align(CodeEntryAlignment); 2157 2158 StubCodeMark mark(this, "StubRoutines", name); 2159 2160 address start = __ pc(); 2161 2162 __ enter(); // required for proper stackwalking of RuntimeStub frame 2163 2164 // bump this on entry, not on exit: 2165 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2166 2167 //----------------------------------------------------------------------- 2168 // Assembler stub will be used for this call to arraycopy 2169 // if the following conditions are met: 2170 // 2171 // (1) src and dst must not be null. 2172 // (2) src_pos must not be negative. 2173 // (3) dst_pos must not be negative. 2174 // (4) length must not be negative. 2175 // (5) src klass and dst klass should be the same and not NULL. 2176 // (6) src and dst should be arrays. 2177 // (7) src_pos + length must not exceed length of src. 2178 // (8) dst_pos + length must not exceed length of dst. 2179 // 2180 2181 // if (src == NULL) return -1; 2182 __ cbz(src, L_failed); 2183 2184 // if (src_pos < 0) return -1; 2185 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2186 2187 // if (dst == NULL) return -1; 2188 __ cbz(dst, L_failed); 2189 2190 // if (dst_pos < 0) return -1; 2191 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2192 2193 // registers used as temp 2194 const Register scratch_length = r16; // elements count to copy 2195 const Register scratch_src_klass = r17; // array klass 2196 const Register lh = r18; // layout helper 2197 2198 // if (length < 0) return -1; 2199 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2200 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2201 2202 __ load_klass(scratch_src_klass, src); 2203 #ifdef ASSERT 2204 // assert(src->klass() != NULL); 2205 { 2206 BLOCK_COMMENT("assert klasses not null {"); 2207 Label L1, L2; 2208 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2209 __ bind(L1); 2210 __ stop("broken null klass"); 2211 __ bind(L2); 2212 __ load_klass(rscratch1, dst); 2213 __ cbz(rscratch1, L1); // this would be broken also 2214 BLOCK_COMMENT("} assert klasses not null done"); 2215 } 2216 #endif 2217 2218 // Load layout helper (32-bits) 2219 // 2220 // |array_tag| | header_size | element_type | |log2_element_size| 2221 // 32 30 24 16 8 2 0 2222 // 2223 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2224 // 2225 2226 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2227 2228 // Handle objArrays completely differently... 2229 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2230 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2231 __ movw(rscratch1, objArray_lh); 2232 __ eorw(rscratch2, lh, rscratch1); 2233 __ cbzw(rscratch2, L_objArray); 2234 2235 // if (src->klass() != dst->klass()) return -1; 2236 __ load_klass(rscratch2, dst); 2237 __ eor(rscratch2, rscratch2, scratch_src_klass); 2238 __ cbnz(rscratch2, L_failed); 2239 2240 // if (!src->is_Array()) return -1; 2241 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2242 2243 // At this point, it is known to be a typeArray (array_tag 0x3). 2244 #ifdef ASSERT 2245 { 2246 BLOCK_COMMENT("assert primitive array {"); 2247 Label L; 2248 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2249 __ cmpw(lh, rscratch2); 2250 __ br(Assembler::GE, L); 2251 __ stop("must be a primitive array"); 2252 __ bind(L); 2253 BLOCK_COMMENT("} assert primitive array done"); 2254 } 2255 #endif 2256 2257 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2258 rscratch2, L_failed); 2259 2260 // TypeArrayKlass 2261 // 2262 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2263 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2264 // 2265 2266 const Register rscratch1_offset = rscratch1; // array offset 2267 const Register r18_elsize = lh; // element size 2268 2269 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2270 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2271 __ add(src, src, rscratch1_offset); // src array offset 2272 __ add(dst, dst, rscratch1_offset); // dst array offset 2273 BLOCK_COMMENT("choose copy loop based on element size"); 2274 2275 // next registers should be set before the jump to corresponding stub 2276 const Register from = c_rarg0; // source array address 2277 const Register to = c_rarg1; // destination array address 2278 const Register count = c_rarg2; // elements count 2279 2280 // 'from', 'to', 'count' registers should be set in such order 2281 // since they are the same as 'src', 'src_pos', 'dst'. 2282 2283 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2284 2285 // The possible values of elsize are 0-3, i.e. exact_log2(element 2286 // size in bytes). We do a simple bitwise binary search. 2287 __ BIND(L_copy_bytes); 2288 __ tbnz(r18_elsize, 1, L_copy_ints); 2289 __ tbnz(r18_elsize, 0, L_copy_shorts); 2290 __ lea(from, Address(src, src_pos));// src_addr 2291 __ lea(to, Address(dst, dst_pos));// dst_addr 2292 __ movw(count, scratch_length); // length 2293 __ b(RuntimeAddress(byte_copy_entry)); 2294 2295 __ BIND(L_copy_shorts); 2296 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2297 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2298 __ movw(count, scratch_length); // length 2299 __ b(RuntimeAddress(short_copy_entry)); 2300 2301 __ BIND(L_copy_ints); 2302 __ tbnz(r18_elsize, 0, L_copy_longs); 2303 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2304 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2305 __ movw(count, scratch_length); // length 2306 __ b(RuntimeAddress(int_copy_entry)); 2307 2308 __ BIND(L_copy_longs); 2309 #ifdef ASSERT 2310 { 2311 BLOCK_COMMENT("assert long copy {"); 2312 Label L; 2313 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2314 __ cmpw(r18_elsize, LogBytesPerLong); 2315 __ br(Assembler::EQ, L); 2316 __ stop("must be long copy, but elsize is wrong"); 2317 __ bind(L); 2318 BLOCK_COMMENT("} assert long copy done"); 2319 } 2320 #endif 2321 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2322 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2323 __ movw(count, scratch_length); // length 2324 __ b(RuntimeAddress(long_copy_entry)); 2325 2326 // ObjArrayKlass 2327 __ BIND(L_objArray); 2328 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2329 2330 Label L_plain_copy, L_checkcast_copy; 2331 // test array classes for subtyping 2332 __ load_klass(r18, dst); 2333 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2334 __ br(Assembler::NE, L_checkcast_copy); 2335 2336 // Identically typed arrays can be copied without element-wise checks. 2337 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2338 rscratch2, L_failed); 2339 2340 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2341 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2342 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2343 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2344 __ movw(count, scratch_length); // length 2345 __ BIND(L_plain_copy); 2346 __ b(RuntimeAddress(oop_copy_entry)); 2347 2348 __ BIND(L_checkcast_copy); 2349 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2350 { 2351 // Before looking at dst.length, make sure dst is also an objArray. 2352 __ ldrw(rscratch1, Address(r18, lh_offset)); 2353 __ movw(rscratch2, objArray_lh); 2354 __ eorw(rscratch1, rscratch1, rscratch2); 2355 __ cbnzw(rscratch1, L_failed); 2356 2357 // It is safe to examine both src.length and dst.length. 2358 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2359 r18, L_failed); 2360 2361 const Register rscratch2_dst_klass = rscratch2; 2362 __ load_klass(rscratch2_dst_klass, dst); // reload 2363 2364 // Marshal the base address arguments now, freeing registers. 2365 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2366 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2367 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2368 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2369 __ movw(count, length); // length (reloaded) 2370 Register sco_temp = c_rarg3; // this register is free now 2371 assert_different_registers(from, to, count, sco_temp, 2372 rscratch2_dst_klass, scratch_src_klass); 2373 // assert_clean_int(count, sco_temp); 2374 2375 // Generate the type check. 2376 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2377 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2378 // assert_clean_int(sco_temp, r18); 2379 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2380 2381 // Fetch destination element klass from the ObjArrayKlass header. 2382 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2383 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2384 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2385 2386 // the checkcast_copy loop needs two extra arguments: 2387 assert(c_rarg3 == sco_temp, "#3 already in place"); 2388 // Set up arguments for checkcast_copy_entry. 2389 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2390 __ b(RuntimeAddress(checkcast_copy_entry)); 2391 } 2392 2393 __ BIND(L_failed); 2394 __ mov(r0, -1); 2395 __ leave(); // required for proper stackwalking of RuntimeStub frame 2396 __ ret(lr); 2397 2398 return start; 2399 } 2400 2401 void generate_arraycopy_stubs() { 2402 address entry; 2403 address entry_jbyte_arraycopy; 2404 address entry_jshort_arraycopy; 2405 address entry_jint_arraycopy; 2406 address entry_oop_arraycopy; 2407 address entry_jlong_arraycopy; 2408 address entry_checkcast_arraycopy; 2409 2410 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2411 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2412 2413 StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11); 2414 2415 //*** jbyte 2416 // Always need aligned and unaligned versions 2417 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2418 "jbyte_disjoint_arraycopy"); 2419 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2420 &entry_jbyte_arraycopy, 2421 "jbyte_arraycopy"); 2422 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2423 "arrayof_jbyte_disjoint_arraycopy"); 2424 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2425 "arrayof_jbyte_arraycopy"); 2426 2427 //*** jshort 2428 // Always need aligned and unaligned versions 2429 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2430 "jshort_disjoint_arraycopy"); 2431 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2432 &entry_jshort_arraycopy, 2433 "jshort_arraycopy"); 2434 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2435 "arrayof_jshort_disjoint_arraycopy"); 2436 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2437 "arrayof_jshort_arraycopy"); 2438 2439 //*** jint 2440 // Aligned versions 2441 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2442 "arrayof_jint_disjoint_arraycopy"); 2443 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2444 "arrayof_jint_arraycopy"); 2445 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2446 // entry_jint_arraycopy always points to the unaligned version 2447 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2448 "jint_disjoint_arraycopy"); 2449 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2450 &entry_jint_arraycopy, 2451 "jint_arraycopy"); 2452 2453 //*** jlong 2454 // It is always aligned 2455 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2456 "arrayof_jlong_disjoint_arraycopy"); 2457 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2458 "arrayof_jlong_arraycopy"); 2459 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2460 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2461 2462 //*** oops 2463 { 2464 // With compressed oops we need unaligned versions; notice that 2465 // we overwrite entry_oop_arraycopy. 2466 bool aligned = !UseCompressedOops; 2467 2468 StubRoutines::_arrayof_oop_disjoint_arraycopy 2469 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2470 /*dest_uninitialized*/false); 2471 StubRoutines::_arrayof_oop_arraycopy 2472 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2473 /*dest_uninitialized*/false); 2474 // Aligned versions without pre-barriers 2475 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2476 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2477 /*dest_uninitialized*/true); 2478 StubRoutines::_arrayof_oop_arraycopy_uninit 2479 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2480 /*dest_uninitialized*/true); 2481 } 2482 2483 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2484 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2485 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2486 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2487 2488 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2489 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2490 /*dest_uninitialized*/true); 2491 2492 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2493 entry_jbyte_arraycopy, 2494 entry_jshort_arraycopy, 2495 entry_jint_arraycopy, 2496 entry_jlong_arraycopy); 2497 2498 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2499 entry_jbyte_arraycopy, 2500 entry_jshort_arraycopy, 2501 entry_jint_arraycopy, 2502 entry_oop_arraycopy, 2503 entry_jlong_arraycopy, 2504 entry_checkcast_arraycopy); 2505 2506 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2507 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2508 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2509 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2510 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2511 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2512 } 2513 2514 // Arguments: 2515 // 2516 // Inputs: 2517 // c_rarg0 - source byte array address 2518 // c_rarg1 - destination byte array address 2519 // c_rarg2 - K (key) in little endian int array 2520 // 2521 address generate_aescrypt_encryptBlock() { 2522 __ align(CodeEntryAlignment); 2523 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2524 2525 Label L_doLast; 2526 2527 const Register from = c_rarg0; // source array address 2528 const Register to = c_rarg1; // destination array address 2529 const Register key = c_rarg2; // key array address 2530 const Register keylen = rscratch1; 2531 2532 address start = __ pc(); 2533 __ enter(); 2534 2535 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2536 2537 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2538 2539 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2540 __ rev32(v1, __ T16B, v1); 2541 __ rev32(v2, __ T16B, v2); 2542 __ rev32(v3, __ T16B, v3); 2543 __ rev32(v4, __ T16B, v4); 2544 __ aese(v0, v1); 2545 __ aesmc(v0, v0); 2546 __ aese(v0, v2); 2547 __ aesmc(v0, v0); 2548 __ aese(v0, v3); 2549 __ aesmc(v0, v0); 2550 __ aese(v0, v4); 2551 __ aesmc(v0, v0); 2552 2553 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2554 __ rev32(v1, __ T16B, v1); 2555 __ rev32(v2, __ T16B, v2); 2556 __ rev32(v3, __ T16B, v3); 2557 __ rev32(v4, __ T16B, v4); 2558 __ aese(v0, v1); 2559 __ aesmc(v0, v0); 2560 __ aese(v0, v2); 2561 __ aesmc(v0, v0); 2562 __ aese(v0, v3); 2563 __ aesmc(v0, v0); 2564 __ aese(v0, v4); 2565 __ aesmc(v0, v0); 2566 2567 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2568 __ rev32(v1, __ T16B, v1); 2569 __ rev32(v2, __ T16B, v2); 2570 2571 __ cmpw(keylen, 44); 2572 __ br(Assembler::EQ, L_doLast); 2573 2574 __ aese(v0, v1); 2575 __ aesmc(v0, v0); 2576 __ aese(v0, v2); 2577 __ aesmc(v0, v0); 2578 2579 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2580 __ rev32(v1, __ T16B, v1); 2581 __ rev32(v2, __ T16B, v2); 2582 2583 __ cmpw(keylen, 52); 2584 __ br(Assembler::EQ, L_doLast); 2585 2586 __ aese(v0, v1); 2587 __ aesmc(v0, v0); 2588 __ aese(v0, v2); 2589 __ aesmc(v0, v0); 2590 2591 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2592 __ rev32(v1, __ T16B, v1); 2593 __ rev32(v2, __ T16B, v2); 2594 2595 __ BIND(L_doLast); 2596 2597 __ aese(v0, v1); 2598 __ aesmc(v0, v0); 2599 __ aese(v0, v2); 2600 2601 __ ld1(v1, __ T16B, key); 2602 __ rev32(v1, __ T16B, v1); 2603 __ eor(v0, __ T16B, v0, v1); 2604 2605 __ st1(v0, __ T16B, to); 2606 2607 __ mov(r0, 0); 2608 2609 __ leave(); 2610 __ ret(lr); 2611 2612 return start; 2613 } 2614 2615 // Arguments: 2616 // 2617 // Inputs: 2618 // c_rarg0 - source byte array address 2619 // c_rarg1 - destination byte array address 2620 // c_rarg2 - K (key) in little endian int array 2621 // 2622 address generate_aescrypt_decryptBlock() { 2623 assert(UseAES, "need AES instructions and misaligned SSE support"); 2624 __ align(CodeEntryAlignment); 2625 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2626 Label L_doLast; 2627 2628 const Register from = c_rarg0; // source array address 2629 const Register to = c_rarg1; // destination array address 2630 const Register key = c_rarg2; // key array address 2631 const Register keylen = rscratch1; 2632 2633 address start = __ pc(); 2634 __ enter(); // required for proper stackwalking of RuntimeStub frame 2635 2636 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2637 2638 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2639 2640 __ ld1(v5, __ T16B, __ post(key, 16)); 2641 __ rev32(v5, __ T16B, v5); 2642 2643 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2644 __ rev32(v1, __ T16B, v1); 2645 __ rev32(v2, __ T16B, v2); 2646 __ rev32(v3, __ T16B, v3); 2647 __ rev32(v4, __ T16B, v4); 2648 __ aesd(v0, v1); 2649 __ aesimc(v0, v0); 2650 __ aesd(v0, v2); 2651 __ aesimc(v0, v0); 2652 __ aesd(v0, v3); 2653 __ aesimc(v0, v0); 2654 __ aesd(v0, v4); 2655 __ aesimc(v0, v0); 2656 2657 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2658 __ rev32(v1, __ T16B, v1); 2659 __ rev32(v2, __ T16B, v2); 2660 __ rev32(v3, __ T16B, v3); 2661 __ rev32(v4, __ T16B, v4); 2662 __ aesd(v0, v1); 2663 __ aesimc(v0, v0); 2664 __ aesd(v0, v2); 2665 __ aesimc(v0, v0); 2666 __ aesd(v0, v3); 2667 __ aesimc(v0, v0); 2668 __ aesd(v0, v4); 2669 __ aesimc(v0, v0); 2670 2671 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2672 __ rev32(v1, __ T16B, v1); 2673 __ rev32(v2, __ T16B, v2); 2674 2675 __ cmpw(keylen, 44); 2676 __ br(Assembler::EQ, L_doLast); 2677 2678 __ aesd(v0, v1); 2679 __ aesimc(v0, v0); 2680 __ aesd(v0, v2); 2681 __ aesimc(v0, v0); 2682 2683 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2684 __ rev32(v1, __ T16B, v1); 2685 __ rev32(v2, __ T16B, v2); 2686 2687 __ cmpw(keylen, 52); 2688 __ br(Assembler::EQ, L_doLast); 2689 2690 __ aesd(v0, v1); 2691 __ aesimc(v0, v0); 2692 __ aesd(v0, v2); 2693 __ aesimc(v0, v0); 2694 2695 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2696 __ rev32(v1, __ T16B, v1); 2697 __ rev32(v2, __ T16B, v2); 2698 2699 __ BIND(L_doLast); 2700 2701 __ aesd(v0, v1); 2702 __ aesimc(v0, v0); 2703 __ aesd(v0, v2); 2704 2705 __ eor(v0, __ T16B, v0, v5); 2706 2707 __ st1(v0, __ T16B, to); 2708 2709 __ mov(r0, 0); 2710 2711 __ leave(); 2712 __ ret(lr); 2713 2714 return start; 2715 } 2716 2717 // Arguments: 2718 // 2719 // Inputs: 2720 // c_rarg0 - source byte array address 2721 // c_rarg1 - destination byte array address 2722 // c_rarg2 - K (key) in little endian int array 2723 // c_rarg3 - r vector byte array address 2724 // c_rarg4 - input length 2725 // 2726 // Output: 2727 // x0 - input length 2728 // 2729 address generate_cipherBlockChaining_encryptAESCrypt() { 2730 assert(UseAES, "need AES instructions and misaligned SSE support"); 2731 __ align(CodeEntryAlignment); 2732 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2733 2734 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish; 2735 2736 const Register from = c_rarg0; // source array address 2737 const Register to = c_rarg1; // destination array address 2738 const Register key = c_rarg2; // key array address 2739 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2740 // and left with the results of the last encryption block 2741 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2742 const Register keylen = rscratch1; 2743 2744 address start = __ pc(); 2745 2746 __ enter(); 2747 2748 __ subsw(rscratch2, len_reg, zr); 2749 __ br(Assembler::LE, _L_finish); 2750 2751 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2752 2753 __ ld1(v0, __ T16B, rvec); 2754 2755 __ cmpw(keylen, 52); 2756 __ br(Assembler::CC, L_loadkeys_44); 2757 __ br(Assembler::EQ, L_loadkeys_52); 2758 2759 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2760 __ rev32(v17, __ T16B, v17); 2761 __ rev32(v18, __ T16B, v18); 2762 __ BIND(L_loadkeys_52); 2763 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2764 __ rev32(v19, __ T16B, v19); 2765 __ rev32(v20, __ T16B, v20); 2766 __ BIND(L_loadkeys_44); 2767 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2768 __ rev32(v21, __ T16B, v21); 2769 __ rev32(v22, __ T16B, v22); 2770 __ rev32(v23, __ T16B, v23); 2771 __ rev32(v24, __ T16B, v24); 2772 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2773 __ rev32(v25, __ T16B, v25); 2774 __ rev32(v26, __ T16B, v26); 2775 __ rev32(v27, __ T16B, v27); 2776 __ rev32(v28, __ T16B, v28); 2777 __ ld1(v29, v30, v31, __ T16B, key); 2778 __ rev32(v29, __ T16B, v29); 2779 __ rev32(v30, __ T16B, v30); 2780 __ rev32(v31, __ T16B, v31); 2781 2782 __ BIND(L_aes_loop); 2783 __ ld1(v1, __ T16B, __ post(from, 16)); 2784 __ eor(v0, __ T16B, v0, v1); 2785 2786 __ br(Assembler::CC, L_rounds_44); 2787 __ br(Assembler::EQ, L_rounds_52); 2788 2789 __ aese(v0, v17); __ aesmc(v0, v0); 2790 __ aese(v0, v18); __ aesmc(v0, v0); 2791 __ BIND(L_rounds_52); 2792 __ aese(v0, v19); __ aesmc(v0, v0); 2793 __ aese(v0, v20); __ aesmc(v0, v0); 2794 __ BIND(L_rounds_44); 2795 __ aese(v0, v21); __ aesmc(v0, v0); 2796 __ aese(v0, v22); __ aesmc(v0, v0); 2797 __ aese(v0, v23); __ aesmc(v0, v0); 2798 __ aese(v0, v24); __ aesmc(v0, v0); 2799 __ aese(v0, v25); __ aesmc(v0, v0); 2800 __ aese(v0, v26); __ aesmc(v0, v0); 2801 __ aese(v0, v27); __ aesmc(v0, v0); 2802 __ aese(v0, v28); __ aesmc(v0, v0); 2803 __ aese(v0, v29); __ aesmc(v0, v0); 2804 __ aese(v0, v30); 2805 __ eor(v0, __ T16B, v0, v31); 2806 2807 __ st1(v0, __ T16B, __ post(to, 16)); 2808 2809 __ subw(len_reg, len_reg, 16); 2810 __ cbnzw(len_reg, L_aes_loop); 2811 2812 __ st1(v0, __ T16B, rvec); 2813 2814 __ BIND(_L_finish); 2815 __ mov(r0, rscratch2); 2816 2817 __ leave(); 2818 __ ret(lr); 2819 2820 return start; 2821 } 2822 2823 // Arguments: 2824 // 2825 // Inputs: 2826 // c_rarg0 - source byte array address 2827 // c_rarg1 - destination byte array address 2828 // c_rarg2 - K (key) in little endian int array 2829 // c_rarg3 - r vector byte array address 2830 // c_rarg4 - input length 2831 // 2832 // Output: 2833 // r0 - input length 2834 // 2835 address generate_cipherBlockChaining_decryptAESCrypt() { 2836 assert(UseAES, "need AES instructions and misaligned SSE support"); 2837 __ align(CodeEntryAlignment); 2838 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2839 2840 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish; 2841 2842 const Register from = c_rarg0; // source array address 2843 const Register to = c_rarg1; // destination array address 2844 const Register key = c_rarg2; // key array address 2845 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2846 // and left with the results of the last encryption block 2847 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2848 const Register keylen = rscratch1; 2849 2850 address start = __ pc(); 2851 2852 __ enter(); 2853 2854 __ subsw(rscratch2, len_reg, zr); 2855 __ br(Assembler::LE, _L_finish); 2856 2857 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2858 2859 __ ld1(v2, __ T16B, rvec); 2860 2861 __ ld1(v31, __ T16B, __ post(key, 16)); 2862 __ rev32(v31, __ T16B, v31); 2863 2864 __ cmpw(keylen, 52); 2865 __ br(Assembler::CC, L_loadkeys_44); 2866 __ br(Assembler::EQ, L_loadkeys_52); 2867 2868 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2869 __ rev32(v17, __ T16B, v17); 2870 __ rev32(v18, __ T16B, v18); 2871 __ BIND(L_loadkeys_52); 2872 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2873 __ rev32(v19, __ T16B, v19); 2874 __ rev32(v20, __ T16B, v20); 2875 __ BIND(L_loadkeys_44); 2876 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2877 __ rev32(v21, __ T16B, v21); 2878 __ rev32(v22, __ T16B, v22); 2879 __ rev32(v23, __ T16B, v23); 2880 __ rev32(v24, __ T16B, v24); 2881 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2882 __ rev32(v25, __ T16B, v25); 2883 __ rev32(v26, __ T16B, v26); 2884 __ rev32(v27, __ T16B, v27); 2885 __ rev32(v28, __ T16B, v28); 2886 __ ld1(v29, v30, __ T16B, key); 2887 __ rev32(v29, __ T16B, v29); 2888 __ rev32(v30, __ T16B, v30); 2889 2890 __ BIND(L_aes_loop); 2891 __ ld1(v0, __ T16B, __ post(from, 16)); 2892 __ orr(v1, __ T16B, v0, v0); 2893 2894 __ br(Assembler::CC, L_rounds_44); 2895 __ br(Assembler::EQ, L_rounds_52); 2896 2897 __ aesd(v0, v17); __ aesimc(v0, v0); 2898 __ aesd(v0, v18); __ aesimc(v0, v0); 2899 __ BIND(L_rounds_52); 2900 __ aesd(v0, v19); __ aesimc(v0, v0); 2901 __ aesd(v0, v20); __ aesimc(v0, v0); 2902 __ BIND(L_rounds_44); 2903 __ aesd(v0, v21); __ aesimc(v0, v0); 2904 __ aesd(v0, v22); __ aesimc(v0, v0); 2905 __ aesd(v0, v23); __ aesimc(v0, v0); 2906 __ aesd(v0, v24); __ aesimc(v0, v0); 2907 __ aesd(v0, v25); __ aesimc(v0, v0); 2908 __ aesd(v0, v26); __ aesimc(v0, v0); 2909 __ aesd(v0, v27); __ aesimc(v0, v0); 2910 __ aesd(v0, v28); __ aesimc(v0, v0); 2911 __ aesd(v0, v29); __ aesimc(v0, v0); 2912 __ aesd(v0, v30); 2913 __ eor(v0, __ T16B, v0, v31); 2914 __ eor(v0, __ T16B, v0, v2); 2915 2916 __ st1(v0, __ T16B, __ post(to, 16)); 2917 __ orr(v2, __ T16B, v1, v1); 2918 2919 __ subw(len_reg, len_reg, 16); 2920 __ cbnzw(len_reg, L_aes_loop); 2921 2922 __ st1(v2, __ T16B, rvec); 2923 2924 __ BIND(_L_finish); 2925 __ mov(r0, rscratch2); 2926 2927 __ leave(); 2928 __ ret(lr); 2929 2930 return start; 2931 } 2932 2933 // Arguments: 2934 // 2935 // Inputs: 2936 // c_rarg0 - byte[] source+offset 2937 // c_rarg1 - int[] SHA.state 2938 // c_rarg2 - int offset 2939 // c_rarg3 - int limit 2940 // 2941 address generate_sha1_implCompress(bool multi_block, const char *name) { 2942 __ align(CodeEntryAlignment); 2943 StubCodeMark mark(this, "StubRoutines", name); 2944 address start = __ pc(); 2945 2946 Register buf = c_rarg0; 2947 Register state = c_rarg1; 2948 Register ofs = c_rarg2; 2949 Register limit = c_rarg3; 2950 2951 Label keys; 2952 Label sha1_loop; 2953 2954 // load the keys into v0..v3 2955 __ adr(rscratch1, keys); 2956 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2957 // load 5 words state into v6, v7 2958 __ ldrq(v6, Address(state, 0)); 2959 __ ldrs(v7, Address(state, 16)); 2960 2961 2962 __ BIND(sha1_loop); 2963 // load 64 bytes of data into v16..v19 2964 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2965 __ rev32(v16, __ T16B, v16); 2966 __ rev32(v17, __ T16B, v17); 2967 __ rev32(v18, __ T16B, v18); 2968 __ rev32(v19, __ T16B, v19); 2969 2970 // do the sha1 2971 __ addv(v4, __ T4S, v16, v0); 2972 __ orr(v20, __ T16B, v6, v6); 2973 2974 FloatRegister d0 = v16; 2975 FloatRegister d1 = v17; 2976 FloatRegister d2 = v18; 2977 FloatRegister d3 = v19; 2978 2979 for (int round = 0; round < 20; round++) { 2980 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2981 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2982 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2983 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2984 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2985 2986 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2987 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2988 __ sha1h(tmp2, __ T4S, v20); 2989 if (round < 5) 2990 __ sha1c(v20, __ T4S, tmp3, tmp4); 2991 else if (round < 10 || round >= 15) 2992 __ sha1p(v20, __ T4S, tmp3, tmp4); 2993 else 2994 __ sha1m(v20, __ T4S, tmp3, tmp4); 2995 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2996 2997 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2998 } 2999 3000 __ addv(v7, __ T2S, v7, v21); 3001 __ addv(v6, __ T4S, v6, v20); 3002 3003 if (multi_block) { 3004 __ add(ofs, ofs, 64); 3005 __ cmp(ofs, limit); 3006 __ br(Assembler::LE, sha1_loop); 3007 __ mov(c_rarg0, ofs); // return ofs 3008 } 3009 3010 __ strq(v6, Address(state, 0)); 3011 __ strs(v7, Address(state, 16)); 3012 3013 __ ret(lr); 3014 3015 __ bind(keys); 3016 __ emit_int32(0x5a827999); 3017 __ emit_int32(0x6ed9eba1); 3018 __ emit_int32(0x8f1bbcdc); 3019 __ emit_int32(0xca62c1d6); 3020 3021 return start; 3022 } 3023 3024 3025 // Arguments: 3026 // 3027 // Inputs: 3028 // c_rarg0 - byte[] source+offset 3029 // c_rarg1 - int[] SHA.state 3030 // c_rarg2 - int offset 3031 // c_rarg3 - int limit 3032 // 3033 address generate_sha256_implCompress(bool multi_block, const char *name) { 3034 static const uint32_t round_consts[64] = { 3035 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3036 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3037 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3038 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3039 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3040 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3041 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3042 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3043 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3044 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3045 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3046 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3047 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3048 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3049 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3050 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3051 }; 3052 __ align(CodeEntryAlignment); 3053 StubCodeMark mark(this, "StubRoutines", name); 3054 address start = __ pc(); 3055 3056 Register buf = c_rarg0; 3057 Register state = c_rarg1; 3058 Register ofs = c_rarg2; 3059 Register limit = c_rarg3; 3060 3061 Label sha1_loop; 3062 3063 __ stpd(v8, v9, __ pre(sp, -32)); 3064 __ stpd(v10, v11, Address(sp, 16)); 3065 3066 // dga == v0 3067 // dgb == v1 3068 // dg0 == v2 3069 // dg1 == v3 3070 // dg2 == v4 3071 // t0 == v6 3072 // t1 == v7 3073 3074 // load 16 keys to v16..v31 3075 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3076 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3077 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3078 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3079 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3080 3081 // load 8 words (256 bits) state 3082 __ ldpq(v0, v1, state); 3083 3084 __ BIND(sha1_loop); 3085 // load 64 bytes of data into v8..v11 3086 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3087 __ rev32(v8, __ T16B, v8); 3088 __ rev32(v9, __ T16B, v9); 3089 __ rev32(v10, __ T16B, v10); 3090 __ rev32(v11, __ T16B, v11); 3091 3092 __ addv(v6, __ T4S, v8, v16); 3093 __ orr(v2, __ T16B, v0, v0); 3094 __ orr(v3, __ T16B, v1, v1); 3095 3096 FloatRegister d0 = v8; 3097 FloatRegister d1 = v9; 3098 FloatRegister d2 = v10; 3099 FloatRegister d3 = v11; 3100 3101 3102 for (int round = 0; round < 16; round++) { 3103 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3104 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3105 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3106 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3107 3108 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3109 __ orr(v4, __ T16B, v2, v2); 3110 if (round < 15) 3111 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3112 __ sha256h(v2, __ T4S, v3, tmp2); 3113 __ sha256h2(v3, __ T4S, v4, tmp2); 3114 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3115 3116 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3117 } 3118 3119 __ addv(v0, __ T4S, v0, v2); 3120 __ addv(v1, __ T4S, v1, v3); 3121 3122 if (multi_block) { 3123 __ add(ofs, ofs, 64); 3124 __ cmp(ofs, limit); 3125 __ br(Assembler::LE, sha1_loop); 3126 __ mov(c_rarg0, ofs); // return ofs 3127 } 3128 3129 __ ldpd(v10, v11, Address(sp, 16)); 3130 __ ldpd(v8, v9, __ post(sp, 32)); 3131 3132 __ stpq(v0, v1, state); 3133 3134 __ ret(lr); 3135 3136 return start; 3137 } 3138 3139 // Safefetch stubs. 3140 void generate_safefetch(const char* name, int size, address* entry, 3141 address* fault_pc, address* continuation_pc) { 3142 // safefetch signatures: 3143 // int SafeFetch32(int* adr, int errValue); 3144 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3145 // 3146 // arguments: 3147 // c_rarg0 = adr 3148 // c_rarg1 = errValue 3149 // 3150 // result: 3151 // PPC_RET = *adr or errValue 3152 3153 StubCodeMark mark(this, "StubRoutines", name); 3154 3155 // Entry point, pc or function descriptor. 3156 *entry = __ pc(); 3157 3158 // Load *adr into c_rarg1, may fault. 3159 *fault_pc = __ pc(); 3160 switch (size) { 3161 case 4: 3162 // int32_t 3163 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3164 break; 3165 case 8: 3166 // int64_t 3167 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3168 break; 3169 default: 3170 ShouldNotReachHere(); 3171 } 3172 3173 // return errValue or *adr 3174 *continuation_pc = __ pc(); 3175 __ mov(r0, c_rarg1); 3176 __ ret(lr); 3177 } 3178 3179 /** 3180 * Arguments: 3181 * 3182 * Inputs: 3183 * c_rarg0 - int crc 3184 * c_rarg1 - byte* buf 3185 * c_rarg2 - int length 3186 * 3187 * Output: 3188 * r0 - int crc result 3189 * 3190 * Preserves: 3191 * r13 3192 * 3193 */ 3194 address generate_updateBytesCRC32() { 3195 assert(UseCRC32Intrinsics, "what are we doing here?"); 3196 3197 __ align(CodeEntryAlignment); 3198 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3199 3200 address start = __ pc(); 3201 3202 const Register crc = c_rarg0; // crc 3203 const Register buf = c_rarg1; // source java byte array address 3204 const Register len = c_rarg2; // length 3205 const Register table0 = c_rarg3; // crc_table address 3206 const Register table1 = c_rarg4; 3207 const Register table2 = c_rarg5; 3208 const Register table3 = c_rarg6; 3209 const Register tmp3 = c_rarg7; 3210 3211 BLOCK_COMMENT("Entry:"); 3212 __ enter(); // required for proper stackwalking of RuntimeStub frame 3213 3214 __ kernel_crc32(crc, buf, len, 3215 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3216 3217 __ leave(); // required for proper stackwalking of RuntimeStub frame 3218 __ ret(lr); 3219 3220 return start; 3221 } 3222 3223 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3224 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3225 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3226 // Karatsuba multiplication performs a 128*128 -> 256-bit 3227 // multiplication in three 128-bit multiplications and a few 3228 // additions. 3229 // 3230 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3231 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3232 // 3233 // Inputs: 3234 // 3235 // A0 in a.d[0] (subkey) 3236 // A1 in a.d[1] 3237 // (A1+A0) in a1_xor_a0.d[0] 3238 // 3239 // B0 in b.d[0] (state) 3240 // B1 in b.d[1] 3241 3242 __ ext(tmp1, __ T16B, b, b, 0x08); 3243 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3244 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3245 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3246 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3247 3248 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3249 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3250 __ eor(tmp2, __ T16B, tmp2, tmp4); 3251 __ eor(tmp2, __ T16B, tmp2, tmp3); 3252 3253 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3254 __ ins(result_hi, __ D, tmp2, 0, 1); 3255 __ ins(result_lo, __ D, tmp2, 1, 0); 3256 } 3257 3258 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3259 FloatRegister p, FloatRegister z, FloatRegister t1) { 3260 const FloatRegister t0 = result; 3261 3262 // The GCM field polynomial f is z^128 + p(z), where p = 3263 // z^7+z^2+z+1. 3264 // 3265 // z^128 === -p(z) (mod (z^128 + p(z))) 3266 // 3267 // so, given that the product we're reducing is 3268 // a == lo + hi * z^128 3269 // substituting, 3270 // === lo - hi * p(z) (mod (z^128 + p(z))) 3271 // 3272 // we reduce by multiplying hi by p(z) and subtracting the result 3273 // from (i.e. XORing it with) lo. Because p has no nonzero high 3274 // bits we can do this with two 64-bit multiplications, lo*p and 3275 // hi*p. 3276 3277 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3278 __ ext(t1, __ T16B, t0, z, 8); 3279 __ eor(hi, __ T16B, hi, t1); 3280 __ ext(t1, __ T16B, z, t0, 8); 3281 __ eor(lo, __ T16B, lo, t1); 3282 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3283 __ eor(result, __ T16B, lo, t0); 3284 } 3285 3286 /** 3287 * Arguments: 3288 * 3289 * Input: 3290 * c_rarg0 - x address 3291 * c_rarg1 - x length 3292 * c_rarg2 - y address 3293 * c_rarg3 - y lenth 3294 * c_rarg4 - z address 3295 * c_rarg5 - z length 3296 */ 3297 address generate_multiplyToLen() { 3298 __ align(CodeEntryAlignment); 3299 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3300 3301 address start = __ pc(); 3302 const Register x = r0; 3303 const Register xlen = r1; 3304 const Register y = r2; 3305 const Register ylen = r3; 3306 const Register z = r4; 3307 const Register zlen = r5; 3308 3309 const Register tmp1 = r10; 3310 const Register tmp2 = r11; 3311 const Register tmp3 = r12; 3312 const Register tmp4 = r13; 3313 const Register tmp5 = r14; 3314 const Register tmp6 = r15; 3315 const Register tmp7 = r16; 3316 3317 BLOCK_COMMENT("Entry:"); 3318 __ enter(); // required for proper stackwalking of RuntimeStub frame 3319 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3320 __ leave(); // required for proper stackwalking of RuntimeStub frame 3321 __ ret(lr); 3322 3323 return start; 3324 } 3325 3326 /** 3327 * Arguments: 3328 * 3329 * Input: 3330 * c_rarg0 - current state address 3331 * c_rarg1 - H key address 3332 * c_rarg2 - data address 3333 * c_rarg3 - number of blocks 3334 * 3335 * Output: 3336 * Updated state at c_rarg0 3337 */ 3338 address generate_ghash_processBlocks() { 3339 // Bafflingly, GCM uses little-endian for the byte order, but 3340 // big-endian for the bit order. For example, the polynomial 1 is 3341 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3342 // 3343 // So, we must either reverse the bytes in each word and do 3344 // everything big-endian or reverse the bits in each byte and do 3345 // it little-endian. On AArch64 it's more idiomatic to reverse 3346 // the bits in each byte (we have an instruction, RBIT, to do 3347 // that) and keep the data in little-endian bit order throught the 3348 // calculation, bit-reversing the inputs and outputs. 3349 3350 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3351 __ align(wordSize * 2); 3352 address p = __ pc(); 3353 __ emit_int64(0x87); // The low-order bits of the field 3354 // polynomial (i.e. p = z^7+z^2+z+1) 3355 // repeated in the low and high parts of a 3356 // 128-bit vector 3357 __ emit_int64(0x87); 3358 3359 __ align(CodeEntryAlignment); 3360 address start = __ pc(); 3361 3362 Register state = c_rarg0; 3363 Register subkeyH = c_rarg1; 3364 Register data = c_rarg2; 3365 Register blocks = c_rarg3; 3366 3367 FloatRegister vzr = v30; 3368 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3369 3370 __ ldrq(v0, Address(state)); 3371 __ ldrq(v1, Address(subkeyH)); 3372 3373 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3374 __ rbit(v0, __ T16B, v0); 3375 __ rev64(v1, __ T16B, v1); 3376 __ rbit(v1, __ T16B, v1); 3377 3378 __ ldrq(v26, p); 3379 3380 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3381 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3382 3383 { 3384 Label L_ghash_loop; 3385 __ bind(L_ghash_loop); 3386 3387 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3388 // reversing each byte 3389 __ rbit(v2, __ T16B, v2); 3390 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3391 3392 // Multiply state in v2 by subkey in v1 3393 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3394 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3395 /*temps*/v6, v20, v18, v21); 3396 // Reduce v7:v5 by the field polynomial 3397 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3398 3399 __ sub(blocks, blocks, 1); 3400 __ cbnz(blocks, L_ghash_loop); 3401 } 3402 3403 // The bit-reversed result is at this point in v0 3404 __ rev64(v1, __ T16B, v0); 3405 __ rbit(v1, __ T16B, v1); 3406 3407 __ st1(v1, __ T16B, state); 3408 __ ret(lr); 3409 3410 return start; 3411 } 3412 3413 // Continuation point for throwing of implicit exceptions that are 3414 // not handled in the current activation. Fabricates an exception 3415 // oop and initiates normal exception dispatching in this 3416 // frame. Since we need to preserve callee-saved values (currently 3417 // only for C2, but done for C1 as well) we need a callee-saved oop 3418 // map and therefore have to make these stubs into RuntimeStubs 3419 // rather than BufferBlobs. If the compiler needs all registers to 3420 // be preserved between the fault point and the exception handler 3421 // then it must assume responsibility for that in 3422 // AbstractCompiler::continuation_for_implicit_null_exception or 3423 // continuation_for_implicit_division_by_zero_exception. All other 3424 // implicit exceptions (e.g., NullPointerException or 3425 // AbstractMethodError on entry) are either at call sites or 3426 // otherwise assume that stack unwinding will be initiated, so 3427 // caller saved registers were assumed volatile in the compiler. 3428 3429 #undef __ 3430 #define __ masm-> 3431 3432 address generate_throw_exception(const char* name, 3433 address runtime_entry, 3434 Register arg1 = noreg, 3435 Register arg2 = noreg) { 3436 // Information about frame layout at time of blocking runtime call. 3437 // Note that we only have to preserve callee-saved registers since 3438 // the compilers are responsible for supplying a continuation point 3439 // if they expect all registers to be preserved. 3440 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3441 enum layout { 3442 rfp_off = 0, 3443 rfp_off2, 3444 return_off, 3445 return_off2, 3446 framesize // inclusive of return address 3447 }; 3448 3449 int insts_size = 512; 3450 int locs_size = 64; 3451 3452 CodeBuffer code(name, insts_size, locs_size); 3453 OopMapSet* oop_maps = new OopMapSet(); 3454 MacroAssembler* masm = new MacroAssembler(&code); 3455 3456 address start = __ pc(); 3457 3458 // This is an inlined and slightly modified version of call_VM 3459 // which has the ability to fetch the return PC out of 3460 // thread-local storage and also sets up last_Java_sp slightly 3461 // differently than the real call_VM 3462 3463 __ enter(); // Save FP and LR before call 3464 3465 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3466 3467 // lr and fp are already in place 3468 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3469 3470 int frame_complete = __ pc() - start; 3471 3472 // Set up last_Java_sp and last_Java_fp 3473 address the_pc = __ pc(); 3474 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3475 3476 // Call runtime 3477 if (arg1 != noreg) { 3478 assert(arg2 != c_rarg1, "clobbered"); 3479 __ mov(c_rarg1, arg1); 3480 } 3481 if (arg2 != noreg) { 3482 __ mov(c_rarg2, arg2); 3483 } 3484 __ mov(c_rarg0, rthread); 3485 BLOCK_COMMENT("call runtime_entry"); 3486 __ mov(rscratch1, runtime_entry); 3487 __ blr(rscratch1); 3488 3489 // Generate oop map 3490 OopMap* map = new OopMap(framesize, 0); 3491 3492 oop_maps->add_gc_map(the_pc - start, map); 3493 3494 __ reset_last_Java_frame(true); 3495 __ maybe_isb(); 3496 3497 __ leave(); 3498 3499 // check for pending exceptions 3500 #ifdef ASSERT 3501 Label L; 3502 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3503 __ cbnz(rscratch1, L); 3504 __ should_not_reach_here(); 3505 __ bind(L); 3506 #endif // ASSERT 3507 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3508 3509 3510 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3511 RuntimeStub* stub = 3512 RuntimeStub::new_runtime_stub(name, 3513 &code, 3514 frame_complete, 3515 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3516 oop_maps, false); 3517 return stub->entry_point(); 3518 } 3519 3520 class MontgomeryMultiplyGenerator : public MacroAssembler { 3521 3522 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3523 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3524 3525 RegSet _toSave; 3526 bool _squaring; 3527 3528 public: 3529 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3530 : MacroAssembler(as->code()), _squaring(squaring) { 3531 3532 // Register allocation 3533 3534 Register reg = c_rarg0; 3535 Pa_base = reg; // Argument registers 3536 if (squaring) 3537 Pb_base = Pa_base; 3538 else 3539 Pb_base = ++reg; 3540 Pn_base = ++reg; 3541 Rlen= ++reg; 3542 inv = ++reg; 3543 Pm_base = ++reg; 3544 3545 // Working registers: 3546 Ra = ++reg; // The current digit of a, b, n, and m. 3547 Rb = ++reg; 3548 Rm = ++reg; 3549 Rn = ++reg; 3550 3551 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3552 Pb = ++reg; 3553 Pm = ++reg; 3554 Pn = ++reg; 3555 3556 t0 = ++reg; // Three registers which form a 3557 t1 = ++reg; // triple-precision accumuator. 3558 t2 = ++reg; 3559 3560 Ri = ++reg; // Inner and outer loop indexes. 3561 Rj = ++reg; 3562 3563 Rhi_ab = ++reg; // Product registers: low and high parts 3564 Rlo_ab = ++reg; // of a*b and m*n. 3565 Rhi_mn = ++reg; 3566 Rlo_mn = ++reg; 3567 3568 // r19 and up are callee-saved. 3569 _toSave = RegSet::range(r19, reg) + Pm_base; 3570 } 3571 3572 private: 3573 void save_regs() { 3574 push(_toSave, sp); 3575 } 3576 3577 void restore_regs() { 3578 pop(_toSave, sp); 3579 } 3580 3581 template <typename T> 3582 void unroll_2(Register count, T block) { 3583 Label loop, end, odd; 3584 tbnz(count, 0, odd); 3585 cbz(count, end); 3586 align(16); 3587 bind(loop); 3588 (this->*block)(); 3589 bind(odd); 3590 (this->*block)(); 3591 subs(count, count, 2); 3592 br(Assembler::GT, loop); 3593 bind(end); 3594 } 3595 3596 template <typename T> 3597 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3598 Label loop, end, odd; 3599 tbnz(count, 0, odd); 3600 cbz(count, end); 3601 align(16); 3602 bind(loop); 3603 (this->*block)(d, s, tmp); 3604 bind(odd); 3605 (this->*block)(d, s, tmp); 3606 subs(count, count, 2); 3607 br(Assembler::GT, loop); 3608 bind(end); 3609 } 3610 3611 void pre1(RegisterOrConstant i) { 3612 block_comment("pre1"); 3613 // Pa = Pa_base; 3614 // Pb = Pb_base + i; 3615 // Pm = Pm_base; 3616 // Pn = Pn_base + i; 3617 // Ra = *Pa; 3618 // Rb = *Pb; 3619 // Rm = *Pm; 3620 // Rn = *Pn; 3621 ldr(Ra, Address(Pa_base)); 3622 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3623 ldr(Rm, Address(Pm_base)); 3624 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3625 lea(Pa, Address(Pa_base)); 3626 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3627 lea(Pm, Address(Pm_base)); 3628 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3629 3630 // Zero the m*n result. 3631 mov(Rhi_mn, zr); 3632 mov(Rlo_mn, zr); 3633 } 3634 3635 // The core multiply-accumulate step of a Montgomery 3636 // multiplication. The idea is to schedule operations as a 3637 // pipeline so that instructions with long latencies (loads and 3638 // multiplies) have time to complete before their results are 3639 // used. This most benefits in-order implementations of the 3640 // architecture but out-of-order ones also benefit. 3641 void step() { 3642 block_comment("step"); 3643 // MACC(Ra, Rb, t0, t1, t2); 3644 // Ra = *++Pa; 3645 // Rb = *--Pb; 3646 umulh(Rhi_ab, Ra, Rb); 3647 mul(Rlo_ab, Ra, Rb); 3648 ldr(Ra, pre(Pa, wordSize)); 3649 ldr(Rb, pre(Pb, -wordSize)); 3650 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3651 // previous iteration. 3652 // MACC(Rm, Rn, t0, t1, t2); 3653 // Rm = *++Pm; 3654 // Rn = *--Pn; 3655 umulh(Rhi_mn, Rm, Rn); 3656 mul(Rlo_mn, Rm, Rn); 3657 ldr(Rm, pre(Pm, wordSize)); 3658 ldr(Rn, pre(Pn, -wordSize)); 3659 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3660 } 3661 3662 void post1() { 3663 block_comment("post1"); 3664 3665 // MACC(Ra, Rb, t0, t1, t2); 3666 // Ra = *++Pa; 3667 // Rb = *--Pb; 3668 umulh(Rhi_ab, Ra, Rb); 3669 mul(Rlo_ab, Ra, Rb); 3670 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3671 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3672 3673 // *Pm = Rm = t0 * inv; 3674 mul(Rm, t0, inv); 3675 str(Rm, Address(Pm)); 3676 3677 // MACC(Rm, Rn, t0, t1, t2); 3678 // t0 = t1; t1 = t2; t2 = 0; 3679 umulh(Rhi_mn, Rm, Rn); 3680 3681 #ifndef PRODUCT 3682 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3683 { 3684 mul(Rlo_mn, Rm, Rn); 3685 add(Rlo_mn, t0, Rlo_mn); 3686 Label ok; 3687 cbz(Rlo_mn, ok); { 3688 stop("broken Montgomery multiply"); 3689 } bind(ok); 3690 } 3691 #endif 3692 // We have very carefully set things up so that 3693 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3694 // the lower half of Rm * Rn because we know the result already: 3695 // it must be -t0. t0 + (-t0) must generate a carry iff 3696 // t0 != 0. So, rather than do a mul and an adds we just set 3697 // the carry flag iff t0 is nonzero. 3698 // 3699 // mul(Rlo_mn, Rm, Rn); 3700 // adds(zr, t0, Rlo_mn); 3701 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3702 adcs(t0, t1, Rhi_mn); 3703 adc(t1, t2, zr); 3704 mov(t2, zr); 3705 } 3706 3707 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 3708 block_comment("pre2"); 3709 // Pa = Pa_base + i-len; 3710 // Pb = Pb_base + len; 3711 // Pm = Pm_base + i-len; 3712 // Pn = Pn_base + len; 3713 3714 if (i.is_register()) { 3715 sub(Rj, i.as_register(), len); 3716 } else { 3717 mov(Rj, i.as_constant()); 3718 sub(Rj, Rj, len); 3719 } 3720 // Rj == i-len 3721 3722 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 3723 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 3724 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3725 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 3726 3727 // Ra = *++Pa; 3728 // Rb = *--Pb; 3729 // Rm = *++Pm; 3730 // Rn = *--Pn; 3731 ldr(Ra, pre(Pa, wordSize)); 3732 ldr(Rb, pre(Pb, -wordSize)); 3733 ldr(Rm, pre(Pm, wordSize)); 3734 ldr(Rn, pre(Pn, -wordSize)); 3735 3736 mov(Rhi_mn, zr); 3737 mov(Rlo_mn, zr); 3738 } 3739 3740 void post2(RegisterOrConstant i, RegisterOrConstant len) { 3741 block_comment("post2"); 3742 if (i.is_constant()) { 3743 mov(Rj, i.as_constant()-len.as_constant()); 3744 } else { 3745 sub(Rj, i.as_register(), len); 3746 } 3747 3748 adds(t0, t0, Rlo_mn); // The pending m*n, low part 3749 3750 // As soon as we know the least significant digit of our result, 3751 // store it. 3752 // Pm_base[i-len] = t0; 3753 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3754 3755 // t0 = t1; t1 = t2; t2 = 0; 3756 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 3757 adc(t1, t2, zr); 3758 mov(t2, zr); 3759 } 3760 3761 // A carry in t0 after Montgomery multiplication means that we 3762 // should subtract multiples of n from our result in m. We'll 3763 // keep doing that until there is no carry. 3764 void normalize(RegisterOrConstant len) { 3765 block_comment("normalize"); 3766 // while (t0) 3767 // t0 = sub(Pm_base, Pn_base, t0, len); 3768 Label loop, post, again; 3769 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 3770 cbz(t0, post); { 3771 bind(again); { 3772 mov(i, zr); 3773 mov(cnt, len); 3774 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3775 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3776 subs(zr, zr, zr); // set carry flag, i.e. no borrow 3777 align(16); 3778 bind(loop); { 3779 sbcs(Rm, Rm, Rn); 3780 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3781 add(i, i, 1); 3782 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3783 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3784 sub(cnt, cnt, 1); 3785 } cbnz(cnt, loop); 3786 sbc(t0, t0, zr); 3787 } cbnz(t0, again); 3788 } bind(post); 3789 } 3790 3791 // Move memory at s to d, reversing words. 3792 // Increments d to end of copied memory 3793 // Destroys tmp1, tmp2 3794 // Preserves len 3795 // Leaves s pointing to the address which was in d at start 3796 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3797 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 3798 3799 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 3800 mov(tmp1, len); 3801 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3802 sub(s, d, len, ext::uxtw, LogBytesPerWord); 3803 } 3804 // where 3805 void reverse1(Register d, Register s, Register tmp) { 3806 ldr(tmp, pre(s, -wordSize)); 3807 ror(tmp, tmp, 32); 3808 str(tmp, post(d, wordSize)); 3809 } 3810 3811 void step_squaring() { 3812 // An extra ACC 3813 step(); 3814 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3815 } 3816 3817 void last_squaring(RegisterOrConstant i) { 3818 Label dont; 3819 // if ((i & 1) == 0) { 3820 tbnz(i.as_register(), 0, dont); { 3821 // MACC(Ra, Rb, t0, t1, t2); 3822 // Ra = *++Pa; 3823 // Rb = *--Pb; 3824 umulh(Rhi_ab, Ra, Rb); 3825 mul(Rlo_ab, Ra, Rb); 3826 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3827 } bind(dont); 3828 } 3829 3830 void extra_step_squaring() { 3831 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3832 3833 // MACC(Rm, Rn, t0, t1, t2); 3834 // Rm = *++Pm; 3835 // Rn = *--Pn; 3836 umulh(Rhi_mn, Rm, Rn); 3837 mul(Rlo_mn, Rm, Rn); 3838 ldr(Rm, pre(Pm, wordSize)); 3839 ldr(Rn, pre(Pn, -wordSize)); 3840 } 3841 3842 void post1_squaring() { 3843 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3844 3845 // *Pm = Rm = t0 * inv; 3846 mul(Rm, t0, inv); 3847 str(Rm, Address(Pm)); 3848 3849 // MACC(Rm, Rn, t0, t1, t2); 3850 // t0 = t1; t1 = t2; t2 = 0; 3851 umulh(Rhi_mn, Rm, Rn); 3852 3853 #ifndef PRODUCT 3854 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3855 { 3856 mul(Rlo_mn, Rm, Rn); 3857 add(Rlo_mn, t0, Rlo_mn); 3858 Label ok; 3859 cbz(Rlo_mn, ok); { 3860 stop("broken Montgomery multiply"); 3861 } bind(ok); 3862 } 3863 #endif 3864 // We have very carefully set things up so that 3865 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3866 // the lower half of Rm * Rn because we know the result already: 3867 // it must be -t0. t0 + (-t0) must generate a carry iff 3868 // t0 != 0. So, rather than do a mul and an adds we just set 3869 // the carry flag iff t0 is nonzero. 3870 // 3871 // mul(Rlo_mn, Rm, Rn); 3872 // adds(zr, t0, Rlo_mn); 3873 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3874 adcs(t0, t1, Rhi_mn); 3875 adc(t1, t2, zr); 3876 mov(t2, zr); 3877 } 3878 3879 void acc(Register Rhi, Register Rlo, 3880 Register t0, Register t1, Register t2) { 3881 adds(t0, t0, Rlo); 3882 adcs(t1, t1, Rhi); 3883 adc(t2, t2, zr); 3884 } 3885 3886 public: 3887 /** 3888 * Fast Montgomery multiplication. The derivation of the 3889 * algorithm is in A Cryptographic Library for the Motorola 3890 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3891 * 3892 * Arguments: 3893 * 3894 * Inputs for multiplication: 3895 * c_rarg0 - int array elements a 3896 * c_rarg1 - int array elements b 3897 * c_rarg2 - int array elements n (the modulus) 3898 * c_rarg3 - int length 3899 * c_rarg4 - int inv 3900 * c_rarg5 - int array elements m (the result) 3901 * 3902 * Inputs for squaring: 3903 * c_rarg0 - int array elements a 3904 * c_rarg1 - int array elements n (the modulus) 3905 * c_rarg2 - int length 3906 * c_rarg3 - int inv 3907 * c_rarg4 - int array elements m (the result) 3908 * 3909 */ 3910 address generate_multiply() { 3911 Label argh, nothing; 3912 bind(argh); 3913 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3914 3915 align(CodeEntryAlignment); 3916 address entry = pc(); 3917 3918 cbzw(Rlen, nothing); 3919 3920 enter(); 3921 3922 // Make room. 3923 cmpw(Rlen, 512); 3924 br(Assembler::HI, argh); 3925 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3926 andr(sp, Ra, -2 * wordSize); 3927 3928 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3929 3930 { 3931 // Copy input args, reversing as we go. We use Ra as a 3932 // temporary variable. 3933 reverse(Ra, Pa_base, Rlen, t0, t1); 3934 if (!_squaring) 3935 reverse(Ra, Pb_base, Rlen, t0, t1); 3936 reverse(Ra, Pn_base, Rlen, t0, t1); 3937 } 3938 3939 // Push all call-saved registers and also Pm_base which we'll need 3940 // at the end. 3941 save_regs(); 3942 3943 #ifndef PRODUCT 3944 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3945 { 3946 ldr(Rn, Address(Pn_base, 0)); 3947 mul(Rlo_mn, Rn, inv); 3948 cmp(Rlo_mn, -1); 3949 Label ok; 3950 br(EQ, ok); { 3951 stop("broken inverse in Montgomery multiply"); 3952 } bind(ok); 3953 } 3954 #endif 3955 3956 mov(Pm_base, Ra); 3957 3958 mov(t0, zr); 3959 mov(t1, zr); 3960 mov(t2, zr); 3961 3962 block_comment("for (int i = 0; i < len; i++) {"); 3963 mov(Ri, zr); { 3964 Label loop, end; 3965 cmpw(Ri, Rlen); 3966 br(Assembler::GE, end); 3967 3968 bind(loop); 3969 pre1(Ri); 3970 3971 block_comment(" for (j = i; j; j--) {"); { 3972 movw(Rj, Ri); 3973 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3974 } block_comment(" } // j"); 3975 3976 post1(); 3977 addw(Ri, Ri, 1); 3978 cmpw(Ri, Rlen); 3979 br(Assembler::LT, loop); 3980 bind(end); 3981 block_comment("} // i"); 3982 } 3983 3984 block_comment("for (int i = len; i < 2*len; i++) {"); 3985 mov(Ri, Rlen); { 3986 Label loop, end; 3987 cmpw(Ri, Rlen, Assembler::LSL, 1); 3988 br(Assembler::GE, end); 3989 3990 bind(loop); 3991 pre2(Ri, Rlen); 3992 3993 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3994 lslw(Rj, Rlen, 1); 3995 subw(Rj, Rj, Ri); 3996 subw(Rj, Rj, 1); 3997 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3998 } block_comment(" } // j"); 3999 4000 post2(Ri, Rlen); 4001 addw(Ri, Ri, 1); 4002 cmpw(Ri, Rlen, Assembler::LSL, 1); 4003 br(Assembler::LT, loop); 4004 bind(end); 4005 } 4006 block_comment("} // i"); 4007 4008 normalize(Rlen); 4009 4010 mov(Ra, Pm_base); // Save Pm_base in Ra 4011 restore_regs(); // Restore caller's Pm_base 4012 4013 // Copy our result into caller's Pm_base 4014 reverse(Pm_base, Ra, Rlen, t0, t1); 4015 4016 leave(); 4017 bind(nothing); 4018 ret(lr); 4019 4020 return entry; 4021 } 4022 // In C, approximately: 4023 4024 // void 4025 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4026 // unsigned long Pn_base[], unsigned long Pm_base[], 4027 // unsigned long inv, int len) { 4028 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4029 // unsigned long *Pa, *Pb, *Pn, *Pm; 4030 // unsigned long Ra, Rb, Rn, Rm; 4031 4032 // int i; 4033 4034 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4035 4036 // for (i = 0; i < len; i++) { 4037 // int j; 4038 4039 // Pa = Pa_base; 4040 // Pb = Pb_base + i; 4041 // Pm = Pm_base; 4042 // Pn = Pn_base + i; 4043 4044 // Ra = *Pa; 4045 // Rb = *Pb; 4046 // Rm = *Pm; 4047 // Rn = *Pn; 4048 4049 // int iters = i; 4050 // for (j = 0; iters--; j++) { 4051 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4052 // MACC(Ra, Rb, t0, t1, t2); 4053 // Ra = *++Pa; 4054 // Rb = *--Pb; 4055 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4056 // MACC(Rm, Rn, t0, t1, t2); 4057 // Rm = *++Pm; 4058 // Rn = *--Pn; 4059 // } 4060 4061 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4062 // MACC(Ra, Rb, t0, t1, t2); 4063 // *Pm = Rm = t0 * inv; 4064 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4065 // MACC(Rm, Rn, t0, t1, t2); 4066 4067 // assert(t0 == 0, "broken Montgomery multiply"); 4068 4069 // t0 = t1; t1 = t2; t2 = 0; 4070 // } 4071 4072 // for (i = len; i < 2*len; i++) { 4073 // int j; 4074 4075 // Pa = Pa_base + i-len; 4076 // Pb = Pb_base + len; 4077 // Pm = Pm_base + i-len; 4078 // Pn = Pn_base + len; 4079 4080 // Ra = *++Pa; 4081 // Rb = *--Pb; 4082 // Rm = *++Pm; 4083 // Rn = *--Pn; 4084 4085 // int iters = len*2-i-1; 4086 // for (j = i-len+1; iters--; j++) { 4087 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4088 // MACC(Ra, Rb, t0, t1, t2); 4089 // Ra = *++Pa; 4090 // Rb = *--Pb; 4091 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4092 // MACC(Rm, Rn, t0, t1, t2); 4093 // Rm = *++Pm; 4094 // Rn = *--Pn; 4095 // } 4096 4097 // Pm_base[i-len] = t0; 4098 // t0 = t1; t1 = t2; t2 = 0; 4099 // } 4100 4101 // while (t0) 4102 // t0 = sub(Pm_base, Pn_base, t0, len); 4103 // } 4104 4105 /** 4106 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4107 * multiplies than Montgomery multiplication so it should be up to 4108 * 25% faster. However, its loop control is more complex and it 4109 * may actually run slower on some machines. 4110 * 4111 * Arguments: 4112 * 4113 * Inputs: 4114 * c_rarg0 - int array elements a 4115 * c_rarg1 - int array elements n (the modulus) 4116 * c_rarg2 - int length 4117 * c_rarg3 - int inv 4118 * c_rarg4 - int array elements m (the result) 4119 * 4120 */ 4121 address generate_square() { 4122 Label argh; 4123 bind(argh); 4124 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4125 4126 align(CodeEntryAlignment); 4127 address entry = pc(); 4128 4129 enter(); 4130 4131 // Make room. 4132 cmpw(Rlen, 512); 4133 br(Assembler::HI, argh); 4134 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4135 andr(sp, Ra, -2 * wordSize); 4136 4137 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4138 4139 { 4140 // Copy input args, reversing as we go. We use Ra as a 4141 // temporary variable. 4142 reverse(Ra, Pa_base, Rlen, t0, t1); 4143 reverse(Ra, Pn_base, Rlen, t0, t1); 4144 } 4145 4146 // Push all call-saved registers and also Pm_base which we'll need 4147 // at the end. 4148 save_regs(); 4149 4150 mov(Pm_base, Ra); 4151 4152 mov(t0, zr); 4153 mov(t1, zr); 4154 mov(t2, zr); 4155 4156 block_comment("for (int i = 0; i < len; i++) {"); 4157 mov(Ri, zr); { 4158 Label loop, end; 4159 bind(loop); 4160 cmp(Ri, Rlen); 4161 br(Assembler::GE, end); 4162 4163 pre1(Ri); 4164 4165 block_comment("for (j = (i+1)/2; j; j--) {"); { 4166 add(Rj, Ri, 1); 4167 lsr(Rj, Rj, 1); 4168 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4169 } block_comment(" } // j"); 4170 4171 last_squaring(Ri); 4172 4173 block_comment(" for (j = i/2; j; j--) {"); { 4174 lsr(Rj, Ri, 1); 4175 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4176 } block_comment(" } // j"); 4177 4178 post1_squaring(); 4179 add(Ri, Ri, 1); 4180 cmp(Ri, Rlen); 4181 br(Assembler::LT, loop); 4182 4183 bind(end); 4184 block_comment("} // i"); 4185 } 4186 4187 block_comment("for (int i = len; i < 2*len; i++) {"); 4188 mov(Ri, Rlen); { 4189 Label loop, end; 4190 bind(loop); 4191 cmp(Ri, Rlen, Assembler::LSL, 1); 4192 br(Assembler::GE, end); 4193 4194 pre2(Ri, Rlen); 4195 4196 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4197 lsl(Rj, Rlen, 1); 4198 sub(Rj, Rj, Ri); 4199 sub(Rj, Rj, 1); 4200 lsr(Rj, Rj, 1); 4201 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4202 } block_comment(" } // j"); 4203 4204 last_squaring(Ri); 4205 4206 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4207 lsl(Rj, Rlen, 1); 4208 sub(Rj, Rj, Ri); 4209 lsr(Rj, Rj, 1); 4210 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4211 } block_comment(" } // j"); 4212 4213 post2(Ri, Rlen); 4214 add(Ri, Ri, 1); 4215 cmp(Ri, Rlen, Assembler::LSL, 1); 4216 4217 br(Assembler::LT, loop); 4218 bind(end); 4219 block_comment("} // i"); 4220 } 4221 4222 normalize(Rlen); 4223 4224 mov(Ra, Pm_base); // Save Pm_base in Ra 4225 restore_regs(); // Restore caller's Pm_base 4226 4227 // Copy our result into caller's Pm_base 4228 reverse(Pm_base, Ra, Rlen, t0, t1); 4229 4230 leave(); 4231 ret(lr); 4232 4233 return entry; 4234 } 4235 // In C, approximately: 4236 4237 // void 4238 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4239 // unsigned long Pm_base[], unsigned long inv, int len) { 4240 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4241 // unsigned long *Pa, *Pb, *Pn, *Pm; 4242 // unsigned long Ra, Rb, Rn, Rm; 4243 4244 // int i; 4245 4246 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4247 4248 // for (i = 0; i < len; i++) { 4249 // int j; 4250 4251 // Pa = Pa_base; 4252 // Pb = Pa_base + i; 4253 // Pm = Pm_base; 4254 // Pn = Pn_base + i; 4255 4256 // Ra = *Pa; 4257 // Rb = *Pb; 4258 // Rm = *Pm; 4259 // Rn = *Pn; 4260 4261 // int iters = (i+1)/2; 4262 // for (j = 0; iters--; j++) { 4263 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4264 // MACC2(Ra, Rb, t0, t1, t2); 4265 // Ra = *++Pa; 4266 // Rb = *--Pb; 4267 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4268 // MACC(Rm, Rn, t0, t1, t2); 4269 // Rm = *++Pm; 4270 // Rn = *--Pn; 4271 // } 4272 // if ((i & 1) == 0) { 4273 // assert(Ra == Pa_base[j], "must be"); 4274 // MACC(Ra, Ra, t0, t1, t2); 4275 // } 4276 // iters = i/2; 4277 // assert(iters == i-j, "must be"); 4278 // for (; iters--; j++) { 4279 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4280 // MACC(Rm, Rn, t0, t1, t2); 4281 // Rm = *++Pm; 4282 // Rn = *--Pn; 4283 // } 4284 4285 // *Pm = Rm = t0 * inv; 4286 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4287 // MACC(Rm, Rn, t0, t1, t2); 4288 4289 // assert(t0 == 0, "broken Montgomery multiply"); 4290 4291 // t0 = t1; t1 = t2; t2 = 0; 4292 // } 4293 4294 // for (i = len; i < 2*len; i++) { 4295 // int start = i-len+1; 4296 // int end = start + (len - start)/2; 4297 // int j; 4298 4299 // Pa = Pa_base + i-len; 4300 // Pb = Pa_base + len; 4301 // Pm = Pm_base + i-len; 4302 // Pn = Pn_base + len; 4303 4304 // Ra = *++Pa; 4305 // Rb = *--Pb; 4306 // Rm = *++Pm; 4307 // Rn = *--Pn; 4308 4309 // int iters = (2*len-i-1)/2; 4310 // assert(iters == end-start, "must be"); 4311 // for (j = start; iters--; j++) { 4312 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4313 // MACC2(Ra, Rb, t0, t1, t2); 4314 // Ra = *++Pa; 4315 // Rb = *--Pb; 4316 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4317 // MACC(Rm, Rn, t0, t1, t2); 4318 // Rm = *++Pm; 4319 // Rn = *--Pn; 4320 // } 4321 // if ((i & 1) == 0) { 4322 // assert(Ra == Pa_base[j], "must be"); 4323 // MACC(Ra, Ra, t0, t1, t2); 4324 // } 4325 // iters = (2*len-i)/2; 4326 // assert(iters == len-j, "must be"); 4327 // for (; iters--; j++) { 4328 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4329 // MACC(Rm, Rn, t0, t1, t2); 4330 // Rm = *++Pm; 4331 // Rn = *--Pn; 4332 // } 4333 // Pm_base[i-len] = t0; 4334 // t0 = t1; t1 = t2; t2 = 0; 4335 // } 4336 4337 // while (t0) 4338 // t0 = sub(Pm_base, Pn_base, t0, len); 4339 // } 4340 }; 4341 4342 // Initialization 4343 void generate_initial() { 4344 // Generate initial stubs and initializes the entry points 4345 4346 // entry points that exist in all platforms Note: This is code 4347 // that could be shared among different platforms - however the 4348 // benefit seems to be smaller than the disadvantage of having a 4349 // much more complicated generator structure. See also comment in 4350 // stubRoutines.hpp. 4351 4352 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4353 4354 StubRoutines::_call_stub_entry = 4355 generate_call_stub(StubRoutines::_call_stub_return_address); 4356 4357 // is referenced by megamorphic call 4358 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4359 4360 // Build this early so it's available for the interpreter. 4361 StubRoutines::_throw_StackOverflowError_entry = 4362 generate_throw_exception("StackOverflowError throw_exception", 4363 CAST_FROM_FN_PTR(address, 4364 SharedRuntime:: 4365 throw_StackOverflowError)); 4366 if (UseCRC32Intrinsics) { 4367 // set table address before stub generation which use it 4368 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4369 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4370 } 4371 } 4372 4373 void generate_all() { 4374 // support for verify_oop (must happen after universe_init) 4375 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4376 StubRoutines::_throw_AbstractMethodError_entry = 4377 generate_throw_exception("AbstractMethodError throw_exception", 4378 CAST_FROM_FN_PTR(address, 4379 SharedRuntime:: 4380 throw_AbstractMethodError)); 4381 4382 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4383 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4384 CAST_FROM_FN_PTR(address, 4385 SharedRuntime:: 4386 throw_IncompatibleClassChangeError)); 4387 4388 StubRoutines::_throw_NullPointerException_at_call_entry = 4389 generate_throw_exception("NullPointerException at call throw_exception", 4390 CAST_FROM_FN_PTR(address, 4391 SharedRuntime:: 4392 throw_NullPointerException_at_call)); 4393 4394 // arraycopy stubs used by compilers 4395 generate_arraycopy_stubs(); 4396 4397 if (UseMultiplyToLenIntrinsic) { 4398 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4399 } 4400 4401 if (UseMontgomeryMultiplyIntrinsic) { 4402 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4403 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4404 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4405 } 4406 4407 if (UseMontgomerySquareIntrinsic) { 4408 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4409 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4410 // We use generate_multiply() rather than generate_square() 4411 // because it's faster for the sizes of modulus we care about. 4412 StubRoutines::_montgomerySquare = g.generate_multiply(); 4413 } 4414 4415 if (UseAESIntrinsics) { 4416 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4417 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4418 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4419 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4420 } 4421 4422 // generate GHASH intrinsics code 4423 if (UseGHASHIntrinsics) { 4424 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4425 } 4426 4427 if (UseSHA1Intrinsics) { 4428 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4429 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4430 } 4431 if (UseSHA256Intrinsics) { 4432 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4433 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4434 } 4435 4436 // Safefetch stubs. 4437 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4438 &StubRoutines::_safefetch32_fault_pc, 4439 &StubRoutines::_safefetch32_continuation_pc); 4440 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4441 &StubRoutines::_safefetchN_fault_pc, 4442 &StubRoutines::_safefetchN_continuation_pc); 4443 } 4444 4445 public: 4446 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4447 if (all) { 4448 generate_all(); 4449 } else { 4450 generate_initial(); 4451 } 4452 } 4453 }; // end class declaration 4454 4455 void StubGenerator_generate(CodeBuffer* code, bool all) { 4456 StubGenerator g(code, all); 4457 }