1 /* 2 * Copyright (c) 2013, Red Hat Inc. 3 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. 4 * All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "interpreter/interpreter.hpp" 31 #include "nativeInst_aarch64.hpp" 32 #include "oops/instanceOop.hpp" 33 #include "oops/method.hpp" 34 #include "oops/objArrayKlass.hpp" 35 #include "oops/oop.inline.hpp" 36 #include "prims/methodHandles.hpp" 37 #include "runtime/frame.inline.hpp" 38 #include "runtime/handles.inline.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubCodeGenerator.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "runtime/thread.inline.hpp" 43 #include "utilities/top.hpp" 44 #ifdef COMPILER2 45 #include "opto/runtime.hpp" 46 #endif 47 48 // Declaration and definition of StubGenerator (no .hpp file). 49 // For a more detailed description of the stub routine structure 50 // see the comment in stubRoutines.hpp 51 52 #undef __ 53 #define __ _masm-> 54 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 55 56 #ifdef PRODUCT 57 #define BLOCK_COMMENT(str) /* nothing */ 58 #else 59 #define BLOCK_COMMENT(str) __ block_comment(str) 60 #endif 61 62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 63 64 // Stub Code definitions 65 66 class StubGenerator: public StubCodeGenerator { 67 private: 68 69 #ifdef PRODUCT 70 #define inc_counter_np(counter) ((void)0) 71 #else 72 void inc_counter_np_(int& counter) { 73 __ lea(rscratch2, ExternalAddress((address)&counter)); 74 __ ldrw(rscratch1, Address(rscratch2)); 75 __ addw(rscratch1, rscratch1, 1); 76 __ strw(rscratch1, Address(rscratch2)); 77 } 78 #define inc_counter_np(counter) \ 79 BLOCK_COMMENT("inc_counter " #counter); \ 80 inc_counter_np_(counter); 81 #endif 82 83 // Call stubs are used to call Java from C 84 // 85 // Arguments: 86 // c_rarg0: call wrapper address address 87 // c_rarg1: result address 88 // c_rarg2: result type BasicType 89 // c_rarg3: method Method* 90 // c_rarg4: (interpreter) entry point address 91 // c_rarg5: parameters intptr_t* 92 // c_rarg6: parameter size (in words) int 93 // c_rarg7: thread Thread* 94 // 95 // There is no return from the stub itself as any Java result 96 // is written to result 97 // 98 // we save r30 (lr) as the return PC at the base of the frame and 99 // link r29 (fp) below it as the frame pointer installing sp (r31) 100 // into fp. 101 // 102 // we save r0-r7, which accounts for all the c arguments. 103 // 104 // TODO: strictly do we need to save them all? they are treated as 105 // volatile by C so could we omit saving the ones we are going to 106 // place in global registers (thread? method?) or those we only use 107 // during setup of the Java call? 108 // 109 // we don't need to save r8 which C uses as an indirect result location 110 // return register. 111 // 112 // we don't need to save r9-r15 which both C and Java treat as 113 // volatile 114 // 115 // we don't need to save r16-18 because Java does not use them 116 // 117 // we save r19-r28 which Java uses as scratch registers and C 118 // expects to be callee-save 119 // 120 // we save the bottom 64 bits of each value stored in v8-v15; it is 121 // the responsibility of the caller to preserve larger values. 122 // 123 // so the stub frame looks like this when we enter Java code 124 // 125 // [ return_from_Java ] <--- sp 126 // [ argument word n ] 127 // ... 128 // -27 [ argument word 1 ] 129 // -26 [ saved v15 ] <--- sp_after_call 130 // -25 [ saved v14 ] 131 // -24 [ saved v13 ] 132 // -23 [ saved v12 ] 133 // -22 [ saved v11 ] 134 // -21 [ saved v10 ] 135 // -20 [ saved v9 ] 136 // -19 [ saved v8 ] 137 // -18 [ saved r28 ] 138 // -17 [ saved r27 ] 139 // -16 [ saved r26 ] 140 // -15 [ saved r25 ] 141 // -14 [ saved r24 ] 142 // -13 [ saved r23 ] 143 // -12 [ saved r22 ] 144 // -11 [ saved r21 ] 145 // -10 [ saved r20 ] 146 // -9 [ saved r19 ] 147 // -8 [ call wrapper (r0) ] 148 // -7 [ result (r1) ] 149 // -6 [ result type (r2) ] 150 // -5 [ method (r3) ] 151 // -4 [ entry point (r4) ] 152 // -3 [ parameters (r5) ] 153 // -2 [ parameter size (r6) ] 154 // -1 [ thread (r7) ] 155 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 156 // 1 [ saved lr (r30) ] 157 158 // Call stub stack layout word offsets from fp 159 enum call_stub_layout { 160 sp_after_call_off = -26, 161 162 d15_off = -26, 163 d13_off = -24, 164 d11_off = -22, 165 d9_off = -20, 166 167 r28_off = -18, 168 r26_off = -16, 169 r24_off = -14, 170 r22_off = -12, 171 r20_off = -10, 172 call_wrapper_off = -8, 173 result_off = -7, 174 result_type_off = -6, 175 method_off = -5, 176 entry_point_off = -4, 177 parameter_size_off = -2, 178 thread_off = -1, 179 fp_f = 0, 180 retaddr_off = 1, 181 }; 182 183 address generate_call_stub(address& return_address) { 184 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 185 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 186 "adjust this code"); 187 188 StubCodeMark mark(this, "StubRoutines", "call_stub"); 189 address start = __ pc(); 190 191 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 192 193 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 194 const Address result (rfp, result_off * wordSize); 195 const Address result_type (rfp, result_type_off * wordSize); 196 const Address method (rfp, method_off * wordSize); 197 const Address entry_point (rfp, entry_point_off * wordSize); 198 const Address parameter_size(rfp, parameter_size_off * wordSize); 199 200 const Address thread (rfp, thread_off * wordSize); 201 202 const Address d15_save (rfp, d15_off * wordSize); 203 const Address d13_save (rfp, d13_off * wordSize); 204 const Address d11_save (rfp, d11_off * wordSize); 205 const Address d9_save (rfp, d9_off * wordSize); 206 207 const Address r28_save (rfp, r28_off * wordSize); 208 const Address r26_save (rfp, r26_off * wordSize); 209 const Address r24_save (rfp, r24_off * wordSize); 210 const Address r22_save (rfp, r22_off * wordSize); 211 const Address r20_save (rfp, r20_off * wordSize); 212 213 // stub code 214 215 address aarch64_entry = __ pc(); 216 217 // set up frame and move sp to end of save area 218 __ enter(); 219 __ sub(sp, rfp, -sp_after_call_off * wordSize); 220 221 // save register parameters and Java scratch/global registers 222 // n.b. we save thread even though it gets installed in 223 // rthread because we want to sanity check rthread later 224 __ str(c_rarg7, thread); 225 __ strw(c_rarg6, parameter_size); 226 __ stp(c_rarg4, c_rarg5, entry_point); 227 __ stp(c_rarg2, c_rarg3, result_type); 228 __ stp(c_rarg0, c_rarg1, call_wrapper); 229 230 __ stp(r20, r19, r20_save); 231 __ stp(r22, r21, r22_save); 232 __ stp(r24, r23, r24_save); 233 __ stp(r26, r25, r26_save); 234 __ stp(r28, r27, r28_save); 235 236 __ stpd(v9, v8, d9_save); 237 __ stpd(v11, v10, d11_save); 238 __ stpd(v13, v12, d13_save); 239 __ stpd(v15, v14, d15_save); 240 241 // install Java thread in global register now we have saved 242 // whatever value it held 243 __ mov(rthread, c_rarg7); 244 // And method 245 __ mov(rmethod, c_rarg3); 246 247 // set up the heapbase register 248 __ reinit_heapbase(); 249 250 #ifdef ASSERT 251 // make sure we have no pending exceptions 252 { 253 Label L; 254 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 255 __ cmp(rscratch1, (unsigned)NULL_WORD); 256 __ br(Assembler::EQ, L); 257 __ stop("StubRoutines::call_stub: entered with pending exception"); 258 __ BIND(L); 259 } 260 #endif 261 // pass parameters if any 262 __ mov(esp, sp); 263 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 264 __ andr(sp, rscratch1, -2 * wordSize); 265 266 BLOCK_COMMENT("pass parameters if any"); 267 Label parameters_done; 268 // parameter count is still in c_rarg6 269 // and parameter pointer identifying param 1 is in c_rarg5 270 __ cbzw(c_rarg6, parameters_done); 271 272 address loop = __ pc(); 273 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 274 __ subsw(c_rarg6, c_rarg6, 1); 275 __ push(rscratch1); 276 __ br(Assembler::GT, loop); 277 278 __ BIND(parameters_done); 279 280 // call Java entry -- passing methdoOop, and current sp 281 // rmethod: Method* 282 // r13: sender sp 283 BLOCK_COMMENT("call Java function"); 284 __ mov(r13, sp); 285 __ blr(c_rarg4); 286 287 // we do this here because the notify will already have been done 288 // if we get to the next instruction via an exception 289 // 290 // n.b. adding this instruction here affects the calculation of 291 // whether or not a routine returns to the call stub (used when 292 // doing stack walks) since the normal test is to check the return 293 // pc against the address saved below. so we may need to allow for 294 // this extra instruction in the check. 295 296 // save current address for use by exception handling code 297 298 return_address = __ pc(); 299 300 // store result depending on type (everything that is not 301 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 302 // n.b. this assumes Java returns an integral result in r0 303 // and a floating result in j_farg0 304 __ ldr(j_rarg2, result); 305 Label is_long, is_float, is_double, exit; 306 __ ldr(j_rarg1, result_type); 307 __ cmp(j_rarg1, T_OBJECT); 308 __ br(Assembler::EQ, is_long); 309 __ cmp(j_rarg1, T_LONG); 310 __ br(Assembler::EQ, is_long); 311 __ cmp(j_rarg1, T_FLOAT); 312 __ br(Assembler::EQ, is_float); 313 __ cmp(j_rarg1, T_DOUBLE); 314 __ br(Assembler::EQ, is_double); 315 316 // handle T_INT case 317 __ strw(r0, Address(j_rarg2)); 318 319 __ BIND(exit); 320 321 // pop parameters 322 __ sub(esp, rfp, -sp_after_call_off * wordSize); 323 324 #ifdef ASSERT 325 // verify that threads correspond 326 { 327 Label L, S; 328 __ ldr(rscratch1, thread); 329 __ cmp(rthread, rscratch1); 330 __ br(Assembler::NE, S); 331 __ get_thread(rscratch1); 332 __ cmp(rthread, rscratch1); 333 __ br(Assembler::EQ, L); 334 __ BIND(S); 335 __ stop("StubRoutines::call_stub: threads must correspond"); 336 __ BIND(L); 337 } 338 #endif 339 340 // restore callee-save registers 341 __ ldpd(v15, v14, d15_save); 342 __ ldpd(v13, v12, d13_save); 343 __ ldpd(v11, v10, d11_save); 344 __ ldpd(v9, v8, d9_save); 345 346 __ ldp(r28, r27, r28_save); 347 __ ldp(r26, r25, r26_save); 348 __ ldp(r24, r23, r24_save); 349 __ ldp(r22, r21, r22_save); 350 __ ldp(r20, r19, r20_save); 351 352 __ ldp(c_rarg0, c_rarg1, call_wrapper); 353 __ ldrw(c_rarg2, result_type); 354 __ ldr(c_rarg3, method); 355 __ ldp(c_rarg4, c_rarg5, entry_point); 356 __ ldp(c_rarg6, c_rarg7, parameter_size); 357 358 // leave frame and return to caller 359 __ leave(); 360 __ ret(lr); 361 362 // handle return types different from T_INT 363 364 __ BIND(is_long); 365 __ str(r0, Address(j_rarg2, 0)); 366 __ br(Assembler::AL, exit); 367 368 __ BIND(is_float); 369 __ strs(j_farg0, Address(j_rarg2, 0)); 370 __ br(Assembler::AL, exit); 371 372 __ BIND(is_double); 373 __ strd(j_farg0, Address(j_rarg2, 0)); 374 __ br(Assembler::AL, exit); 375 376 return start; 377 } 378 379 // Return point for a Java call if there's an exception thrown in 380 // Java code. The exception is caught and transformed into a 381 // pending exception stored in JavaThread that can be tested from 382 // within the VM. 383 // 384 // Note: Usually the parameters are removed by the callee. In case 385 // of an exception crossing an activation frame boundary, that is 386 // not the case if the callee is compiled code => need to setup the 387 // rsp. 388 // 389 // r0: exception oop 390 391 address generate_catch_exception() { 392 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 393 address start = __ pc(); 394 395 // same as in generate_call_stub(): 396 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 397 const Address thread (rfp, thread_off * wordSize); 398 399 #ifdef ASSERT 400 // verify that threads correspond 401 { 402 Label L, S; 403 __ ldr(rscratch1, thread); 404 __ cmp(rthread, rscratch1); 405 __ br(Assembler::NE, S); 406 __ get_thread(rscratch1); 407 __ cmp(rthread, rscratch1); 408 __ br(Assembler::EQ, L); 409 __ bind(S); 410 __ stop("StubRoutines::catch_exception: threads must correspond"); 411 __ bind(L); 412 } 413 #endif 414 415 // set pending exception 416 __ verify_oop(r0); 417 418 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 419 __ mov(rscratch1, (address)__FILE__); 420 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 421 __ movw(rscratch1, (int)__LINE__); 422 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 423 424 // complete return to VM 425 assert(StubRoutines::_call_stub_return_address != NULL, 426 "_call_stub_return_address must have been generated before"); 427 __ b(StubRoutines::_call_stub_return_address); 428 429 return start; 430 } 431 432 // Continuation point for runtime calls returning with a pending 433 // exception. The pending exception check happened in the runtime 434 // or native call stub. The pending exception in Thread is 435 // converted into a Java-level exception. 436 // 437 // Contract with Java-level exception handlers: 438 // r0: exception 439 // r3: throwing pc 440 // 441 // NOTE: At entry of this stub, exception-pc must be in LR !! 442 443 // NOTE: this is always used as a jump target within generated code 444 // so it just needs to be generated code wiht no x86 prolog 445 446 address generate_forward_exception() { 447 StubCodeMark mark(this, "StubRoutines", "forward exception"); 448 address start = __ pc(); 449 450 // Upon entry, LR points to the return address returning into 451 // Java (interpreted or compiled) code; i.e., the return address 452 // becomes the throwing pc. 453 // 454 // Arguments pushed before the runtime call are still on the stack 455 // but the exception handler will reset the stack pointer -> 456 // ignore them. A potential result in registers can be ignored as 457 // well. 458 459 #ifdef ASSERT 460 // make sure this code is only executed if there is a pending exception 461 { 462 Label L; 463 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 464 __ cbnz(rscratch1, L); 465 __ stop("StubRoutines::forward exception: no pending exception (1)"); 466 __ bind(L); 467 } 468 #endif 469 470 // compute exception handler into r19 471 472 // call the VM to find the handler address associated with the 473 // caller address. pass thread in r0 and caller pc (ret address) 474 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 475 // the stack. 476 __ mov(c_rarg1, lr); 477 // lr will be trashed by the VM call so we move it to R19 478 // (callee-saved) because we also need to pass it to the handler 479 // returned by this call. 480 __ mov(r19, lr); 481 BLOCK_COMMENT("call exception_handler_for_return_address"); 482 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 483 SharedRuntime::exception_handler_for_return_address), 484 rthread, c_rarg1); 485 // we should not really care that lr is no longer the callee 486 // address. we saved the value the handler needs in r19 so we can 487 // just copy it to r3. however, the C2 handler will push its own 488 // frame and then calls into the VM and the VM code asserts that 489 // the PC for the frame above the handler belongs to a compiled 490 // Java method. So, we restore lr here to satisfy that assert. 491 __ mov(lr, r19); 492 // setup r0 & r3 & clear pending exception 493 __ mov(r3, r19); 494 __ mov(r19, r0); 495 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 496 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 497 498 #ifdef ASSERT 499 // make sure exception is set 500 { 501 Label L; 502 __ cbnz(r0, L); 503 __ stop("StubRoutines::forward exception: no pending exception (2)"); 504 __ bind(L); 505 } 506 #endif 507 508 // continue at exception handler 509 // r0: exception 510 // r3: throwing pc 511 // r19: exception handler 512 __ verify_oop(r0); 513 __ br(r19); 514 515 return start; 516 } 517 518 // Non-destructive plausibility checks for oops 519 // 520 // Arguments: 521 // r0: oop to verify 522 // rscratch1: error message 523 // 524 // Stack after saving c_rarg3: 525 // [tos + 0]: saved c_rarg3 526 // [tos + 1]: saved c_rarg2 527 // [tos + 2]: saved lr 528 // [tos + 3]: saved rscratch2 529 // [tos + 4]: saved r0 530 // [tos + 5]: saved rscratch1 531 address generate_verify_oop() { 532 533 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 534 address start = __ pc(); 535 536 Label exit, error; 537 538 // save c_rarg2 and c_rarg3 539 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 540 541 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 542 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 543 __ ldr(c_rarg3, Address(c_rarg2)); 544 __ add(c_rarg3, c_rarg3, 1); 545 __ str(c_rarg3, Address(c_rarg2)); 546 547 // object is in r0 548 // make sure object is 'reasonable' 549 __ cbz(r0, exit); // if obj is NULL it is OK 550 551 // Check if the oop is in the right area of memory 552 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 553 __ andr(c_rarg2, r0, c_rarg3); 554 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 555 556 // Compare c_rarg2 and c_rarg3. We don't use a compare 557 // instruction here because the flags register is live. 558 __ eor(c_rarg2, c_rarg2, c_rarg3); 559 __ cbnz(c_rarg2, error); 560 561 // make sure klass is 'reasonable', which is not zero. 562 __ load_klass(r0, r0); // get klass 563 __ cbz(r0, error); // if klass is NULL it is broken 564 565 // return if everything seems ok 566 __ bind(exit); 567 568 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 569 __ ret(lr); 570 571 // handle errors 572 __ bind(error); 573 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 574 575 __ push(RegSet::range(r0, r29), sp); 576 // debug(char* msg, int64_t pc, int64_t regs[]) 577 __ mov(c_rarg0, rscratch1); // pass address of error message 578 __ mov(c_rarg1, lr); // pass return address 579 __ mov(c_rarg2, sp); // pass address of regs on stack 580 #ifndef PRODUCT 581 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 582 #endif 583 BLOCK_COMMENT("call MacroAssembler::debug"); 584 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 585 __ blr(rscratch1); 586 587 return start; 588 } 589 590 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 591 592 // Generate code for an array write pre barrier 593 // 594 // addr - starting address 595 // count - element count 596 // tmp - scratch register 597 // 598 // Destroy no registers except rscratch1 and rscratch2 599 // 600 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 601 BarrierSet* bs = Universe::heap()->barrier_set(); 602 switch (bs->kind()) { 603 case BarrierSet::G1SATBCT: 604 case BarrierSet::G1SATBCTLogging: 605 // With G1, don't generate the call if we statically know that the target in uninitialized 606 if (!dest_uninitialized) { 607 __ push_call_clobbered_registers(); 608 if (count == c_rarg0) { 609 if (addr == c_rarg1) { 610 // exactly backwards!! 611 __ mov(rscratch1, c_rarg0); 612 __ mov(c_rarg0, c_rarg1); 613 __ mov(c_rarg1, rscratch1); 614 } else { 615 __ mov(c_rarg1, count); 616 __ mov(c_rarg0, addr); 617 } 618 } else { 619 __ mov(c_rarg0, addr); 620 __ mov(c_rarg1, count); 621 } 622 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 623 __ pop_call_clobbered_registers(); 624 break; 625 case BarrierSet::CardTableModRef: 626 case BarrierSet::CardTableExtension: 627 case BarrierSet::ModRef: 628 break; 629 default: 630 ShouldNotReachHere(); 631 632 } 633 } 634 } 635 636 // 637 // Generate code for an array write post barrier 638 // 639 // Input: 640 // start - register containing starting address of destination array 641 // end - register containing ending address of destination array 642 // scratch - scratch register 643 // 644 // The input registers are overwritten. 645 // The ending address is inclusive. 646 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 647 assert_different_registers(start, end, scratch); 648 Label L_done; 649 650 // "end" is inclusive end pointer == start + (count - 1) * array_element_size 651 // If count == 0, "end" is less than "start" and we need to skip card marking. 652 __ cmp(end, start); 653 __ br(__ LO, L_done); 654 655 BarrierSet* bs = Universe::heap()->barrier_set(); 656 switch (bs->kind()) { 657 case BarrierSet::G1SATBCT: 658 case BarrierSet::G1SATBCTLogging: 659 660 { 661 __ push_call_clobbered_registers(); 662 // must compute element count unless barrier set interface is changed (other platforms supply count) 663 assert_different_registers(start, end, scratch); 664 __ lea(scratch, Address(end, BytesPerHeapOop)); 665 __ sub(scratch, scratch, start); // subtract start to get #bytes 666 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 667 __ mov(c_rarg0, start); 668 __ mov(c_rarg1, scratch); 669 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 670 __ pop_call_clobbered_registers(); 671 } 672 break; 673 case BarrierSet::CardTableModRef: 674 case BarrierSet::CardTableExtension: 675 { 676 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 677 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 678 679 Label L_loop; 680 681 __ lsr(start, start, CardTableModRefBS::card_shift); 682 __ lsr(end, end, CardTableModRefBS::card_shift); 683 __ sub(end, end, start); // number of bytes to copy 684 685 const Register count = end; // 'end' register contains bytes count now 686 __ load_byte_map_base(scratch); 687 __ add(start, start, scratch); 688 if (UseConcMarkSweepGC) { 689 __ membar(__ StoreStore); 690 } 691 __ BIND(L_loop); 692 __ strb(zr, Address(start, count)); 693 __ subs(count, count, 1); 694 __ br(Assembler::GE, L_loop); 695 } 696 break; 697 default: 698 ShouldNotReachHere(); 699 700 } 701 __ bind(L_done); 702 } 703 704 address generate_zero_longs(Register base, Register cnt) { 705 Register tmp = rscratch1; 706 Register tmp2 = rscratch2; 707 int zva_length = VM_Version::zva_length(); 708 Label initial_table_end, loop_zva; 709 Label fini; 710 711 __ align(CodeEntryAlignment); 712 StubCodeMark mark(this, "StubRoutines", "zero_longs"); 713 address start = __ pc(); 714 715 // Base must be 16 byte aligned. If not just return and let caller handle it 716 __ tst(base, 0x0f); 717 __ br(Assembler::NE, fini); 718 // Align base with ZVA length. 719 __ neg(tmp, base); 720 __ andr(tmp, tmp, zva_length - 1); 721 722 // tmp: the number of bytes to be filled to align the base with ZVA length. 723 __ add(base, base, tmp); 724 __ sub(cnt, cnt, tmp, Assembler::ASR, 3); 725 __ adr(tmp2, initial_table_end); 726 __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 727 __ br(tmp2); 728 729 for (int i = -zva_length + 16; i < 0; i += 16) 730 __ stp(zr, zr, Address(base, i)); 731 __ bind(initial_table_end); 732 733 __ sub(cnt, cnt, zva_length >> 3); 734 __ bind(loop_zva); 735 __ dc(Assembler::ZVA, base); 736 __ subs(cnt, cnt, zva_length >> 3); 737 __ add(base, base, zva_length); 738 __ br(Assembler::GE, loop_zva); 739 __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 740 __ bind(fini); 741 __ ret(lr); 742 743 return start; 744 } 745 746 typedef enum { 747 copy_forwards = 1, 748 copy_backwards = -1 749 } copy_direction; 750 751 // Bulk copy of blocks of 8 words. 752 // 753 // count is a count of words. 754 // 755 // Precondition: count >= 8 756 // 757 // Postconditions: 758 // 759 // The least significant bit of count contains the remaining count 760 // of words to copy. The rest of count is trash. 761 // 762 // s and d are adjusted to point to the remaining words to copy 763 // 764 void generate_copy_longs(Label &start, Register s, Register d, Register count, 765 copy_direction direction) { 766 int unit = wordSize * direction; 767 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 768 769 int offset; 770 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 771 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 772 const Register stride = r13; 773 774 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 775 assert_different_registers(s, d, count, rscratch1); 776 777 Label again, drain; 778 const char *stub_name; 779 if (direction == copy_forwards) 780 stub_name = "foward_copy_longs"; 781 else 782 stub_name = "backward_copy_longs"; 783 784 __ align(CodeEntryAlignment); 785 786 StubCodeMark mark(this, "StubRoutines", stub_name); 787 788 __ bind(start); 789 790 Label unaligned_copy_long; 791 if (AvoidUnalignedAccesses) { 792 __ tbnz(d, 3, unaligned_copy_long); 793 } 794 795 if (direction == copy_forwards) { 796 __ sub(s, s, bias); 797 __ sub(d, d, bias); 798 } 799 800 #ifdef ASSERT 801 // Make sure we are never given < 8 words 802 { 803 Label L; 804 __ cmp(count, 8); 805 __ br(Assembler::GE, L); 806 __ stop("genrate_copy_longs called with < 8 words"); 807 __ bind(L); 808 } 809 #endif 810 811 // Fill 8 registers 812 if (UseSIMDForMemoryOps) { 813 __ ldpq(v0, v1, Address(s, 4 * unit)); 814 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 815 } else { 816 __ ldp(t0, t1, Address(s, 2 * unit)); 817 __ ldp(t2, t3, Address(s, 4 * unit)); 818 __ ldp(t4, t5, Address(s, 6 * unit)); 819 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 820 } 821 822 __ subs(count, count, 16); 823 __ br(Assembler::LO, drain); 824 825 int prefetch = PrefetchCopyIntervalInBytes; 826 bool use_stride = false; 827 if (direction == copy_backwards) { 828 use_stride = prefetch > 256; 829 prefetch = -prefetch; 830 if (use_stride) __ mov(stride, prefetch); 831 } 832 833 __ bind(again); 834 835 if (PrefetchCopyIntervalInBytes > 0) 836 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 837 838 if (UseSIMDForMemoryOps) { 839 __ stpq(v0, v1, Address(d, 4 * unit)); 840 __ ldpq(v0, v1, Address(s, 4 * unit)); 841 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 842 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 843 } else { 844 __ stp(t0, t1, Address(d, 2 * unit)); 845 __ ldp(t0, t1, Address(s, 2 * unit)); 846 __ stp(t2, t3, Address(d, 4 * unit)); 847 __ ldp(t2, t3, Address(s, 4 * unit)); 848 __ stp(t4, t5, Address(d, 6 * unit)); 849 __ ldp(t4, t5, Address(s, 6 * unit)); 850 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 851 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 852 } 853 854 __ subs(count, count, 8); 855 __ br(Assembler::HS, again); 856 857 // Drain 858 __ bind(drain); 859 if (UseSIMDForMemoryOps) { 860 __ stpq(v0, v1, Address(d, 4 * unit)); 861 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 862 } else { 863 __ stp(t0, t1, Address(d, 2 * unit)); 864 __ stp(t2, t3, Address(d, 4 * unit)); 865 __ stp(t4, t5, Address(d, 6 * unit)); 866 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 867 } 868 869 { 870 Label L1, L2; 871 __ tbz(count, exact_log2(4), L1); 872 if (UseSIMDForMemoryOps) { 873 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 874 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 875 } else { 876 __ ldp(t0, t1, Address(s, 2 * unit)); 877 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 878 __ stp(t0, t1, Address(d, 2 * unit)); 879 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 880 } 881 __ bind(L1); 882 883 if (direction == copy_forwards) { 884 __ add(s, s, bias); 885 __ add(d, d, bias); 886 } 887 888 __ tbz(count, 1, L2); 889 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 890 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 891 __ bind(L2); 892 } 893 894 __ ret(lr); 895 896 if (AvoidUnalignedAccesses) { 897 Label drain, again; 898 // Register order for storing. Order is different for backward copy. 899 900 __ bind(unaligned_copy_long); 901 902 // source address is even aligned, target odd aligned 903 // 904 // when forward copying word pairs we read long pairs at offsets 905 // {0, 2, 4, 6} (in long words). when backwards copying we read 906 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 907 // address by -2 in the forwards case so we can compute the 908 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 909 // or -1. 910 // 911 // when forward copying we need to store 1 word, 3 pairs and 912 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 913 // zero offset We adjust the destination by -1 which means we 914 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 915 // 916 // When backwards copyng we need to store 1 word, 3 pairs and 917 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 918 // offsets {1, 3, 5, 7, 8} * unit. 919 920 if (direction == copy_forwards) { 921 __ sub(s, s, 16); 922 __ sub(d, d, 8); 923 } 924 925 // Fill 8 registers 926 // 927 // for forwards copy s was offset by -16 from the original input 928 // value of s so the register contents are at these offsets 929 // relative to the 64 bit block addressed by that original input 930 // and so on for each successive 64 byte block when s is updated 931 // 932 // t0 at offset 0, t1 at offset 8 933 // t2 at offset 16, t3 at offset 24 934 // t4 at offset 32, t5 at offset 40 935 // t6 at offset 48, t7 at offset 56 936 937 // for backwards copy s was not offset so the register contents 938 // are at these offsets into the preceding 64 byte block 939 // relative to that original input and so on for each successive 940 // preceding 64 byte block when s is updated. this explains the 941 // slightly counter-intuitive looking pattern of register usage 942 // in the stp instructions for backwards copy. 943 // 944 // t0 at offset -16, t1 at offset -8 945 // t2 at offset -32, t3 at offset -24 946 // t4 at offset -48, t5 at offset -40 947 // t6 at offset -64, t7 at offset -56 948 949 __ ldp(t0, t1, Address(s, 2 * unit)); 950 __ ldp(t2, t3, Address(s, 4 * unit)); 951 __ ldp(t4, t5, Address(s, 6 * unit)); 952 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 953 954 __ subs(count, count, 16); 955 __ br(Assembler::LO, drain); 956 957 int prefetch = PrefetchCopyIntervalInBytes; 958 bool use_stride = false; 959 if (direction == copy_backwards) { 960 use_stride = prefetch > 256; 961 prefetch = -prefetch; 962 if (use_stride) __ mov(stride, prefetch); 963 } 964 965 __ bind(again); 966 967 if (PrefetchCopyIntervalInBytes > 0) 968 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 969 970 if (direction == copy_forwards) { 971 // allowing for the offset of -8 the store instructions place 972 // registers into the target 64 bit block at the following 973 // offsets 974 // 975 // t0 at offset 0 976 // t1 at offset 8, t2 at offset 16 977 // t3 at offset 24, t4 at offset 32 978 // t5 at offset 40, t6 at offset 48 979 // t7 at offset 56 980 981 __ str(t0, Address(d, 1 * unit)); 982 __ stp(t1, t2, Address(d, 2 * unit)); 983 __ ldp(t0, t1, Address(s, 2 * unit)); 984 __ stp(t3, t4, Address(d, 4 * unit)); 985 __ ldp(t2, t3, Address(s, 4 * unit)); 986 __ stp(t5, t6, Address(d, 6 * unit)); 987 __ ldp(t4, t5, Address(s, 6 * unit)); 988 __ str(t7, Address(__ pre(d, 8 * unit))); 989 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 990 } else { 991 // d was not offset when we started so the registers are 992 // written into the 64 bit block preceding d with the following 993 // offsets 994 // 995 // t1 at offset -8 996 // t3 at offset -24, t0 at offset -16 997 // t5 at offset -48, t2 at offset -32 998 // t7 at offset -56, t4 at offset -48 999 // t6 at offset -64 1000 // 1001 // note that this matches the offsets previously noted for the 1002 // loads 1003 1004 __ str(t1, Address(d, 1 * unit)); 1005 __ stp(t3, t0, Address(d, 3 * unit)); 1006 __ ldp(t0, t1, Address(s, 2 * unit)); 1007 __ stp(t5, t2, Address(d, 5 * unit)); 1008 __ ldp(t2, t3, Address(s, 4 * unit)); 1009 __ stp(t7, t4, Address(d, 7 * unit)); 1010 __ ldp(t4, t5, Address(s, 6 * unit)); 1011 __ str(t6, Address(__ pre(d, 8 * unit))); 1012 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1013 } 1014 1015 __ subs(count, count, 8); 1016 __ br(Assembler::HS, again); 1017 1018 // Drain 1019 // 1020 // this uses the same pattern of offsets and register arguments 1021 // as above 1022 __ bind(drain); 1023 if (direction == copy_forwards) { 1024 __ str(t0, Address(d, 1 * unit)); 1025 __ stp(t1, t2, Address(d, 2 * unit)); 1026 __ stp(t3, t4, Address(d, 4 * unit)); 1027 __ stp(t5, t6, Address(d, 6 * unit)); 1028 __ str(t7, Address(__ pre(d, 8 * unit))); 1029 } else { 1030 __ str(t1, Address(d, 1 * unit)); 1031 __ stp(t3, t0, Address(d, 3 * unit)); 1032 __ stp(t5, t2, Address(d, 5 * unit)); 1033 __ stp(t7, t4, Address(d, 7 * unit)); 1034 __ str(t6, Address(__ pre(d, 8 * unit))); 1035 } 1036 // now we need to copy any remaining part block which may 1037 // include a 4 word block subblock and/or a 2 word subblock. 1038 // bits 2 and 1 in the count are the tell-tale for whetehr we 1039 // have each such subblock 1040 { 1041 Label L1, L2; 1042 __ tbz(count, exact_log2(4), L1); 1043 // this is the same as above but copying only 4 longs hence 1044 // with ony one intervening stp between the str instructions 1045 // but note that the offsets and registers still follow the 1046 // same pattern 1047 __ ldp(t0, t1, Address(s, 2 * unit)); 1048 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1049 if (direction == copy_forwards) { 1050 __ str(t0, Address(d, 1 * unit)); 1051 __ stp(t1, t2, Address(d, 2 * unit)); 1052 __ str(t3, Address(__ pre(d, 4 * unit))); 1053 } else { 1054 __ str(t1, Address(d, 1 * unit)); 1055 __ stp(t3, t0, Address(d, 3 * unit)); 1056 __ str(t2, Address(__ pre(d, 4 * unit))); 1057 } 1058 __ bind(L1); 1059 1060 __ tbz(count, 1, L2); 1061 // this is the same as above but copying only 2 longs hence 1062 // there is no intervening stp between the str instructions 1063 // but note that the offset and register patterns are still 1064 // the same 1065 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1066 if (direction == copy_forwards) { 1067 __ str(t0, Address(d, 1 * unit)); 1068 __ str(t1, Address(__ pre(d, 2 * unit))); 1069 } else { 1070 __ str(t1, Address(d, 1 * unit)); 1071 __ str(t0, Address(__ pre(d, 2 * unit))); 1072 } 1073 __ bind(L2); 1074 1075 // for forwards copy we need to re-adjust the offsets we 1076 // applied so that s and d are follow the last words written 1077 1078 if (direction == copy_forwards) { 1079 __ add(s, s, 16); 1080 __ add(d, d, 8); 1081 } 1082 1083 } 1084 1085 __ ret(lr); 1086 } 1087 } 1088 1089 // Small copy: less than 16 bytes. 1090 // 1091 // NB: Ignores all of the bits of count which represent more than 15 1092 // bytes, so a caller doesn't have to mask them. 1093 1094 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1095 bool is_backwards = step < 0; 1096 size_t granularity = uabs(step); 1097 int direction = is_backwards ? -1 : 1; 1098 int unit = wordSize * direction; 1099 1100 Label Lpair, Lword, Lint, Lshort, Lbyte; 1101 1102 assert(granularity 1103 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1104 1105 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1106 1107 // ??? I don't know if this bit-test-and-branch is the right thing 1108 // to do. It does a lot of jumping, resulting in several 1109 // mispredicted branches. It might make more sense to do this 1110 // with something like Duff's device with a single computed branch. 1111 1112 __ tbz(count, 3 - exact_log2(granularity), Lword); 1113 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1114 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1115 __ bind(Lword); 1116 1117 if (granularity <= sizeof (jint)) { 1118 __ tbz(count, 2 - exact_log2(granularity), Lint); 1119 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1120 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1121 __ bind(Lint); 1122 } 1123 1124 if (granularity <= sizeof (jshort)) { 1125 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1126 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1127 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1128 __ bind(Lshort); 1129 } 1130 1131 if (granularity <= sizeof (jbyte)) { 1132 __ tbz(count, 0, Lbyte); 1133 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1134 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1135 __ bind(Lbyte); 1136 } 1137 } 1138 1139 Label copy_f, copy_b; 1140 1141 // All-singing all-dancing memory copy. 1142 // 1143 // Copy count units of memory from s to d. The size of a unit is 1144 // step, which can be positive or negative depending on the direction 1145 // of copy. If is_aligned is false, we align the source address. 1146 // 1147 1148 void copy_memory(bool is_aligned, Register s, Register d, 1149 Register count, Register tmp, int step) { 1150 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1151 bool is_backwards = step < 0; 1152 int granularity = uabs(step); 1153 const Register t0 = r3, t1 = r4; 1154 1155 // <= 96 bytes do inline. Direction doesn't matter because we always 1156 // load all the data before writing anything 1157 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1158 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1159 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1160 const Register send = r17, dend = r18; 1161 1162 if (PrefetchCopyIntervalInBytes > 0) 1163 __ prfm(Address(s, 0), PLDL1KEEP); 1164 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1165 __ br(Assembler::HI, copy_big); 1166 1167 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1168 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1169 1170 __ cmp(count, 16/granularity); 1171 __ br(Assembler::LS, copy16); 1172 1173 __ cmp(count, 64/granularity); 1174 __ br(Assembler::HI, copy80); 1175 1176 __ cmp(count, 32/granularity); 1177 __ br(Assembler::LS, copy32); 1178 1179 // 33..64 bytes 1180 if (UseSIMDForMemoryOps) { 1181 __ ldpq(v0, v1, Address(s, 0)); 1182 __ ldpq(v2, v3, Address(send, -32)); 1183 __ stpq(v0, v1, Address(d, 0)); 1184 __ stpq(v2, v3, Address(dend, -32)); 1185 } else { 1186 __ ldp(t0, t1, Address(s, 0)); 1187 __ ldp(t2, t3, Address(s, 16)); 1188 __ ldp(t4, t5, Address(send, -32)); 1189 __ ldp(t6, t7, Address(send, -16)); 1190 1191 __ stp(t0, t1, Address(d, 0)); 1192 __ stp(t2, t3, Address(d, 16)); 1193 __ stp(t4, t5, Address(dend, -32)); 1194 __ stp(t6, t7, Address(dend, -16)); 1195 } 1196 __ b(finish); 1197 1198 // 17..32 bytes 1199 __ bind(copy32); 1200 __ ldp(t0, t1, Address(s, 0)); 1201 __ ldp(t2, t3, Address(send, -16)); 1202 __ stp(t0, t1, Address(d, 0)); 1203 __ stp(t2, t3, Address(dend, -16)); 1204 __ b(finish); 1205 1206 // 65..80/96 bytes 1207 // (96 bytes if SIMD because we do 32 byes per instruction) 1208 __ bind(copy80); 1209 if (UseSIMDForMemoryOps) { 1210 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1211 __ ldpq(v4, v5, Address(send, -32)); 1212 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1213 __ stpq(v4, v5, Address(dend, -32)); 1214 } else { 1215 __ ldp(t0, t1, Address(s, 0)); 1216 __ ldp(t2, t3, Address(s, 16)); 1217 __ ldp(t4, t5, Address(s, 32)); 1218 __ ldp(t6, t7, Address(s, 48)); 1219 __ ldp(t8, t9, Address(send, -16)); 1220 1221 __ stp(t0, t1, Address(d, 0)); 1222 __ stp(t2, t3, Address(d, 16)); 1223 __ stp(t4, t5, Address(d, 32)); 1224 __ stp(t6, t7, Address(d, 48)); 1225 __ stp(t8, t9, Address(dend, -16)); 1226 } 1227 __ b(finish); 1228 1229 // 0..16 bytes 1230 __ bind(copy16); 1231 __ cmp(count, 8/granularity); 1232 __ br(Assembler::LO, copy8); 1233 1234 // 8..16 bytes 1235 __ ldr(t0, Address(s, 0)); 1236 __ ldr(t1, Address(send, -8)); 1237 __ str(t0, Address(d, 0)); 1238 __ str(t1, Address(dend, -8)); 1239 __ b(finish); 1240 1241 if (granularity < 8) { 1242 // 4..7 bytes 1243 __ bind(copy8); 1244 __ tbz(count, 2 - exact_log2(granularity), copy4); 1245 __ ldrw(t0, Address(s, 0)); 1246 __ ldrw(t1, Address(send, -4)); 1247 __ strw(t0, Address(d, 0)); 1248 __ strw(t1, Address(dend, -4)); 1249 __ b(finish); 1250 if (granularity < 4) { 1251 // 0..3 bytes 1252 __ bind(copy4); 1253 __ cbz(count, finish); // get rid of 0 case 1254 if (granularity == 2) { 1255 __ ldrh(t0, Address(s, 0)); 1256 __ strh(t0, Address(d, 0)); 1257 } else { // granularity == 1 1258 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1259 // the first and last byte. 1260 // Handle the 3 byte case by loading and storing base + count/2 1261 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1262 // This does means in the 1 byte case we load/store the same 1263 // byte 3 times. 1264 __ lsr(count, count, 1); 1265 __ ldrb(t0, Address(s, 0)); 1266 __ ldrb(t1, Address(send, -1)); 1267 __ ldrb(t2, Address(s, count)); 1268 __ strb(t0, Address(d, 0)); 1269 __ strb(t1, Address(dend, -1)); 1270 __ strb(t2, Address(d, count)); 1271 } 1272 __ b(finish); 1273 } 1274 } 1275 1276 __ bind(copy_big); 1277 if (is_backwards) { 1278 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1279 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1280 } 1281 1282 // Now we've got the small case out of the way we can align the 1283 // source address on a 2-word boundary. 1284 1285 Label aligned; 1286 1287 if (is_aligned) { 1288 // We may have to adjust by 1 word to get s 2-word-aligned. 1289 __ tbz(s, exact_log2(wordSize), aligned); 1290 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1291 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1292 __ sub(count, count, wordSize/granularity); 1293 } else { 1294 if (is_backwards) { 1295 __ andr(rscratch2, s, 2 * wordSize - 1); 1296 } else { 1297 __ neg(rscratch2, s); 1298 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1299 } 1300 // rscratch2 is the byte adjustment needed to align s. 1301 __ cbz(rscratch2, aligned); 1302 int shift = exact_log2(granularity); 1303 if (shift) __ lsr(rscratch2, rscratch2, shift); 1304 __ sub(count, count, rscratch2); 1305 1306 #if 0 1307 // ?? This code is only correct for a disjoint copy. It may or 1308 // may not make sense to use it in that case. 1309 1310 // Copy the first pair; s and d may not be aligned. 1311 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1312 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1313 1314 // Align s and d, adjust count 1315 if (is_backwards) { 1316 __ sub(s, s, rscratch2); 1317 __ sub(d, d, rscratch2); 1318 } else { 1319 __ add(s, s, rscratch2); 1320 __ add(d, d, rscratch2); 1321 } 1322 #else 1323 copy_memory_small(s, d, rscratch2, rscratch1, step); 1324 #endif 1325 } 1326 1327 __ bind(aligned); 1328 1329 // s is now 2-word-aligned. 1330 1331 // We have a count of units and some trailing bytes. Adjust the 1332 // count and do a bulk copy of words. 1333 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1334 if (direction == copy_forwards) 1335 __ bl(copy_f); 1336 else 1337 __ bl(copy_b); 1338 1339 // And the tail. 1340 copy_memory_small(s, d, count, tmp, step); 1341 1342 if (granularity >= 8) __ bind(copy8); 1343 if (granularity >= 4) __ bind(copy4); 1344 __ bind(finish); 1345 } 1346 1347 1348 void clobber_registers() { 1349 #ifdef ASSERT 1350 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1351 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1352 for (Register r = r3; r <= r18; r++) 1353 if (r != rscratch1) __ mov(r, rscratch1); 1354 #endif 1355 } 1356 1357 // Scan over array at a for count oops, verifying each one. 1358 // Preserves a and count, clobbers rscratch1 and rscratch2. 1359 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1360 Label loop, end; 1361 __ mov(rscratch1, a); 1362 __ mov(rscratch2, zr); 1363 __ bind(loop); 1364 __ cmp(rscratch2, count); 1365 __ br(Assembler::HS, end); 1366 if (size == (size_t)wordSize) { 1367 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1368 __ verify_oop(temp); 1369 } else { 1370 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1371 __ decode_heap_oop(temp); // calls verify_oop 1372 } 1373 __ add(rscratch2, rscratch2, 1); 1374 __ b(loop); 1375 __ bind(end); 1376 } 1377 1378 // Arguments: 1379 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1380 // ignored 1381 // is_oop - true => oop array, so generate store check code 1382 // name - stub name string 1383 // 1384 // Inputs: 1385 // c_rarg0 - source array address 1386 // c_rarg1 - destination array address 1387 // c_rarg2 - element count, treated as ssize_t, can be zero 1388 // 1389 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1390 // the hardware handle it. The two dwords within qwords that span 1391 // cache line boundaries will still be loaded and stored atomicly. 1392 // 1393 // Side Effects: 1394 // disjoint_int_copy_entry is set to the no-overlap entry point 1395 // used by generate_conjoint_int_oop_copy(). 1396 // 1397 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1398 const char *name, bool dest_uninitialized = false) { 1399 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1400 __ align(CodeEntryAlignment); 1401 StubCodeMark mark(this, "StubRoutines", name); 1402 address start = __ pc(); 1403 __ enter(); 1404 1405 if (entry != NULL) { 1406 *entry = __ pc(); 1407 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1408 BLOCK_COMMENT("Entry:"); 1409 } 1410 1411 if (is_oop) { 1412 __ push(RegSet::of(d, count), sp); 1413 // no registers are destroyed by this call 1414 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1415 } 1416 copy_memory(aligned, s, d, count, rscratch1, size); 1417 if (is_oop) { 1418 __ pop(RegSet::of(d, count), sp); 1419 if (VerifyOops) 1420 verify_oop_array(size, d, count, r16); 1421 __ sub(count, count, 1); // make an inclusive end pointer 1422 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1423 gen_write_ref_array_post_barrier(d, count, rscratch1); 1424 } 1425 __ leave(); 1426 __ mov(r0, zr); // return 0 1427 __ ret(lr); 1428 return start; 1429 } 1430 1431 // Arguments: 1432 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1433 // ignored 1434 // is_oop - true => oop array, so generate store check code 1435 // name - stub name string 1436 // 1437 // Inputs: 1438 // c_rarg0 - source array address 1439 // c_rarg1 - destination array address 1440 // c_rarg2 - element count, treated as ssize_t, can be zero 1441 // 1442 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1443 // the hardware handle it. The two dwords within qwords that span 1444 // cache line boundaries will still be loaded and stored atomicly. 1445 // 1446 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1447 address *entry, const char *name, 1448 bool dest_uninitialized = false) { 1449 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1450 1451 StubCodeMark mark(this, "StubRoutines", name); 1452 address start = __ pc(); 1453 1454 __ enter(); 1455 1456 if (entry != NULL) { 1457 *entry = __ pc(); 1458 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1459 BLOCK_COMMENT("Entry:"); 1460 } 1461 1462 // use fwd copy when (d-s) above_equal (count*size) 1463 __ sub(rscratch1, d, s); 1464 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1465 __ br(Assembler::HS, nooverlap_target); 1466 1467 if (is_oop) { 1468 __ push(RegSet::of(d, count), sp); 1469 // no registers are destroyed by this call 1470 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1471 } 1472 copy_memory(aligned, s, d, count, rscratch1, -size); 1473 if (is_oop) { 1474 __ pop(RegSet::of(d, count), sp); 1475 if (VerifyOops) 1476 verify_oop_array(size, d, count, r16); 1477 __ sub(count, count, 1); // make an inclusive end pointer 1478 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1479 gen_write_ref_array_post_barrier(d, count, rscratch1); 1480 } 1481 __ leave(); 1482 __ mov(r0, zr); // return 0 1483 __ ret(lr); 1484 return start; 1485 } 1486 1487 // Arguments: 1488 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1489 // ignored 1490 // name - stub name string 1491 // 1492 // Inputs: 1493 // c_rarg0 - source array address 1494 // c_rarg1 - destination array address 1495 // c_rarg2 - element count, treated as ssize_t, can be zero 1496 // 1497 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1498 // we let the hardware handle it. The one to eight bytes within words, 1499 // dwords or qwords that span cache line boundaries will still be loaded 1500 // and stored atomically. 1501 // 1502 // Side Effects: 1503 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1504 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1505 // we let the hardware handle it. The one to eight bytes within words, 1506 // dwords or qwords that span cache line boundaries will still be loaded 1507 // and stored atomically. 1508 // 1509 // Side Effects: 1510 // disjoint_byte_copy_entry is set to the no-overlap entry point 1511 // used by generate_conjoint_byte_copy(). 1512 // 1513 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1514 const bool not_oop = false; 1515 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1516 } 1517 1518 // Arguments: 1519 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1520 // ignored 1521 // name - stub name string 1522 // 1523 // Inputs: 1524 // c_rarg0 - source array address 1525 // c_rarg1 - destination array address 1526 // c_rarg2 - element count, treated as ssize_t, can be zero 1527 // 1528 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1529 // we let the hardware handle it. The one to eight bytes within words, 1530 // dwords or qwords that span cache line boundaries will still be loaded 1531 // and stored atomically. 1532 // 1533 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1534 address* entry, const char *name) { 1535 const bool not_oop = false; 1536 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1537 } 1538 1539 // Arguments: 1540 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1541 // ignored 1542 // name - stub name string 1543 // 1544 // Inputs: 1545 // c_rarg0 - source array address 1546 // c_rarg1 - destination array address 1547 // c_rarg2 - element count, treated as ssize_t, can be zero 1548 // 1549 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1550 // let the hardware handle it. The two or four words within dwords 1551 // or qwords that span cache line boundaries will still be loaded 1552 // and stored atomically. 1553 // 1554 // Side Effects: 1555 // disjoint_short_copy_entry is set to the no-overlap entry point 1556 // used by generate_conjoint_short_copy(). 1557 // 1558 address generate_disjoint_short_copy(bool aligned, 1559 address* entry, const char *name) { 1560 const bool not_oop = false; 1561 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1562 } 1563 1564 // Arguments: 1565 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1566 // ignored 1567 // name - stub name string 1568 // 1569 // Inputs: 1570 // c_rarg0 - source array address 1571 // c_rarg1 - destination array address 1572 // c_rarg2 - element count, treated as ssize_t, can be zero 1573 // 1574 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1575 // let the hardware handle it. The two or four words within dwords 1576 // or qwords that span cache line boundaries will still be loaded 1577 // and stored atomically. 1578 // 1579 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1580 address *entry, const char *name) { 1581 const bool not_oop = false; 1582 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1583 1584 } 1585 // Arguments: 1586 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1587 // ignored 1588 // name - stub name string 1589 // 1590 // Inputs: 1591 // c_rarg0 - source array address 1592 // c_rarg1 - destination array address 1593 // c_rarg2 - element count, treated as ssize_t, can be zero 1594 // 1595 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1596 // the hardware handle it. The two dwords within qwords that span 1597 // cache line boundaries will still be loaded and stored atomicly. 1598 // 1599 // Side Effects: 1600 // disjoint_int_copy_entry is set to the no-overlap entry point 1601 // used by generate_conjoint_int_oop_copy(). 1602 // 1603 address generate_disjoint_int_copy(bool aligned, address *entry, 1604 const char *name, bool dest_uninitialized = false) { 1605 const bool not_oop = false; 1606 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1607 } 1608 1609 // Arguments: 1610 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1611 // ignored 1612 // name - stub name string 1613 // 1614 // Inputs: 1615 // c_rarg0 - source array address 1616 // c_rarg1 - destination array address 1617 // c_rarg2 - element count, treated as ssize_t, can be zero 1618 // 1619 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1620 // the hardware handle it. The two dwords within qwords that span 1621 // cache line boundaries will still be loaded and stored atomicly. 1622 // 1623 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1624 address *entry, const char *name, 1625 bool dest_uninitialized = false) { 1626 const bool not_oop = false; 1627 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1628 } 1629 1630 1631 // Arguments: 1632 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1633 // ignored 1634 // name - stub name string 1635 // 1636 // Inputs: 1637 // c_rarg0 - source array address 1638 // c_rarg1 - destination array address 1639 // c_rarg2 - element count, treated as size_t, can be zero 1640 // 1641 // Side Effects: 1642 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1643 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1644 // 1645 address generate_disjoint_long_copy(bool aligned, address *entry, 1646 const char *name, bool dest_uninitialized = false) { 1647 const bool not_oop = false; 1648 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1649 } 1650 1651 // Arguments: 1652 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1653 // ignored 1654 // name - stub name string 1655 // 1656 // Inputs: 1657 // c_rarg0 - source array address 1658 // c_rarg1 - destination array address 1659 // c_rarg2 - element count, treated as size_t, can be zero 1660 // 1661 address generate_conjoint_long_copy(bool aligned, 1662 address nooverlap_target, address *entry, 1663 const char *name, bool dest_uninitialized = false) { 1664 const bool not_oop = false; 1665 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1666 } 1667 1668 // Arguments: 1669 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1670 // ignored 1671 // name - stub name string 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as size_t, can be zero 1677 // 1678 // Side Effects: 1679 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1680 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1681 // 1682 address generate_disjoint_oop_copy(bool aligned, address *entry, 1683 const char *name, bool dest_uninitialized) { 1684 const bool is_oop = true; 1685 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1686 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1687 } 1688 1689 // Arguments: 1690 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1691 // ignored 1692 // name - stub name string 1693 // 1694 // Inputs: 1695 // c_rarg0 - source array address 1696 // c_rarg1 - destination array address 1697 // c_rarg2 - element count, treated as size_t, can be zero 1698 // 1699 address generate_conjoint_oop_copy(bool aligned, 1700 address nooverlap_target, address *entry, 1701 const char *name, bool dest_uninitialized) { 1702 const bool is_oop = true; 1703 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1704 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1705 name, dest_uninitialized); 1706 } 1707 1708 1709 // Helper for generating a dynamic type check. 1710 // Smashes rscratch1. 1711 void generate_type_check(Register sub_klass, 1712 Register super_check_offset, 1713 Register super_klass, 1714 Label& L_success) { 1715 assert_different_registers(sub_klass, super_check_offset, super_klass); 1716 1717 BLOCK_COMMENT("type_check:"); 1718 1719 Label L_miss; 1720 1721 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1722 super_check_offset); 1723 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1724 1725 // Fall through on failure! 1726 __ BIND(L_miss); 1727 } 1728 1729 // 1730 // Generate checkcasting array copy stub 1731 // 1732 // Input: 1733 // c_rarg0 - source array address 1734 // c_rarg1 - destination array address 1735 // c_rarg2 - element count, treated as ssize_t, can be zero 1736 // c_rarg3 - size_t ckoff (super_check_offset) 1737 // c_rarg4 - oop ckval (super_klass) 1738 // 1739 // Output: 1740 // r0 == 0 - success 1741 // r0 == -1^K - failure, where K is partial transfer count 1742 // 1743 address generate_checkcast_copy(const char *name, address *entry, 1744 bool dest_uninitialized = false) { 1745 1746 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1747 1748 // Input registers (after setup_arg_regs) 1749 const Register from = c_rarg0; // source array address 1750 const Register to = c_rarg1; // destination array address 1751 const Register count = c_rarg2; // elementscount 1752 const Register ckoff = c_rarg3; // super_check_offset 1753 const Register ckval = c_rarg4; // super_klass 1754 1755 // Registers used as temps (r18, r19, r20 are save-on-entry) 1756 const Register count_save = r21; // orig elementscount 1757 const Register start_to = r20; // destination array start address 1758 const Register copied_oop = r18; // actual oop copied 1759 const Register r19_klass = r19; // oop._klass 1760 1761 //--------------------------------------------------------------- 1762 // Assembler stub will be used for this call to arraycopy 1763 // if the two arrays are subtypes of Object[] but the 1764 // destination array type is not equal to or a supertype 1765 // of the source type. Each element must be separately 1766 // checked. 1767 1768 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1769 copied_oop, r19_klass, count_save); 1770 1771 __ align(CodeEntryAlignment); 1772 StubCodeMark mark(this, "StubRoutines", name); 1773 address start = __ pc(); 1774 1775 __ enter(); // required for proper stackwalking of RuntimeStub frame 1776 1777 #ifdef ASSERT 1778 // caller guarantees that the arrays really are different 1779 // otherwise, we would have to make conjoint checks 1780 { Label L; 1781 array_overlap_test(L, TIMES_OOP); 1782 __ stop("checkcast_copy within a single array"); 1783 __ bind(L); 1784 } 1785 #endif //ASSERT 1786 1787 // Caller of this entry point must set up the argument registers. 1788 if (entry != NULL) { 1789 *entry = __ pc(); 1790 BLOCK_COMMENT("Entry:"); 1791 } 1792 1793 // Empty array: Nothing to do. 1794 __ cbz(count, L_done); 1795 1796 __ push(RegSet::of(r18, r19, r20, r21), sp); 1797 1798 #ifdef ASSERT 1799 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1800 // The ckoff and ckval must be mutually consistent, 1801 // even though caller generates both. 1802 { Label L; 1803 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1804 __ ldrw(start_to, Address(ckval, sco_offset)); 1805 __ cmpw(ckoff, start_to); 1806 __ br(Assembler::EQ, L); 1807 __ stop("super_check_offset inconsistent"); 1808 __ bind(L); 1809 } 1810 #endif //ASSERT 1811 1812 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1813 1814 // save the original count 1815 __ mov(count_save, count); 1816 1817 // Copy from low to high addresses 1818 __ mov(start_to, to); // Save destination array start address 1819 __ b(L_load_element); 1820 1821 // ======== begin loop ======== 1822 // (Loop is rotated; its entry is L_load_element.) 1823 // Loop control: 1824 // for (; count != 0; count--) { 1825 // copied_oop = load_heap_oop(from++); 1826 // ... generate_type_check ...; 1827 // store_heap_oop(to++, copied_oop); 1828 // } 1829 __ align(OptoLoopAlignment); 1830 1831 __ BIND(L_store_element); 1832 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1833 __ sub(count, count, 1); 1834 __ cbz(count, L_do_card_marks); 1835 1836 // ======== loop entry is here ======== 1837 __ BIND(L_load_element); 1838 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1839 __ cbz(copied_oop, L_store_element); 1840 1841 __ load_klass(r19_klass, copied_oop);// query the object klass 1842 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1843 // ======== end loop ======== 1844 1845 // It was a real error; we must depend on the caller to finish the job. 1846 // Register count = remaining oops, count_orig = total oops. 1847 // Emit GC store barriers for the oops we have copied and report 1848 // their number to the caller. 1849 1850 __ subs(count, count_save, count); // K = partially copied oop count 1851 __ eon(count, count, zr); // report (-1^K) to caller 1852 __ br(Assembler::EQ, L_done_pop); 1853 1854 __ BIND(L_do_card_marks); 1855 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1856 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1857 1858 __ bind(L_done_pop); 1859 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1860 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1861 1862 __ bind(L_done); 1863 __ mov(r0, count); 1864 __ leave(); 1865 __ ret(lr); 1866 1867 return start; 1868 } 1869 1870 // Perform range checks on the proposed arraycopy. 1871 // Kills temp, but nothing else. 1872 // Also, clean the sign bits of src_pos and dst_pos. 1873 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1874 Register src_pos, // source position (c_rarg1) 1875 Register dst, // destination array oo (c_rarg2) 1876 Register dst_pos, // destination position (c_rarg3) 1877 Register length, 1878 Register temp, 1879 Label& L_failed) { 1880 BLOCK_COMMENT("arraycopy_range_checks:"); 1881 1882 assert_different_registers(rscratch1, temp); 1883 1884 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1885 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1886 __ addw(temp, length, src_pos); 1887 __ cmpw(temp, rscratch1); 1888 __ br(Assembler::HI, L_failed); 1889 1890 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1891 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1892 __ addw(temp, length, dst_pos); 1893 __ cmpw(temp, rscratch1); 1894 __ br(Assembler::HI, L_failed); 1895 1896 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1897 __ movw(src_pos, src_pos); 1898 __ movw(dst_pos, dst_pos); 1899 1900 BLOCK_COMMENT("arraycopy_range_checks done"); 1901 } 1902 1903 // These stubs get called from some dumb test routine. 1904 // I'll write them properly when they're called from 1905 // something that's actually doing something. 1906 static void fake_arraycopy_stub(address src, address dst, int count) { 1907 assert(count == 0, "huh?"); 1908 } 1909 1910 1911 // 1912 // Generate stub for array fill. If "aligned" is true, the 1913 // "to" address is assumed to be heapword aligned. 1914 // 1915 // Arguments for generated stub: 1916 // to: c_rarg0 1917 // value: c_rarg1 1918 // count: c_rarg2 treated as signed 1919 // 1920 address generate_fill(BasicType t, bool aligned, const char *name) { 1921 __ align(CodeEntryAlignment); 1922 StubCodeMark mark(this, "StubRoutines", name); 1923 address start = __ pc(); 1924 1925 BLOCK_COMMENT("Entry:"); 1926 1927 const Register to = c_rarg0; // source array address 1928 const Register value = c_rarg1; // value 1929 const Register count = c_rarg2; // elements count 1930 1931 const Register bz_base = r10; // base for block_zero routine 1932 const Register cnt_words = r11; // temp register 1933 1934 __ enter(); 1935 1936 Label L_fill_elements, L_exit1; 1937 1938 int shift = -1; 1939 switch (t) { 1940 case T_BYTE: 1941 shift = 0; 1942 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 1943 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 1944 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 1945 __ br(Assembler::LO, L_fill_elements); 1946 break; 1947 case T_SHORT: 1948 shift = 1; 1949 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 1950 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 1951 __ br(Assembler::LO, L_fill_elements); 1952 break; 1953 case T_INT: 1954 shift = 2; 1955 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 1956 __ br(Assembler::LO, L_fill_elements); 1957 break; 1958 default: ShouldNotReachHere(); 1959 } 1960 1961 // Align source address at 8 bytes address boundary. 1962 Label L_skip_align1, L_skip_align2, L_skip_align4; 1963 if (!aligned) { 1964 switch (t) { 1965 case T_BYTE: 1966 // One byte misalignment happens only for byte arrays. 1967 __ tbz(to, 0, L_skip_align1); 1968 __ strb(value, Address(__ post(to, 1))); 1969 __ subw(count, count, 1); 1970 __ bind(L_skip_align1); 1971 // Fallthrough 1972 case T_SHORT: 1973 // Two bytes misalignment happens only for byte and short (char) arrays. 1974 __ tbz(to, 1, L_skip_align2); 1975 __ strh(value, Address(__ post(to, 2))); 1976 __ subw(count, count, 2 >> shift); 1977 __ bind(L_skip_align2); 1978 // Fallthrough 1979 case T_INT: 1980 // Align to 8 bytes, we know we are 4 byte aligned to start. 1981 __ tbz(to, 2, L_skip_align4); 1982 __ strw(value, Address(__ post(to, 4))); 1983 __ subw(count, count, 4 >> shift); 1984 __ bind(L_skip_align4); 1985 break; 1986 default: ShouldNotReachHere(); 1987 } 1988 } 1989 1990 // 1991 // Fill large chunks 1992 // 1993 __ lsrw(cnt_words, count, 3 - shift); // number of words 1994 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 1995 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 1996 if (UseBlockZeroing) { 1997 Label non_block_zeroing, rest; 1998 // count >= BlockZeroingLowLimit && value == 0 1999 __ subs(rscratch1, cnt_words, BlockZeroingLowLimit >> 3); 2000 __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE); 2001 __ br(Assembler::NE, non_block_zeroing); 2002 __ mov(bz_base, to); 2003 __ block_zero(bz_base, cnt_words, true); 2004 __ mov(to, bz_base); 2005 __ b(rest); 2006 __ bind(non_block_zeroing); 2007 __ fill_words(to, cnt_words, value); 2008 __ bind(rest); 2009 } 2010 else { 2011 __ fill_words(to, cnt_words, value); 2012 } 2013 2014 // Remaining count is less than 8 bytes. Fill it by a single store. 2015 // Note that the total length is no less than 8 bytes. 2016 if (t == T_BYTE || t == T_SHORT) { 2017 Label L_exit1; 2018 __ cbzw(count, L_exit1); 2019 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2020 __ str(value, Address(to, -8)); // overwrite some elements 2021 __ bind(L_exit1); 2022 __ leave(); 2023 __ ret(lr); 2024 } 2025 2026 // Handle copies less than 8 bytes. 2027 Label L_fill_2, L_fill_4, L_exit2; 2028 __ bind(L_fill_elements); 2029 switch (t) { 2030 case T_BYTE: 2031 __ tbz(count, 0, L_fill_2); 2032 __ strb(value, Address(__ post(to, 1))); 2033 __ bind(L_fill_2); 2034 __ tbz(count, 1, L_fill_4); 2035 __ strh(value, Address(__ post(to, 2))); 2036 __ bind(L_fill_4); 2037 __ tbz(count, 2, L_exit2); 2038 __ strw(value, Address(to)); 2039 break; 2040 case T_SHORT: 2041 __ tbz(count, 0, L_fill_4); 2042 __ strh(value, Address(__ post(to, 2))); 2043 __ bind(L_fill_4); 2044 __ tbz(count, 1, L_exit2); 2045 __ strw(value, Address(to)); 2046 break; 2047 case T_INT: 2048 __ cbzw(count, L_exit2); 2049 __ strw(value, Address(to)); 2050 break; 2051 default: ShouldNotReachHere(); 2052 } 2053 __ bind(L_exit2); 2054 __ leave(); 2055 __ ret(lr); 2056 return start; 2057 } 2058 2059 // 2060 // Generate 'unsafe' array copy stub 2061 // Though just as safe as the other stubs, it takes an unscaled 2062 // size_t argument instead of an element count. 2063 // 2064 // Input: 2065 // c_rarg0 - source array address 2066 // c_rarg1 - destination array address 2067 // c_rarg2 - byte count, treated as ssize_t, can be zero 2068 // 2069 // Examines the alignment of the operands and dispatches 2070 // to a long, int, short, or byte copy loop. 2071 // 2072 address generate_unsafe_copy(const char *name, 2073 address byte_copy_entry, 2074 address short_copy_entry, 2075 address int_copy_entry, 2076 address long_copy_entry) { 2077 Label L_long_aligned, L_int_aligned, L_short_aligned; 2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2079 2080 __ align(CodeEntryAlignment); 2081 StubCodeMark mark(this, "StubRoutines", name); 2082 address start = __ pc(); 2083 __ enter(); // required for proper stackwalking of RuntimeStub frame 2084 2085 // bump this on entry, not on exit: 2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2087 2088 __ orr(rscratch1, s, d); 2089 __ orr(rscratch1, rscratch1, count); 2090 2091 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2092 __ cbz(rscratch1, L_long_aligned); 2093 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2094 __ cbz(rscratch1, L_int_aligned); 2095 __ tbz(rscratch1, 0, L_short_aligned); 2096 __ b(RuntimeAddress(byte_copy_entry)); 2097 2098 __ BIND(L_short_aligned); 2099 __ lsr(count, count, LogBytesPerShort); // size => short_count 2100 __ b(RuntimeAddress(short_copy_entry)); 2101 __ BIND(L_int_aligned); 2102 __ lsr(count, count, LogBytesPerInt); // size => int_count 2103 __ b(RuntimeAddress(int_copy_entry)); 2104 __ BIND(L_long_aligned); 2105 __ lsr(count, count, LogBytesPerLong); // size => long_count 2106 __ b(RuntimeAddress(long_copy_entry)); 2107 2108 return start; 2109 } 2110 2111 // 2112 // Generate generic array copy stubs 2113 // 2114 // Input: 2115 // c_rarg0 - src oop 2116 // c_rarg1 - src_pos (32-bits) 2117 // c_rarg2 - dst oop 2118 // c_rarg3 - dst_pos (32-bits) 2119 // c_rarg4 - element count (32-bits) 2120 // 2121 // Output: 2122 // r0 == 0 - success 2123 // r0 == -1^K - failure, where K is partial transfer count 2124 // 2125 address generate_generic_copy(const char *name, 2126 address byte_copy_entry, address short_copy_entry, 2127 address int_copy_entry, address oop_copy_entry, 2128 address long_copy_entry, address checkcast_copy_entry) { 2129 2130 Label L_failed, L_failed_0, L_objArray; 2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2132 2133 // Input registers 2134 const Register src = c_rarg0; // source array oop 2135 const Register src_pos = c_rarg1; // source position 2136 const Register dst = c_rarg2; // destination array oop 2137 const Register dst_pos = c_rarg3; // destination position 2138 const Register length = c_rarg4; 2139 2140 __ align(CodeEntryAlignment); 2141 2142 StubCodeMark mark(this, "StubRoutines", name); 2143 2144 address start = __ pc(); 2145 2146 __ enter(); // required for proper stackwalking of RuntimeStub frame 2147 2148 // bump this on entry, not on exit: 2149 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2150 2151 //----------------------------------------------------------------------- 2152 // Assembler stub will be used for this call to arraycopy 2153 // if the following conditions are met: 2154 // 2155 // (1) src and dst must not be null. 2156 // (2) src_pos must not be negative. 2157 // (3) dst_pos must not be negative. 2158 // (4) length must not be negative. 2159 // (5) src klass and dst klass should be the same and not NULL. 2160 // (6) src and dst should be arrays. 2161 // (7) src_pos + length must not exceed length of src. 2162 // (8) dst_pos + length must not exceed length of dst. 2163 // 2164 2165 // if (src == NULL) return -1; 2166 __ cbz(src, L_failed); 2167 2168 // if (src_pos < 0) return -1; 2169 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2170 2171 // if (dst == NULL) return -1; 2172 __ cbz(dst, L_failed); 2173 2174 // if (dst_pos < 0) return -1; 2175 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2176 2177 // registers used as temp 2178 const Register scratch_length = r16; // elements count to copy 2179 const Register scratch_src_klass = r17; // array klass 2180 const Register lh = r18; // layout helper 2181 2182 // if (length < 0) return -1; 2183 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2184 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2185 2186 __ load_klass(scratch_src_klass, src); 2187 #ifdef ASSERT 2188 // assert(src->klass() != NULL); 2189 { 2190 BLOCK_COMMENT("assert klasses not null {"); 2191 Label L1, L2; 2192 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2193 __ bind(L1); 2194 __ stop("broken null klass"); 2195 __ bind(L2); 2196 __ load_klass(rscratch1, dst); 2197 __ cbz(rscratch1, L1); // this would be broken also 2198 BLOCK_COMMENT("} assert klasses not null done"); 2199 } 2200 #endif 2201 2202 // Load layout helper (32-bits) 2203 // 2204 // |array_tag| | header_size | element_type | |log2_element_size| 2205 // 32 30 24 16 8 2 0 2206 // 2207 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2208 // 2209 2210 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2211 2212 // Handle objArrays completely differently... 2213 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2214 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2215 __ movw(rscratch1, objArray_lh); 2216 __ eorw(rscratch2, lh, rscratch1); 2217 __ cbzw(rscratch2, L_objArray); 2218 2219 // if (src->klass() != dst->klass()) return -1; 2220 __ load_klass(rscratch2, dst); 2221 __ eor(rscratch2, rscratch2, scratch_src_klass); 2222 __ cbnz(rscratch2, L_failed); 2223 2224 // if (!src->is_Array()) return -1; 2225 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2226 2227 // At this point, it is known to be a typeArray (array_tag 0x3). 2228 #ifdef ASSERT 2229 { 2230 BLOCK_COMMENT("assert primitive array {"); 2231 Label L; 2232 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2233 __ cmpw(lh, rscratch2); 2234 __ br(Assembler::GE, L); 2235 __ stop("must be a primitive array"); 2236 __ bind(L); 2237 BLOCK_COMMENT("} assert primitive array done"); 2238 } 2239 #endif 2240 2241 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2242 rscratch2, L_failed); 2243 2244 // TypeArrayKlass 2245 // 2246 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2247 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2248 // 2249 2250 const Register rscratch1_offset = rscratch1; // array offset 2251 const Register r18_elsize = lh; // element size 2252 2253 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2254 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2255 __ add(src, src, rscratch1_offset); // src array offset 2256 __ add(dst, dst, rscratch1_offset); // dst array offset 2257 BLOCK_COMMENT("choose copy loop based on element size"); 2258 2259 // next registers should be set before the jump to corresponding stub 2260 const Register from = c_rarg0; // source array address 2261 const Register to = c_rarg1; // destination array address 2262 const Register count = c_rarg2; // elements count 2263 2264 // 'from', 'to', 'count' registers should be set in such order 2265 // since they are the same as 'src', 'src_pos', 'dst'. 2266 2267 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2268 2269 // The possible values of elsize are 0-3, i.e. exact_log2(element 2270 // size in bytes). We do a simple bitwise binary search. 2271 __ BIND(L_copy_bytes); 2272 __ tbnz(r18_elsize, 1, L_copy_ints); 2273 __ tbnz(r18_elsize, 0, L_copy_shorts); 2274 __ lea(from, Address(src, src_pos));// src_addr 2275 __ lea(to, Address(dst, dst_pos));// dst_addr 2276 __ movw(count, scratch_length); // length 2277 __ b(RuntimeAddress(byte_copy_entry)); 2278 2279 __ BIND(L_copy_shorts); 2280 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2281 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2282 __ movw(count, scratch_length); // length 2283 __ b(RuntimeAddress(short_copy_entry)); 2284 2285 __ BIND(L_copy_ints); 2286 __ tbnz(r18_elsize, 0, L_copy_longs); 2287 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2288 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2289 __ movw(count, scratch_length); // length 2290 __ b(RuntimeAddress(int_copy_entry)); 2291 2292 __ BIND(L_copy_longs); 2293 #ifdef ASSERT 2294 { 2295 BLOCK_COMMENT("assert long copy {"); 2296 Label L; 2297 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2298 __ cmpw(r18_elsize, LogBytesPerLong); 2299 __ br(Assembler::EQ, L); 2300 __ stop("must be long copy, but elsize is wrong"); 2301 __ bind(L); 2302 BLOCK_COMMENT("} assert long copy done"); 2303 } 2304 #endif 2305 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2306 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2307 __ movw(count, scratch_length); // length 2308 __ b(RuntimeAddress(long_copy_entry)); 2309 2310 // ObjArrayKlass 2311 __ BIND(L_objArray); 2312 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2313 2314 Label L_plain_copy, L_checkcast_copy; 2315 // test array classes for subtyping 2316 __ load_klass(r18, dst); 2317 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2318 __ br(Assembler::NE, L_checkcast_copy); 2319 2320 // Identically typed arrays can be copied without element-wise checks. 2321 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2322 rscratch2, L_failed); 2323 2324 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2325 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2326 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2327 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2328 __ movw(count, scratch_length); // length 2329 __ BIND(L_plain_copy); 2330 __ b(RuntimeAddress(oop_copy_entry)); 2331 2332 __ BIND(L_checkcast_copy); 2333 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2334 { 2335 // Before looking at dst.length, make sure dst is also an objArray. 2336 __ ldrw(rscratch1, Address(r18, lh_offset)); 2337 __ movw(rscratch2, objArray_lh); 2338 __ eorw(rscratch1, rscratch1, rscratch2); 2339 __ cbnzw(rscratch1, L_failed); 2340 2341 // It is safe to examine both src.length and dst.length. 2342 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2343 r18, L_failed); 2344 2345 const Register rscratch2_dst_klass = rscratch2; 2346 __ load_klass(rscratch2_dst_klass, dst); // reload 2347 2348 // Marshal the base address arguments now, freeing registers. 2349 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2350 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2351 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ movw(count, length); // length (reloaded) 2354 Register sco_temp = c_rarg3; // this register is free now 2355 assert_different_registers(from, to, count, sco_temp, 2356 rscratch2_dst_klass, scratch_src_klass); 2357 // assert_clean_int(count, sco_temp); 2358 2359 // Generate the type check. 2360 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2361 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2362 // assert_clean_int(sco_temp, r18); 2363 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2364 2365 // Fetch destination element klass from the ObjArrayKlass header. 2366 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2367 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2368 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2369 2370 // the checkcast_copy loop needs two extra arguments: 2371 assert(c_rarg3 == sco_temp, "#3 already in place"); 2372 // Set up arguments for checkcast_copy_entry. 2373 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2374 __ b(RuntimeAddress(checkcast_copy_entry)); 2375 } 2376 2377 __ BIND(L_failed); 2378 __ mov(r0, -1); 2379 __ leave(); // required for proper stackwalking of RuntimeStub frame 2380 __ ret(lr); 2381 2382 return start; 2383 } 2384 2385 void generate_arraycopy_stubs() { 2386 address entry; 2387 address entry_jbyte_arraycopy; 2388 address entry_jshort_arraycopy; 2389 address entry_jint_arraycopy; 2390 address entry_oop_arraycopy; 2391 address entry_jlong_arraycopy; 2392 address entry_checkcast_arraycopy; 2393 2394 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2395 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2396 2397 StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11); 2398 2399 //*** jbyte 2400 // Always need aligned and unaligned versions 2401 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2402 "jbyte_disjoint_arraycopy"); 2403 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2404 &entry_jbyte_arraycopy, 2405 "jbyte_arraycopy"); 2406 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2407 "arrayof_jbyte_disjoint_arraycopy"); 2408 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2409 "arrayof_jbyte_arraycopy"); 2410 2411 //*** jshort 2412 // Always need aligned and unaligned versions 2413 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2414 "jshort_disjoint_arraycopy"); 2415 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2416 &entry_jshort_arraycopy, 2417 "jshort_arraycopy"); 2418 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2419 "arrayof_jshort_disjoint_arraycopy"); 2420 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2421 "arrayof_jshort_arraycopy"); 2422 2423 //*** jint 2424 // Aligned versions 2425 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2426 "arrayof_jint_disjoint_arraycopy"); 2427 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2428 "arrayof_jint_arraycopy"); 2429 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2430 // entry_jint_arraycopy always points to the unaligned version 2431 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2432 "jint_disjoint_arraycopy"); 2433 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2434 &entry_jint_arraycopy, 2435 "jint_arraycopy"); 2436 2437 //*** jlong 2438 // It is always aligned 2439 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2440 "arrayof_jlong_disjoint_arraycopy"); 2441 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2442 "arrayof_jlong_arraycopy"); 2443 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2444 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2445 2446 //*** oops 2447 { 2448 // With compressed oops we need unaligned versions; notice that 2449 // we overwrite entry_oop_arraycopy. 2450 bool aligned = !UseCompressedOops; 2451 2452 StubRoutines::_arrayof_oop_disjoint_arraycopy 2453 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2454 /*dest_uninitialized*/false); 2455 StubRoutines::_arrayof_oop_arraycopy 2456 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2457 /*dest_uninitialized*/false); 2458 // Aligned versions without pre-barriers 2459 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2460 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2461 /*dest_uninitialized*/true); 2462 StubRoutines::_arrayof_oop_arraycopy_uninit 2463 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2464 /*dest_uninitialized*/true); 2465 } 2466 2467 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2468 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2469 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2470 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2471 2472 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2473 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2474 /*dest_uninitialized*/true); 2475 2476 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2477 entry_jbyte_arraycopy, 2478 entry_jshort_arraycopy, 2479 entry_jint_arraycopy, 2480 entry_jlong_arraycopy); 2481 2482 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2483 entry_jbyte_arraycopy, 2484 entry_jshort_arraycopy, 2485 entry_jint_arraycopy, 2486 entry_oop_arraycopy, 2487 entry_jlong_arraycopy, 2488 entry_checkcast_arraycopy); 2489 2490 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2491 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2492 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2493 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2494 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2495 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2496 } 2497 2498 // Arguments: 2499 // 2500 // Inputs: 2501 // c_rarg0 - source byte array address 2502 // c_rarg1 - destination byte array address 2503 // c_rarg2 - K (key) in little endian int array 2504 // 2505 address generate_aescrypt_encryptBlock() { 2506 __ align(CodeEntryAlignment); 2507 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2508 2509 Label L_doLast; 2510 2511 const Register from = c_rarg0; // source array address 2512 const Register to = c_rarg1; // destination array address 2513 const Register key = c_rarg2; // key array address 2514 const Register keylen = rscratch1; 2515 2516 address start = __ pc(); 2517 __ enter(); 2518 2519 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2520 2521 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2522 2523 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2524 __ rev32(v1, __ T16B, v1); 2525 __ rev32(v2, __ T16B, v2); 2526 __ rev32(v3, __ T16B, v3); 2527 __ rev32(v4, __ T16B, v4); 2528 __ aese(v0, v1); 2529 __ aesmc(v0, v0); 2530 __ aese(v0, v2); 2531 __ aesmc(v0, v0); 2532 __ aese(v0, v3); 2533 __ aesmc(v0, v0); 2534 __ aese(v0, v4); 2535 __ aesmc(v0, v0); 2536 2537 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2538 __ rev32(v1, __ T16B, v1); 2539 __ rev32(v2, __ T16B, v2); 2540 __ rev32(v3, __ T16B, v3); 2541 __ rev32(v4, __ T16B, v4); 2542 __ aese(v0, v1); 2543 __ aesmc(v0, v0); 2544 __ aese(v0, v2); 2545 __ aesmc(v0, v0); 2546 __ aese(v0, v3); 2547 __ aesmc(v0, v0); 2548 __ aese(v0, v4); 2549 __ aesmc(v0, v0); 2550 2551 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2552 __ rev32(v1, __ T16B, v1); 2553 __ rev32(v2, __ T16B, v2); 2554 2555 __ cmpw(keylen, 44); 2556 __ br(Assembler::EQ, L_doLast); 2557 2558 __ aese(v0, v1); 2559 __ aesmc(v0, v0); 2560 __ aese(v0, v2); 2561 __ aesmc(v0, v0); 2562 2563 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2564 __ rev32(v1, __ T16B, v1); 2565 __ rev32(v2, __ T16B, v2); 2566 2567 __ cmpw(keylen, 52); 2568 __ br(Assembler::EQ, L_doLast); 2569 2570 __ aese(v0, v1); 2571 __ aesmc(v0, v0); 2572 __ aese(v0, v2); 2573 __ aesmc(v0, v0); 2574 2575 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2576 __ rev32(v1, __ T16B, v1); 2577 __ rev32(v2, __ T16B, v2); 2578 2579 __ BIND(L_doLast); 2580 2581 __ aese(v0, v1); 2582 __ aesmc(v0, v0); 2583 __ aese(v0, v2); 2584 2585 __ ld1(v1, __ T16B, key); 2586 __ rev32(v1, __ T16B, v1); 2587 __ eor(v0, __ T16B, v0, v1); 2588 2589 __ st1(v0, __ T16B, to); 2590 2591 __ mov(r0, 0); 2592 2593 __ leave(); 2594 __ ret(lr); 2595 2596 return start; 2597 } 2598 2599 // Arguments: 2600 // 2601 // Inputs: 2602 // c_rarg0 - source byte array address 2603 // c_rarg1 - destination byte array address 2604 // c_rarg2 - K (key) in little endian int array 2605 // 2606 address generate_aescrypt_decryptBlock() { 2607 assert(UseAES, "need AES instructions and misaligned SSE support"); 2608 __ align(CodeEntryAlignment); 2609 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2610 Label L_doLast; 2611 2612 const Register from = c_rarg0; // source array address 2613 const Register to = c_rarg1; // destination array address 2614 const Register key = c_rarg2; // key array address 2615 const Register keylen = rscratch1; 2616 2617 address start = __ pc(); 2618 __ enter(); // required for proper stackwalking of RuntimeStub frame 2619 2620 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2621 2622 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2623 2624 __ ld1(v5, __ T16B, __ post(key, 16)); 2625 __ rev32(v5, __ T16B, v5); 2626 2627 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2628 __ rev32(v1, __ T16B, v1); 2629 __ rev32(v2, __ T16B, v2); 2630 __ rev32(v3, __ T16B, v3); 2631 __ rev32(v4, __ T16B, v4); 2632 __ aesd(v0, v1); 2633 __ aesimc(v0, v0); 2634 __ aesd(v0, v2); 2635 __ aesimc(v0, v0); 2636 __ aesd(v0, v3); 2637 __ aesimc(v0, v0); 2638 __ aesd(v0, v4); 2639 __ aesimc(v0, v0); 2640 2641 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2642 __ rev32(v1, __ T16B, v1); 2643 __ rev32(v2, __ T16B, v2); 2644 __ rev32(v3, __ T16B, v3); 2645 __ rev32(v4, __ T16B, v4); 2646 __ aesd(v0, v1); 2647 __ aesimc(v0, v0); 2648 __ aesd(v0, v2); 2649 __ aesimc(v0, v0); 2650 __ aesd(v0, v3); 2651 __ aesimc(v0, v0); 2652 __ aesd(v0, v4); 2653 __ aesimc(v0, v0); 2654 2655 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2656 __ rev32(v1, __ T16B, v1); 2657 __ rev32(v2, __ T16B, v2); 2658 2659 __ cmpw(keylen, 44); 2660 __ br(Assembler::EQ, L_doLast); 2661 2662 __ aesd(v0, v1); 2663 __ aesimc(v0, v0); 2664 __ aesd(v0, v2); 2665 __ aesimc(v0, v0); 2666 2667 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2668 __ rev32(v1, __ T16B, v1); 2669 __ rev32(v2, __ T16B, v2); 2670 2671 __ cmpw(keylen, 52); 2672 __ br(Assembler::EQ, L_doLast); 2673 2674 __ aesd(v0, v1); 2675 __ aesimc(v0, v0); 2676 __ aesd(v0, v2); 2677 __ aesimc(v0, v0); 2678 2679 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2680 __ rev32(v1, __ T16B, v1); 2681 __ rev32(v2, __ T16B, v2); 2682 2683 __ BIND(L_doLast); 2684 2685 __ aesd(v0, v1); 2686 __ aesimc(v0, v0); 2687 __ aesd(v0, v2); 2688 2689 __ eor(v0, __ T16B, v0, v5); 2690 2691 __ st1(v0, __ T16B, to); 2692 2693 __ mov(r0, 0); 2694 2695 __ leave(); 2696 __ ret(lr); 2697 2698 return start; 2699 } 2700 2701 // Arguments: 2702 // 2703 // Inputs: 2704 // c_rarg0 - source byte array address 2705 // c_rarg1 - destination byte array address 2706 // c_rarg2 - K (key) in little endian int array 2707 // c_rarg3 - r vector byte array address 2708 // c_rarg4 - input length 2709 // 2710 // Output: 2711 // x0 - input length 2712 // 2713 address generate_cipherBlockChaining_encryptAESCrypt() { 2714 assert(UseAES, "need AES instructions and misaligned SSE support"); 2715 __ align(CodeEntryAlignment); 2716 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2717 2718 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish; 2719 2720 const Register from = c_rarg0; // source array address 2721 const Register to = c_rarg1; // destination array address 2722 const Register key = c_rarg2; // key array address 2723 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2724 // and left with the results of the last encryption block 2725 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2726 const Register keylen = rscratch1; 2727 2728 address start = __ pc(); 2729 2730 __ enter(); 2731 2732 __ subsw(rscratch2, len_reg, zr); 2733 __ br(Assembler::LE, _L_finish); 2734 2735 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2736 2737 __ ld1(v0, __ T16B, rvec); 2738 2739 __ cmpw(keylen, 52); 2740 __ br(Assembler::CC, L_loadkeys_44); 2741 __ br(Assembler::EQ, L_loadkeys_52); 2742 2743 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2744 __ rev32(v17, __ T16B, v17); 2745 __ rev32(v18, __ T16B, v18); 2746 __ BIND(L_loadkeys_52); 2747 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2748 __ rev32(v19, __ T16B, v19); 2749 __ rev32(v20, __ T16B, v20); 2750 __ BIND(L_loadkeys_44); 2751 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2752 __ rev32(v21, __ T16B, v21); 2753 __ rev32(v22, __ T16B, v22); 2754 __ rev32(v23, __ T16B, v23); 2755 __ rev32(v24, __ T16B, v24); 2756 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2757 __ rev32(v25, __ T16B, v25); 2758 __ rev32(v26, __ T16B, v26); 2759 __ rev32(v27, __ T16B, v27); 2760 __ rev32(v28, __ T16B, v28); 2761 __ ld1(v29, v30, v31, __ T16B, key); 2762 __ rev32(v29, __ T16B, v29); 2763 __ rev32(v30, __ T16B, v30); 2764 __ rev32(v31, __ T16B, v31); 2765 2766 __ BIND(L_aes_loop); 2767 __ ld1(v1, __ T16B, __ post(from, 16)); 2768 __ eor(v0, __ T16B, v0, v1); 2769 2770 __ br(Assembler::CC, L_rounds_44); 2771 __ br(Assembler::EQ, L_rounds_52); 2772 2773 __ aese(v0, v17); __ aesmc(v0, v0); 2774 __ aese(v0, v18); __ aesmc(v0, v0); 2775 __ BIND(L_rounds_52); 2776 __ aese(v0, v19); __ aesmc(v0, v0); 2777 __ aese(v0, v20); __ aesmc(v0, v0); 2778 __ BIND(L_rounds_44); 2779 __ aese(v0, v21); __ aesmc(v0, v0); 2780 __ aese(v0, v22); __ aesmc(v0, v0); 2781 __ aese(v0, v23); __ aesmc(v0, v0); 2782 __ aese(v0, v24); __ aesmc(v0, v0); 2783 __ aese(v0, v25); __ aesmc(v0, v0); 2784 __ aese(v0, v26); __ aesmc(v0, v0); 2785 __ aese(v0, v27); __ aesmc(v0, v0); 2786 __ aese(v0, v28); __ aesmc(v0, v0); 2787 __ aese(v0, v29); __ aesmc(v0, v0); 2788 __ aese(v0, v30); 2789 __ eor(v0, __ T16B, v0, v31); 2790 2791 __ st1(v0, __ T16B, __ post(to, 16)); 2792 2793 __ subw(len_reg, len_reg, 16); 2794 __ cbnzw(len_reg, L_aes_loop); 2795 2796 __ st1(v0, __ T16B, rvec); 2797 2798 __ BIND(_L_finish); 2799 __ mov(r0, rscratch2); 2800 2801 __ leave(); 2802 __ ret(lr); 2803 2804 return start; 2805 } 2806 2807 // Arguments: 2808 // 2809 // Inputs: 2810 // c_rarg0 - source byte array address 2811 // c_rarg1 - destination byte array address 2812 // c_rarg2 - K (key) in little endian int array 2813 // c_rarg3 - r vector byte array address 2814 // c_rarg4 - input length 2815 // 2816 // Output: 2817 // r0 - input length 2818 // 2819 address generate_cipherBlockChaining_decryptAESCrypt() { 2820 assert(UseAES, "need AES instructions and misaligned SSE support"); 2821 __ align(CodeEntryAlignment); 2822 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2823 2824 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish; 2825 2826 const Register from = c_rarg0; // source array address 2827 const Register to = c_rarg1; // destination array address 2828 const Register key = c_rarg2; // key array address 2829 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2830 // and left with the results of the last encryption block 2831 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2832 const Register keylen = rscratch1; 2833 2834 address start = __ pc(); 2835 2836 __ enter(); 2837 2838 __ subsw(rscratch2, len_reg, zr); 2839 __ br(Assembler::LE, _L_finish); 2840 2841 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2842 2843 __ ld1(v2, __ T16B, rvec); 2844 2845 __ ld1(v31, __ T16B, __ post(key, 16)); 2846 __ rev32(v31, __ T16B, v31); 2847 2848 __ cmpw(keylen, 52); 2849 __ br(Assembler::CC, L_loadkeys_44); 2850 __ br(Assembler::EQ, L_loadkeys_52); 2851 2852 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2853 __ rev32(v17, __ T16B, v17); 2854 __ rev32(v18, __ T16B, v18); 2855 __ BIND(L_loadkeys_52); 2856 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2857 __ rev32(v19, __ T16B, v19); 2858 __ rev32(v20, __ T16B, v20); 2859 __ BIND(L_loadkeys_44); 2860 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2861 __ rev32(v21, __ T16B, v21); 2862 __ rev32(v22, __ T16B, v22); 2863 __ rev32(v23, __ T16B, v23); 2864 __ rev32(v24, __ T16B, v24); 2865 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2866 __ rev32(v25, __ T16B, v25); 2867 __ rev32(v26, __ T16B, v26); 2868 __ rev32(v27, __ T16B, v27); 2869 __ rev32(v28, __ T16B, v28); 2870 __ ld1(v29, v30, __ T16B, key); 2871 __ rev32(v29, __ T16B, v29); 2872 __ rev32(v30, __ T16B, v30); 2873 2874 __ BIND(L_aes_loop); 2875 __ ld1(v0, __ T16B, __ post(from, 16)); 2876 __ orr(v1, __ T16B, v0, v0); 2877 2878 __ br(Assembler::CC, L_rounds_44); 2879 __ br(Assembler::EQ, L_rounds_52); 2880 2881 __ aesd(v0, v17); __ aesimc(v0, v0); 2882 __ aesd(v0, v18); __ aesimc(v0, v0); 2883 __ BIND(L_rounds_52); 2884 __ aesd(v0, v19); __ aesimc(v0, v0); 2885 __ aesd(v0, v20); __ aesimc(v0, v0); 2886 __ BIND(L_rounds_44); 2887 __ aesd(v0, v21); __ aesimc(v0, v0); 2888 __ aesd(v0, v22); __ aesimc(v0, v0); 2889 __ aesd(v0, v23); __ aesimc(v0, v0); 2890 __ aesd(v0, v24); __ aesimc(v0, v0); 2891 __ aesd(v0, v25); __ aesimc(v0, v0); 2892 __ aesd(v0, v26); __ aesimc(v0, v0); 2893 __ aesd(v0, v27); __ aesimc(v0, v0); 2894 __ aesd(v0, v28); __ aesimc(v0, v0); 2895 __ aesd(v0, v29); __ aesimc(v0, v0); 2896 __ aesd(v0, v30); 2897 __ eor(v0, __ T16B, v0, v31); 2898 __ eor(v0, __ T16B, v0, v2); 2899 2900 __ st1(v0, __ T16B, __ post(to, 16)); 2901 __ orr(v2, __ T16B, v1, v1); 2902 2903 __ subw(len_reg, len_reg, 16); 2904 __ cbnzw(len_reg, L_aes_loop); 2905 2906 __ st1(v2, __ T16B, rvec); 2907 2908 __ BIND(_L_finish); 2909 __ mov(r0, rscratch2); 2910 2911 __ leave(); 2912 __ ret(lr); 2913 2914 return start; 2915 } 2916 2917 // Arguments: 2918 // 2919 // Inputs: 2920 // c_rarg0 - byte[] source+offset 2921 // c_rarg1 - int[] SHA.state 2922 // c_rarg2 - int offset 2923 // c_rarg3 - int limit 2924 // 2925 address generate_sha1_implCompress(bool multi_block, const char *name) { 2926 __ align(CodeEntryAlignment); 2927 StubCodeMark mark(this, "StubRoutines", name); 2928 address start = __ pc(); 2929 2930 Register buf = c_rarg0; 2931 Register state = c_rarg1; 2932 Register ofs = c_rarg2; 2933 Register limit = c_rarg3; 2934 2935 Label keys; 2936 Label sha1_loop; 2937 2938 // load the keys into v0..v3 2939 __ adr(rscratch1, keys); 2940 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2941 // load 5 words state into v6, v7 2942 __ ldrq(v6, Address(state, 0)); 2943 __ ldrs(v7, Address(state, 16)); 2944 2945 2946 __ BIND(sha1_loop); 2947 // load 64 bytes of data into v16..v19 2948 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2949 __ rev32(v16, __ T16B, v16); 2950 __ rev32(v17, __ T16B, v17); 2951 __ rev32(v18, __ T16B, v18); 2952 __ rev32(v19, __ T16B, v19); 2953 2954 // do the sha1 2955 __ addv(v4, __ T4S, v16, v0); 2956 __ orr(v20, __ T16B, v6, v6); 2957 2958 FloatRegister d0 = v16; 2959 FloatRegister d1 = v17; 2960 FloatRegister d2 = v18; 2961 FloatRegister d3 = v19; 2962 2963 for (int round = 0; round < 20; round++) { 2964 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2965 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2966 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2967 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2968 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2969 2970 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2971 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2972 __ sha1h(tmp2, __ T4S, v20); 2973 if (round < 5) 2974 __ sha1c(v20, __ T4S, tmp3, tmp4); 2975 else if (round < 10 || round >= 15) 2976 __ sha1p(v20, __ T4S, tmp3, tmp4); 2977 else 2978 __ sha1m(v20, __ T4S, tmp3, tmp4); 2979 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2980 2981 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2982 } 2983 2984 __ addv(v7, __ T2S, v7, v21); 2985 __ addv(v6, __ T4S, v6, v20); 2986 2987 if (multi_block) { 2988 __ add(ofs, ofs, 64); 2989 __ cmp(ofs, limit); 2990 __ br(Assembler::LE, sha1_loop); 2991 __ mov(c_rarg0, ofs); // return ofs 2992 } 2993 2994 __ strq(v6, Address(state, 0)); 2995 __ strs(v7, Address(state, 16)); 2996 2997 __ ret(lr); 2998 2999 __ bind(keys); 3000 __ emit_int32(0x5a827999); 3001 __ emit_int32(0x6ed9eba1); 3002 __ emit_int32(0x8f1bbcdc); 3003 __ emit_int32(0xca62c1d6); 3004 3005 return start; 3006 } 3007 3008 3009 // Arguments: 3010 // 3011 // Inputs: 3012 // c_rarg0 - byte[] source+offset 3013 // c_rarg1 - int[] SHA.state 3014 // c_rarg2 - int offset 3015 // c_rarg3 - int limit 3016 // 3017 address generate_sha256_implCompress(bool multi_block, const char *name) { 3018 static const uint32_t round_consts[64] = { 3019 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3020 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3021 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3022 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3023 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3024 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3025 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3026 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3027 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3028 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3029 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3030 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3031 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3032 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3033 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3034 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3035 }; 3036 __ align(CodeEntryAlignment); 3037 StubCodeMark mark(this, "StubRoutines", name); 3038 address start = __ pc(); 3039 3040 Register buf = c_rarg0; 3041 Register state = c_rarg1; 3042 Register ofs = c_rarg2; 3043 Register limit = c_rarg3; 3044 3045 Label sha1_loop; 3046 3047 __ stpd(v8, v9, __ pre(sp, -32)); 3048 __ stpd(v10, v11, Address(sp, 16)); 3049 3050 // dga == v0 3051 // dgb == v1 3052 // dg0 == v2 3053 // dg1 == v3 3054 // dg2 == v4 3055 // t0 == v6 3056 // t1 == v7 3057 3058 // load 16 keys to v16..v31 3059 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3060 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3061 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3062 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3063 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3064 3065 // load 8 words (256 bits) state 3066 __ ldpq(v0, v1, state); 3067 3068 __ BIND(sha1_loop); 3069 // load 64 bytes of data into v8..v11 3070 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3071 __ rev32(v8, __ T16B, v8); 3072 __ rev32(v9, __ T16B, v9); 3073 __ rev32(v10, __ T16B, v10); 3074 __ rev32(v11, __ T16B, v11); 3075 3076 __ addv(v6, __ T4S, v8, v16); 3077 __ orr(v2, __ T16B, v0, v0); 3078 __ orr(v3, __ T16B, v1, v1); 3079 3080 FloatRegister d0 = v8; 3081 FloatRegister d1 = v9; 3082 FloatRegister d2 = v10; 3083 FloatRegister d3 = v11; 3084 3085 3086 for (int round = 0; round < 16; round++) { 3087 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3088 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3089 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3090 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3091 3092 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3093 __ orr(v4, __ T16B, v2, v2); 3094 if (round < 15) 3095 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3096 __ sha256h(v2, __ T4S, v3, tmp2); 3097 __ sha256h2(v3, __ T4S, v4, tmp2); 3098 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3099 3100 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3101 } 3102 3103 __ addv(v0, __ T4S, v0, v2); 3104 __ addv(v1, __ T4S, v1, v3); 3105 3106 if (multi_block) { 3107 __ add(ofs, ofs, 64); 3108 __ cmp(ofs, limit); 3109 __ br(Assembler::LE, sha1_loop); 3110 __ mov(c_rarg0, ofs); // return ofs 3111 } 3112 3113 __ ldpd(v10, v11, Address(sp, 16)); 3114 __ ldpd(v8, v9, __ post(sp, 32)); 3115 3116 __ stpq(v0, v1, state); 3117 3118 __ ret(lr); 3119 3120 return start; 3121 } 3122 3123 // Safefetch stubs. 3124 void generate_safefetch(const char* name, int size, address* entry, 3125 address* fault_pc, address* continuation_pc) { 3126 // safefetch signatures: 3127 // int SafeFetch32(int* adr, int errValue); 3128 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3129 // 3130 // arguments: 3131 // c_rarg0 = adr 3132 // c_rarg1 = errValue 3133 // 3134 // result: 3135 // PPC_RET = *adr or errValue 3136 3137 StubCodeMark mark(this, "StubRoutines", name); 3138 3139 // Entry point, pc or function descriptor. 3140 *entry = __ pc(); 3141 3142 // Load *adr into c_rarg1, may fault. 3143 *fault_pc = __ pc(); 3144 switch (size) { 3145 case 4: 3146 // int32_t 3147 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3148 break; 3149 case 8: 3150 // int64_t 3151 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3152 break; 3153 default: 3154 ShouldNotReachHere(); 3155 } 3156 3157 // return errValue or *adr 3158 *continuation_pc = __ pc(); 3159 __ mov(r0, c_rarg1); 3160 __ ret(lr); 3161 } 3162 3163 /** 3164 * Arguments: 3165 * 3166 * Inputs: 3167 * c_rarg0 - int crc 3168 * c_rarg1 - byte* buf 3169 * c_rarg2 - int length 3170 * 3171 * Output: 3172 * r0 - int crc result 3173 * 3174 * Preserves: 3175 * r13 3176 * 3177 */ 3178 address generate_updateBytesCRC32() { 3179 assert(UseCRC32Intrinsics, "what are we doing here?"); 3180 3181 __ align(CodeEntryAlignment); 3182 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3183 3184 address start = __ pc(); 3185 3186 const Register crc = c_rarg0; // crc 3187 const Register buf = c_rarg1; // source java byte array address 3188 const Register len = c_rarg2; // length 3189 const Register table0 = c_rarg3; // crc_table address 3190 const Register table1 = c_rarg4; 3191 const Register table2 = c_rarg5; 3192 const Register table3 = c_rarg6; 3193 const Register tmp3 = c_rarg7; 3194 3195 BLOCK_COMMENT("Entry:"); 3196 __ enter(); // required for proper stackwalking of RuntimeStub frame 3197 3198 __ kernel_crc32(crc, buf, len, 3199 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3200 3201 __ leave(); // required for proper stackwalking of RuntimeStub frame 3202 __ ret(lr); 3203 3204 return start; 3205 } 3206 3207 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3208 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3209 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3210 // Karatsuba multiplication performs a 128*128 -> 256-bit 3211 // multiplication in three 128-bit multiplications and a few 3212 // additions. 3213 // 3214 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3215 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3216 // 3217 // Inputs: 3218 // 3219 // A0 in a.d[0] (subkey) 3220 // A1 in a.d[1] 3221 // (A1+A0) in a1_xor_a0.d[0] 3222 // 3223 // B0 in b.d[0] (state) 3224 // B1 in b.d[1] 3225 3226 __ ext(tmp1, __ T16B, b, b, 0x08); 3227 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3228 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3229 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3230 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3231 3232 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3233 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3234 __ eor(tmp2, __ T16B, tmp2, tmp4); 3235 __ eor(tmp2, __ T16B, tmp2, tmp3); 3236 3237 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3238 __ ins(result_hi, __ D, tmp2, 0, 1); 3239 __ ins(result_lo, __ D, tmp2, 1, 0); 3240 } 3241 3242 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3243 FloatRegister p, FloatRegister z, FloatRegister t1) { 3244 const FloatRegister t0 = result; 3245 3246 // The GCM field polynomial f is z^128 + p(z), where p = 3247 // z^7+z^2+z+1. 3248 // 3249 // z^128 === -p(z) (mod (z^128 + p(z))) 3250 // 3251 // so, given that the product we're reducing is 3252 // a == lo + hi * z^128 3253 // substituting, 3254 // === lo - hi * p(z) (mod (z^128 + p(z))) 3255 // 3256 // we reduce by multiplying hi by p(z) and subtracting the result 3257 // from (i.e. XORing it with) lo. Because p has no nonzero high 3258 // bits we can do this with two 64-bit multiplications, lo*p and 3259 // hi*p. 3260 3261 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3262 __ ext(t1, __ T16B, t0, z, 8); 3263 __ eor(hi, __ T16B, hi, t1); 3264 __ ext(t1, __ T16B, z, t0, 8); 3265 __ eor(lo, __ T16B, lo, t1); 3266 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3267 __ eor(result, __ T16B, lo, t0); 3268 } 3269 3270 /** 3271 * Arguments: 3272 * 3273 * Input: 3274 * c_rarg0 - x address 3275 * c_rarg1 - x length 3276 * c_rarg2 - y address 3277 * c_rarg3 - y lenth 3278 * c_rarg4 - z address 3279 * c_rarg5 - z length 3280 */ 3281 address generate_multiplyToLen() { 3282 __ align(CodeEntryAlignment); 3283 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3284 3285 address start = __ pc(); 3286 const Register x = r0; 3287 const Register xlen = r1; 3288 const Register y = r2; 3289 const Register ylen = r3; 3290 const Register z = r4; 3291 const Register zlen = r5; 3292 3293 const Register tmp1 = r10; 3294 const Register tmp2 = r11; 3295 const Register tmp3 = r12; 3296 const Register tmp4 = r13; 3297 const Register tmp5 = r14; 3298 const Register tmp6 = r15; 3299 const Register tmp7 = r16; 3300 3301 BLOCK_COMMENT("Entry:"); 3302 __ enter(); // required for proper stackwalking of RuntimeStub frame 3303 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3304 __ leave(); // required for proper stackwalking of RuntimeStub frame 3305 __ ret(lr); 3306 3307 return start; 3308 } 3309 3310 /** 3311 * Arguments: 3312 * 3313 * Input: 3314 * c_rarg0 - current state address 3315 * c_rarg1 - H key address 3316 * c_rarg2 - data address 3317 * c_rarg3 - number of blocks 3318 * 3319 * Output: 3320 * Updated state at c_rarg0 3321 */ 3322 address generate_ghash_processBlocks() { 3323 // Bafflingly, GCM uses little-endian for the byte order, but 3324 // big-endian for the bit order. For example, the polynomial 1 is 3325 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3326 // 3327 // So, we must either reverse the bytes in each word and do 3328 // everything big-endian or reverse the bits in each byte and do 3329 // it little-endian. On AArch64 it's more idiomatic to reverse 3330 // the bits in each byte (we have an instruction, RBIT, to do 3331 // that) and keep the data in little-endian bit order throught the 3332 // calculation, bit-reversing the inputs and outputs. 3333 3334 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3335 __ align(wordSize * 2); 3336 address p = __ pc(); 3337 __ emit_int64(0x87); // The low-order bits of the field 3338 // polynomial (i.e. p = z^7+z^2+z+1) 3339 // repeated in the low and high parts of a 3340 // 128-bit vector 3341 __ emit_int64(0x87); 3342 3343 __ align(CodeEntryAlignment); 3344 address start = __ pc(); 3345 3346 Register state = c_rarg0; 3347 Register subkeyH = c_rarg1; 3348 Register data = c_rarg2; 3349 Register blocks = c_rarg3; 3350 3351 FloatRegister vzr = v30; 3352 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3353 3354 __ ldrq(v0, Address(state)); 3355 __ ldrq(v1, Address(subkeyH)); 3356 3357 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3358 __ rbit(v0, __ T16B, v0); 3359 __ rev64(v1, __ T16B, v1); 3360 __ rbit(v1, __ T16B, v1); 3361 3362 __ ldrq(v26, p); 3363 3364 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3365 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3366 3367 { 3368 Label L_ghash_loop; 3369 __ bind(L_ghash_loop); 3370 3371 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3372 // reversing each byte 3373 __ rbit(v2, __ T16B, v2); 3374 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3375 3376 // Multiply state in v2 by subkey in v1 3377 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3378 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3379 /*temps*/v6, v20, v18, v21); 3380 // Reduce v7:v5 by the field polynomial 3381 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3382 3383 __ sub(blocks, blocks, 1); 3384 __ cbnz(blocks, L_ghash_loop); 3385 } 3386 3387 // The bit-reversed result is at this point in v0 3388 __ rev64(v1, __ T16B, v0); 3389 __ rbit(v1, __ T16B, v1); 3390 3391 __ st1(v1, __ T16B, state); 3392 __ ret(lr); 3393 3394 return start; 3395 } 3396 3397 // Continuation point for throwing of implicit exceptions that are 3398 // not handled in the current activation. Fabricates an exception 3399 // oop and initiates normal exception dispatching in this 3400 // frame. Since we need to preserve callee-saved values (currently 3401 // only for C2, but done for C1 as well) we need a callee-saved oop 3402 // map and therefore have to make these stubs into RuntimeStubs 3403 // rather than BufferBlobs. If the compiler needs all registers to 3404 // be preserved between the fault point and the exception handler 3405 // then it must assume responsibility for that in 3406 // AbstractCompiler::continuation_for_implicit_null_exception or 3407 // continuation_for_implicit_division_by_zero_exception. All other 3408 // implicit exceptions (e.g., NullPointerException or 3409 // AbstractMethodError on entry) are either at call sites or 3410 // otherwise assume that stack unwinding will be initiated, so 3411 // caller saved registers were assumed volatile in the compiler. 3412 3413 #undef __ 3414 #define __ masm-> 3415 3416 address generate_throw_exception(const char* name, 3417 address runtime_entry, 3418 Register arg1 = noreg, 3419 Register arg2 = noreg) { 3420 // Information about frame layout at time of blocking runtime call. 3421 // Note that we only have to preserve callee-saved registers since 3422 // the compilers are responsible for supplying a continuation point 3423 // if they expect all registers to be preserved. 3424 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3425 enum layout { 3426 rfp_off = 0, 3427 rfp_off2, 3428 return_off, 3429 return_off2, 3430 framesize // inclusive of return address 3431 }; 3432 3433 int insts_size = 512; 3434 int locs_size = 64; 3435 3436 CodeBuffer code(name, insts_size, locs_size); 3437 OopMapSet* oop_maps = new OopMapSet(); 3438 MacroAssembler* masm = new MacroAssembler(&code); 3439 3440 address start = __ pc(); 3441 3442 // This is an inlined and slightly modified version of call_VM 3443 // which has the ability to fetch the return PC out of 3444 // thread-local storage and also sets up last_Java_sp slightly 3445 // differently than the real call_VM 3446 3447 __ enter(); // Save FP and LR before call 3448 3449 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3450 3451 // lr and fp are already in place 3452 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3453 3454 int frame_complete = __ pc() - start; 3455 3456 // Set up last_Java_sp and last_Java_fp 3457 address the_pc = __ pc(); 3458 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3459 3460 // Call runtime 3461 if (arg1 != noreg) { 3462 assert(arg2 != c_rarg1, "clobbered"); 3463 __ mov(c_rarg1, arg1); 3464 } 3465 if (arg2 != noreg) { 3466 __ mov(c_rarg2, arg2); 3467 } 3468 __ mov(c_rarg0, rthread); 3469 BLOCK_COMMENT("call runtime_entry"); 3470 __ mov(rscratch1, runtime_entry); 3471 __ blr(rscratch1); 3472 3473 // Generate oop map 3474 OopMap* map = new OopMap(framesize, 0); 3475 3476 oop_maps->add_gc_map(the_pc - start, map); 3477 3478 __ reset_last_Java_frame(true); 3479 __ maybe_isb(); 3480 3481 __ leave(); 3482 3483 // check for pending exceptions 3484 #ifdef ASSERT 3485 Label L; 3486 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3487 __ cbnz(rscratch1, L); 3488 __ should_not_reach_here(); 3489 __ bind(L); 3490 #endif // ASSERT 3491 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3492 3493 3494 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3495 RuntimeStub* stub = 3496 RuntimeStub::new_runtime_stub(name, 3497 &code, 3498 frame_complete, 3499 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3500 oop_maps, false); 3501 return stub->entry_point(); 3502 } 3503 3504 class MontgomeryMultiplyGenerator : public MacroAssembler { 3505 3506 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3507 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3508 3509 RegSet _toSave; 3510 bool _squaring; 3511 3512 public: 3513 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3514 : MacroAssembler(as->code()), _squaring(squaring) { 3515 3516 // Register allocation 3517 3518 Register reg = c_rarg0; 3519 Pa_base = reg; // Argument registers 3520 if (squaring) 3521 Pb_base = Pa_base; 3522 else 3523 Pb_base = ++reg; 3524 Pn_base = ++reg; 3525 Rlen= ++reg; 3526 inv = ++reg; 3527 Pm_base = ++reg; 3528 3529 // Working registers: 3530 Ra = ++reg; // The current digit of a, b, n, and m. 3531 Rb = ++reg; 3532 Rm = ++reg; 3533 Rn = ++reg; 3534 3535 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3536 Pb = ++reg; 3537 Pm = ++reg; 3538 Pn = ++reg; 3539 3540 t0 = ++reg; // Three registers which form a 3541 t1 = ++reg; // triple-precision accumuator. 3542 t2 = ++reg; 3543 3544 Ri = ++reg; // Inner and outer loop indexes. 3545 Rj = ++reg; 3546 3547 Rhi_ab = ++reg; // Product registers: low and high parts 3548 Rlo_ab = ++reg; // of a*b and m*n. 3549 Rhi_mn = ++reg; 3550 Rlo_mn = ++reg; 3551 3552 // r19 and up are callee-saved. 3553 _toSave = RegSet::range(r19, reg) + Pm_base; 3554 } 3555 3556 private: 3557 void save_regs() { 3558 push(_toSave, sp); 3559 } 3560 3561 void restore_regs() { 3562 pop(_toSave, sp); 3563 } 3564 3565 template <typename T> 3566 void unroll_2(Register count, T block) { 3567 Label loop, end, odd; 3568 tbnz(count, 0, odd); 3569 cbz(count, end); 3570 align(16); 3571 bind(loop); 3572 (this->*block)(); 3573 bind(odd); 3574 (this->*block)(); 3575 subs(count, count, 2); 3576 br(Assembler::GT, loop); 3577 bind(end); 3578 } 3579 3580 template <typename T> 3581 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3582 Label loop, end, odd; 3583 tbnz(count, 0, odd); 3584 cbz(count, end); 3585 align(16); 3586 bind(loop); 3587 (this->*block)(d, s, tmp); 3588 bind(odd); 3589 (this->*block)(d, s, tmp); 3590 subs(count, count, 2); 3591 br(Assembler::GT, loop); 3592 bind(end); 3593 } 3594 3595 void pre1(RegisterOrConstant i) { 3596 block_comment("pre1"); 3597 // Pa = Pa_base; 3598 // Pb = Pb_base + i; 3599 // Pm = Pm_base; 3600 // Pn = Pn_base + i; 3601 // Ra = *Pa; 3602 // Rb = *Pb; 3603 // Rm = *Pm; 3604 // Rn = *Pn; 3605 ldr(Ra, Address(Pa_base)); 3606 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3607 ldr(Rm, Address(Pm_base)); 3608 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3609 lea(Pa, Address(Pa_base)); 3610 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3611 lea(Pm, Address(Pm_base)); 3612 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3613 3614 // Zero the m*n result. 3615 mov(Rhi_mn, zr); 3616 mov(Rlo_mn, zr); 3617 } 3618 3619 // The core multiply-accumulate step of a Montgomery 3620 // multiplication. The idea is to schedule operations as a 3621 // pipeline so that instructions with long latencies (loads and 3622 // multiplies) have time to complete before their results are 3623 // used. This most benefits in-order implementations of the 3624 // architecture but out-of-order ones also benefit. 3625 void step() { 3626 block_comment("step"); 3627 // MACC(Ra, Rb, t0, t1, t2); 3628 // Ra = *++Pa; 3629 // Rb = *--Pb; 3630 umulh(Rhi_ab, Ra, Rb); 3631 mul(Rlo_ab, Ra, Rb); 3632 ldr(Ra, pre(Pa, wordSize)); 3633 ldr(Rb, pre(Pb, -wordSize)); 3634 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3635 // previous iteration. 3636 // MACC(Rm, Rn, t0, t1, t2); 3637 // Rm = *++Pm; 3638 // Rn = *--Pn; 3639 umulh(Rhi_mn, Rm, Rn); 3640 mul(Rlo_mn, Rm, Rn); 3641 ldr(Rm, pre(Pm, wordSize)); 3642 ldr(Rn, pre(Pn, -wordSize)); 3643 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3644 } 3645 3646 void post1() { 3647 block_comment("post1"); 3648 3649 // MACC(Ra, Rb, t0, t1, t2); 3650 // Ra = *++Pa; 3651 // Rb = *--Pb; 3652 umulh(Rhi_ab, Ra, Rb); 3653 mul(Rlo_ab, Ra, Rb); 3654 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3655 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3656 3657 // *Pm = Rm = t0 * inv; 3658 mul(Rm, t0, inv); 3659 str(Rm, Address(Pm)); 3660 3661 // MACC(Rm, Rn, t0, t1, t2); 3662 // t0 = t1; t1 = t2; t2 = 0; 3663 umulh(Rhi_mn, Rm, Rn); 3664 3665 #ifndef PRODUCT 3666 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3667 { 3668 mul(Rlo_mn, Rm, Rn); 3669 add(Rlo_mn, t0, Rlo_mn); 3670 Label ok; 3671 cbz(Rlo_mn, ok); { 3672 stop("broken Montgomery multiply"); 3673 } bind(ok); 3674 } 3675 #endif 3676 // We have very carefully set things up so that 3677 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3678 // the lower half of Rm * Rn because we know the result already: 3679 // it must be -t0. t0 + (-t0) must generate a carry iff 3680 // t0 != 0. So, rather than do a mul and an adds we just set 3681 // the carry flag iff t0 is nonzero. 3682 // 3683 // mul(Rlo_mn, Rm, Rn); 3684 // adds(zr, t0, Rlo_mn); 3685 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3686 adcs(t0, t1, Rhi_mn); 3687 adc(t1, t2, zr); 3688 mov(t2, zr); 3689 } 3690 3691 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 3692 block_comment("pre2"); 3693 // Pa = Pa_base + i-len; 3694 // Pb = Pb_base + len; 3695 // Pm = Pm_base + i-len; 3696 // Pn = Pn_base + len; 3697 3698 if (i.is_register()) { 3699 sub(Rj, i.as_register(), len); 3700 } else { 3701 mov(Rj, i.as_constant()); 3702 sub(Rj, Rj, len); 3703 } 3704 // Rj == i-len 3705 3706 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 3707 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 3708 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3709 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 3710 3711 // Ra = *++Pa; 3712 // Rb = *--Pb; 3713 // Rm = *++Pm; 3714 // Rn = *--Pn; 3715 ldr(Ra, pre(Pa, wordSize)); 3716 ldr(Rb, pre(Pb, -wordSize)); 3717 ldr(Rm, pre(Pm, wordSize)); 3718 ldr(Rn, pre(Pn, -wordSize)); 3719 3720 mov(Rhi_mn, zr); 3721 mov(Rlo_mn, zr); 3722 } 3723 3724 void post2(RegisterOrConstant i, RegisterOrConstant len) { 3725 block_comment("post2"); 3726 if (i.is_constant()) { 3727 mov(Rj, i.as_constant()-len.as_constant()); 3728 } else { 3729 sub(Rj, i.as_register(), len); 3730 } 3731 3732 adds(t0, t0, Rlo_mn); // The pending m*n, low part 3733 3734 // As soon as we know the least significant digit of our result, 3735 // store it. 3736 // Pm_base[i-len] = t0; 3737 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3738 3739 // t0 = t1; t1 = t2; t2 = 0; 3740 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 3741 adc(t1, t2, zr); 3742 mov(t2, zr); 3743 } 3744 3745 // A carry in t0 after Montgomery multiplication means that we 3746 // should subtract multiples of n from our result in m. We'll 3747 // keep doing that until there is no carry. 3748 void normalize(RegisterOrConstant len) { 3749 block_comment("normalize"); 3750 // while (t0) 3751 // t0 = sub(Pm_base, Pn_base, t0, len); 3752 Label loop, post, again; 3753 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 3754 cbz(t0, post); { 3755 bind(again); { 3756 mov(i, zr); 3757 mov(cnt, len); 3758 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3759 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3760 subs(zr, zr, zr); // set carry flag, i.e. no borrow 3761 align(16); 3762 bind(loop); { 3763 sbcs(Rm, Rm, Rn); 3764 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3765 add(i, i, 1); 3766 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3767 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3768 sub(cnt, cnt, 1); 3769 } cbnz(cnt, loop); 3770 sbc(t0, t0, zr); 3771 } cbnz(t0, again); 3772 } bind(post); 3773 } 3774 3775 // Move memory at s to d, reversing words. 3776 // Increments d to end of copied memory 3777 // Destroys tmp1, tmp2 3778 // Preserves len 3779 // Leaves s pointing to the address which was in d at start 3780 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3781 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 3782 3783 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 3784 mov(tmp1, len); 3785 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3786 sub(s, d, len, ext::uxtw, LogBytesPerWord); 3787 } 3788 // where 3789 void reverse1(Register d, Register s, Register tmp) { 3790 ldr(tmp, pre(s, -wordSize)); 3791 ror(tmp, tmp, 32); 3792 str(tmp, post(d, wordSize)); 3793 } 3794 3795 void step_squaring() { 3796 // An extra ACC 3797 step(); 3798 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3799 } 3800 3801 void last_squaring(RegisterOrConstant i) { 3802 Label dont; 3803 // if ((i & 1) == 0) { 3804 tbnz(i.as_register(), 0, dont); { 3805 // MACC(Ra, Rb, t0, t1, t2); 3806 // Ra = *++Pa; 3807 // Rb = *--Pb; 3808 umulh(Rhi_ab, Ra, Rb); 3809 mul(Rlo_ab, Ra, Rb); 3810 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3811 } bind(dont); 3812 } 3813 3814 void extra_step_squaring() { 3815 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3816 3817 // MACC(Rm, Rn, t0, t1, t2); 3818 // Rm = *++Pm; 3819 // Rn = *--Pn; 3820 umulh(Rhi_mn, Rm, Rn); 3821 mul(Rlo_mn, Rm, Rn); 3822 ldr(Rm, pre(Pm, wordSize)); 3823 ldr(Rn, pre(Pn, -wordSize)); 3824 } 3825 3826 void post1_squaring() { 3827 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3828 3829 // *Pm = Rm = t0 * inv; 3830 mul(Rm, t0, inv); 3831 str(Rm, Address(Pm)); 3832 3833 // MACC(Rm, Rn, t0, t1, t2); 3834 // t0 = t1; t1 = t2; t2 = 0; 3835 umulh(Rhi_mn, Rm, Rn); 3836 3837 #ifndef PRODUCT 3838 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3839 { 3840 mul(Rlo_mn, Rm, Rn); 3841 add(Rlo_mn, t0, Rlo_mn); 3842 Label ok; 3843 cbz(Rlo_mn, ok); { 3844 stop("broken Montgomery multiply"); 3845 } bind(ok); 3846 } 3847 #endif 3848 // We have very carefully set things up so that 3849 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3850 // the lower half of Rm * Rn because we know the result already: 3851 // it must be -t0. t0 + (-t0) must generate a carry iff 3852 // t0 != 0. So, rather than do a mul and an adds we just set 3853 // the carry flag iff t0 is nonzero. 3854 // 3855 // mul(Rlo_mn, Rm, Rn); 3856 // adds(zr, t0, Rlo_mn); 3857 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3858 adcs(t0, t1, Rhi_mn); 3859 adc(t1, t2, zr); 3860 mov(t2, zr); 3861 } 3862 3863 void acc(Register Rhi, Register Rlo, 3864 Register t0, Register t1, Register t2) { 3865 adds(t0, t0, Rlo); 3866 adcs(t1, t1, Rhi); 3867 adc(t2, t2, zr); 3868 } 3869 3870 public: 3871 /** 3872 * Fast Montgomery multiplication. The derivation of the 3873 * algorithm is in A Cryptographic Library for the Motorola 3874 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3875 * 3876 * Arguments: 3877 * 3878 * Inputs for multiplication: 3879 * c_rarg0 - int array elements a 3880 * c_rarg1 - int array elements b 3881 * c_rarg2 - int array elements n (the modulus) 3882 * c_rarg3 - int length 3883 * c_rarg4 - int inv 3884 * c_rarg5 - int array elements m (the result) 3885 * 3886 * Inputs for squaring: 3887 * c_rarg0 - int array elements a 3888 * c_rarg1 - int array elements n (the modulus) 3889 * c_rarg2 - int length 3890 * c_rarg3 - int inv 3891 * c_rarg4 - int array elements m (the result) 3892 * 3893 */ 3894 address generate_multiply() { 3895 Label argh, nothing; 3896 bind(argh); 3897 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3898 3899 align(CodeEntryAlignment); 3900 address entry = pc(); 3901 3902 cbzw(Rlen, nothing); 3903 3904 enter(); 3905 3906 // Make room. 3907 cmpw(Rlen, 512); 3908 br(Assembler::HI, argh); 3909 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3910 andr(sp, Ra, -2 * wordSize); 3911 3912 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3913 3914 { 3915 // Copy input args, reversing as we go. We use Ra as a 3916 // temporary variable. 3917 reverse(Ra, Pa_base, Rlen, t0, t1); 3918 if (!_squaring) 3919 reverse(Ra, Pb_base, Rlen, t0, t1); 3920 reverse(Ra, Pn_base, Rlen, t0, t1); 3921 } 3922 3923 // Push all call-saved registers and also Pm_base which we'll need 3924 // at the end. 3925 save_regs(); 3926 3927 #ifndef PRODUCT 3928 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3929 { 3930 ldr(Rn, Address(Pn_base, 0)); 3931 mul(Rlo_mn, Rn, inv); 3932 cmp(Rlo_mn, -1); 3933 Label ok; 3934 br(EQ, ok); { 3935 stop("broken inverse in Montgomery multiply"); 3936 } bind(ok); 3937 } 3938 #endif 3939 3940 mov(Pm_base, Ra); 3941 3942 mov(t0, zr); 3943 mov(t1, zr); 3944 mov(t2, zr); 3945 3946 block_comment("for (int i = 0; i < len; i++) {"); 3947 mov(Ri, zr); { 3948 Label loop, end; 3949 cmpw(Ri, Rlen); 3950 br(Assembler::GE, end); 3951 3952 bind(loop); 3953 pre1(Ri); 3954 3955 block_comment(" for (j = i; j; j--) {"); { 3956 movw(Rj, Ri); 3957 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3958 } block_comment(" } // j"); 3959 3960 post1(); 3961 addw(Ri, Ri, 1); 3962 cmpw(Ri, Rlen); 3963 br(Assembler::LT, loop); 3964 bind(end); 3965 block_comment("} // i"); 3966 } 3967 3968 block_comment("for (int i = len; i < 2*len; i++) {"); 3969 mov(Ri, Rlen); { 3970 Label loop, end; 3971 cmpw(Ri, Rlen, Assembler::LSL, 1); 3972 br(Assembler::GE, end); 3973 3974 bind(loop); 3975 pre2(Ri, Rlen); 3976 3977 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3978 lslw(Rj, Rlen, 1); 3979 subw(Rj, Rj, Ri); 3980 subw(Rj, Rj, 1); 3981 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3982 } block_comment(" } // j"); 3983 3984 post2(Ri, Rlen); 3985 addw(Ri, Ri, 1); 3986 cmpw(Ri, Rlen, Assembler::LSL, 1); 3987 br(Assembler::LT, loop); 3988 bind(end); 3989 } 3990 block_comment("} // i"); 3991 3992 normalize(Rlen); 3993 3994 mov(Ra, Pm_base); // Save Pm_base in Ra 3995 restore_regs(); // Restore caller's Pm_base 3996 3997 // Copy our result into caller's Pm_base 3998 reverse(Pm_base, Ra, Rlen, t0, t1); 3999 4000 leave(); 4001 bind(nothing); 4002 ret(lr); 4003 4004 return entry; 4005 } 4006 // In C, approximately: 4007 4008 // void 4009 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4010 // unsigned long Pn_base[], unsigned long Pm_base[], 4011 // unsigned long inv, int len) { 4012 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4013 // unsigned long *Pa, *Pb, *Pn, *Pm; 4014 // unsigned long Ra, Rb, Rn, Rm; 4015 4016 // int i; 4017 4018 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4019 4020 // for (i = 0; i < len; i++) { 4021 // int j; 4022 4023 // Pa = Pa_base; 4024 // Pb = Pb_base + i; 4025 // Pm = Pm_base; 4026 // Pn = Pn_base + i; 4027 4028 // Ra = *Pa; 4029 // Rb = *Pb; 4030 // Rm = *Pm; 4031 // Rn = *Pn; 4032 4033 // int iters = i; 4034 // for (j = 0; iters--; j++) { 4035 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4036 // MACC(Ra, Rb, t0, t1, t2); 4037 // Ra = *++Pa; 4038 // Rb = *--Pb; 4039 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4040 // MACC(Rm, Rn, t0, t1, t2); 4041 // Rm = *++Pm; 4042 // Rn = *--Pn; 4043 // } 4044 4045 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4046 // MACC(Ra, Rb, t0, t1, t2); 4047 // *Pm = Rm = t0 * inv; 4048 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4049 // MACC(Rm, Rn, t0, t1, t2); 4050 4051 // assert(t0 == 0, "broken Montgomery multiply"); 4052 4053 // t0 = t1; t1 = t2; t2 = 0; 4054 // } 4055 4056 // for (i = len; i < 2*len; i++) { 4057 // int j; 4058 4059 // Pa = Pa_base + i-len; 4060 // Pb = Pb_base + len; 4061 // Pm = Pm_base + i-len; 4062 // Pn = Pn_base + len; 4063 4064 // Ra = *++Pa; 4065 // Rb = *--Pb; 4066 // Rm = *++Pm; 4067 // Rn = *--Pn; 4068 4069 // int iters = len*2-i-1; 4070 // for (j = i-len+1; iters--; j++) { 4071 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4072 // MACC(Ra, Rb, t0, t1, t2); 4073 // Ra = *++Pa; 4074 // Rb = *--Pb; 4075 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4076 // MACC(Rm, Rn, t0, t1, t2); 4077 // Rm = *++Pm; 4078 // Rn = *--Pn; 4079 // } 4080 4081 // Pm_base[i-len] = t0; 4082 // t0 = t1; t1 = t2; t2 = 0; 4083 // } 4084 4085 // while (t0) 4086 // t0 = sub(Pm_base, Pn_base, t0, len); 4087 // } 4088 4089 /** 4090 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4091 * multiplies than Montgomery multiplication so it should be up to 4092 * 25% faster. However, its loop control is more complex and it 4093 * may actually run slower on some machines. 4094 * 4095 * Arguments: 4096 * 4097 * Inputs: 4098 * c_rarg0 - int array elements a 4099 * c_rarg1 - int array elements n (the modulus) 4100 * c_rarg2 - int length 4101 * c_rarg3 - int inv 4102 * c_rarg4 - int array elements m (the result) 4103 * 4104 */ 4105 address generate_square() { 4106 Label argh; 4107 bind(argh); 4108 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4109 4110 align(CodeEntryAlignment); 4111 address entry = pc(); 4112 4113 enter(); 4114 4115 // Make room. 4116 cmpw(Rlen, 512); 4117 br(Assembler::HI, argh); 4118 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4119 andr(sp, Ra, -2 * wordSize); 4120 4121 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4122 4123 { 4124 // Copy input args, reversing as we go. We use Ra as a 4125 // temporary variable. 4126 reverse(Ra, Pa_base, Rlen, t0, t1); 4127 reverse(Ra, Pn_base, Rlen, t0, t1); 4128 } 4129 4130 // Push all call-saved registers and also Pm_base which we'll need 4131 // at the end. 4132 save_regs(); 4133 4134 mov(Pm_base, Ra); 4135 4136 mov(t0, zr); 4137 mov(t1, zr); 4138 mov(t2, zr); 4139 4140 block_comment("for (int i = 0; i < len; i++) {"); 4141 mov(Ri, zr); { 4142 Label loop, end; 4143 bind(loop); 4144 cmp(Ri, Rlen); 4145 br(Assembler::GE, end); 4146 4147 pre1(Ri); 4148 4149 block_comment("for (j = (i+1)/2; j; j--) {"); { 4150 add(Rj, Ri, 1); 4151 lsr(Rj, Rj, 1); 4152 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4153 } block_comment(" } // j"); 4154 4155 last_squaring(Ri); 4156 4157 block_comment(" for (j = i/2; j; j--) {"); { 4158 lsr(Rj, Ri, 1); 4159 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4160 } block_comment(" } // j"); 4161 4162 post1_squaring(); 4163 add(Ri, Ri, 1); 4164 cmp(Ri, Rlen); 4165 br(Assembler::LT, loop); 4166 4167 bind(end); 4168 block_comment("} // i"); 4169 } 4170 4171 block_comment("for (int i = len; i < 2*len; i++) {"); 4172 mov(Ri, Rlen); { 4173 Label loop, end; 4174 bind(loop); 4175 cmp(Ri, Rlen, Assembler::LSL, 1); 4176 br(Assembler::GE, end); 4177 4178 pre2(Ri, Rlen); 4179 4180 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4181 lsl(Rj, Rlen, 1); 4182 sub(Rj, Rj, Ri); 4183 sub(Rj, Rj, 1); 4184 lsr(Rj, Rj, 1); 4185 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4186 } block_comment(" } // j"); 4187 4188 last_squaring(Ri); 4189 4190 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4191 lsl(Rj, Rlen, 1); 4192 sub(Rj, Rj, Ri); 4193 lsr(Rj, Rj, 1); 4194 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4195 } block_comment(" } // j"); 4196 4197 post2(Ri, Rlen); 4198 add(Ri, Ri, 1); 4199 cmp(Ri, Rlen, Assembler::LSL, 1); 4200 4201 br(Assembler::LT, loop); 4202 bind(end); 4203 block_comment("} // i"); 4204 } 4205 4206 normalize(Rlen); 4207 4208 mov(Ra, Pm_base); // Save Pm_base in Ra 4209 restore_regs(); // Restore caller's Pm_base 4210 4211 // Copy our result into caller's Pm_base 4212 reverse(Pm_base, Ra, Rlen, t0, t1); 4213 4214 leave(); 4215 ret(lr); 4216 4217 return entry; 4218 } 4219 // In C, approximately: 4220 4221 // void 4222 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4223 // unsigned long Pm_base[], unsigned long inv, int len) { 4224 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4225 // unsigned long *Pa, *Pb, *Pn, *Pm; 4226 // unsigned long Ra, Rb, Rn, Rm; 4227 4228 // int i; 4229 4230 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4231 4232 // for (i = 0; i < len; i++) { 4233 // int j; 4234 4235 // Pa = Pa_base; 4236 // Pb = Pa_base + i; 4237 // Pm = Pm_base; 4238 // Pn = Pn_base + i; 4239 4240 // Ra = *Pa; 4241 // Rb = *Pb; 4242 // Rm = *Pm; 4243 // Rn = *Pn; 4244 4245 // int iters = (i+1)/2; 4246 // for (j = 0; iters--; j++) { 4247 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4248 // MACC2(Ra, Rb, t0, t1, t2); 4249 // Ra = *++Pa; 4250 // Rb = *--Pb; 4251 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4252 // MACC(Rm, Rn, t0, t1, t2); 4253 // Rm = *++Pm; 4254 // Rn = *--Pn; 4255 // } 4256 // if ((i & 1) == 0) { 4257 // assert(Ra == Pa_base[j], "must be"); 4258 // MACC(Ra, Ra, t0, t1, t2); 4259 // } 4260 // iters = i/2; 4261 // assert(iters == i-j, "must be"); 4262 // for (; iters--; j++) { 4263 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4264 // MACC(Rm, Rn, t0, t1, t2); 4265 // Rm = *++Pm; 4266 // Rn = *--Pn; 4267 // } 4268 4269 // *Pm = Rm = t0 * inv; 4270 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4271 // MACC(Rm, Rn, t0, t1, t2); 4272 4273 // assert(t0 == 0, "broken Montgomery multiply"); 4274 4275 // t0 = t1; t1 = t2; t2 = 0; 4276 // } 4277 4278 // for (i = len; i < 2*len; i++) { 4279 // int start = i-len+1; 4280 // int end = start + (len - start)/2; 4281 // int j; 4282 4283 // Pa = Pa_base + i-len; 4284 // Pb = Pa_base + len; 4285 // Pm = Pm_base + i-len; 4286 // Pn = Pn_base + len; 4287 4288 // Ra = *++Pa; 4289 // Rb = *--Pb; 4290 // Rm = *++Pm; 4291 // Rn = *--Pn; 4292 4293 // int iters = (2*len-i-1)/2; 4294 // assert(iters == end-start, "must be"); 4295 // for (j = start; iters--; j++) { 4296 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4297 // MACC2(Ra, Rb, t0, t1, t2); 4298 // Ra = *++Pa; 4299 // Rb = *--Pb; 4300 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4301 // MACC(Rm, Rn, t0, t1, t2); 4302 // Rm = *++Pm; 4303 // Rn = *--Pn; 4304 // } 4305 // if ((i & 1) == 0) { 4306 // assert(Ra == Pa_base[j], "must be"); 4307 // MACC(Ra, Ra, t0, t1, t2); 4308 // } 4309 // iters = (2*len-i)/2; 4310 // assert(iters == len-j, "must be"); 4311 // for (; iters--; j++) { 4312 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4313 // MACC(Rm, Rn, t0, t1, t2); 4314 // Rm = *++Pm; 4315 // Rn = *--Pn; 4316 // } 4317 // Pm_base[i-len] = t0; 4318 // t0 = t1; t1 = t2; t2 = 0; 4319 // } 4320 4321 // while (t0) 4322 // t0 = sub(Pm_base, Pn_base, t0, len); 4323 // } 4324 }; 4325 4326 // Initialization 4327 void generate_initial() { 4328 // Generate initial stubs and initializes the entry points 4329 4330 // entry points that exist in all platforms Note: This is code 4331 // that could be shared among different platforms - however the 4332 // benefit seems to be smaller than the disadvantage of having a 4333 // much more complicated generator structure. See also comment in 4334 // stubRoutines.hpp. 4335 4336 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4337 4338 StubRoutines::_call_stub_entry = 4339 generate_call_stub(StubRoutines::_call_stub_return_address); 4340 4341 // is referenced by megamorphic call 4342 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4343 4344 // Build this early so it's available for the interpreter. 4345 StubRoutines::_throw_StackOverflowError_entry = 4346 generate_throw_exception("StackOverflowError throw_exception", 4347 CAST_FROM_FN_PTR(address, 4348 SharedRuntime:: 4349 throw_StackOverflowError)); 4350 if (UseCRC32Intrinsics) { 4351 // set table address before stub generation which use it 4352 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4353 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4354 } 4355 } 4356 4357 void generate_all() { 4358 // support for verify_oop (must happen after universe_init) 4359 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4360 StubRoutines::_throw_AbstractMethodError_entry = 4361 generate_throw_exception("AbstractMethodError throw_exception", 4362 CAST_FROM_FN_PTR(address, 4363 SharedRuntime:: 4364 throw_AbstractMethodError)); 4365 4366 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4367 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4368 CAST_FROM_FN_PTR(address, 4369 SharedRuntime:: 4370 throw_IncompatibleClassChangeError)); 4371 4372 StubRoutines::_throw_NullPointerException_at_call_entry = 4373 generate_throw_exception("NullPointerException at call throw_exception", 4374 CAST_FROM_FN_PTR(address, 4375 SharedRuntime:: 4376 throw_NullPointerException_at_call)); 4377 4378 // arraycopy stubs used by compilers 4379 generate_arraycopy_stubs(); 4380 4381 if (UseMultiplyToLenIntrinsic) { 4382 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4383 } 4384 4385 if (UseMontgomeryMultiplyIntrinsic) { 4386 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4387 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4388 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4389 } 4390 4391 if (UseMontgomerySquareIntrinsic) { 4392 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4393 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4394 // We use generate_multiply() rather than generate_square() 4395 // because it's faster for the sizes of modulus we care about. 4396 StubRoutines::_montgomerySquare = g.generate_multiply(); 4397 } 4398 4399 if (UseAESIntrinsics) { 4400 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4401 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4402 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4403 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4404 } 4405 4406 // generate GHASH intrinsics code 4407 if (UseGHASHIntrinsics) { 4408 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4409 } 4410 4411 if (UseSHA1Intrinsics) { 4412 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4413 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4414 } 4415 if (UseSHA256Intrinsics) { 4416 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4417 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4418 } 4419 4420 // Safefetch stubs. 4421 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4422 &StubRoutines::_safefetch32_fault_pc, 4423 &StubRoutines::_safefetch32_continuation_pc); 4424 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4425 &StubRoutines::_safefetchN_fault_pc, 4426 &StubRoutines::_safefetchN_continuation_pc); 4427 } 4428 4429 public: 4430 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4431 if (all) { 4432 generate_all(); 4433 } else { 4434 generate_initial(); 4435 } 4436 } 4437 }; // end class declaration 4438 4439 void StubGenerator_generate(CodeBuffer* code, bool all) { 4440 StubGenerator g(code, all); 4441 }