1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/globalDefinitions.hpp" 57 #include "utilities/powerOfTwo.hpp" 58 #ifdef COMPILER2 59 #include "opto/runtime.hpp" 60 #endif 61 #if INCLUDE_ZGC 62 #include "gc/z/zThreadLocalData.hpp" 63 #endif 64 65 // Declaration and definition of StubGenerator (no .hpp file). 66 // For a more detailed description of the stub routine structure 67 // see the comment in stubRoutines.hpp 68 69 #undef __ 70 #define __ _masm-> 71 72 #ifdef PRODUCT 73 #define BLOCK_COMMENT(str) /* nothing */ 74 #else 75 #define BLOCK_COMMENT(str) __ block_comment(str) 76 #endif 77 78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 79 80 // Stub Code definitions 81 82 class StubGenerator: public StubCodeGenerator { 83 private: 84 85 #ifdef PRODUCT 86 #define inc_counter_np(counter) ((void)0) 87 #else 88 void inc_counter_np_(uint& counter) { 89 __ lea(rscratch2, ExternalAddress((address)&counter)); 90 __ ldrw(rscratch1, Address(rscratch2)); 91 __ addw(rscratch1, rscratch1, 1); 92 __ strw(rscratch1, Address(rscratch2)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubCodeMark mark(this, "StubRoutines", "call_stub"); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 // All of j_rargN may be used to return inline type fields so be careful 332 // not to clobber those. 333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 334 // assignment of Rresult below. 335 Register Rresult = r14, Rresult_type = r15; 336 __ ldr(Rresult, result); 337 Label is_long, is_float, is_double, check_prim, exit; 338 __ ldr(Rresult_type, result_type); 339 __ cmp(Rresult_type, (u1)T_OBJECT); 340 __ br(Assembler::EQ, check_prim); 341 __ cmp(Rresult_type, (u1)T_LONG); 342 __ br(Assembler::EQ, is_long); 343 __ cmp(Rresult_type, (u1)T_FLOAT); 344 __ br(Assembler::EQ, is_float); 345 __ cmp(Rresult_type, (u1)T_DOUBLE); 346 __ br(Assembler::EQ, is_double); 347 348 // handle T_INT case 349 __ strw(r0, Address(Rresult)); 350 351 __ BIND(exit); 352 353 // pop parameters 354 __ sub(esp, rfp, -sp_after_call_off * wordSize); 355 356 #ifdef ASSERT 357 // verify that threads correspond 358 { 359 Label L, S; 360 __ ldr(rscratch1, thread); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::NE, S); 363 __ get_thread(rscratch1); 364 __ cmp(rthread, rscratch1); 365 __ br(Assembler::EQ, L); 366 __ BIND(S); 367 __ stop("StubRoutines::call_stub: threads must correspond"); 368 __ BIND(L); 369 } 370 #endif 371 372 __ pop_cont_fastpath(rthread); 373 374 // restore callee-save registers 375 __ ldpd(v15, v14, d15_save); 376 __ ldpd(v13, v12, d13_save); 377 __ ldpd(v11, v10, d11_save); 378 __ ldpd(v9, v8, d9_save); 379 380 __ ldp(r28, r27, r28_save); 381 __ ldp(r26, r25, r26_save); 382 __ ldp(r24, r23, r24_save); 383 __ ldp(r22, r21, r22_save); 384 __ ldp(r20, r19, r20_save); 385 386 // restore fpcr 387 __ ldr(rscratch1, fpcr_save); 388 __ set_fpcr(rscratch1); 389 390 __ ldp(c_rarg0, c_rarg1, call_wrapper); 391 __ ldrw(c_rarg2, result_type); 392 __ ldr(c_rarg3, method); 393 __ ldp(c_rarg4, c_rarg5, entry_point); 394 __ ldp(c_rarg6, c_rarg7, parameter_size); 395 396 // leave frame and return to caller 397 __ leave(); 398 __ ret(lr); 399 400 // handle return types different from T_INT 401 __ BIND(check_prim); 402 if (InlineTypeReturnedAsFields) { 403 // Check for scalarized return value 404 __ tbz(r0, 0, is_long); 405 // Load pack handler address 406 __ andr(rscratch1, r0, -2); 407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 409 __ blr(rscratch1); 410 __ b(exit); 411 } 412 413 __ BIND(is_long); 414 __ str(r0, Address(Rresult, 0)); 415 __ br(Assembler::AL, exit); 416 417 __ BIND(is_float); 418 __ strs(j_farg0, Address(Rresult, 0)); 419 __ br(Assembler::AL, exit); 420 421 __ BIND(is_double); 422 __ strd(j_farg0, Address(Rresult, 0)); 423 __ br(Assembler::AL, exit); 424 425 return start; 426 } 427 428 // Return point for a Java call if there's an exception thrown in 429 // Java code. The exception is caught and transformed into a 430 // pending exception stored in JavaThread that can be tested from 431 // within the VM. 432 // 433 // Note: Usually the parameters are removed by the callee. In case 434 // of an exception crossing an activation frame boundary, that is 435 // not the case if the callee is compiled code => need to setup the 436 // rsp. 437 // 438 // r0: exception oop 439 440 address generate_catch_exception() { 441 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 442 address start = __ pc(); 443 444 // same as in generate_call_stub(): 445 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 446 const Address thread (rfp, thread_off * wordSize); 447 448 #ifdef ASSERT 449 // verify that threads correspond 450 { 451 Label L, S; 452 __ ldr(rscratch1, thread); 453 __ cmp(rthread, rscratch1); 454 __ br(Assembler::NE, S); 455 __ get_thread(rscratch1); 456 __ cmp(rthread, rscratch1); 457 __ br(Assembler::EQ, L); 458 __ bind(S); 459 __ stop("StubRoutines::catch_exception: threads must correspond"); 460 __ bind(L); 461 } 462 #endif 463 464 // set pending exception 465 __ verify_oop(r0); 466 467 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 468 __ mov(rscratch1, (address)__FILE__); 469 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 470 __ movw(rscratch1, (int)__LINE__); 471 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 472 473 // complete return to VM 474 assert(StubRoutines::_call_stub_return_address != nullptr, 475 "_call_stub_return_address must have been generated before"); 476 __ b(StubRoutines::_call_stub_return_address); 477 478 return start; 479 } 480 481 // Continuation point for runtime calls returning with a pending 482 // exception. The pending exception check happened in the runtime 483 // or native call stub. The pending exception in Thread is 484 // converted into a Java-level exception. 485 // 486 // Contract with Java-level exception handlers: 487 // r0: exception 488 // r3: throwing pc 489 // 490 // NOTE: At entry of this stub, exception-pc must be in LR !! 491 492 // NOTE: this is always used as a jump target within generated code 493 // so it just needs to be generated code with no x86 prolog 494 495 address generate_forward_exception() { 496 StubCodeMark mark(this, "StubRoutines", "forward exception"); 497 address start = __ pc(); 498 499 // Upon entry, LR points to the return address returning into 500 // Java (interpreted or compiled) code; i.e., the return address 501 // becomes the throwing pc. 502 // 503 // Arguments pushed before the runtime call are still on the stack 504 // but the exception handler will reset the stack pointer -> 505 // ignore them. A potential result in registers can be ignored as 506 // well. 507 508 #ifdef ASSERT 509 // make sure this code is only executed if there is a pending exception 510 { 511 Label L; 512 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 513 __ cbnz(rscratch1, L); 514 __ stop("StubRoutines::forward exception: no pending exception (1)"); 515 __ bind(L); 516 } 517 #endif 518 519 // compute exception handler into r19 520 521 // call the VM to find the handler address associated with the 522 // caller address. pass thread in r0 and caller pc (ret address) 523 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 524 // the stack. 525 __ mov(c_rarg1, lr); 526 // lr will be trashed by the VM call so we move it to R19 527 // (callee-saved) because we also need to pass it to the handler 528 // returned by this call. 529 __ mov(r19, lr); 530 BLOCK_COMMENT("call exception_handler_for_return_address"); 531 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 532 SharedRuntime::exception_handler_for_return_address), 533 rthread, c_rarg1); 534 // Reinitialize the ptrue predicate register, in case the external runtime 535 // call clobbers ptrue reg, as we may return to SVE compiled code. 536 __ reinitialize_ptrue(); 537 538 // we should not really care that lr is no longer the callee 539 // address. we saved the value the handler needs in r19 so we can 540 // just copy it to r3. however, the C2 handler will push its own 541 // frame and then calls into the VM and the VM code asserts that 542 // the PC for the frame above the handler belongs to a compiled 543 // Java method. So, we restore lr here to satisfy that assert. 544 __ mov(lr, r19); 545 // setup r0 & r3 & clear pending exception 546 __ mov(r3, r19); 547 __ mov(r19, r0); 548 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 549 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 550 551 #ifdef ASSERT 552 // make sure exception is set 553 { 554 Label L; 555 __ cbnz(r0, L); 556 __ stop("StubRoutines::forward exception: no pending exception (2)"); 557 __ bind(L); 558 } 559 #endif 560 561 // continue at exception handler 562 // r0: exception 563 // r3: throwing pc 564 // r19: exception handler 565 __ verify_oop(r0); 566 __ br(r19); 567 568 return start; 569 } 570 571 // Non-destructive plausibility checks for oops 572 // 573 // Arguments: 574 // r0: oop to verify 575 // rscratch1: error message 576 // 577 // Stack after saving c_rarg3: 578 // [tos + 0]: saved c_rarg3 579 // [tos + 1]: saved c_rarg2 580 // [tos + 2]: saved lr 581 // [tos + 3]: saved rscratch2 582 // [tos + 4]: saved r0 583 // [tos + 5]: saved rscratch1 584 address generate_verify_oop() { 585 586 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 587 address start = __ pc(); 588 589 Label exit, error; 590 591 // save c_rarg2 and c_rarg3 592 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 593 594 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 595 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 596 __ ldr(c_rarg3, Address(c_rarg2)); 597 __ add(c_rarg3, c_rarg3, 1); 598 __ str(c_rarg3, Address(c_rarg2)); 599 600 // object is in r0 601 // make sure object is 'reasonable' 602 __ cbz(r0, exit); // if obj is null it is OK 603 604 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 605 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 606 607 // return if everything seems ok 608 __ bind(exit); 609 610 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 611 __ ret(lr); 612 613 // handle errors 614 __ bind(error); 615 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 616 617 __ push(RegSet::range(r0, r29), sp); 618 // debug(char* msg, int64_t pc, int64_t regs[]) 619 __ mov(c_rarg0, rscratch1); // pass address of error message 620 __ mov(c_rarg1, lr); // pass return address 621 __ mov(c_rarg2, sp); // pass address of regs on stack 622 #ifndef PRODUCT 623 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 624 #endif 625 BLOCK_COMMENT("call MacroAssembler::debug"); 626 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 627 __ blr(rscratch1); 628 __ hlt(0); 629 630 return start; 631 } 632 633 // Generate indices for iota vector. 634 address generate_iota_indices(const char *stub_name) { 635 __ align(CodeEntryAlignment); 636 StubCodeMark mark(this, "StubRoutines", stub_name); 637 address start = __ pc(); 638 // B 639 __ emit_data64(0x0706050403020100, relocInfo::none); 640 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 641 // H 642 __ emit_data64(0x0003000200010000, relocInfo::none); 643 __ emit_data64(0x0007000600050004, relocInfo::none); 644 // S 645 __ emit_data64(0x0000000100000000, relocInfo::none); 646 __ emit_data64(0x0000000300000002, relocInfo::none); 647 // D 648 __ emit_data64(0x0000000000000000, relocInfo::none); 649 __ emit_data64(0x0000000000000001, relocInfo::none); 650 // S - FP 651 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 652 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 653 // D - FP 654 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 655 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 656 return start; 657 } 658 659 // The inner part of zero_words(). This is the bulk operation, 660 // zeroing words in blocks, possibly using DC ZVA to do it. The 661 // caller is responsible for zeroing the last few words. 662 // 663 // Inputs: 664 // r10: the HeapWord-aligned base address of an array to zero. 665 // r11: the count in HeapWords, r11 > 0. 666 // 667 // Returns r10 and r11, adjusted for the caller to clear. 668 // r10: the base address of the tail of words left to clear. 669 // r11: the number of words in the tail. 670 // r11 < MacroAssembler::zero_words_block_size. 671 672 address generate_zero_blocks() { 673 Label done; 674 Label base_aligned; 675 676 Register base = r10, cnt = r11; 677 678 __ align(CodeEntryAlignment); 679 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 680 address start = __ pc(); 681 682 if (UseBlockZeroing) { 683 int zva_length = VM_Version::zva_length(); 684 685 // Ensure ZVA length can be divided by 16. This is required by 686 // the subsequent operations. 687 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 688 689 __ tbz(base, 3, base_aligned); 690 __ str(zr, Address(__ post(base, 8))); 691 __ sub(cnt, cnt, 1); 692 __ bind(base_aligned); 693 694 // Ensure count >= zva_length * 2 so that it still deserves a zva after 695 // alignment. 696 Label small; 697 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 698 __ subs(rscratch1, cnt, low_limit >> 3); 699 __ br(Assembler::LT, small); 700 __ zero_dcache_blocks(base, cnt); 701 __ bind(small); 702 } 703 704 { 705 // Number of stp instructions we'll unroll 706 const int unroll = 707 MacroAssembler::zero_words_block_size / 2; 708 // Clear the remaining blocks. 709 Label loop; 710 __ subs(cnt, cnt, unroll * 2); 711 __ br(Assembler::LT, done); 712 __ bind(loop); 713 for (int i = 0; i < unroll; i++) 714 __ stp(zr, zr, __ post(base, 16)); 715 __ subs(cnt, cnt, unroll * 2); 716 __ br(Assembler::GE, loop); 717 __ bind(done); 718 __ add(cnt, cnt, unroll * 2); 719 } 720 721 __ ret(lr); 722 723 return start; 724 } 725 726 727 typedef enum { 728 copy_forwards = 1, 729 copy_backwards = -1 730 } copy_direction; 731 732 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 733 // for arraycopy stubs. 734 class ArrayCopyBarrierSetHelper : StackObj { 735 BarrierSetAssembler* _bs_asm; 736 MacroAssembler* _masm; 737 DecoratorSet _decorators; 738 BasicType _type; 739 Register _gct1; 740 Register _gct2; 741 Register _gct3; 742 FloatRegister _gcvt1; 743 FloatRegister _gcvt2; 744 FloatRegister _gcvt3; 745 746 public: 747 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 748 DecoratorSet decorators, 749 BasicType type, 750 Register gct1, 751 Register gct2, 752 Register gct3, 753 FloatRegister gcvt1, 754 FloatRegister gcvt2, 755 FloatRegister gcvt3) 756 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 757 _masm(masm), 758 _decorators(decorators), 759 _type(type), 760 _gct1(gct1), 761 _gct2(gct2), 762 _gct3(gct3), 763 _gcvt1(gcvt1), 764 _gcvt2(gcvt2), 765 _gcvt3(gcvt3) { 766 } 767 768 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 770 dst1, dst2, src, 771 _gct1, _gct2, _gcvt1); 772 } 773 774 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 778 } 779 780 void copy_load_at_16(Register dst1, Register dst2, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 782 dst1, dst2, src, 783 _gct1); 784 } 785 786 void copy_store_at_16(Address dst, Register src1, Register src2) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 788 dst, src1, src2, 789 _gct1, _gct2, _gct3); 790 } 791 792 void copy_load_at_8(Register dst, Address src) { 793 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 794 dst, noreg, src, 795 _gct1); 796 } 797 798 void copy_store_at_8(Address dst, Register src) { 799 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 800 dst, src, noreg, 801 _gct1, _gct2, _gct3); 802 } 803 }; 804 805 // Bulk copy of blocks of 8 words. 806 // 807 // count is a count of words. 808 // 809 // Precondition: count >= 8 810 // 811 // Postconditions: 812 // 813 // The least significant bit of count contains the remaining count 814 // of words to copy. The rest of count is trash. 815 // 816 // s and d are adjusted to point to the remaining words to copy 817 // 818 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 819 copy_direction direction) { 820 int unit = wordSize * direction; 821 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 822 823 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 824 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 825 const Register stride = r14; 826 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 827 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 828 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 829 830 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 831 assert_different_registers(s, d, count, rscratch1, rscratch2); 832 833 Label again, drain; 834 const char *stub_name; 835 if (direction == copy_forwards) 836 stub_name = "forward_copy_longs"; 837 else 838 stub_name = "backward_copy_longs"; 839 840 __ align(CodeEntryAlignment); 841 842 StubCodeMark mark(this, "StubRoutines", stub_name); 843 844 __ bind(start); 845 846 Label unaligned_copy_long; 847 if (AvoidUnalignedAccesses) { 848 __ tbnz(d, 3, unaligned_copy_long); 849 } 850 851 if (direction == copy_forwards) { 852 __ sub(s, s, bias); 853 __ sub(d, d, bias); 854 } 855 856 #ifdef ASSERT 857 // Make sure we are never given < 8 words 858 { 859 Label L; 860 __ cmp(count, (u1)8); 861 __ br(Assembler::GE, L); 862 __ stop("genrate_copy_longs called with < 8 words"); 863 __ bind(L); 864 } 865 #endif 866 867 // Fill 8 registers 868 if (UseSIMDForMemoryOps) { 869 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 870 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 871 } else { 872 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 873 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 874 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 875 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 876 } 877 878 __ subs(count, count, 16); 879 __ br(Assembler::LO, drain); 880 881 int prefetch = PrefetchCopyIntervalInBytes; 882 bool use_stride = false; 883 if (direction == copy_backwards) { 884 use_stride = prefetch > 256; 885 prefetch = -prefetch; 886 if (use_stride) __ mov(stride, prefetch); 887 } 888 889 __ bind(again); 890 891 if (PrefetchCopyIntervalInBytes > 0) 892 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 893 894 if (UseSIMDForMemoryOps) { 895 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 896 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 897 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 898 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 899 } else { 900 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 902 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 903 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 904 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 905 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 906 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 907 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 908 } 909 910 __ subs(count, count, 8); 911 __ br(Assembler::HS, again); 912 913 // Drain 914 __ bind(drain); 915 if (UseSIMDForMemoryOps) { 916 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 917 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 918 } else { 919 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 920 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 921 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 922 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 923 } 924 925 { 926 Label L1, L2; 927 __ tbz(count, exact_log2(4), L1); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 930 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 931 } else { 932 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 933 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 934 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 935 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 936 } 937 __ bind(L1); 938 939 if (direction == copy_forwards) { 940 __ add(s, s, bias); 941 __ add(d, d, bias); 942 } 943 944 __ tbz(count, 1, L2); 945 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 946 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 947 __ bind(L2); 948 } 949 950 __ ret(lr); 951 952 if (AvoidUnalignedAccesses) { 953 Label drain, again; 954 // Register order for storing. Order is different for backward copy. 955 956 __ bind(unaligned_copy_long); 957 958 // source address is even aligned, target odd aligned 959 // 960 // when forward copying word pairs we read long pairs at offsets 961 // {0, 2, 4, 6} (in long words). when backwards copying we read 962 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 963 // address by -2 in the forwards case so we can compute the 964 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 965 // or -1. 966 // 967 // when forward copying we need to store 1 word, 3 pairs and 968 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 969 // zero offset We adjust the destination by -1 which means we 970 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 971 // 972 // When backwards copyng we need to store 1 word, 3 pairs and 973 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 974 // offsets {1, 3, 5, 7, 8} * unit. 975 976 if (direction == copy_forwards) { 977 __ sub(s, s, 16); 978 __ sub(d, d, 8); 979 } 980 981 // Fill 8 registers 982 // 983 // for forwards copy s was offset by -16 from the original input 984 // value of s so the register contents are at these offsets 985 // relative to the 64 bit block addressed by that original input 986 // and so on for each successive 64 byte block when s is updated 987 // 988 // t0 at offset 0, t1 at offset 8 989 // t2 at offset 16, t3 at offset 24 990 // t4 at offset 32, t5 at offset 40 991 // t6 at offset 48, t7 at offset 56 992 993 // for backwards copy s was not offset so the register contents 994 // are at these offsets into the preceding 64 byte block 995 // relative to that original input and so on for each successive 996 // preceding 64 byte block when s is updated. this explains the 997 // slightly counter-intuitive looking pattern of register usage 998 // in the stp instructions for backwards copy. 999 // 1000 // t0 at offset -16, t1 at offset -8 1001 // t2 at offset -32, t3 at offset -24 1002 // t4 at offset -48, t5 at offset -40 1003 // t6 at offset -64, t7 at offset -56 1004 1005 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1006 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1007 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1008 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1009 1010 __ subs(count, count, 16); 1011 __ br(Assembler::LO, drain); 1012 1013 int prefetch = PrefetchCopyIntervalInBytes; 1014 bool use_stride = false; 1015 if (direction == copy_backwards) { 1016 use_stride = prefetch > 256; 1017 prefetch = -prefetch; 1018 if (use_stride) __ mov(stride, prefetch); 1019 } 1020 1021 __ bind(again); 1022 1023 if (PrefetchCopyIntervalInBytes > 0) 1024 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1025 1026 if (direction == copy_forwards) { 1027 // allowing for the offset of -8 the store instructions place 1028 // registers into the target 64 bit block at the following 1029 // offsets 1030 // 1031 // t0 at offset 0 1032 // t1 at offset 8, t2 at offset 16 1033 // t3 at offset 24, t4 at offset 32 1034 // t5 at offset 40, t6 at offset 48 1035 // t7 at offset 56 1036 1037 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1038 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1039 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1040 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1041 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1042 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1043 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1044 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1045 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1046 } else { 1047 // d was not offset when we started so the registers are 1048 // written into the 64 bit block preceding d with the following 1049 // offsets 1050 // 1051 // t1 at offset -8 1052 // t3 at offset -24, t0 at offset -16 1053 // t5 at offset -48, t2 at offset -32 1054 // t7 at offset -56, t4 at offset -48 1055 // t6 at offset -64 1056 // 1057 // note that this matches the offsets previously noted for the 1058 // loads 1059 1060 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1061 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1062 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1063 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1064 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1065 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1066 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1067 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1068 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1069 } 1070 1071 __ subs(count, count, 8); 1072 __ br(Assembler::HS, again); 1073 1074 // Drain 1075 // 1076 // this uses the same pattern of offsets and register arguments 1077 // as above 1078 __ bind(drain); 1079 if (direction == copy_forwards) { 1080 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1081 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1082 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1083 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1084 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1085 } else { 1086 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1087 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1088 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1089 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1090 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1091 } 1092 // now we need to copy any remaining part block which may 1093 // include a 4 word block subblock and/or a 2 word subblock. 1094 // bits 2 and 1 in the count are the tell-tale for whether we 1095 // have each such subblock 1096 { 1097 Label L1, L2; 1098 __ tbz(count, exact_log2(4), L1); 1099 // this is the same as above but copying only 4 longs hence 1100 // with only one intervening stp between the str instructions 1101 // but note that the offsets and registers still follow the 1102 // same pattern 1103 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1104 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1105 if (direction == copy_forwards) { 1106 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1107 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1108 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1109 } else { 1110 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1111 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1112 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1113 } 1114 __ bind(L1); 1115 1116 __ tbz(count, 1, L2); 1117 // this is the same as above but copying only 2 longs hence 1118 // there is no intervening stp between the str instructions 1119 // but note that the offset and register patterns are still 1120 // the same 1121 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1122 if (direction == copy_forwards) { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1124 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1125 } else { 1126 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1127 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1128 } 1129 __ bind(L2); 1130 1131 // for forwards copy we need to re-adjust the offsets we 1132 // applied so that s and d are follow the last words written 1133 1134 if (direction == copy_forwards) { 1135 __ add(s, s, 16); 1136 __ add(d, d, 8); 1137 } 1138 1139 } 1140 1141 __ ret(lr); 1142 } 1143 } 1144 1145 // Small copy: less than 16 bytes. 1146 // 1147 // NB: Ignores all of the bits of count which represent more than 15 1148 // bytes, so a caller doesn't have to mask them. 1149 1150 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1151 bool is_backwards = step < 0; 1152 size_t granularity = uabs(step); 1153 int direction = is_backwards ? -1 : 1; 1154 1155 Label Lword, Lint, Lshort, Lbyte; 1156 1157 assert(granularity 1158 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1159 1160 const Register t0 = r3; 1161 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1162 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1163 1164 // ??? I don't know if this bit-test-and-branch is the right thing 1165 // to do. It does a lot of jumping, resulting in several 1166 // mispredicted branches. It might make more sense to do this 1167 // with something like Duff's device with a single computed branch. 1168 1169 __ tbz(count, 3 - exact_log2(granularity), Lword); 1170 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1171 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1172 __ bind(Lword); 1173 1174 if (granularity <= sizeof (jint)) { 1175 __ tbz(count, 2 - exact_log2(granularity), Lint); 1176 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1177 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1178 __ bind(Lint); 1179 } 1180 1181 if (granularity <= sizeof (jshort)) { 1182 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1183 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1184 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1185 __ bind(Lshort); 1186 } 1187 1188 if (granularity <= sizeof (jbyte)) { 1189 __ tbz(count, 0, Lbyte); 1190 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1191 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1192 __ bind(Lbyte); 1193 } 1194 } 1195 1196 Label copy_f, copy_b; 1197 Label copy_obj_f, copy_obj_b; 1198 Label copy_obj_uninit_f, copy_obj_uninit_b; 1199 1200 // All-singing all-dancing memory copy. 1201 // 1202 // Copy count units of memory from s to d. The size of a unit is 1203 // step, which can be positive or negative depending on the direction 1204 // of copy. If is_aligned is false, we align the source address. 1205 // 1206 1207 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1208 Register s, Register d, Register count, int step) { 1209 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1210 bool is_backwards = step < 0; 1211 unsigned int granularity = uabs(step); 1212 const Register t0 = r3, t1 = r4; 1213 1214 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1215 // load all the data before writing anything 1216 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1217 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1218 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1219 const Register send = r17, dend = r16; 1220 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1221 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1222 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1223 1224 if (PrefetchCopyIntervalInBytes > 0) 1225 __ prfm(Address(s, 0), PLDL1KEEP); 1226 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1227 __ br(Assembler::HI, copy_big); 1228 1229 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1230 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1231 1232 __ cmp(count, u1(16/granularity)); 1233 __ br(Assembler::LS, copy16); 1234 1235 __ cmp(count, u1(64/granularity)); 1236 __ br(Assembler::HI, copy80); 1237 1238 __ cmp(count, u1(32/granularity)); 1239 __ br(Assembler::LS, copy32); 1240 1241 // 33..64 bytes 1242 if (UseSIMDForMemoryOps) { 1243 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1244 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1245 bs.copy_store_at_32(Address(d, 0), v0, v1); 1246 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1247 } else { 1248 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1249 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1250 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1251 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1252 1253 bs.copy_store_at_16(Address(d, 0), t0, t1); 1254 bs.copy_store_at_16(Address(d, 16), t2, t3); 1255 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1256 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1257 } 1258 __ b(finish); 1259 1260 // 17..32 bytes 1261 __ bind(copy32); 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1264 1265 bs.copy_store_at_16(Address(d, 0), t0, t1); 1266 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1267 __ b(finish); 1268 1269 // 65..80/96 bytes 1270 // (96 bytes if SIMD because we do 32 byes per instruction) 1271 __ bind(copy80); 1272 if (UseSIMDForMemoryOps) { 1273 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1274 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1275 // Unaligned pointers can be an issue for copying. 1276 // The issue has more chances to happen when granularity of data is 1277 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1278 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1279 // The most performance drop has been seen for the range 65-80 bytes. 1280 // For such cases using the pair of ldp/stp instead of the third pair of 1281 // ldpq/stpq fixes the performance issue. 1282 if (granularity < sizeof (jint)) { 1283 Label copy96; 1284 __ cmp(count, u1(80/granularity)); 1285 __ br(Assembler::HI, copy96); 1286 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1287 1288 bs.copy_store_at_32(Address(d, 0), v0, v1); 1289 bs.copy_store_at_32(Address(d, 32), v2, v3); 1290 1291 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1292 __ b(finish); 1293 1294 __ bind(copy96); 1295 } 1296 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1297 1298 bs.copy_store_at_32(Address(d, 0), v0, v1); 1299 bs.copy_store_at_32(Address(d, 32), v2, v3); 1300 1301 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1302 } else { 1303 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1304 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1305 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1306 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1307 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1308 1309 bs.copy_store_at_16(Address(d, 0), t0, t1); 1310 bs.copy_store_at_16(Address(d, 16), t2, t3); 1311 bs.copy_store_at_16(Address(d, 32), t4, t5); 1312 bs.copy_store_at_16(Address(d, 48), t6, t7); 1313 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1314 } 1315 __ b(finish); 1316 1317 // 0..16 bytes 1318 __ bind(copy16); 1319 __ cmp(count, u1(8/granularity)); 1320 __ br(Assembler::LO, copy8); 1321 1322 // 8..16 bytes 1323 bs.copy_load_at_8(t0, Address(s, 0)); 1324 bs.copy_load_at_8(t1, Address(send, -8)); 1325 bs.copy_store_at_8(Address(d, 0), t0); 1326 bs.copy_store_at_8(Address(dend, -8), t1); 1327 __ b(finish); 1328 1329 if (granularity < 8) { 1330 // 4..7 bytes 1331 __ bind(copy8); 1332 __ tbz(count, 2 - exact_log2(granularity), copy4); 1333 __ ldrw(t0, Address(s, 0)); 1334 __ ldrw(t1, Address(send, -4)); 1335 __ strw(t0, Address(d, 0)); 1336 __ strw(t1, Address(dend, -4)); 1337 __ b(finish); 1338 if (granularity < 4) { 1339 // 0..3 bytes 1340 __ bind(copy4); 1341 __ cbz(count, finish); // get rid of 0 case 1342 if (granularity == 2) { 1343 __ ldrh(t0, Address(s, 0)); 1344 __ strh(t0, Address(d, 0)); 1345 } else { // granularity == 1 1346 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1347 // the first and last byte. 1348 // Handle the 3 byte case by loading and storing base + count/2 1349 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1350 // This does means in the 1 byte case we load/store the same 1351 // byte 3 times. 1352 __ lsr(count, count, 1); 1353 __ ldrb(t0, Address(s, 0)); 1354 __ ldrb(t1, Address(send, -1)); 1355 __ ldrb(t2, Address(s, count)); 1356 __ strb(t0, Address(d, 0)); 1357 __ strb(t1, Address(dend, -1)); 1358 __ strb(t2, Address(d, count)); 1359 } 1360 __ b(finish); 1361 } 1362 } 1363 1364 __ bind(copy_big); 1365 if (is_backwards) { 1366 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1367 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1368 } 1369 1370 // Now we've got the small case out of the way we can align the 1371 // source address on a 2-word boundary. 1372 1373 // Here we will materialize a count in r15, which is used by copy_memory_small 1374 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1375 // Up until here, we have used t9, which aliases r15, but from here on, that register 1376 // can not be used as a temp register, as it contains the count. 1377 1378 Label aligned; 1379 1380 if (is_aligned) { 1381 // We may have to adjust by 1 word to get s 2-word-aligned. 1382 __ tbz(s, exact_log2(wordSize), aligned); 1383 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1384 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1385 __ sub(count, count, wordSize/granularity); 1386 } else { 1387 if (is_backwards) { 1388 __ andr(r15, s, 2 * wordSize - 1); 1389 } else { 1390 __ neg(r15, s); 1391 __ andr(r15, r15, 2 * wordSize - 1); 1392 } 1393 // r15 is the byte adjustment needed to align s. 1394 __ cbz(r15, aligned); 1395 int shift = exact_log2(granularity); 1396 if (shift) __ lsr(r15, r15, shift); 1397 __ sub(count, count, r15); 1398 1399 #if 0 1400 // ?? This code is only correct for a disjoint copy. It may or 1401 // may not make sense to use it in that case. 1402 1403 // Copy the first pair; s and d may not be aligned. 1404 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1405 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1406 1407 // Align s and d, adjust count 1408 if (is_backwards) { 1409 __ sub(s, s, r15); 1410 __ sub(d, d, r15); 1411 } else { 1412 __ add(s, s, r15); 1413 __ add(d, d, r15); 1414 } 1415 #else 1416 copy_memory_small(decorators, type, s, d, r15, step); 1417 #endif 1418 } 1419 1420 __ bind(aligned); 1421 1422 // s is now 2-word-aligned. 1423 1424 // We have a count of units and some trailing bytes. Adjust the 1425 // count and do a bulk copy of words. 1426 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1427 if (direction == copy_forwards) { 1428 if (type != T_OBJECT) { 1429 __ bl(copy_f); 1430 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1431 __ bl(copy_obj_uninit_f); 1432 } else { 1433 __ bl(copy_obj_f); 1434 } 1435 } else { 1436 if (type != T_OBJECT) { 1437 __ bl(copy_b); 1438 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1439 __ bl(copy_obj_uninit_b); 1440 } else { 1441 __ bl(copy_obj_b); 1442 } 1443 } 1444 1445 // And the tail. 1446 copy_memory_small(decorators, type, s, d, count, step); 1447 1448 if (granularity >= 8) __ bind(copy8); 1449 if (granularity >= 4) __ bind(copy4); 1450 __ bind(finish); 1451 } 1452 1453 1454 void clobber_registers() { 1455 #ifdef ASSERT 1456 RegSet clobbered 1457 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1458 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1459 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1460 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1461 __ mov(*it, rscratch1); 1462 } 1463 #endif 1464 1465 } 1466 1467 // Scan over array at a for count oops, verifying each one. 1468 // Preserves a and count, clobbers rscratch1 and rscratch2. 1469 void verify_oop_array (int size, Register a, Register count, Register temp) { 1470 Label loop, end; 1471 __ mov(rscratch1, a); 1472 __ mov(rscratch2, zr); 1473 __ bind(loop); 1474 __ cmp(rscratch2, count); 1475 __ br(Assembler::HS, end); 1476 if (size == wordSize) { 1477 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1478 __ verify_oop(temp); 1479 } else { 1480 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1481 __ decode_heap_oop(temp); // calls verify_oop 1482 } 1483 __ add(rscratch2, rscratch2, 1); 1484 __ b(loop); 1485 __ bind(end); 1486 } 1487 1488 // Arguments: 1489 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1490 // ignored 1491 // is_oop - true => oop array, so generate store check code 1492 // name - stub name string 1493 // 1494 // Inputs: 1495 // c_rarg0 - source array address 1496 // c_rarg1 - destination array address 1497 // c_rarg2 - element count, treated as ssize_t, can be zero 1498 // 1499 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1500 // the hardware handle it. The two dwords within qwords that span 1501 // cache line boundaries will still be loaded and stored atomically. 1502 // 1503 // Side Effects: 1504 // disjoint_int_copy_entry is set to the no-overlap entry point 1505 // used by generate_conjoint_int_oop_copy(). 1506 // 1507 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1508 const char *name, bool dest_uninitialized = false) { 1509 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1510 RegSet saved_reg = RegSet::of(s, d, count); 1511 __ align(CodeEntryAlignment); 1512 StubCodeMark mark(this, "StubRoutines", name); 1513 address start = __ pc(); 1514 __ enter(); 1515 1516 if (entry != nullptr) { 1517 *entry = __ pc(); 1518 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1519 BLOCK_COMMENT("Entry:"); 1520 } 1521 1522 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1523 if (dest_uninitialized) { 1524 decorators |= IS_DEST_UNINITIALIZED; 1525 } 1526 if (aligned) { 1527 decorators |= ARRAYCOPY_ALIGNED; 1528 } 1529 1530 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1531 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1532 1533 if (is_oop) { 1534 // save regs before copy_memory 1535 __ push(RegSet::of(d, count), sp); 1536 } 1537 { 1538 // UnsafeCopyMemory page error: continue after ucm 1539 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1540 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1541 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1542 } 1543 1544 if (is_oop) { 1545 __ pop(RegSet::of(d, count), sp); 1546 if (VerifyOops) 1547 verify_oop_array(size, d, count, r16); 1548 } 1549 1550 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1551 1552 __ leave(); 1553 __ mov(r0, zr); // return 0 1554 __ ret(lr); 1555 return start; 1556 } 1557 1558 // Arguments: 1559 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1560 // ignored 1561 // is_oop - true => oop array, so generate store check code 1562 // name - stub name string 1563 // 1564 // Inputs: 1565 // c_rarg0 - source array address 1566 // c_rarg1 - destination array address 1567 // c_rarg2 - element count, treated as ssize_t, can be zero 1568 // 1569 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1570 // the hardware handle it. The two dwords within qwords that span 1571 // cache line boundaries will still be loaded and stored atomically. 1572 // 1573 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1574 address *entry, const char *name, 1575 bool dest_uninitialized = false) { 1576 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1577 RegSet saved_regs = RegSet::of(s, d, count); 1578 StubCodeMark mark(this, "StubRoutines", name); 1579 address start = __ pc(); 1580 __ enter(); 1581 1582 if (entry != nullptr) { 1583 *entry = __ pc(); 1584 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1585 BLOCK_COMMENT("Entry:"); 1586 } 1587 1588 // use fwd copy when (d-s) above_equal (count*size) 1589 __ sub(rscratch1, d, s); 1590 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1591 __ br(Assembler::HS, nooverlap_target); 1592 1593 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1594 if (dest_uninitialized) { 1595 decorators |= IS_DEST_UNINITIALIZED; 1596 } 1597 if (aligned) { 1598 decorators |= ARRAYCOPY_ALIGNED; 1599 } 1600 1601 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1602 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1603 1604 if (is_oop) { 1605 // save regs before copy_memory 1606 __ push(RegSet::of(d, count), sp); 1607 } 1608 { 1609 // UnsafeCopyMemory page error: continue after ucm 1610 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1611 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1612 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1613 } 1614 if (is_oop) { 1615 __ pop(RegSet::of(d, count), sp); 1616 if (VerifyOops) 1617 verify_oop_array(size, d, count, r16); 1618 } 1619 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1620 __ leave(); 1621 __ mov(r0, zr); // return 0 1622 __ ret(lr); 1623 return start; 1624 } 1625 1626 // Arguments: 1627 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1628 // ignored 1629 // name - stub name string 1630 // 1631 // Inputs: 1632 // c_rarg0 - source array address 1633 // c_rarg1 - destination array address 1634 // c_rarg2 - element count, treated as ssize_t, can be zero 1635 // 1636 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1637 // we let the hardware handle it. The one to eight bytes within words, 1638 // dwords or qwords that span cache line boundaries will still be loaded 1639 // and stored atomically. 1640 // 1641 // Side Effects: 1642 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1643 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1644 // we let the hardware handle it. The one to eight bytes within words, 1645 // dwords or qwords that span cache line boundaries will still be loaded 1646 // and stored atomically. 1647 // 1648 // Side Effects: 1649 // disjoint_byte_copy_entry is set to the no-overlap entry point 1650 // used by generate_conjoint_byte_copy(). 1651 // 1652 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1653 const bool not_oop = false; 1654 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1655 } 1656 1657 // Arguments: 1658 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1659 // ignored 1660 // name - stub name string 1661 // 1662 // Inputs: 1663 // c_rarg0 - source array address 1664 // c_rarg1 - destination array address 1665 // c_rarg2 - element count, treated as ssize_t, can be zero 1666 // 1667 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1668 // we let the hardware handle it. The one to eight bytes within words, 1669 // dwords or qwords that span cache line boundaries will still be loaded 1670 // and stored atomically. 1671 // 1672 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1673 address* entry, const char *name) { 1674 const bool not_oop = false; 1675 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1676 } 1677 1678 // Arguments: 1679 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1680 // ignored 1681 // name - stub name string 1682 // 1683 // Inputs: 1684 // c_rarg0 - source array address 1685 // c_rarg1 - destination array address 1686 // c_rarg2 - element count, treated as ssize_t, can be zero 1687 // 1688 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1689 // let the hardware handle it. The two or four words within dwords 1690 // or qwords that span cache line boundaries will still be loaded 1691 // and stored atomically. 1692 // 1693 // Side Effects: 1694 // disjoint_short_copy_entry is set to the no-overlap entry point 1695 // used by generate_conjoint_short_copy(). 1696 // 1697 address generate_disjoint_short_copy(bool aligned, 1698 address* entry, const char *name) { 1699 const bool not_oop = false; 1700 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1701 } 1702 1703 // Arguments: 1704 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1705 // ignored 1706 // name - stub name string 1707 // 1708 // Inputs: 1709 // c_rarg0 - source array address 1710 // c_rarg1 - destination array address 1711 // c_rarg2 - element count, treated as ssize_t, can be zero 1712 // 1713 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1714 // let the hardware handle it. The two or four words within dwords 1715 // or qwords that span cache line boundaries will still be loaded 1716 // and stored atomically. 1717 // 1718 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1719 address *entry, const char *name) { 1720 const bool not_oop = false; 1721 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1722 1723 } 1724 // Arguments: 1725 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1726 // ignored 1727 // name - stub name string 1728 // 1729 // Inputs: 1730 // c_rarg0 - source array address 1731 // c_rarg1 - destination array address 1732 // c_rarg2 - element count, treated as ssize_t, can be zero 1733 // 1734 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1735 // the hardware handle it. The two dwords within qwords that span 1736 // cache line boundaries will still be loaded and stored atomically. 1737 // 1738 // Side Effects: 1739 // disjoint_int_copy_entry is set to the no-overlap entry point 1740 // used by generate_conjoint_int_oop_copy(). 1741 // 1742 address generate_disjoint_int_copy(bool aligned, address *entry, 1743 const char *name, bool dest_uninitialized = false) { 1744 const bool not_oop = false; 1745 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1746 } 1747 1748 // Arguments: 1749 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1750 // ignored 1751 // name - stub name string 1752 // 1753 // Inputs: 1754 // c_rarg0 - source array address 1755 // c_rarg1 - destination array address 1756 // c_rarg2 - element count, treated as ssize_t, can be zero 1757 // 1758 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1759 // the hardware handle it. The two dwords within qwords that span 1760 // cache line boundaries will still be loaded and stored atomically. 1761 // 1762 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1763 address *entry, const char *name, 1764 bool dest_uninitialized = false) { 1765 const bool not_oop = false; 1766 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1767 } 1768 1769 1770 // Arguments: 1771 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1772 // ignored 1773 // name - stub name string 1774 // 1775 // Inputs: 1776 // c_rarg0 - source array address 1777 // c_rarg1 - destination array address 1778 // c_rarg2 - element count, treated as size_t, can be zero 1779 // 1780 // Side Effects: 1781 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1782 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1783 // 1784 address generate_disjoint_long_copy(bool aligned, address *entry, 1785 const char *name, bool dest_uninitialized = false) { 1786 const bool not_oop = false; 1787 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1788 } 1789 1790 // Arguments: 1791 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1792 // ignored 1793 // name - stub name string 1794 // 1795 // Inputs: 1796 // c_rarg0 - source array address 1797 // c_rarg1 - destination array address 1798 // c_rarg2 - element count, treated as size_t, can be zero 1799 // 1800 address generate_conjoint_long_copy(bool aligned, 1801 address nooverlap_target, address *entry, 1802 const char *name, bool dest_uninitialized = false) { 1803 const bool not_oop = false; 1804 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1805 } 1806 1807 // Arguments: 1808 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1809 // ignored 1810 // name - stub name string 1811 // 1812 // Inputs: 1813 // c_rarg0 - source array address 1814 // c_rarg1 - destination array address 1815 // c_rarg2 - element count, treated as size_t, can be zero 1816 // 1817 // Side Effects: 1818 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1819 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1820 // 1821 address generate_disjoint_oop_copy(bool aligned, address *entry, 1822 const char *name, bool dest_uninitialized) { 1823 const bool is_oop = true; 1824 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1825 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1826 } 1827 1828 // Arguments: 1829 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1830 // ignored 1831 // name - stub name string 1832 // 1833 // Inputs: 1834 // c_rarg0 - source array address 1835 // c_rarg1 - destination array address 1836 // c_rarg2 - element count, treated as size_t, can be zero 1837 // 1838 address generate_conjoint_oop_copy(bool aligned, 1839 address nooverlap_target, address *entry, 1840 const char *name, bool dest_uninitialized) { 1841 const bool is_oop = true; 1842 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1843 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1844 name, dest_uninitialized); 1845 } 1846 1847 1848 // Helper for generating a dynamic type check. 1849 // Smashes rscratch1, rscratch2. 1850 void generate_type_check(Register sub_klass, 1851 Register super_check_offset, 1852 Register super_klass, 1853 Label& L_success) { 1854 assert_different_registers(sub_klass, super_check_offset, super_klass); 1855 1856 BLOCK_COMMENT("type_check:"); 1857 1858 Label L_miss; 1859 1860 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1861 super_check_offset); 1862 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1863 1864 // Fall through on failure! 1865 __ BIND(L_miss); 1866 } 1867 1868 // 1869 // Generate checkcasting array copy stub 1870 // 1871 // Input: 1872 // c_rarg0 - source array address 1873 // c_rarg1 - destination array address 1874 // c_rarg2 - element count, treated as ssize_t, can be zero 1875 // c_rarg3 - size_t ckoff (super_check_offset) 1876 // c_rarg4 - oop ckval (super_klass) 1877 // 1878 // Output: 1879 // r0 == 0 - success 1880 // r0 == -1^K - failure, where K is partial transfer count 1881 // 1882 address generate_checkcast_copy(const char *name, address *entry, 1883 bool dest_uninitialized = false) { 1884 1885 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1886 1887 // Input registers (after setup_arg_regs) 1888 const Register from = c_rarg0; // source array address 1889 const Register to = c_rarg1; // destination array address 1890 const Register count = c_rarg2; // elementscount 1891 const Register ckoff = c_rarg3; // super_check_offset 1892 const Register ckval = c_rarg4; // super_klass 1893 1894 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1895 RegSet wb_post_saved_regs = RegSet::of(count); 1896 1897 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1898 const Register copied_oop = r22; // actual oop copied 1899 const Register count_save = r21; // orig elementscount 1900 const Register start_to = r20; // destination array start address 1901 const Register r19_klass = r19; // oop._klass 1902 1903 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1904 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1905 1906 //--------------------------------------------------------------- 1907 // Assembler stub will be used for this call to arraycopy 1908 // if the two arrays are subtypes of Object[] but the 1909 // destination array type is not equal to or a supertype 1910 // of the source type. Each element must be separately 1911 // checked. 1912 1913 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1914 copied_oop, r19_klass, count_save); 1915 1916 __ align(CodeEntryAlignment); 1917 StubCodeMark mark(this, "StubRoutines", name); 1918 address start = __ pc(); 1919 1920 __ enter(); // required for proper stackwalking of RuntimeStub frame 1921 1922 #ifdef ASSERT 1923 // caller guarantees that the arrays really are different 1924 // otherwise, we would have to make conjoint checks 1925 { Label L; 1926 __ b(L); // conjoint check not yet implemented 1927 __ stop("checkcast_copy within a single array"); 1928 __ bind(L); 1929 } 1930 #endif //ASSERT 1931 1932 // Caller of this entry point must set up the argument registers. 1933 if (entry != nullptr) { 1934 *entry = __ pc(); 1935 BLOCK_COMMENT("Entry:"); 1936 } 1937 1938 // Empty array: Nothing to do. 1939 __ cbz(count, L_done); 1940 __ push(RegSet::of(r19, r20, r21, r22), sp); 1941 1942 #ifdef ASSERT 1943 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1944 // The ckoff and ckval must be mutually consistent, 1945 // even though caller generates both. 1946 { Label L; 1947 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1948 __ ldrw(start_to, Address(ckval, sco_offset)); 1949 __ cmpw(ckoff, start_to); 1950 __ br(Assembler::EQ, L); 1951 __ stop("super_check_offset inconsistent"); 1952 __ bind(L); 1953 } 1954 #endif //ASSERT 1955 1956 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1957 bool is_oop = true; 1958 int element_size = UseCompressedOops ? 4 : 8; 1959 if (dest_uninitialized) { 1960 decorators |= IS_DEST_UNINITIALIZED; 1961 } 1962 1963 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1964 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1965 1966 // save the original count 1967 __ mov(count_save, count); 1968 1969 // Copy from low to high addresses 1970 __ mov(start_to, to); // Save destination array start address 1971 __ b(L_load_element); 1972 1973 // ======== begin loop ======== 1974 // (Loop is rotated; its entry is L_load_element.) 1975 // Loop control: 1976 // for (; count != 0; count--) { 1977 // copied_oop = load_heap_oop(from++); 1978 // ... generate_type_check ...; 1979 // store_heap_oop(to++, copied_oop); 1980 // } 1981 __ align(OptoLoopAlignment); 1982 1983 __ BIND(L_store_element); 1984 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1985 __ post(to, element_size), copied_oop, noreg, 1986 gct1, gct2, gct3); 1987 __ sub(count, count, 1); 1988 __ cbz(count, L_do_card_marks); 1989 1990 // ======== loop entry is here ======== 1991 __ BIND(L_load_element); 1992 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1993 copied_oop, noreg, __ post(from, element_size), 1994 gct1); 1995 __ cbz(copied_oop, L_store_element); 1996 1997 __ load_klass(r19_klass, copied_oop);// query the object klass 1998 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1999 // ======== end loop ======== 2000 2001 // It was a real error; we must depend on the caller to finish the job. 2002 // Register count = remaining oops, count_orig = total oops. 2003 // Emit GC store barriers for the oops we have copied and report 2004 // their number to the caller. 2005 2006 __ subs(count, count_save, count); // K = partially copied oop count 2007 __ eon(count, count, zr); // report (-1^K) to caller 2008 __ br(Assembler::EQ, L_done_pop); 2009 2010 __ BIND(L_do_card_marks); 2011 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2012 2013 __ bind(L_done_pop); 2014 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2015 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2016 2017 __ bind(L_done); 2018 __ mov(r0, count); 2019 __ leave(); 2020 __ ret(lr); 2021 2022 return start; 2023 } 2024 2025 // Perform range checks on the proposed arraycopy. 2026 // Kills temp, but nothing else. 2027 // Also, clean the sign bits of src_pos and dst_pos. 2028 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2029 Register src_pos, // source position (c_rarg1) 2030 Register dst, // destination array oo (c_rarg2) 2031 Register dst_pos, // destination position (c_rarg3) 2032 Register length, 2033 Register temp, 2034 Label& L_failed) { 2035 BLOCK_COMMENT("arraycopy_range_checks:"); 2036 2037 assert_different_registers(rscratch1, temp); 2038 2039 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2040 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2041 __ addw(temp, length, src_pos); 2042 __ cmpw(temp, rscratch1); 2043 __ br(Assembler::HI, L_failed); 2044 2045 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2046 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2047 __ addw(temp, length, dst_pos); 2048 __ cmpw(temp, rscratch1); 2049 __ br(Assembler::HI, L_failed); 2050 2051 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2052 __ movw(src_pos, src_pos); 2053 __ movw(dst_pos, dst_pos); 2054 2055 BLOCK_COMMENT("arraycopy_range_checks done"); 2056 } 2057 2058 // These stubs get called from some dumb test routine. 2059 // I'll write them properly when they're called from 2060 // something that's actually doing something. 2061 static void fake_arraycopy_stub(address src, address dst, int count) { 2062 assert(count == 0, "huh?"); 2063 } 2064 2065 2066 // 2067 // Generate 'unsafe' array copy stub 2068 // Though just as safe as the other stubs, it takes an unscaled 2069 // size_t argument instead of an element count. 2070 // 2071 // Input: 2072 // c_rarg0 - source array address 2073 // c_rarg1 - destination array address 2074 // c_rarg2 - byte count, treated as ssize_t, can be zero 2075 // 2076 // Examines the alignment of the operands and dispatches 2077 // to a long, int, short, or byte copy loop. 2078 // 2079 address generate_unsafe_copy(const char *name, 2080 address byte_copy_entry, 2081 address short_copy_entry, 2082 address int_copy_entry, 2083 address long_copy_entry) { 2084 Label L_long_aligned, L_int_aligned, L_short_aligned; 2085 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2086 2087 __ align(CodeEntryAlignment); 2088 StubCodeMark mark(this, "StubRoutines", name); 2089 address start = __ pc(); 2090 __ enter(); // required for proper stackwalking of RuntimeStub frame 2091 2092 // bump this on entry, not on exit: 2093 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2094 2095 __ orr(rscratch1, s, d); 2096 __ orr(rscratch1, rscratch1, count); 2097 2098 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2099 __ cbz(rscratch1, L_long_aligned); 2100 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2101 __ cbz(rscratch1, L_int_aligned); 2102 __ tbz(rscratch1, 0, L_short_aligned); 2103 __ b(RuntimeAddress(byte_copy_entry)); 2104 2105 __ BIND(L_short_aligned); 2106 __ lsr(count, count, LogBytesPerShort); // size => short_count 2107 __ b(RuntimeAddress(short_copy_entry)); 2108 __ BIND(L_int_aligned); 2109 __ lsr(count, count, LogBytesPerInt); // size => int_count 2110 __ b(RuntimeAddress(int_copy_entry)); 2111 __ BIND(L_long_aligned); 2112 __ lsr(count, count, LogBytesPerLong); // size => long_count 2113 __ b(RuntimeAddress(long_copy_entry)); 2114 2115 return start; 2116 } 2117 2118 // 2119 // Generate generic array copy stubs 2120 // 2121 // Input: 2122 // c_rarg0 - src oop 2123 // c_rarg1 - src_pos (32-bits) 2124 // c_rarg2 - dst oop 2125 // c_rarg3 - dst_pos (32-bits) 2126 // c_rarg4 - element count (32-bits) 2127 // 2128 // Output: 2129 // r0 == 0 - success 2130 // r0 == -1^K - failure, where K is partial transfer count 2131 // 2132 address generate_generic_copy(const char *name, 2133 address byte_copy_entry, address short_copy_entry, 2134 address int_copy_entry, address oop_copy_entry, 2135 address long_copy_entry, address checkcast_copy_entry) { 2136 2137 Label L_failed, L_objArray; 2138 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2139 2140 // Input registers 2141 const Register src = c_rarg0; // source array oop 2142 const Register src_pos = c_rarg1; // source position 2143 const Register dst = c_rarg2; // destination array oop 2144 const Register dst_pos = c_rarg3; // destination position 2145 const Register length = c_rarg4; 2146 2147 2148 // Registers used as temps 2149 const Register dst_klass = c_rarg5; 2150 2151 __ align(CodeEntryAlignment); 2152 2153 StubCodeMark mark(this, "StubRoutines", name); 2154 2155 address start = __ pc(); 2156 2157 __ enter(); // required for proper stackwalking of RuntimeStub frame 2158 2159 // bump this on entry, not on exit: 2160 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2161 2162 //----------------------------------------------------------------------- 2163 // Assembler stub will be used for this call to arraycopy 2164 // if the following conditions are met: 2165 // 2166 // (1) src and dst must not be null. 2167 // (2) src_pos must not be negative. 2168 // (3) dst_pos must not be negative. 2169 // (4) length must not be negative. 2170 // (5) src klass and dst klass should be the same and not null. 2171 // (6) src and dst should be arrays. 2172 // (7) src_pos + length must not exceed length of src. 2173 // (8) dst_pos + length must not exceed length of dst. 2174 // 2175 2176 // if (src == nullptr) return -1; 2177 __ cbz(src, L_failed); 2178 2179 // if (src_pos < 0) return -1; 2180 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2181 2182 // if (dst == nullptr) return -1; 2183 __ cbz(dst, L_failed); 2184 2185 // if (dst_pos < 0) return -1; 2186 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2187 2188 // registers used as temp 2189 const Register scratch_length = r16; // elements count to copy 2190 const Register scratch_src_klass = r17; // array klass 2191 const Register lh = r15; // layout helper 2192 2193 // if (length < 0) return -1; 2194 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2195 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2196 2197 __ load_klass(scratch_src_klass, src); 2198 #ifdef ASSERT 2199 // assert(src->klass() != nullptr); 2200 { 2201 BLOCK_COMMENT("assert klasses not null {"); 2202 Label L1, L2; 2203 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2204 __ bind(L1); 2205 __ stop("broken null klass"); 2206 __ bind(L2); 2207 __ load_klass(rscratch1, dst); 2208 __ cbz(rscratch1, L1); // this would be broken also 2209 BLOCK_COMMENT("} assert klasses not null done"); 2210 } 2211 #endif 2212 2213 // Load layout helper (32-bits) 2214 // 2215 // |array_tag| | header_size | element_type | |log2_element_size| 2216 // 32 30 24 16 8 2 0 2217 // 2218 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2219 // 2220 2221 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2222 2223 // Handle objArrays completely differently... 2224 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2225 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2226 __ movw(rscratch1, objArray_lh); 2227 __ eorw(rscratch2, lh, rscratch1); 2228 __ cbzw(rscratch2, L_objArray); 2229 2230 // if (src->klass() != dst->klass()) return -1; 2231 __ load_klass(rscratch2, dst); 2232 __ eor(rscratch2, rscratch2, scratch_src_klass); 2233 __ cbnz(rscratch2, L_failed); 2234 2235 // Check for flat inline type array -> return -1 2236 __ tst(lh, Klass::_lh_array_tag_flat_value_bit_inplace); 2237 __ br(Assembler::NE, L_failed); 2238 2239 // Check for null-free (non-flat) inline type array -> handle as object array 2240 __ tst(lh, Klass::_lh_null_free_array_bit_inplace); 2241 __ br(Assembler::NE, L_failed); 2242 2243 // if (!src->is_Array()) return -1; 2244 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2245 2246 // At this point, it is known to be a typeArray (array_tag 0x3). 2247 #ifdef ASSERT 2248 { 2249 BLOCK_COMMENT("assert primitive array {"); 2250 Label L; 2251 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2252 __ cmpw(lh, rscratch2); 2253 __ br(Assembler::GE, L); 2254 __ stop("must be a primitive array"); 2255 __ bind(L); 2256 BLOCK_COMMENT("} assert primitive array done"); 2257 } 2258 #endif 2259 2260 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2261 rscratch2, L_failed); 2262 2263 // TypeArrayKlass 2264 // 2265 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2266 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2267 // 2268 2269 const Register rscratch1_offset = rscratch1; // array offset 2270 const Register r15_elsize = lh; // element size 2271 2272 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2273 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2274 __ add(src, src, rscratch1_offset); // src array offset 2275 __ add(dst, dst, rscratch1_offset); // dst array offset 2276 BLOCK_COMMENT("choose copy loop based on element size"); 2277 2278 // next registers should be set before the jump to corresponding stub 2279 const Register from = c_rarg0; // source array address 2280 const Register to = c_rarg1; // destination array address 2281 const Register count = c_rarg2; // elements count 2282 2283 // 'from', 'to', 'count' registers should be set in such order 2284 // since they are the same as 'src', 'src_pos', 'dst'. 2285 2286 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2287 2288 // The possible values of elsize are 0-3, i.e. exact_log2(element 2289 // size in bytes). We do a simple bitwise binary search. 2290 __ BIND(L_copy_bytes); 2291 __ tbnz(r15_elsize, 1, L_copy_ints); 2292 __ tbnz(r15_elsize, 0, L_copy_shorts); 2293 __ lea(from, Address(src, src_pos));// src_addr 2294 __ lea(to, Address(dst, dst_pos));// dst_addr 2295 __ movw(count, scratch_length); // length 2296 __ b(RuntimeAddress(byte_copy_entry)); 2297 2298 __ BIND(L_copy_shorts); 2299 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2300 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2301 __ movw(count, scratch_length); // length 2302 __ b(RuntimeAddress(short_copy_entry)); 2303 2304 __ BIND(L_copy_ints); 2305 __ tbnz(r15_elsize, 0, L_copy_longs); 2306 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2307 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2308 __ movw(count, scratch_length); // length 2309 __ b(RuntimeAddress(int_copy_entry)); 2310 2311 __ BIND(L_copy_longs); 2312 #ifdef ASSERT 2313 { 2314 BLOCK_COMMENT("assert long copy {"); 2315 Label L; 2316 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2317 __ cmpw(r15_elsize, LogBytesPerLong); 2318 __ br(Assembler::EQ, L); 2319 __ stop("must be long copy, but elsize is wrong"); 2320 __ bind(L); 2321 BLOCK_COMMENT("} assert long copy done"); 2322 } 2323 #endif 2324 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2325 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2326 __ movw(count, scratch_length); // length 2327 __ b(RuntimeAddress(long_copy_entry)); 2328 2329 // ObjArrayKlass 2330 __ BIND(L_objArray); 2331 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2332 2333 Label L_plain_copy, L_checkcast_copy; 2334 // test array classes for subtyping 2335 __ load_klass(r15, dst); 2336 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2337 __ br(Assembler::NE, L_checkcast_copy); 2338 2339 // Identically typed arrays can be copied without element-wise checks. 2340 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2341 rscratch2, L_failed); 2342 2343 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2344 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2345 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2346 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2347 __ movw(count, scratch_length); // length 2348 __ BIND(L_plain_copy); 2349 __ b(RuntimeAddress(oop_copy_entry)); 2350 2351 __ BIND(L_checkcast_copy); 2352 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2353 { 2354 // Before looking at dst.length, make sure dst is also an objArray. 2355 __ ldrw(rscratch1, Address(r15, lh_offset)); 2356 __ movw(rscratch2, objArray_lh); 2357 __ eorw(rscratch1, rscratch1, rscratch2); 2358 __ cbnzw(rscratch1, L_failed); 2359 2360 // It is safe to examine both src.length and dst.length. 2361 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2362 r15, L_failed); 2363 2364 __ load_klass(dst_klass, dst); // reload 2365 2366 // Marshal the base address arguments now, freeing registers. 2367 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2368 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2369 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2370 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2371 __ movw(count, length); // length (reloaded) 2372 Register sco_temp = c_rarg3; // this register is free now 2373 assert_different_registers(from, to, count, sco_temp, 2374 dst_klass, scratch_src_klass); 2375 // assert_clean_int(count, sco_temp); 2376 2377 // Generate the type check. 2378 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2379 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2380 2381 // Smashes rscratch1, rscratch2 2382 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2383 2384 // Fetch destination element klass from the ObjArrayKlass header. 2385 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2386 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2387 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2388 2389 // the checkcast_copy loop needs two extra arguments: 2390 assert(c_rarg3 == sco_temp, "#3 already in place"); 2391 // Set up arguments for checkcast_copy_entry. 2392 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2393 __ b(RuntimeAddress(checkcast_copy_entry)); 2394 } 2395 2396 __ BIND(L_failed); 2397 __ mov(r0, -1); 2398 __ leave(); // required for proper stackwalking of RuntimeStub frame 2399 __ ret(lr); 2400 2401 return start; 2402 } 2403 2404 // 2405 // Generate stub for array fill. If "aligned" is true, the 2406 // "to" address is assumed to be heapword aligned. 2407 // 2408 // Arguments for generated stub: 2409 // to: c_rarg0 2410 // value: c_rarg1 2411 // count: c_rarg2 treated as signed 2412 // 2413 address generate_fill(BasicType t, bool aligned, const char *name) { 2414 __ align(CodeEntryAlignment); 2415 StubCodeMark mark(this, "StubRoutines", name); 2416 address start = __ pc(); 2417 2418 BLOCK_COMMENT("Entry:"); 2419 2420 const Register to = c_rarg0; // source array address 2421 const Register value = c_rarg1; // value 2422 const Register count = c_rarg2; // elements count 2423 2424 const Register bz_base = r10; // base for block_zero routine 2425 const Register cnt_words = r11; // temp register 2426 2427 __ enter(); 2428 2429 Label L_fill_elements, L_exit1; 2430 2431 int shift = -1; 2432 switch (t) { 2433 case T_BYTE: 2434 shift = 0; 2435 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2436 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2437 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2438 __ br(Assembler::LO, L_fill_elements); 2439 break; 2440 case T_SHORT: 2441 shift = 1; 2442 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2443 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2444 __ br(Assembler::LO, L_fill_elements); 2445 break; 2446 case T_INT: 2447 shift = 2; 2448 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2449 __ br(Assembler::LO, L_fill_elements); 2450 break; 2451 default: ShouldNotReachHere(); 2452 } 2453 2454 // Align source address at 8 bytes address boundary. 2455 Label L_skip_align1, L_skip_align2, L_skip_align4; 2456 if (!aligned) { 2457 switch (t) { 2458 case T_BYTE: 2459 // One byte misalignment happens only for byte arrays. 2460 __ tbz(to, 0, L_skip_align1); 2461 __ strb(value, Address(__ post(to, 1))); 2462 __ subw(count, count, 1); 2463 __ bind(L_skip_align1); 2464 // Fallthrough 2465 case T_SHORT: 2466 // Two bytes misalignment happens only for byte and short (char) arrays. 2467 __ tbz(to, 1, L_skip_align2); 2468 __ strh(value, Address(__ post(to, 2))); 2469 __ subw(count, count, 2 >> shift); 2470 __ bind(L_skip_align2); 2471 // Fallthrough 2472 case T_INT: 2473 // Align to 8 bytes, we know we are 4 byte aligned to start. 2474 __ tbz(to, 2, L_skip_align4); 2475 __ strw(value, Address(__ post(to, 4))); 2476 __ subw(count, count, 4 >> shift); 2477 __ bind(L_skip_align4); 2478 break; 2479 default: ShouldNotReachHere(); 2480 } 2481 } 2482 2483 // 2484 // Fill large chunks 2485 // 2486 __ lsrw(cnt_words, count, 3 - shift); // number of words 2487 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2488 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2489 if (UseBlockZeroing) { 2490 Label non_block_zeroing, rest; 2491 // If the fill value is zero we can use the fast zero_words(). 2492 __ cbnz(value, non_block_zeroing); 2493 __ mov(bz_base, to); 2494 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2495 address tpc = __ zero_words(bz_base, cnt_words); 2496 if (tpc == nullptr) { 2497 fatal("CodeCache is full at generate_fill"); 2498 } 2499 __ b(rest); 2500 __ bind(non_block_zeroing); 2501 __ fill_words(to, cnt_words, value); 2502 __ bind(rest); 2503 } else { 2504 __ fill_words(to, cnt_words, value); 2505 } 2506 2507 // Remaining count is less than 8 bytes. Fill it by a single store. 2508 // Note that the total length is no less than 8 bytes. 2509 if (t == T_BYTE || t == T_SHORT) { 2510 Label L_exit1; 2511 __ cbzw(count, L_exit1); 2512 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2513 __ str(value, Address(to, -8)); // overwrite some elements 2514 __ bind(L_exit1); 2515 __ leave(); 2516 __ ret(lr); 2517 } 2518 2519 // Handle copies less than 8 bytes. 2520 Label L_fill_2, L_fill_4, L_exit2; 2521 __ bind(L_fill_elements); 2522 switch (t) { 2523 case T_BYTE: 2524 __ tbz(count, 0, L_fill_2); 2525 __ strb(value, Address(__ post(to, 1))); 2526 __ bind(L_fill_2); 2527 __ tbz(count, 1, L_fill_4); 2528 __ strh(value, Address(__ post(to, 2))); 2529 __ bind(L_fill_4); 2530 __ tbz(count, 2, L_exit2); 2531 __ strw(value, Address(to)); 2532 break; 2533 case T_SHORT: 2534 __ tbz(count, 0, L_fill_4); 2535 __ strh(value, Address(__ post(to, 2))); 2536 __ bind(L_fill_4); 2537 __ tbz(count, 1, L_exit2); 2538 __ strw(value, Address(to)); 2539 break; 2540 case T_INT: 2541 __ cbzw(count, L_exit2); 2542 __ strw(value, Address(to)); 2543 break; 2544 default: ShouldNotReachHere(); 2545 } 2546 __ bind(L_exit2); 2547 __ leave(); 2548 __ ret(lr); 2549 return start; 2550 } 2551 2552 address generate_data_cache_writeback() { 2553 const Register line = c_rarg0; // address of line to write back 2554 2555 __ align(CodeEntryAlignment); 2556 2557 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2558 2559 address start = __ pc(); 2560 __ enter(); 2561 __ cache_wb(Address(line, 0)); 2562 __ leave(); 2563 __ ret(lr); 2564 2565 return start; 2566 } 2567 2568 address generate_data_cache_writeback_sync() { 2569 const Register is_pre = c_rarg0; // pre or post sync 2570 2571 __ align(CodeEntryAlignment); 2572 2573 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2574 2575 // pre wbsync is a no-op 2576 // post wbsync translates to an sfence 2577 2578 Label skip; 2579 address start = __ pc(); 2580 __ enter(); 2581 __ cbnz(is_pre, skip); 2582 __ cache_wbsync(false); 2583 __ bind(skip); 2584 __ leave(); 2585 __ ret(lr); 2586 2587 return start; 2588 } 2589 2590 void generate_arraycopy_stubs() { 2591 address entry; 2592 address entry_jbyte_arraycopy; 2593 address entry_jshort_arraycopy; 2594 address entry_jint_arraycopy; 2595 address entry_oop_arraycopy; 2596 address entry_jlong_arraycopy; 2597 address entry_checkcast_arraycopy; 2598 2599 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2600 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2601 2602 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2603 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2604 2605 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2606 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2607 2608 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2609 2610 //*** jbyte 2611 // Always need aligned and unaligned versions 2612 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2613 "jbyte_disjoint_arraycopy"); 2614 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2615 &entry_jbyte_arraycopy, 2616 "jbyte_arraycopy"); 2617 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2618 "arrayof_jbyte_disjoint_arraycopy"); 2619 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2620 "arrayof_jbyte_arraycopy"); 2621 2622 //*** jshort 2623 // Always need aligned and unaligned versions 2624 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2625 "jshort_disjoint_arraycopy"); 2626 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2627 &entry_jshort_arraycopy, 2628 "jshort_arraycopy"); 2629 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2630 "arrayof_jshort_disjoint_arraycopy"); 2631 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2632 "arrayof_jshort_arraycopy"); 2633 2634 //*** jint 2635 // Aligned versions 2636 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2637 "arrayof_jint_disjoint_arraycopy"); 2638 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2639 "arrayof_jint_arraycopy"); 2640 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2641 // entry_jint_arraycopy always points to the unaligned version 2642 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2643 "jint_disjoint_arraycopy"); 2644 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2645 &entry_jint_arraycopy, 2646 "jint_arraycopy"); 2647 2648 //*** jlong 2649 // It is always aligned 2650 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2651 "arrayof_jlong_disjoint_arraycopy"); 2652 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2653 "arrayof_jlong_arraycopy"); 2654 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2655 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2656 2657 //*** oops 2658 { 2659 // With compressed oops we need unaligned versions; notice that 2660 // we overwrite entry_oop_arraycopy. 2661 bool aligned = !UseCompressedOops; 2662 2663 StubRoutines::_arrayof_oop_disjoint_arraycopy 2664 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2665 /*dest_uninitialized*/false); 2666 StubRoutines::_arrayof_oop_arraycopy 2667 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2668 /*dest_uninitialized*/false); 2669 // Aligned versions without pre-barriers 2670 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2671 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2672 /*dest_uninitialized*/true); 2673 StubRoutines::_arrayof_oop_arraycopy_uninit 2674 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2675 /*dest_uninitialized*/true); 2676 } 2677 2678 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2679 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2680 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2681 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2682 2683 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2684 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2685 /*dest_uninitialized*/true); 2686 2687 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2688 entry_jbyte_arraycopy, 2689 entry_jshort_arraycopy, 2690 entry_jint_arraycopy, 2691 entry_jlong_arraycopy); 2692 2693 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2694 entry_jbyte_arraycopy, 2695 entry_jshort_arraycopy, 2696 entry_jint_arraycopy, 2697 entry_oop_arraycopy, 2698 entry_jlong_arraycopy, 2699 entry_checkcast_arraycopy); 2700 2701 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2702 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2703 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2704 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2705 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2706 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2707 } 2708 2709 void generate_math_stubs() { Unimplemented(); } 2710 2711 // Arguments: 2712 // 2713 // Inputs: 2714 // c_rarg0 - source byte array address 2715 // c_rarg1 - destination byte array address 2716 // c_rarg2 - K (key) in little endian int array 2717 // 2718 address generate_aescrypt_encryptBlock() { 2719 __ align(CodeEntryAlignment); 2720 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2721 2722 const Register from = c_rarg0; // source array address 2723 const Register to = c_rarg1; // destination array address 2724 const Register key = c_rarg2; // key array address 2725 const Register keylen = rscratch1; 2726 2727 address start = __ pc(); 2728 __ enter(); 2729 2730 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2731 2732 __ aesenc_loadkeys(key, keylen); 2733 __ aesecb_encrypt(from, to, keylen); 2734 2735 __ mov(r0, 0); 2736 2737 __ leave(); 2738 __ ret(lr); 2739 2740 return start; 2741 } 2742 2743 // Arguments: 2744 // 2745 // Inputs: 2746 // c_rarg0 - source byte array address 2747 // c_rarg1 - destination byte array address 2748 // c_rarg2 - K (key) in little endian int array 2749 // 2750 address generate_aescrypt_decryptBlock() { 2751 assert(UseAES, "need AES cryptographic extension support"); 2752 __ align(CodeEntryAlignment); 2753 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2754 Label L_doLast; 2755 2756 const Register from = c_rarg0; // source array address 2757 const Register to = c_rarg1; // destination array address 2758 const Register key = c_rarg2; // key array address 2759 const Register keylen = rscratch1; 2760 2761 address start = __ pc(); 2762 __ enter(); // required for proper stackwalking of RuntimeStub frame 2763 2764 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2765 2766 __ aesecb_decrypt(from, to, key, keylen); 2767 2768 __ mov(r0, 0); 2769 2770 __ leave(); 2771 __ ret(lr); 2772 2773 return start; 2774 } 2775 2776 // Arguments: 2777 // 2778 // Inputs: 2779 // c_rarg0 - source byte array address 2780 // c_rarg1 - destination byte array address 2781 // c_rarg2 - K (key) in little endian int array 2782 // c_rarg3 - r vector byte array address 2783 // c_rarg4 - input length 2784 // 2785 // Output: 2786 // x0 - input length 2787 // 2788 address generate_cipherBlockChaining_encryptAESCrypt() { 2789 assert(UseAES, "need AES cryptographic extension support"); 2790 __ align(CodeEntryAlignment); 2791 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2792 2793 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2794 2795 const Register from = c_rarg0; // source array address 2796 const Register to = c_rarg1; // destination array address 2797 const Register key = c_rarg2; // key array address 2798 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2799 // and left with the results of the last encryption block 2800 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2801 const Register keylen = rscratch1; 2802 2803 address start = __ pc(); 2804 2805 __ enter(); 2806 2807 __ movw(rscratch2, len_reg); 2808 2809 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2810 2811 __ ld1(v0, __ T16B, rvec); 2812 2813 __ cmpw(keylen, 52); 2814 __ br(Assembler::CC, L_loadkeys_44); 2815 __ br(Assembler::EQ, L_loadkeys_52); 2816 2817 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2818 __ rev32(v17, __ T16B, v17); 2819 __ rev32(v18, __ T16B, v18); 2820 __ BIND(L_loadkeys_52); 2821 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2822 __ rev32(v19, __ T16B, v19); 2823 __ rev32(v20, __ T16B, v20); 2824 __ BIND(L_loadkeys_44); 2825 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2826 __ rev32(v21, __ T16B, v21); 2827 __ rev32(v22, __ T16B, v22); 2828 __ rev32(v23, __ T16B, v23); 2829 __ rev32(v24, __ T16B, v24); 2830 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2831 __ rev32(v25, __ T16B, v25); 2832 __ rev32(v26, __ T16B, v26); 2833 __ rev32(v27, __ T16B, v27); 2834 __ rev32(v28, __ T16B, v28); 2835 __ ld1(v29, v30, v31, __ T16B, key); 2836 __ rev32(v29, __ T16B, v29); 2837 __ rev32(v30, __ T16B, v30); 2838 __ rev32(v31, __ T16B, v31); 2839 2840 __ BIND(L_aes_loop); 2841 __ ld1(v1, __ T16B, __ post(from, 16)); 2842 __ eor(v0, __ T16B, v0, v1); 2843 2844 __ br(Assembler::CC, L_rounds_44); 2845 __ br(Assembler::EQ, L_rounds_52); 2846 2847 __ aese(v0, v17); __ aesmc(v0, v0); 2848 __ aese(v0, v18); __ aesmc(v0, v0); 2849 __ BIND(L_rounds_52); 2850 __ aese(v0, v19); __ aesmc(v0, v0); 2851 __ aese(v0, v20); __ aesmc(v0, v0); 2852 __ BIND(L_rounds_44); 2853 __ aese(v0, v21); __ aesmc(v0, v0); 2854 __ aese(v0, v22); __ aesmc(v0, v0); 2855 __ aese(v0, v23); __ aesmc(v0, v0); 2856 __ aese(v0, v24); __ aesmc(v0, v0); 2857 __ aese(v0, v25); __ aesmc(v0, v0); 2858 __ aese(v0, v26); __ aesmc(v0, v0); 2859 __ aese(v0, v27); __ aesmc(v0, v0); 2860 __ aese(v0, v28); __ aesmc(v0, v0); 2861 __ aese(v0, v29); __ aesmc(v0, v0); 2862 __ aese(v0, v30); 2863 __ eor(v0, __ T16B, v0, v31); 2864 2865 __ st1(v0, __ T16B, __ post(to, 16)); 2866 2867 __ subw(len_reg, len_reg, 16); 2868 __ cbnzw(len_reg, L_aes_loop); 2869 2870 __ st1(v0, __ T16B, rvec); 2871 2872 __ mov(r0, rscratch2); 2873 2874 __ leave(); 2875 __ ret(lr); 2876 2877 return start; 2878 } 2879 2880 // Arguments: 2881 // 2882 // Inputs: 2883 // c_rarg0 - source byte array address 2884 // c_rarg1 - destination byte array address 2885 // c_rarg2 - K (key) in little endian int array 2886 // c_rarg3 - r vector byte array address 2887 // c_rarg4 - input length 2888 // 2889 // Output: 2890 // r0 - input length 2891 // 2892 address generate_cipherBlockChaining_decryptAESCrypt() { 2893 assert(UseAES, "need AES cryptographic extension support"); 2894 __ align(CodeEntryAlignment); 2895 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2896 2897 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2898 2899 const Register from = c_rarg0; // source array address 2900 const Register to = c_rarg1; // destination array address 2901 const Register key = c_rarg2; // key array address 2902 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2903 // and left with the results of the last encryption block 2904 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2905 const Register keylen = rscratch1; 2906 2907 address start = __ pc(); 2908 2909 __ enter(); 2910 2911 __ movw(rscratch2, len_reg); 2912 2913 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2914 2915 __ ld1(v2, __ T16B, rvec); 2916 2917 __ ld1(v31, __ T16B, __ post(key, 16)); 2918 __ rev32(v31, __ T16B, v31); 2919 2920 __ cmpw(keylen, 52); 2921 __ br(Assembler::CC, L_loadkeys_44); 2922 __ br(Assembler::EQ, L_loadkeys_52); 2923 2924 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2925 __ rev32(v17, __ T16B, v17); 2926 __ rev32(v18, __ T16B, v18); 2927 __ BIND(L_loadkeys_52); 2928 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2929 __ rev32(v19, __ T16B, v19); 2930 __ rev32(v20, __ T16B, v20); 2931 __ BIND(L_loadkeys_44); 2932 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2933 __ rev32(v21, __ T16B, v21); 2934 __ rev32(v22, __ T16B, v22); 2935 __ rev32(v23, __ T16B, v23); 2936 __ rev32(v24, __ T16B, v24); 2937 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2938 __ rev32(v25, __ T16B, v25); 2939 __ rev32(v26, __ T16B, v26); 2940 __ rev32(v27, __ T16B, v27); 2941 __ rev32(v28, __ T16B, v28); 2942 __ ld1(v29, v30, __ T16B, key); 2943 __ rev32(v29, __ T16B, v29); 2944 __ rev32(v30, __ T16B, v30); 2945 2946 __ BIND(L_aes_loop); 2947 __ ld1(v0, __ T16B, __ post(from, 16)); 2948 __ orr(v1, __ T16B, v0, v0); 2949 2950 __ br(Assembler::CC, L_rounds_44); 2951 __ br(Assembler::EQ, L_rounds_52); 2952 2953 __ aesd(v0, v17); __ aesimc(v0, v0); 2954 __ aesd(v0, v18); __ aesimc(v0, v0); 2955 __ BIND(L_rounds_52); 2956 __ aesd(v0, v19); __ aesimc(v0, v0); 2957 __ aesd(v0, v20); __ aesimc(v0, v0); 2958 __ BIND(L_rounds_44); 2959 __ aesd(v0, v21); __ aesimc(v0, v0); 2960 __ aesd(v0, v22); __ aesimc(v0, v0); 2961 __ aesd(v0, v23); __ aesimc(v0, v0); 2962 __ aesd(v0, v24); __ aesimc(v0, v0); 2963 __ aesd(v0, v25); __ aesimc(v0, v0); 2964 __ aesd(v0, v26); __ aesimc(v0, v0); 2965 __ aesd(v0, v27); __ aesimc(v0, v0); 2966 __ aesd(v0, v28); __ aesimc(v0, v0); 2967 __ aesd(v0, v29); __ aesimc(v0, v0); 2968 __ aesd(v0, v30); 2969 __ eor(v0, __ T16B, v0, v31); 2970 __ eor(v0, __ T16B, v0, v2); 2971 2972 __ st1(v0, __ T16B, __ post(to, 16)); 2973 __ orr(v2, __ T16B, v1, v1); 2974 2975 __ subw(len_reg, len_reg, 16); 2976 __ cbnzw(len_reg, L_aes_loop); 2977 2978 __ st1(v2, __ T16B, rvec); 2979 2980 __ mov(r0, rscratch2); 2981 2982 __ leave(); 2983 __ ret(lr); 2984 2985 return start; 2986 } 2987 2988 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2989 // Inputs: 128-bits. in is preserved. 2990 // The least-significant 64-bit word is in the upper dword of each vector. 2991 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2992 // Output: result 2993 void be_add_128_64(FloatRegister result, FloatRegister in, 2994 FloatRegister inc, FloatRegister tmp) { 2995 assert_different_registers(result, tmp, inc); 2996 2997 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2998 // input 2999 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3000 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3001 // MSD == 0 (must be!) to LSD 3002 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3003 } 3004 3005 // CTR AES crypt. 3006 // Arguments: 3007 // 3008 // Inputs: 3009 // c_rarg0 - source byte array address 3010 // c_rarg1 - destination byte array address 3011 // c_rarg2 - K (key) in little endian int array 3012 // c_rarg3 - counter vector byte array address 3013 // c_rarg4 - input length 3014 // c_rarg5 - saved encryptedCounter start 3015 // c_rarg6 - saved used length 3016 // 3017 // Output: 3018 // r0 - input length 3019 // 3020 address generate_counterMode_AESCrypt() { 3021 const Register in = c_rarg0; 3022 const Register out = c_rarg1; 3023 const Register key = c_rarg2; 3024 const Register counter = c_rarg3; 3025 const Register saved_len = c_rarg4, len = r10; 3026 const Register saved_encrypted_ctr = c_rarg5; 3027 const Register used_ptr = c_rarg6, used = r12; 3028 3029 const Register offset = r7; 3030 const Register keylen = r11; 3031 3032 const unsigned char block_size = 16; 3033 const int bulk_width = 4; 3034 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3035 // performance with larger data sizes, but it also means that the 3036 // fast path isn't used until you have at least 8 blocks, and up 3037 // to 127 bytes of data will be executed on the slow path. For 3038 // that reason, and also so as not to blow away too much icache, 4 3039 // blocks seems like a sensible compromise. 3040 3041 // Algorithm: 3042 // 3043 // if (len == 0) { 3044 // goto DONE; 3045 // } 3046 // int result = len; 3047 // do { 3048 // if (used >= blockSize) { 3049 // if (len >= bulk_width * blockSize) { 3050 // CTR_large_block(); 3051 // if (len == 0) 3052 // goto DONE; 3053 // } 3054 // for (;;) { 3055 // 16ByteVector v0 = counter; 3056 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3057 // used = 0; 3058 // if (len < blockSize) 3059 // break; /* goto NEXT */ 3060 // 16ByteVector v1 = load16Bytes(in, offset); 3061 // v1 = v1 ^ encryptedCounter; 3062 // store16Bytes(out, offset); 3063 // used = blockSize; 3064 // offset += blockSize; 3065 // len -= blockSize; 3066 // if (len == 0) 3067 // goto DONE; 3068 // } 3069 // } 3070 // NEXT: 3071 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3072 // len--; 3073 // } while (len != 0); 3074 // DONE: 3075 // return result; 3076 // 3077 // CTR_large_block() 3078 // Wide bulk encryption of whole blocks. 3079 3080 __ align(CodeEntryAlignment); 3081 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3082 const address start = __ pc(); 3083 __ enter(); 3084 3085 Label DONE, CTR_large_block, large_block_return; 3086 __ ldrw(used, Address(used_ptr)); 3087 __ cbzw(saved_len, DONE); 3088 3089 __ mov(len, saved_len); 3090 __ mov(offset, 0); 3091 3092 // Compute #rounds for AES based on the length of the key array 3093 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3094 3095 __ aesenc_loadkeys(key, keylen); 3096 3097 { 3098 Label L_CTR_loop, NEXT; 3099 3100 __ bind(L_CTR_loop); 3101 3102 __ cmp(used, block_size); 3103 __ br(__ LO, NEXT); 3104 3105 // Maybe we have a lot of data 3106 __ subsw(rscratch1, len, bulk_width * block_size); 3107 __ br(__ HS, CTR_large_block); 3108 __ BIND(large_block_return); 3109 __ cbzw(len, DONE); 3110 3111 // Setup the counter 3112 __ movi(v4, __ T4S, 0); 3113 __ movi(v5, __ T4S, 1); 3114 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3115 3116 // 128-bit big-endian increment 3117 __ ld1(v0, __ T16B, counter); 3118 __ rev64(v16, __ T16B, v0); 3119 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3120 __ rev64(v16, __ T16B, v16); 3121 __ st1(v16, __ T16B, counter); 3122 // Previous counter value is in v0 3123 // v4 contains { 0, 1 } 3124 3125 { 3126 // We have fewer than bulk_width blocks of data left. Encrypt 3127 // them one by one until there is less than a full block 3128 // remaining, being careful to save both the encrypted counter 3129 // and the counter. 3130 3131 Label inner_loop; 3132 __ bind(inner_loop); 3133 // Counter to encrypt is in v0 3134 __ aesecb_encrypt(noreg, noreg, keylen); 3135 __ st1(v0, __ T16B, saved_encrypted_ctr); 3136 3137 // Do we have a remaining full block? 3138 3139 __ mov(used, 0); 3140 __ cmp(len, block_size); 3141 __ br(__ LO, NEXT); 3142 3143 // Yes, we have a full block 3144 __ ldrq(v1, Address(in, offset)); 3145 __ eor(v1, __ T16B, v1, v0); 3146 __ strq(v1, Address(out, offset)); 3147 __ mov(used, block_size); 3148 __ add(offset, offset, block_size); 3149 3150 __ subw(len, len, block_size); 3151 __ cbzw(len, DONE); 3152 3153 // Increment the counter, store it back 3154 __ orr(v0, __ T16B, v16, v16); 3155 __ rev64(v16, __ T16B, v16); 3156 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3157 __ rev64(v16, __ T16B, v16); 3158 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3159 3160 __ b(inner_loop); 3161 } 3162 3163 __ BIND(NEXT); 3164 3165 // Encrypt a single byte, and loop. 3166 // We expect this to be a rare event. 3167 __ ldrb(rscratch1, Address(in, offset)); 3168 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3169 __ eor(rscratch1, rscratch1, rscratch2); 3170 __ strb(rscratch1, Address(out, offset)); 3171 __ add(offset, offset, 1); 3172 __ add(used, used, 1); 3173 __ subw(len, len,1); 3174 __ cbnzw(len, L_CTR_loop); 3175 } 3176 3177 __ bind(DONE); 3178 __ strw(used, Address(used_ptr)); 3179 __ mov(r0, saved_len); 3180 3181 __ leave(); // required for proper stackwalking of RuntimeStub frame 3182 __ ret(lr); 3183 3184 // Bulk encryption 3185 3186 __ BIND (CTR_large_block); 3187 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3188 3189 if (bulk_width == 8) { 3190 __ sub(sp, sp, 4 * 16); 3191 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3192 } 3193 __ sub(sp, sp, 4 * 16); 3194 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3195 RegSet saved_regs = (RegSet::of(in, out, offset) 3196 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3197 __ push(saved_regs, sp); 3198 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3199 __ add(in, in, offset); 3200 __ add(out, out, offset); 3201 3202 // Keys should already be loaded into the correct registers 3203 3204 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3205 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3206 3207 // AES/CTR loop 3208 { 3209 Label L_CTR_loop; 3210 __ BIND(L_CTR_loop); 3211 3212 // Setup the counters 3213 __ movi(v8, __ T4S, 0); 3214 __ movi(v9, __ T4S, 1); 3215 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3216 3217 for (int i = 0; i < bulk_width; i++) { 3218 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3219 __ rev64(v0_ofs, __ T16B, v16); 3220 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3221 } 3222 3223 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3224 3225 // Encrypt the counters 3226 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3227 3228 if (bulk_width == 8) { 3229 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3230 } 3231 3232 // XOR the encrypted counters with the inputs 3233 for (int i = 0; i < bulk_width; i++) { 3234 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3235 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3236 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3237 } 3238 3239 // Write the encrypted data 3240 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3241 if (bulk_width == 8) { 3242 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3243 } 3244 3245 __ subw(len, len, 16 * bulk_width); 3246 __ cbnzw(len, L_CTR_loop); 3247 } 3248 3249 // Save the counter back where it goes 3250 __ rev64(v16, __ T16B, v16); 3251 __ st1(v16, __ T16B, counter); 3252 3253 __ pop(saved_regs, sp); 3254 3255 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3256 if (bulk_width == 8) { 3257 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3258 } 3259 3260 __ andr(rscratch1, len, -16 * bulk_width); 3261 __ sub(len, len, rscratch1); 3262 __ add(offset, offset, rscratch1); 3263 __ mov(used, 16); 3264 __ strw(used, Address(used_ptr)); 3265 __ b(large_block_return); 3266 3267 return start; 3268 } 3269 3270 // Vector AES Galois Counter Mode implementation. Parameters: 3271 // 3272 // in = c_rarg0 3273 // len = c_rarg1 3274 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3275 // out = c_rarg3 3276 // key = c_rarg4 3277 // state = c_rarg5 - GHASH.state 3278 // subkeyHtbl = c_rarg6 - powers of H 3279 // counter = c_rarg7 - 16 bytes of CTR 3280 // return - number of processed bytes 3281 address generate_galoisCounterMode_AESCrypt() { 3282 address ghash_polynomial = __ pc(); 3283 __ emit_int64(0x87); // The low-order bits of the field 3284 // polynomial (i.e. p = z^7+z^2+z+1) 3285 // repeated in the low and high parts of a 3286 // 128-bit vector 3287 __ emit_int64(0x87); 3288 3289 __ align(CodeEntryAlignment); 3290 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3291 address start = __ pc(); 3292 __ enter(); 3293 3294 const Register in = c_rarg0; 3295 const Register len = c_rarg1; 3296 const Register ct = c_rarg2; 3297 const Register out = c_rarg3; 3298 // and updated with the incremented counter in the end 3299 3300 const Register key = c_rarg4; 3301 const Register state = c_rarg5; 3302 3303 const Register subkeyHtbl = c_rarg6; 3304 3305 const Register counter = c_rarg7; 3306 3307 const Register keylen = r10; 3308 // Save state before entering routine 3309 __ sub(sp, sp, 4 * 16); 3310 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3311 __ sub(sp, sp, 4 * 16); 3312 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3313 3314 // __ andr(len, len, -512); 3315 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3316 __ str(len, __ pre(sp, -2 * wordSize)); 3317 3318 Label DONE; 3319 __ cbz(len, DONE); 3320 3321 // Compute #rounds for AES based on the length of the key array 3322 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3323 3324 __ aesenc_loadkeys(key, keylen); 3325 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3326 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3327 3328 // AES/CTR loop 3329 { 3330 Label L_CTR_loop; 3331 __ BIND(L_CTR_loop); 3332 3333 // Setup the counters 3334 __ movi(v8, __ T4S, 0); 3335 __ movi(v9, __ T4S, 1); 3336 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3337 3338 assert(v0->encoding() < v8->encoding(), ""); 3339 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3340 FloatRegister f = as_FloatRegister(i); 3341 __ rev32(f, __ T16B, v16); 3342 __ addv(v16, __ T4S, v16, v8); 3343 } 3344 3345 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3346 3347 // Encrypt the counters 3348 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3349 3350 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3351 3352 // XOR the encrypted counters with the inputs 3353 for (int i = 0; i < 8; i++) { 3354 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3355 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3356 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3357 } 3358 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3359 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3360 3361 __ subw(len, len, 16 * 8); 3362 __ cbnzw(len, L_CTR_loop); 3363 } 3364 3365 __ rev32(v16, __ T16B, v16); 3366 __ st1(v16, __ T16B, counter); 3367 3368 __ ldr(len, Address(sp)); 3369 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3370 3371 // GHASH/CTR loop 3372 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3373 len, /*unrolls*/4); 3374 3375 #ifdef ASSERT 3376 { Label L; 3377 __ cmp(len, (unsigned char)0); 3378 __ br(Assembler::EQ, L); 3379 __ stop("stubGenerator: abort"); 3380 __ bind(L); 3381 } 3382 #endif 3383 3384 __ bind(DONE); 3385 // Return the number of bytes processed 3386 __ ldr(r0, __ post(sp, 2 * wordSize)); 3387 3388 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3389 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3390 3391 __ leave(); // required for proper stackwalking of RuntimeStub frame 3392 __ ret(lr); 3393 return start; 3394 } 3395 3396 class Cached64Bytes { 3397 private: 3398 MacroAssembler *_masm; 3399 Register _regs[8]; 3400 3401 public: 3402 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3403 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3404 auto it = rs.begin(); 3405 for (auto &r: _regs) { 3406 r = *it; 3407 ++it; 3408 } 3409 } 3410 3411 void gen_loads(Register base) { 3412 for (int i = 0; i < 8; i += 2) { 3413 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3414 } 3415 } 3416 3417 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3418 void extract_u32(Register dest, int i) { 3419 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3420 } 3421 }; 3422 3423 // Utility routines for md5. 3424 // Clobbers r10 and r11. 3425 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3426 int k, int s, int t) { 3427 Register rscratch3 = r10; 3428 Register rscratch4 = r11; 3429 3430 __ eorw(rscratch3, r3, r4); 3431 __ movw(rscratch2, t); 3432 __ andw(rscratch3, rscratch3, r2); 3433 __ addw(rscratch4, r1, rscratch2); 3434 reg_cache.extract_u32(rscratch1, k); 3435 __ eorw(rscratch3, rscratch3, r4); 3436 __ addw(rscratch4, rscratch4, rscratch1); 3437 __ addw(rscratch3, rscratch3, rscratch4); 3438 __ rorw(rscratch2, rscratch3, 32 - s); 3439 __ addw(r1, rscratch2, r2); 3440 } 3441 3442 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3443 int k, int s, int t) { 3444 Register rscratch3 = r10; 3445 Register rscratch4 = r11; 3446 3447 __ andw(rscratch3, r2, r4); 3448 __ bicw(rscratch4, r3, r4); 3449 reg_cache.extract_u32(rscratch1, k); 3450 __ movw(rscratch2, t); 3451 __ orrw(rscratch3, rscratch3, rscratch4); 3452 __ addw(rscratch4, r1, rscratch2); 3453 __ addw(rscratch4, rscratch4, rscratch1); 3454 __ addw(rscratch3, rscratch3, rscratch4); 3455 __ rorw(rscratch2, rscratch3, 32 - s); 3456 __ addw(r1, rscratch2, r2); 3457 } 3458 3459 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3460 int k, int s, int t) { 3461 Register rscratch3 = r10; 3462 Register rscratch4 = r11; 3463 3464 __ eorw(rscratch3, r3, r4); 3465 __ movw(rscratch2, t); 3466 __ addw(rscratch4, r1, rscratch2); 3467 reg_cache.extract_u32(rscratch1, k); 3468 __ eorw(rscratch3, rscratch3, r2); 3469 __ addw(rscratch4, rscratch4, rscratch1); 3470 __ addw(rscratch3, rscratch3, rscratch4); 3471 __ rorw(rscratch2, rscratch3, 32 - s); 3472 __ addw(r1, rscratch2, r2); 3473 } 3474 3475 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3476 int k, int s, int t) { 3477 Register rscratch3 = r10; 3478 Register rscratch4 = r11; 3479 3480 __ movw(rscratch3, t); 3481 __ ornw(rscratch2, r2, r4); 3482 __ addw(rscratch4, r1, rscratch3); 3483 reg_cache.extract_u32(rscratch1, k); 3484 __ eorw(rscratch3, rscratch2, r3); 3485 __ addw(rscratch4, rscratch4, rscratch1); 3486 __ addw(rscratch3, rscratch3, rscratch4); 3487 __ rorw(rscratch2, rscratch3, 32 - s); 3488 __ addw(r1, rscratch2, r2); 3489 } 3490 3491 // Arguments: 3492 // 3493 // Inputs: 3494 // c_rarg0 - byte[] source+offset 3495 // c_rarg1 - int[] SHA.state 3496 // c_rarg2 - int offset 3497 // c_rarg3 - int limit 3498 // 3499 address generate_md5_implCompress(bool multi_block, const char *name) { 3500 __ align(CodeEntryAlignment); 3501 StubCodeMark mark(this, "StubRoutines", name); 3502 address start = __ pc(); 3503 3504 Register buf = c_rarg0; 3505 Register state = c_rarg1; 3506 Register ofs = c_rarg2; 3507 Register limit = c_rarg3; 3508 Register a = r4; 3509 Register b = r5; 3510 Register c = r6; 3511 Register d = r7; 3512 Register rscratch3 = r10; 3513 Register rscratch4 = r11; 3514 3515 Register state_regs[2] = { r12, r13 }; 3516 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3517 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3518 3519 __ push(saved_regs, sp); 3520 3521 __ ldp(state_regs[0], state_regs[1], Address(state)); 3522 __ ubfx(a, state_regs[0], 0, 32); 3523 __ ubfx(b, state_regs[0], 32, 32); 3524 __ ubfx(c, state_regs[1], 0, 32); 3525 __ ubfx(d, state_regs[1], 32, 32); 3526 3527 Label md5_loop; 3528 __ BIND(md5_loop); 3529 3530 reg_cache.gen_loads(buf); 3531 3532 // Round 1 3533 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3534 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3535 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3536 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3537 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3538 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3539 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3540 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3541 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3542 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3543 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3544 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3545 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3546 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3547 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3548 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3549 3550 // Round 2 3551 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3552 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3553 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3554 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3555 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3556 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3557 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3558 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3559 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3560 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3561 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3562 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3563 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3564 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3565 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3566 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3567 3568 // Round 3 3569 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3570 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3571 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3572 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3573 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3574 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3575 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3576 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3577 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3578 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3579 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3580 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3581 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3582 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3583 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3584 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3585 3586 // Round 4 3587 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3588 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3589 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3590 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3591 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3592 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3593 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3594 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3595 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3596 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3597 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3598 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3599 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3600 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3601 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3602 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3603 3604 __ addw(a, state_regs[0], a); 3605 __ ubfx(rscratch2, state_regs[0], 32, 32); 3606 __ addw(b, rscratch2, b); 3607 __ addw(c, state_regs[1], c); 3608 __ ubfx(rscratch4, state_regs[1], 32, 32); 3609 __ addw(d, rscratch4, d); 3610 3611 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3612 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3613 3614 if (multi_block) { 3615 __ add(buf, buf, 64); 3616 __ add(ofs, ofs, 64); 3617 __ cmp(ofs, limit); 3618 __ br(Assembler::LE, md5_loop); 3619 __ mov(c_rarg0, ofs); // return ofs 3620 } 3621 3622 // write hash values back in the correct order 3623 __ stp(state_regs[0], state_regs[1], Address(state)); 3624 3625 __ pop(saved_regs, sp); 3626 3627 __ ret(lr); 3628 3629 return start; 3630 } 3631 3632 // Arguments: 3633 // 3634 // Inputs: 3635 // c_rarg0 - byte[] source+offset 3636 // c_rarg1 - int[] SHA.state 3637 // c_rarg2 - int offset 3638 // c_rarg3 - int limit 3639 // 3640 address generate_sha1_implCompress(bool multi_block, const char *name) { 3641 __ align(CodeEntryAlignment); 3642 StubCodeMark mark(this, "StubRoutines", name); 3643 address start = __ pc(); 3644 3645 Register buf = c_rarg0; 3646 Register state = c_rarg1; 3647 Register ofs = c_rarg2; 3648 Register limit = c_rarg3; 3649 3650 Label keys; 3651 Label sha1_loop; 3652 3653 // load the keys into v0..v3 3654 __ adr(rscratch1, keys); 3655 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3656 // load 5 words state into v6, v7 3657 __ ldrq(v6, Address(state, 0)); 3658 __ ldrs(v7, Address(state, 16)); 3659 3660 3661 __ BIND(sha1_loop); 3662 // load 64 bytes of data into v16..v19 3663 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3664 __ rev32(v16, __ T16B, v16); 3665 __ rev32(v17, __ T16B, v17); 3666 __ rev32(v18, __ T16B, v18); 3667 __ rev32(v19, __ T16B, v19); 3668 3669 // do the sha1 3670 __ addv(v4, __ T4S, v16, v0); 3671 __ orr(v20, __ T16B, v6, v6); 3672 3673 FloatRegister d0 = v16; 3674 FloatRegister d1 = v17; 3675 FloatRegister d2 = v18; 3676 FloatRegister d3 = v19; 3677 3678 for (int round = 0; round < 20; round++) { 3679 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3680 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3681 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3682 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3683 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3684 3685 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3686 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3687 __ sha1h(tmp2, __ T4S, v20); 3688 if (round < 5) 3689 __ sha1c(v20, __ T4S, tmp3, tmp4); 3690 else if (round < 10 || round >= 15) 3691 __ sha1p(v20, __ T4S, tmp3, tmp4); 3692 else 3693 __ sha1m(v20, __ T4S, tmp3, tmp4); 3694 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3695 3696 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3697 } 3698 3699 __ addv(v7, __ T2S, v7, v21); 3700 __ addv(v6, __ T4S, v6, v20); 3701 3702 if (multi_block) { 3703 __ add(ofs, ofs, 64); 3704 __ cmp(ofs, limit); 3705 __ br(Assembler::LE, sha1_loop); 3706 __ mov(c_rarg0, ofs); // return ofs 3707 } 3708 3709 __ strq(v6, Address(state, 0)); 3710 __ strs(v7, Address(state, 16)); 3711 3712 __ ret(lr); 3713 3714 __ bind(keys); 3715 __ emit_int32(0x5a827999); 3716 __ emit_int32(0x6ed9eba1); 3717 __ emit_int32(0x8f1bbcdc); 3718 __ emit_int32(0xca62c1d6); 3719 3720 return start; 3721 } 3722 3723 3724 // Arguments: 3725 // 3726 // Inputs: 3727 // c_rarg0 - byte[] source+offset 3728 // c_rarg1 - int[] SHA.state 3729 // c_rarg2 - int offset 3730 // c_rarg3 - int limit 3731 // 3732 address generate_sha256_implCompress(bool multi_block, const char *name) { 3733 static const uint32_t round_consts[64] = { 3734 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3735 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3736 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3737 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3738 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3739 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3740 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3741 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3742 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3743 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3744 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3745 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3746 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3747 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3748 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3749 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3750 }; 3751 __ align(CodeEntryAlignment); 3752 StubCodeMark mark(this, "StubRoutines", name); 3753 address start = __ pc(); 3754 3755 Register buf = c_rarg0; 3756 Register state = c_rarg1; 3757 Register ofs = c_rarg2; 3758 Register limit = c_rarg3; 3759 3760 Label sha1_loop; 3761 3762 __ stpd(v8, v9, __ pre(sp, -32)); 3763 __ stpd(v10, v11, Address(sp, 16)); 3764 3765 // dga == v0 3766 // dgb == v1 3767 // dg0 == v2 3768 // dg1 == v3 3769 // dg2 == v4 3770 // t0 == v6 3771 // t1 == v7 3772 3773 // load 16 keys to v16..v31 3774 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3775 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3776 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3777 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3778 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3779 3780 // load 8 words (256 bits) state 3781 __ ldpq(v0, v1, state); 3782 3783 __ BIND(sha1_loop); 3784 // load 64 bytes of data into v8..v11 3785 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3786 __ rev32(v8, __ T16B, v8); 3787 __ rev32(v9, __ T16B, v9); 3788 __ rev32(v10, __ T16B, v10); 3789 __ rev32(v11, __ T16B, v11); 3790 3791 __ addv(v6, __ T4S, v8, v16); 3792 __ orr(v2, __ T16B, v0, v0); 3793 __ orr(v3, __ T16B, v1, v1); 3794 3795 FloatRegister d0 = v8; 3796 FloatRegister d1 = v9; 3797 FloatRegister d2 = v10; 3798 FloatRegister d3 = v11; 3799 3800 3801 for (int round = 0; round < 16; round++) { 3802 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3803 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3804 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3805 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3806 3807 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3808 __ orr(v4, __ T16B, v2, v2); 3809 if (round < 15) 3810 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3811 __ sha256h(v2, __ T4S, v3, tmp2); 3812 __ sha256h2(v3, __ T4S, v4, tmp2); 3813 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3814 3815 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3816 } 3817 3818 __ addv(v0, __ T4S, v0, v2); 3819 __ addv(v1, __ T4S, v1, v3); 3820 3821 if (multi_block) { 3822 __ add(ofs, ofs, 64); 3823 __ cmp(ofs, limit); 3824 __ br(Assembler::LE, sha1_loop); 3825 __ mov(c_rarg0, ofs); // return ofs 3826 } 3827 3828 __ ldpd(v10, v11, Address(sp, 16)); 3829 __ ldpd(v8, v9, __ post(sp, 32)); 3830 3831 __ stpq(v0, v1, state); 3832 3833 __ ret(lr); 3834 3835 return start; 3836 } 3837 3838 // Double rounds for sha512. 3839 void sha512_dround(int dr, 3840 FloatRegister vi0, FloatRegister vi1, 3841 FloatRegister vi2, FloatRegister vi3, 3842 FloatRegister vi4, FloatRegister vrc0, 3843 FloatRegister vrc1, FloatRegister vin0, 3844 FloatRegister vin1, FloatRegister vin2, 3845 FloatRegister vin3, FloatRegister vin4) { 3846 if (dr < 36) { 3847 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3848 } 3849 __ addv(v5, __ T2D, vrc0, vin0); 3850 __ ext(v6, __ T16B, vi2, vi3, 8); 3851 __ ext(v5, __ T16B, v5, v5, 8); 3852 __ ext(v7, __ T16B, vi1, vi2, 8); 3853 __ addv(vi3, __ T2D, vi3, v5); 3854 if (dr < 32) { 3855 __ ext(v5, __ T16B, vin3, vin4, 8); 3856 __ sha512su0(vin0, __ T2D, vin1); 3857 } 3858 __ sha512h(vi3, __ T2D, v6, v7); 3859 if (dr < 32) { 3860 __ sha512su1(vin0, __ T2D, vin2, v5); 3861 } 3862 __ addv(vi4, __ T2D, vi1, vi3); 3863 __ sha512h2(vi3, __ T2D, vi1, vi0); 3864 } 3865 3866 // Arguments: 3867 // 3868 // Inputs: 3869 // c_rarg0 - byte[] source+offset 3870 // c_rarg1 - int[] SHA.state 3871 // c_rarg2 - int offset 3872 // c_rarg3 - int limit 3873 // 3874 address generate_sha512_implCompress(bool multi_block, const char *name) { 3875 static const uint64_t round_consts[80] = { 3876 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3877 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3878 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3879 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3880 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3881 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3882 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3883 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3884 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3885 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3886 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3887 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3888 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3889 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3890 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3891 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3892 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3893 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3894 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3895 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3896 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3897 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3898 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3899 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3900 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3901 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3902 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3903 }; 3904 3905 __ align(CodeEntryAlignment); 3906 StubCodeMark mark(this, "StubRoutines", name); 3907 address start = __ pc(); 3908 3909 Register buf = c_rarg0; 3910 Register state = c_rarg1; 3911 Register ofs = c_rarg2; 3912 Register limit = c_rarg3; 3913 3914 __ stpd(v8, v9, __ pre(sp, -64)); 3915 __ stpd(v10, v11, Address(sp, 16)); 3916 __ stpd(v12, v13, Address(sp, 32)); 3917 __ stpd(v14, v15, Address(sp, 48)); 3918 3919 Label sha512_loop; 3920 3921 // load state 3922 __ ld1(v8, v9, v10, v11, __ T2D, state); 3923 3924 // load first 4 round constants 3925 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3926 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3927 3928 __ BIND(sha512_loop); 3929 // load 128B of data into v12..v19 3930 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3931 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3932 __ rev64(v12, __ T16B, v12); 3933 __ rev64(v13, __ T16B, v13); 3934 __ rev64(v14, __ T16B, v14); 3935 __ rev64(v15, __ T16B, v15); 3936 __ rev64(v16, __ T16B, v16); 3937 __ rev64(v17, __ T16B, v17); 3938 __ rev64(v18, __ T16B, v18); 3939 __ rev64(v19, __ T16B, v19); 3940 3941 __ mov(rscratch2, rscratch1); 3942 3943 __ mov(v0, __ T16B, v8); 3944 __ mov(v1, __ T16B, v9); 3945 __ mov(v2, __ T16B, v10); 3946 __ mov(v3, __ T16B, v11); 3947 3948 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3949 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3950 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3951 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3952 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3953 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3954 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3955 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3956 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3957 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3958 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3959 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3960 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3961 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3962 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3963 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3964 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3965 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3966 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3967 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3968 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3969 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3970 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3971 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3972 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3973 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3974 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3975 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3976 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3977 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3978 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3979 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3980 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3981 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3982 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3983 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3984 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3985 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3986 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3987 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3988 3989 __ addv(v8, __ T2D, v8, v0); 3990 __ addv(v9, __ T2D, v9, v1); 3991 __ addv(v10, __ T2D, v10, v2); 3992 __ addv(v11, __ T2D, v11, v3); 3993 3994 if (multi_block) { 3995 __ add(ofs, ofs, 128); 3996 __ cmp(ofs, limit); 3997 __ br(Assembler::LE, sha512_loop); 3998 __ mov(c_rarg0, ofs); // return ofs 3999 } 4000 4001 __ st1(v8, v9, v10, v11, __ T2D, state); 4002 4003 __ ldpd(v14, v15, Address(sp, 48)); 4004 __ ldpd(v12, v13, Address(sp, 32)); 4005 __ ldpd(v10, v11, Address(sp, 16)); 4006 __ ldpd(v8, v9, __ post(sp, 64)); 4007 4008 __ ret(lr); 4009 4010 return start; 4011 } 4012 4013 // Arguments: 4014 // 4015 // Inputs: 4016 // c_rarg0 - byte[] source+offset 4017 // c_rarg1 - byte[] SHA.state 4018 // c_rarg2 - int block_size 4019 // c_rarg3 - int offset 4020 // c_rarg4 - int limit 4021 // 4022 address generate_sha3_implCompress(bool multi_block, const char *name) { 4023 static const uint64_t round_consts[24] = { 4024 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4025 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4026 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4027 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4028 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4029 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4030 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4031 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4032 }; 4033 4034 __ align(CodeEntryAlignment); 4035 StubCodeMark mark(this, "StubRoutines", name); 4036 address start = __ pc(); 4037 4038 Register buf = c_rarg0; 4039 Register state = c_rarg1; 4040 Register block_size = c_rarg2; 4041 Register ofs = c_rarg3; 4042 Register limit = c_rarg4; 4043 4044 Label sha3_loop, rounds24_loop; 4045 Label sha3_512_or_sha3_384, shake128; 4046 4047 __ stpd(v8, v9, __ pre(sp, -64)); 4048 __ stpd(v10, v11, Address(sp, 16)); 4049 __ stpd(v12, v13, Address(sp, 32)); 4050 __ stpd(v14, v15, Address(sp, 48)); 4051 4052 // load state 4053 __ add(rscratch1, state, 32); 4054 __ ld1(v0, v1, v2, v3, __ T1D, state); 4055 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4056 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4057 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4058 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4059 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4060 __ ld1(v24, __ T1D, rscratch1); 4061 4062 __ BIND(sha3_loop); 4063 4064 // 24 keccak rounds 4065 __ movw(rscratch2, 24); 4066 4067 // load round_constants base 4068 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4069 4070 // load input 4071 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4072 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4073 __ eor(v0, __ T8B, v0, v25); 4074 __ eor(v1, __ T8B, v1, v26); 4075 __ eor(v2, __ T8B, v2, v27); 4076 __ eor(v3, __ T8B, v3, v28); 4077 __ eor(v4, __ T8B, v4, v29); 4078 __ eor(v5, __ T8B, v5, v30); 4079 __ eor(v6, __ T8B, v6, v31); 4080 4081 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4082 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4083 4084 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4085 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4086 __ eor(v7, __ T8B, v7, v25); 4087 __ eor(v8, __ T8B, v8, v26); 4088 __ eor(v9, __ T8B, v9, v27); 4089 __ eor(v10, __ T8B, v10, v28); 4090 __ eor(v11, __ T8B, v11, v29); 4091 __ eor(v12, __ T8B, v12, v30); 4092 __ eor(v13, __ T8B, v13, v31); 4093 4094 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4095 __ eor(v14, __ T8B, v14, v25); 4096 __ eor(v15, __ T8B, v15, v26); 4097 __ eor(v16, __ T8B, v16, v27); 4098 4099 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4100 __ andw(c_rarg5, block_size, 48); 4101 __ cbzw(c_rarg5, rounds24_loop); 4102 4103 __ tbnz(block_size, 5, shake128); 4104 // block_size == 144, bit5 == 0, SHA3-244 4105 __ ldrd(v28, __ post(buf, 8)); 4106 __ eor(v17, __ T8B, v17, v28); 4107 __ b(rounds24_loop); 4108 4109 __ BIND(shake128); 4110 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4111 __ eor(v17, __ T8B, v17, v28); 4112 __ eor(v18, __ T8B, v18, v29); 4113 __ eor(v19, __ T8B, v19, v30); 4114 __ eor(v20, __ T8B, v20, v31); 4115 __ b(rounds24_loop); // block_size == 168, SHAKE128 4116 4117 __ BIND(sha3_512_or_sha3_384); 4118 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4119 __ eor(v7, __ T8B, v7, v25); 4120 __ eor(v8, __ T8B, v8, v26); 4121 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4122 4123 // SHA3-384 4124 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4125 __ eor(v9, __ T8B, v9, v27); 4126 __ eor(v10, __ T8B, v10, v28); 4127 __ eor(v11, __ T8B, v11, v29); 4128 __ eor(v12, __ T8B, v12, v30); 4129 4130 __ BIND(rounds24_loop); 4131 __ subw(rscratch2, rscratch2, 1); 4132 4133 __ eor3(v29, __ T16B, v4, v9, v14); 4134 __ eor3(v26, __ T16B, v1, v6, v11); 4135 __ eor3(v28, __ T16B, v3, v8, v13); 4136 __ eor3(v25, __ T16B, v0, v5, v10); 4137 __ eor3(v27, __ T16B, v2, v7, v12); 4138 __ eor3(v29, __ T16B, v29, v19, v24); 4139 __ eor3(v26, __ T16B, v26, v16, v21); 4140 __ eor3(v28, __ T16B, v28, v18, v23); 4141 __ eor3(v25, __ T16B, v25, v15, v20); 4142 __ eor3(v27, __ T16B, v27, v17, v22); 4143 4144 __ rax1(v30, __ T2D, v29, v26); 4145 __ rax1(v26, __ T2D, v26, v28); 4146 __ rax1(v28, __ T2D, v28, v25); 4147 __ rax1(v25, __ T2D, v25, v27); 4148 __ rax1(v27, __ T2D, v27, v29); 4149 4150 __ eor(v0, __ T16B, v0, v30); 4151 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4152 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4153 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4154 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4155 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4156 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4157 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4158 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4159 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4160 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4161 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4162 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4163 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4164 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4165 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4166 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4167 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4168 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4169 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4170 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4171 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4172 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4173 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4174 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4175 4176 __ bcax(v20, __ T16B, v31, v22, v8); 4177 __ bcax(v21, __ T16B, v8, v23, v22); 4178 __ bcax(v22, __ T16B, v22, v24, v23); 4179 __ bcax(v23, __ T16B, v23, v31, v24); 4180 __ bcax(v24, __ T16B, v24, v8, v31); 4181 4182 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4183 4184 __ bcax(v17, __ T16B, v25, v19, v3); 4185 __ bcax(v18, __ T16B, v3, v15, v19); 4186 __ bcax(v19, __ T16B, v19, v16, v15); 4187 __ bcax(v15, __ T16B, v15, v25, v16); 4188 __ bcax(v16, __ T16B, v16, v3, v25); 4189 4190 __ bcax(v10, __ T16B, v29, v12, v26); 4191 __ bcax(v11, __ T16B, v26, v13, v12); 4192 __ bcax(v12, __ T16B, v12, v14, v13); 4193 __ bcax(v13, __ T16B, v13, v29, v14); 4194 __ bcax(v14, __ T16B, v14, v26, v29); 4195 4196 __ bcax(v7, __ T16B, v30, v9, v4); 4197 __ bcax(v8, __ T16B, v4, v5, v9); 4198 __ bcax(v9, __ T16B, v9, v6, v5); 4199 __ bcax(v5, __ T16B, v5, v30, v6); 4200 __ bcax(v6, __ T16B, v6, v4, v30); 4201 4202 __ bcax(v3, __ T16B, v27, v0, v28); 4203 __ bcax(v4, __ T16B, v28, v1, v0); 4204 __ bcax(v0, __ T16B, v0, v2, v1); 4205 __ bcax(v1, __ T16B, v1, v27, v2); 4206 __ bcax(v2, __ T16B, v2, v28, v27); 4207 4208 __ eor(v0, __ T16B, v0, v31); 4209 4210 __ cbnzw(rscratch2, rounds24_loop); 4211 4212 if (multi_block) { 4213 __ add(ofs, ofs, block_size); 4214 __ cmp(ofs, limit); 4215 __ br(Assembler::LE, sha3_loop); 4216 __ mov(c_rarg0, ofs); // return ofs 4217 } 4218 4219 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4220 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4221 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4222 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4223 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4224 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4225 __ st1(v24, __ T1D, state); 4226 4227 __ ldpd(v14, v15, Address(sp, 48)); 4228 __ ldpd(v12, v13, Address(sp, 32)); 4229 __ ldpd(v10, v11, Address(sp, 16)); 4230 __ ldpd(v8, v9, __ post(sp, 64)); 4231 4232 __ ret(lr); 4233 4234 return start; 4235 } 4236 4237 /** 4238 * Arguments: 4239 * 4240 * Inputs: 4241 * c_rarg0 - int crc 4242 * c_rarg1 - byte* buf 4243 * c_rarg2 - int length 4244 * 4245 * Output: 4246 * rax - int crc result 4247 */ 4248 address generate_updateBytesCRC32() { 4249 assert(UseCRC32Intrinsics, "what are we doing here?"); 4250 4251 __ align(CodeEntryAlignment); 4252 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4253 4254 address start = __ pc(); 4255 4256 const Register crc = c_rarg0; // crc 4257 const Register buf = c_rarg1; // source java byte array address 4258 const Register len = c_rarg2; // length 4259 const Register table0 = c_rarg3; // crc_table address 4260 const Register table1 = c_rarg4; 4261 const Register table2 = c_rarg5; 4262 const Register table3 = c_rarg6; 4263 const Register tmp3 = c_rarg7; 4264 4265 BLOCK_COMMENT("Entry:"); 4266 __ enter(); // required for proper stackwalking of RuntimeStub frame 4267 4268 __ kernel_crc32(crc, buf, len, 4269 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4270 4271 __ leave(); // required for proper stackwalking of RuntimeStub frame 4272 __ ret(lr); 4273 4274 return start; 4275 } 4276 4277 // ChaCha20 block function. This version parallelizes by loading 4278 // individual 32-bit state elements into vectors for four blocks 4279 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4280 // 4281 // state (int[16]) = c_rarg0 4282 // keystream (byte[1024]) = c_rarg1 4283 // return - number of bytes of keystream (always 256) 4284 address generate_chacha20Block_blockpar() { 4285 Label L_twoRounds, L_cc20_const; 4286 // The constant data is broken into two 128-bit segments to be loaded 4287 // onto FloatRegisters. The first 128 bits are a counter add overlay 4288 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4289 // The second 128-bits is a table constant used for 8-bit left rotations. 4290 __ BIND(L_cc20_const); 4291 __ emit_int64(0x0000000100000000UL); 4292 __ emit_int64(0x0000000300000002UL); 4293 __ emit_int64(0x0605040702010003UL); 4294 __ emit_int64(0x0E0D0C0F0A09080BUL); 4295 4296 __ align(CodeEntryAlignment); 4297 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4298 address start = __ pc(); 4299 __ enter(); 4300 4301 int i, j; 4302 const Register state = c_rarg0; 4303 const Register keystream = c_rarg1; 4304 const Register loopCtr = r10; 4305 const Register tmpAddr = r11; 4306 4307 const FloatRegister stateFirst = v0; 4308 const FloatRegister stateSecond = v1; 4309 const FloatRegister stateThird = v2; 4310 const FloatRegister stateFourth = v3; 4311 const FloatRegister origCtrState = v28; 4312 const FloatRegister scratch = v29; 4313 const FloatRegister lrot8Tbl = v30; 4314 4315 // Organize SIMD registers in an array that facilitates 4316 // putting repetitive opcodes into loop structures. It is 4317 // important that each grouping of 4 registers is monotonically 4318 // increasing to support the requirements of multi-register 4319 // instructions (e.g. ld4r, st4, etc.) 4320 const FloatRegister workSt[16] = { 4321 v4, v5, v6, v7, v16, v17, v18, v19, 4322 v20, v21, v22, v23, v24, v25, v26, v27 4323 }; 4324 4325 // Load from memory and interlace across 16 SIMD registers, 4326 // With each word from memory being broadcast to all lanes of 4327 // each successive SIMD register. 4328 // Addr(0) -> All lanes in workSt[i] 4329 // Addr(4) -> All lanes workSt[i + 1], etc. 4330 __ mov(tmpAddr, state); 4331 for (i = 0; i < 16; i += 4) { 4332 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4333 __ post(tmpAddr, 16)); 4334 } 4335 4336 // Pull in constant data. The first 16 bytes are the add overlay 4337 // which is applied to the vector holding the counter (state[12]). 4338 // The second 16 bytes is the index register for the 8-bit left 4339 // rotation tbl instruction. 4340 __ adr(tmpAddr, L_cc20_const); 4341 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4342 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4343 4344 // Set up the 10 iteration loop and perform all 8 quarter round ops 4345 __ mov(loopCtr, 10); 4346 __ BIND(L_twoRounds); 4347 4348 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4349 scratch, lrot8Tbl); 4350 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4351 scratch, lrot8Tbl); 4352 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4353 scratch, lrot8Tbl); 4354 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4355 scratch, lrot8Tbl); 4356 4357 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4358 scratch, lrot8Tbl); 4359 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4360 scratch, lrot8Tbl); 4361 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4362 scratch, lrot8Tbl); 4363 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4364 scratch, lrot8Tbl); 4365 4366 // Decrement and iterate 4367 __ sub(loopCtr, loopCtr, 1); 4368 __ cbnz(loopCtr, L_twoRounds); 4369 4370 __ mov(tmpAddr, state); 4371 4372 // Add the starting state back to the post-loop keystream 4373 // state. We read/interlace the state array from memory into 4374 // 4 registers similar to what we did in the beginning. Then 4375 // add the counter overlay onto workSt[12] at the end. 4376 for (i = 0; i < 16; i += 4) { 4377 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4378 __ post(tmpAddr, 16)); 4379 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4380 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4381 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4382 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4383 } 4384 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4385 4386 // Write to key stream, storing the same element out of workSt[0..15] 4387 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4388 // for the next element position. 4389 for (i = 0; i < 4; i++) { 4390 for (j = 0; j < 16; j += 4) { 4391 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4392 __ post(keystream, 16)); 4393 } 4394 } 4395 4396 __ mov(r0, 256); // Return length of output keystream 4397 __ leave(); 4398 __ ret(lr); 4399 4400 return start; 4401 } 4402 4403 /** 4404 * Arguments: 4405 * 4406 * Inputs: 4407 * c_rarg0 - int crc 4408 * c_rarg1 - byte* buf 4409 * c_rarg2 - int length 4410 * c_rarg3 - int* table 4411 * 4412 * Output: 4413 * r0 - int crc result 4414 */ 4415 address generate_updateBytesCRC32C() { 4416 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4417 4418 __ align(CodeEntryAlignment); 4419 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4420 4421 address start = __ pc(); 4422 4423 const Register crc = c_rarg0; // crc 4424 const Register buf = c_rarg1; // source java byte array address 4425 const Register len = c_rarg2; // length 4426 const Register table0 = c_rarg3; // crc_table address 4427 const Register table1 = c_rarg4; 4428 const Register table2 = c_rarg5; 4429 const Register table3 = c_rarg6; 4430 const Register tmp3 = c_rarg7; 4431 4432 BLOCK_COMMENT("Entry:"); 4433 __ enter(); // required for proper stackwalking of RuntimeStub frame 4434 4435 __ kernel_crc32c(crc, buf, len, 4436 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4437 4438 __ leave(); // required for proper stackwalking of RuntimeStub frame 4439 __ ret(lr); 4440 4441 return start; 4442 } 4443 4444 /*** 4445 * Arguments: 4446 * 4447 * Inputs: 4448 * c_rarg0 - int adler 4449 * c_rarg1 - byte* buff 4450 * c_rarg2 - int len 4451 * 4452 * Output: 4453 * c_rarg0 - int adler result 4454 */ 4455 address generate_updateBytesAdler32() { 4456 __ align(CodeEntryAlignment); 4457 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4458 address start = __ pc(); 4459 4460 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4461 4462 // Aliases 4463 Register adler = c_rarg0; 4464 Register s1 = c_rarg0; 4465 Register s2 = c_rarg3; 4466 Register buff = c_rarg1; 4467 Register len = c_rarg2; 4468 Register nmax = r4; 4469 Register base = r5; 4470 Register count = r6; 4471 Register temp0 = rscratch1; 4472 Register temp1 = rscratch2; 4473 FloatRegister vbytes = v0; 4474 FloatRegister vs1acc = v1; 4475 FloatRegister vs2acc = v2; 4476 FloatRegister vtable = v3; 4477 4478 // Max number of bytes we can process before having to take the mod 4479 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4480 uint64_t BASE = 0xfff1; 4481 uint64_t NMAX = 0x15B0; 4482 4483 __ mov(base, BASE); 4484 __ mov(nmax, NMAX); 4485 4486 // Load accumulation coefficients for the upper 16 bits 4487 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4488 __ ld1(vtable, __ T16B, Address(temp0)); 4489 4490 // s1 is initialized to the lower 16 bits of adler 4491 // s2 is initialized to the upper 16 bits of adler 4492 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4493 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4494 4495 // The pipelined loop needs at least 16 elements for 1 iteration 4496 // It does check this, but it is more effective to skip to the cleanup loop 4497 __ cmp(len, (u1)16); 4498 __ br(Assembler::HS, L_nmax); 4499 __ cbz(len, L_combine); 4500 4501 __ bind(L_simple_by1_loop); 4502 __ ldrb(temp0, Address(__ post(buff, 1))); 4503 __ add(s1, s1, temp0); 4504 __ add(s2, s2, s1); 4505 __ subs(len, len, 1); 4506 __ br(Assembler::HI, L_simple_by1_loop); 4507 4508 // s1 = s1 % BASE 4509 __ subs(temp0, s1, base); 4510 __ csel(s1, temp0, s1, Assembler::HS); 4511 4512 // s2 = s2 % BASE 4513 __ lsr(temp0, s2, 16); 4514 __ lsl(temp1, temp0, 4); 4515 __ sub(temp1, temp1, temp0); 4516 __ add(s2, temp1, s2, ext::uxth); 4517 4518 __ subs(temp0, s2, base); 4519 __ csel(s2, temp0, s2, Assembler::HS); 4520 4521 __ b(L_combine); 4522 4523 __ bind(L_nmax); 4524 __ subs(len, len, nmax); 4525 __ sub(count, nmax, 16); 4526 __ br(Assembler::LO, L_by16); 4527 4528 __ bind(L_nmax_loop); 4529 4530 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4531 vbytes, vs1acc, vs2acc, vtable); 4532 4533 __ subs(count, count, 16); 4534 __ br(Assembler::HS, L_nmax_loop); 4535 4536 // s1 = s1 % BASE 4537 __ lsr(temp0, s1, 16); 4538 __ lsl(temp1, temp0, 4); 4539 __ sub(temp1, temp1, temp0); 4540 __ add(temp1, temp1, s1, ext::uxth); 4541 4542 __ lsr(temp0, temp1, 16); 4543 __ lsl(s1, temp0, 4); 4544 __ sub(s1, s1, temp0); 4545 __ add(s1, s1, temp1, ext:: uxth); 4546 4547 __ subs(temp0, s1, base); 4548 __ csel(s1, temp0, s1, Assembler::HS); 4549 4550 // s2 = s2 % BASE 4551 __ lsr(temp0, s2, 16); 4552 __ lsl(temp1, temp0, 4); 4553 __ sub(temp1, temp1, temp0); 4554 __ add(temp1, temp1, s2, ext::uxth); 4555 4556 __ lsr(temp0, temp1, 16); 4557 __ lsl(s2, temp0, 4); 4558 __ sub(s2, s2, temp0); 4559 __ add(s2, s2, temp1, ext:: uxth); 4560 4561 __ subs(temp0, s2, base); 4562 __ csel(s2, temp0, s2, Assembler::HS); 4563 4564 __ subs(len, len, nmax); 4565 __ sub(count, nmax, 16); 4566 __ br(Assembler::HS, L_nmax_loop); 4567 4568 __ bind(L_by16); 4569 __ adds(len, len, count); 4570 __ br(Assembler::LO, L_by1); 4571 4572 __ bind(L_by16_loop); 4573 4574 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4575 vbytes, vs1acc, vs2acc, vtable); 4576 4577 __ subs(len, len, 16); 4578 __ br(Assembler::HS, L_by16_loop); 4579 4580 __ bind(L_by1); 4581 __ adds(len, len, 15); 4582 __ br(Assembler::LO, L_do_mod); 4583 4584 __ bind(L_by1_loop); 4585 __ ldrb(temp0, Address(__ post(buff, 1))); 4586 __ add(s1, temp0, s1); 4587 __ add(s2, s2, s1); 4588 __ subs(len, len, 1); 4589 __ br(Assembler::HS, L_by1_loop); 4590 4591 __ bind(L_do_mod); 4592 // s1 = s1 % BASE 4593 __ lsr(temp0, s1, 16); 4594 __ lsl(temp1, temp0, 4); 4595 __ sub(temp1, temp1, temp0); 4596 __ add(temp1, temp1, s1, ext::uxth); 4597 4598 __ lsr(temp0, temp1, 16); 4599 __ lsl(s1, temp0, 4); 4600 __ sub(s1, s1, temp0); 4601 __ add(s1, s1, temp1, ext:: uxth); 4602 4603 __ subs(temp0, s1, base); 4604 __ csel(s1, temp0, s1, Assembler::HS); 4605 4606 // s2 = s2 % BASE 4607 __ lsr(temp0, s2, 16); 4608 __ lsl(temp1, temp0, 4); 4609 __ sub(temp1, temp1, temp0); 4610 __ add(temp1, temp1, s2, ext::uxth); 4611 4612 __ lsr(temp0, temp1, 16); 4613 __ lsl(s2, temp0, 4); 4614 __ sub(s2, s2, temp0); 4615 __ add(s2, s2, temp1, ext:: uxth); 4616 4617 __ subs(temp0, s2, base); 4618 __ csel(s2, temp0, s2, Assembler::HS); 4619 4620 // Combine lower bits and higher bits 4621 __ bind(L_combine); 4622 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4623 4624 __ ret(lr); 4625 4626 return start; 4627 } 4628 4629 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4630 Register temp0, Register temp1, FloatRegister vbytes, 4631 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4632 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4633 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4634 // In non-vectorized code, we update s1 and s2 as: 4635 // s1 <- s1 + b1 4636 // s2 <- s2 + s1 4637 // s1 <- s1 + b2 4638 // s2 <- s2 + b1 4639 // ... 4640 // s1 <- s1 + b16 4641 // s2 <- s2 + s1 4642 // Putting above assignments together, we have: 4643 // s1_new = s1 + b1 + b2 + ... + b16 4644 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4645 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4646 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4647 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4648 4649 // s2 = s2 + s1 * 16 4650 __ add(s2, s2, s1, Assembler::LSL, 4); 4651 4652 // vs1acc = b1 + b2 + b3 + ... + b16 4653 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4654 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4655 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4656 __ uaddlv(vs1acc, __ T16B, vbytes); 4657 __ uaddlv(vs2acc, __ T8H, vs2acc); 4658 4659 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4660 __ fmovd(temp0, vs1acc); 4661 __ fmovd(temp1, vs2acc); 4662 __ add(s1, s1, temp0); 4663 __ add(s2, s2, temp1); 4664 } 4665 4666 /** 4667 * Arguments: 4668 * 4669 * Input: 4670 * c_rarg0 - x address 4671 * c_rarg1 - x length 4672 * c_rarg2 - y address 4673 * c_rarg3 - y length 4674 * c_rarg4 - z address 4675 * c_rarg5 - z length 4676 */ 4677 address generate_multiplyToLen() { 4678 __ align(CodeEntryAlignment); 4679 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4680 4681 address start = __ pc(); 4682 const Register x = r0; 4683 const Register xlen = r1; 4684 const Register y = r2; 4685 const Register ylen = r3; 4686 const Register z = r4; 4687 const Register zlen = r5; 4688 4689 const Register tmp1 = r10; 4690 const Register tmp2 = r11; 4691 const Register tmp3 = r12; 4692 const Register tmp4 = r13; 4693 const Register tmp5 = r14; 4694 const Register tmp6 = r15; 4695 const Register tmp7 = r16; 4696 4697 BLOCK_COMMENT("Entry:"); 4698 __ enter(); // required for proper stackwalking of RuntimeStub frame 4699 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4700 __ leave(); // required for proper stackwalking of RuntimeStub frame 4701 __ ret(lr); 4702 4703 return start; 4704 } 4705 4706 address generate_squareToLen() { 4707 // squareToLen algorithm for sizes 1..127 described in java code works 4708 // faster than multiply_to_len on some CPUs and slower on others, but 4709 // multiply_to_len shows a bit better overall results 4710 __ align(CodeEntryAlignment); 4711 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4712 address start = __ pc(); 4713 4714 const Register x = r0; 4715 const Register xlen = r1; 4716 const Register z = r2; 4717 const Register zlen = r3; 4718 const Register y = r4; // == x 4719 const Register ylen = r5; // == xlen 4720 4721 const Register tmp1 = r10; 4722 const Register tmp2 = r11; 4723 const Register tmp3 = r12; 4724 const Register tmp4 = r13; 4725 const Register tmp5 = r14; 4726 const Register tmp6 = r15; 4727 const Register tmp7 = r16; 4728 4729 RegSet spilled_regs = RegSet::of(y, ylen); 4730 BLOCK_COMMENT("Entry:"); 4731 __ enter(); 4732 __ push(spilled_regs, sp); 4733 __ mov(y, x); 4734 __ mov(ylen, xlen); 4735 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4736 __ pop(spilled_regs, sp); 4737 __ leave(); 4738 __ ret(lr); 4739 return start; 4740 } 4741 4742 address generate_mulAdd() { 4743 __ align(CodeEntryAlignment); 4744 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4745 4746 address start = __ pc(); 4747 4748 const Register out = r0; 4749 const Register in = r1; 4750 const Register offset = r2; 4751 const Register len = r3; 4752 const Register k = r4; 4753 4754 BLOCK_COMMENT("Entry:"); 4755 __ enter(); 4756 __ mul_add(out, in, offset, len, k); 4757 __ leave(); 4758 __ ret(lr); 4759 4760 return start; 4761 } 4762 4763 // Arguments: 4764 // 4765 // Input: 4766 // c_rarg0 - newArr address 4767 // c_rarg1 - oldArr address 4768 // c_rarg2 - newIdx 4769 // c_rarg3 - shiftCount 4770 // c_rarg4 - numIter 4771 // 4772 address generate_bigIntegerRightShift() { 4773 __ align(CodeEntryAlignment); 4774 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4775 address start = __ pc(); 4776 4777 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4778 4779 Register newArr = c_rarg0; 4780 Register oldArr = c_rarg1; 4781 Register newIdx = c_rarg2; 4782 Register shiftCount = c_rarg3; 4783 Register numIter = c_rarg4; 4784 Register idx = numIter; 4785 4786 Register newArrCur = rscratch1; 4787 Register shiftRevCount = rscratch2; 4788 Register oldArrCur = r13; 4789 Register oldArrNext = r14; 4790 4791 FloatRegister oldElem0 = v0; 4792 FloatRegister oldElem1 = v1; 4793 FloatRegister newElem = v2; 4794 FloatRegister shiftVCount = v3; 4795 FloatRegister shiftVRevCount = v4; 4796 4797 __ cbz(idx, Exit); 4798 4799 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4800 4801 // left shift count 4802 __ movw(shiftRevCount, 32); 4803 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4804 4805 // numIter too small to allow a 4-words SIMD loop, rolling back 4806 __ cmp(numIter, (u1)4); 4807 __ br(Assembler::LT, ShiftThree); 4808 4809 __ dup(shiftVCount, __ T4S, shiftCount); 4810 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4811 __ negr(shiftVCount, __ T4S, shiftVCount); 4812 4813 __ BIND(ShiftSIMDLoop); 4814 4815 // Calculate the load addresses 4816 __ sub(idx, idx, 4); 4817 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4818 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4819 __ add(oldArrCur, oldArrNext, 4); 4820 4821 // Load 4 words and process 4822 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4823 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4824 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4825 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4826 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4827 __ st1(newElem, __ T4S, Address(newArrCur)); 4828 4829 __ cmp(idx, (u1)4); 4830 __ br(Assembler::LT, ShiftTwoLoop); 4831 __ b(ShiftSIMDLoop); 4832 4833 __ BIND(ShiftTwoLoop); 4834 __ cbz(idx, Exit); 4835 __ cmp(idx, (u1)1); 4836 __ br(Assembler::EQ, ShiftOne); 4837 4838 // Calculate the load addresses 4839 __ sub(idx, idx, 2); 4840 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4841 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4842 __ add(oldArrCur, oldArrNext, 4); 4843 4844 // Load 2 words and process 4845 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4846 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4847 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4848 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4849 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4850 __ st1(newElem, __ T2S, Address(newArrCur)); 4851 __ b(ShiftTwoLoop); 4852 4853 __ BIND(ShiftThree); 4854 __ tbz(idx, 1, ShiftOne); 4855 __ tbz(idx, 0, ShiftTwo); 4856 __ ldrw(r10, Address(oldArr, 12)); 4857 __ ldrw(r11, Address(oldArr, 8)); 4858 __ lsrvw(r10, r10, shiftCount); 4859 __ lslvw(r11, r11, shiftRevCount); 4860 __ orrw(r12, r10, r11); 4861 __ strw(r12, Address(newArr, 8)); 4862 4863 __ BIND(ShiftTwo); 4864 __ ldrw(r10, Address(oldArr, 8)); 4865 __ ldrw(r11, Address(oldArr, 4)); 4866 __ lsrvw(r10, r10, shiftCount); 4867 __ lslvw(r11, r11, shiftRevCount); 4868 __ orrw(r12, r10, r11); 4869 __ strw(r12, Address(newArr, 4)); 4870 4871 __ BIND(ShiftOne); 4872 __ ldrw(r10, Address(oldArr, 4)); 4873 __ ldrw(r11, Address(oldArr)); 4874 __ lsrvw(r10, r10, shiftCount); 4875 __ lslvw(r11, r11, shiftRevCount); 4876 __ orrw(r12, r10, r11); 4877 __ strw(r12, Address(newArr)); 4878 4879 __ BIND(Exit); 4880 __ ret(lr); 4881 4882 return start; 4883 } 4884 4885 // Arguments: 4886 // 4887 // Input: 4888 // c_rarg0 - newArr address 4889 // c_rarg1 - oldArr address 4890 // c_rarg2 - newIdx 4891 // c_rarg3 - shiftCount 4892 // c_rarg4 - numIter 4893 // 4894 address generate_bigIntegerLeftShift() { 4895 __ align(CodeEntryAlignment); 4896 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4897 address start = __ pc(); 4898 4899 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4900 4901 Register newArr = c_rarg0; 4902 Register oldArr = c_rarg1; 4903 Register newIdx = c_rarg2; 4904 Register shiftCount = c_rarg3; 4905 Register numIter = c_rarg4; 4906 4907 Register shiftRevCount = rscratch1; 4908 Register oldArrNext = rscratch2; 4909 4910 FloatRegister oldElem0 = v0; 4911 FloatRegister oldElem1 = v1; 4912 FloatRegister newElem = v2; 4913 FloatRegister shiftVCount = v3; 4914 FloatRegister shiftVRevCount = v4; 4915 4916 __ cbz(numIter, Exit); 4917 4918 __ add(oldArrNext, oldArr, 4); 4919 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4920 4921 // right shift count 4922 __ movw(shiftRevCount, 32); 4923 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4924 4925 // numIter too small to allow a 4-words SIMD loop, rolling back 4926 __ cmp(numIter, (u1)4); 4927 __ br(Assembler::LT, ShiftThree); 4928 4929 __ dup(shiftVCount, __ T4S, shiftCount); 4930 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4931 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4932 4933 __ BIND(ShiftSIMDLoop); 4934 4935 // load 4 words and process 4936 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4937 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4938 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4939 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4940 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4941 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4942 __ sub(numIter, numIter, 4); 4943 4944 __ cmp(numIter, (u1)4); 4945 __ br(Assembler::LT, ShiftTwoLoop); 4946 __ b(ShiftSIMDLoop); 4947 4948 __ BIND(ShiftTwoLoop); 4949 __ cbz(numIter, Exit); 4950 __ cmp(numIter, (u1)1); 4951 __ br(Assembler::EQ, ShiftOne); 4952 4953 // load 2 words and process 4954 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4955 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4956 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4957 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4958 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4959 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4960 __ sub(numIter, numIter, 2); 4961 __ b(ShiftTwoLoop); 4962 4963 __ BIND(ShiftThree); 4964 __ ldrw(r10, __ post(oldArr, 4)); 4965 __ ldrw(r11, __ post(oldArrNext, 4)); 4966 __ lslvw(r10, r10, shiftCount); 4967 __ lsrvw(r11, r11, shiftRevCount); 4968 __ orrw(r12, r10, r11); 4969 __ strw(r12, __ post(newArr, 4)); 4970 __ tbz(numIter, 1, Exit); 4971 __ tbz(numIter, 0, ShiftOne); 4972 4973 __ BIND(ShiftTwo); 4974 __ ldrw(r10, __ post(oldArr, 4)); 4975 __ ldrw(r11, __ post(oldArrNext, 4)); 4976 __ lslvw(r10, r10, shiftCount); 4977 __ lsrvw(r11, r11, shiftRevCount); 4978 __ orrw(r12, r10, r11); 4979 __ strw(r12, __ post(newArr, 4)); 4980 4981 __ BIND(ShiftOne); 4982 __ ldrw(r10, Address(oldArr)); 4983 __ ldrw(r11, Address(oldArrNext)); 4984 __ lslvw(r10, r10, shiftCount); 4985 __ lsrvw(r11, r11, shiftRevCount); 4986 __ orrw(r12, r10, r11); 4987 __ strw(r12, Address(newArr)); 4988 4989 __ BIND(Exit); 4990 __ ret(lr); 4991 4992 return start; 4993 } 4994 4995 address generate_count_positives(address &count_positives_long) { 4996 const u1 large_loop_size = 64; 4997 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4998 int dcache_line = VM_Version::dcache_line_size(); 4999 5000 Register ary1 = r1, len = r2, result = r0; 5001 5002 __ align(CodeEntryAlignment); 5003 5004 StubCodeMark mark(this, "StubRoutines", "count_positives"); 5005 5006 address entry = __ pc(); 5007 5008 __ enter(); 5009 // precondition: a copy of len is already in result 5010 // __ mov(result, len); 5011 5012 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5013 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5014 5015 __ cmp(len, (u1)15); 5016 __ br(Assembler::GT, LEN_OVER_15); 5017 // The only case when execution falls into this code is when pointer is near 5018 // the end of memory page and we have to avoid reading next page 5019 __ add(ary1, ary1, len); 5020 __ subs(len, len, 8); 5021 __ br(Assembler::GT, LEN_OVER_8); 5022 __ ldr(rscratch2, Address(ary1, -8)); 5023 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5024 __ lsrv(rscratch2, rscratch2, rscratch1); 5025 __ tst(rscratch2, UPPER_BIT_MASK); 5026 __ csel(result, zr, result, Assembler::NE); 5027 __ leave(); 5028 __ ret(lr); 5029 __ bind(LEN_OVER_8); 5030 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5031 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5032 __ tst(rscratch2, UPPER_BIT_MASK); 5033 __ br(Assembler::NE, RET_NO_POP); 5034 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5035 __ lsrv(rscratch1, rscratch1, rscratch2); 5036 __ tst(rscratch1, UPPER_BIT_MASK); 5037 __ bind(RET_NO_POP); 5038 __ csel(result, zr, result, Assembler::NE); 5039 __ leave(); 5040 __ ret(lr); 5041 5042 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5043 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5044 5045 count_positives_long = __ pc(); // 2nd entry point 5046 5047 __ enter(); 5048 5049 __ bind(LEN_OVER_15); 5050 __ push(spilled_regs, sp); 5051 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5052 __ cbz(rscratch2, ALIGNED); 5053 __ ldp(tmp6, tmp1, Address(ary1)); 5054 __ mov(tmp5, 16); 5055 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5056 __ add(ary1, ary1, rscratch1); 5057 __ orr(tmp6, tmp6, tmp1); 5058 __ tst(tmp6, UPPER_BIT_MASK); 5059 __ br(Assembler::NE, RET_ADJUST); 5060 __ sub(len, len, rscratch1); 5061 5062 __ bind(ALIGNED); 5063 __ cmp(len, large_loop_size); 5064 __ br(Assembler::LT, CHECK_16); 5065 // Perform 16-byte load as early return in pre-loop to handle situation 5066 // when initially aligned large array has negative values at starting bytes, 5067 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5068 // slower. Cases with negative bytes further ahead won't be affected that 5069 // much. In fact, it'll be faster due to early loads, less instructions and 5070 // less branches in LARGE_LOOP. 5071 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5072 __ sub(len, len, 16); 5073 __ orr(tmp6, tmp6, tmp1); 5074 __ tst(tmp6, UPPER_BIT_MASK); 5075 __ br(Assembler::NE, RET_ADJUST_16); 5076 __ cmp(len, large_loop_size); 5077 __ br(Assembler::LT, CHECK_16); 5078 5079 if (SoftwarePrefetchHintDistance >= 0 5080 && SoftwarePrefetchHintDistance >= dcache_line) { 5081 // initial prefetch 5082 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5083 } 5084 __ bind(LARGE_LOOP); 5085 if (SoftwarePrefetchHintDistance >= 0) { 5086 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5087 } 5088 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5089 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5090 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5091 // instructions per cycle and have less branches, but this approach disables 5092 // early return, thus, all 64 bytes are loaded and checked every time. 5093 __ ldp(tmp2, tmp3, Address(ary1)); 5094 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5095 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5096 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5097 __ add(ary1, ary1, large_loop_size); 5098 __ sub(len, len, large_loop_size); 5099 __ orr(tmp2, tmp2, tmp3); 5100 __ orr(tmp4, tmp4, tmp5); 5101 __ orr(rscratch1, rscratch1, rscratch2); 5102 __ orr(tmp6, tmp6, tmp1); 5103 __ orr(tmp2, tmp2, tmp4); 5104 __ orr(rscratch1, rscratch1, tmp6); 5105 __ orr(tmp2, tmp2, rscratch1); 5106 __ tst(tmp2, UPPER_BIT_MASK); 5107 __ br(Assembler::NE, RET_ADJUST_LONG); 5108 __ cmp(len, large_loop_size); 5109 __ br(Assembler::GE, LARGE_LOOP); 5110 5111 __ bind(CHECK_16); // small 16-byte load pre-loop 5112 __ cmp(len, (u1)16); 5113 __ br(Assembler::LT, POST_LOOP16); 5114 5115 __ bind(LOOP16); // small 16-byte load loop 5116 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5117 __ sub(len, len, 16); 5118 __ orr(tmp2, tmp2, tmp3); 5119 __ tst(tmp2, UPPER_BIT_MASK); 5120 __ br(Assembler::NE, RET_ADJUST_16); 5121 __ cmp(len, (u1)16); 5122 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5123 5124 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5125 __ cmp(len, (u1)8); 5126 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5127 __ ldr(tmp3, Address(__ post(ary1, 8))); 5128 __ tst(tmp3, UPPER_BIT_MASK); 5129 __ br(Assembler::NE, RET_ADJUST); 5130 __ sub(len, len, 8); 5131 5132 __ bind(POST_LOOP16_LOAD_TAIL); 5133 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5134 __ ldr(tmp1, Address(ary1)); 5135 __ mov(tmp2, 64); 5136 __ sub(tmp4, tmp2, len, __ LSL, 3); 5137 __ lslv(tmp1, tmp1, tmp4); 5138 __ tst(tmp1, UPPER_BIT_MASK); 5139 __ br(Assembler::NE, RET_ADJUST); 5140 // Fallthrough 5141 5142 __ bind(RET_LEN); 5143 __ pop(spilled_regs, sp); 5144 __ leave(); 5145 __ ret(lr); 5146 5147 // difference result - len is the count of guaranteed to be 5148 // positive bytes 5149 5150 __ bind(RET_ADJUST_LONG); 5151 __ add(len, len, (u1)(large_loop_size - 16)); 5152 __ bind(RET_ADJUST_16); 5153 __ add(len, len, 16); 5154 __ bind(RET_ADJUST); 5155 __ pop(spilled_regs, sp); 5156 __ leave(); 5157 __ sub(result, result, len); 5158 __ ret(lr); 5159 5160 return entry; 5161 } 5162 5163 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5164 bool usePrefetch, Label &NOT_EQUAL) { 5165 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5166 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5167 tmp7 = r12, tmp8 = r13; 5168 Label LOOP; 5169 5170 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5171 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5172 __ bind(LOOP); 5173 if (usePrefetch) { 5174 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5175 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5176 } 5177 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5178 __ eor(tmp1, tmp1, tmp2); 5179 __ eor(tmp3, tmp3, tmp4); 5180 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5181 __ orr(tmp1, tmp1, tmp3); 5182 __ cbnz(tmp1, NOT_EQUAL); 5183 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5184 __ eor(tmp5, tmp5, tmp6); 5185 __ eor(tmp7, tmp7, tmp8); 5186 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5187 __ orr(tmp5, tmp5, tmp7); 5188 __ cbnz(tmp5, NOT_EQUAL); 5189 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5190 __ eor(tmp1, tmp1, tmp2); 5191 __ eor(tmp3, tmp3, tmp4); 5192 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5193 __ orr(tmp1, tmp1, tmp3); 5194 __ cbnz(tmp1, NOT_EQUAL); 5195 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5196 __ eor(tmp5, tmp5, tmp6); 5197 __ sub(cnt1, cnt1, 8 * wordSize); 5198 __ eor(tmp7, tmp7, tmp8); 5199 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5200 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5201 // cmp) because subs allows an unlimited range of immediate operand. 5202 __ subs(tmp6, cnt1, loopThreshold); 5203 __ orr(tmp5, tmp5, tmp7); 5204 __ cbnz(tmp5, NOT_EQUAL); 5205 __ br(__ GE, LOOP); 5206 // post-loop 5207 __ eor(tmp1, tmp1, tmp2); 5208 __ eor(tmp3, tmp3, tmp4); 5209 __ orr(tmp1, tmp1, tmp3); 5210 __ sub(cnt1, cnt1, 2 * wordSize); 5211 __ cbnz(tmp1, NOT_EQUAL); 5212 } 5213 5214 void generate_large_array_equals_loop_simd(int loopThreshold, 5215 bool usePrefetch, Label &NOT_EQUAL) { 5216 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5217 tmp2 = rscratch2; 5218 Label LOOP; 5219 5220 __ bind(LOOP); 5221 if (usePrefetch) { 5222 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5223 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5224 } 5225 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5226 __ sub(cnt1, cnt1, 8 * wordSize); 5227 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5228 __ subs(tmp1, cnt1, loopThreshold); 5229 __ eor(v0, __ T16B, v0, v4); 5230 __ eor(v1, __ T16B, v1, v5); 5231 __ eor(v2, __ T16B, v2, v6); 5232 __ eor(v3, __ T16B, v3, v7); 5233 __ orr(v0, __ T16B, v0, v1); 5234 __ orr(v1, __ T16B, v2, v3); 5235 __ orr(v0, __ T16B, v0, v1); 5236 __ umov(tmp1, v0, __ D, 0); 5237 __ umov(tmp2, v0, __ D, 1); 5238 __ orr(tmp1, tmp1, tmp2); 5239 __ cbnz(tmp1, NOT_EQUAL); 5240 __ br(__ GE, LOOP); 5241 } 5242 5243 // a1 = r1 - array1 address 5244 // a2 = r2 - array2 address 5245 // result = r0 - return value. Already contains "false" 5246 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5247 // r3-r5 are reserved temporary registers 5248 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5249 address generate_large_array_equals() { 5250 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5251 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5252 tmp7 = r12, tmp8 = r13; 5253 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5254 SMALL_LOOP, POST_LOOP; 5255 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5256 // calculate if at least 32 prefetched bytes are used 5257 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5258 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5259 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5260 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5261 tmp5, tmp6, tmp7, tmp8); 5262 5263 __ align(CodeEntryAlignment); 5264 5265 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5266 5267 address entry = __ pc(); 5268 __ enter(); 5269 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5270 // also advance pointers to use post-increment instead of pre-increment 5271 __ add(a1, a1, wordSize); 5272 __ add(a2, a2, wordSize); 5273 if (AvoidUnalignedAccesses) { 5274 // both implementations (SIMD/nonSIMD) are using relatively large load 5275 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5276 // on some CPUs in case of address is not at least 16-byte aligned. 5277 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5278 // load if needed at least for 1st address and make if 16-byte aligned. 5279 Label ALIGNED16; 5280 __ tbz(a1, 3, ALIGNED16); 5281 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5282 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5283 __ sub(cnt1, cnt1, wordSize); 5284 __ eor(tmp1, tmp1, tmp2); 5285 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5286 __ bind(ALIGNED16); 5287 } 5288 if (UseSIMDForArrayEquals) { 5289 if (SoftwarePrefetchHintDistance >= 0) { 5290 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5291 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5292 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5293 /* prfm = */ true, NOT_EQUAL); 5294 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5295 __ br(__ LT, TAIL); 5296 } 5297 __ bind(NO_PREFETCH_LARGE_LOOP); 5298 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5299 /* prfm = */ false, NOT_EQUAL); 5300 } else { 5301 __ push(spilled_regs, sp); 5302 if (SoftwarePrefetchHintDistance >= 0) { 5303 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5304 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5305 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5306 /* prfm = */ true, NOT_EQUAL); 5307 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5308 __ br(__ LT, TAIL); 5309 } 5310 __ bind(NO_PREFETCH_LARGE_LOOP); 5311 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5312 /* prfm = */ false, NOT_EQUAL); 5313 } 5314 __ bind(TAIL); 5315 __ cbz(cnt1, EQUAL); 5316 __ subs(cnt1, cnt1, wordSize); 5317 __ br(__ LE, POST_LOOP); 5318 __ bind(SMALL_LOOP); 5319 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5320 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5321 __ subs(cnt1, cnt1, wordSize); 5322 __ eor(tmp1, tmp1, tmp2); 5323 __ cbnz(tmp1, NOT_EQUAL); 5324 __ br(__ GT, SMALL_LOOP); 5325 __ bind(POST_LOOP); 5326 __ ldr(tmp1, Address(a1, cnt1)); 5327 __ ldr(tmp2, Address(a2, cnt1)); 5328 __ eor(tmp1, tmp1, tmp2); 5329 __ cbnz(tmp1, NOT_EQUAL); 5330 __ bind(EQUAL); 5331 __ mov(result, true); 5332 __ bind(NOT_EQUAL); 5333 if (!UseSIMDForArrayEquals) { 5334 __ pop(spilled_regs, sp); 5335 } 5336 __ bind(NOT_EQUAL_NO_POP); 5337 __ leave(); 5338 __ ret(lr); 5339 return entry; 5340 } 5341 5342 address generate_dsin_dcos(bool isCos) { 5343 __ align(CodeEntryAlignment); 5344 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5345 address start = __ pc(); 5346 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5347 (address)StubRoutines::aarch64::_two_over_pi, 5348 (address)StubRoutines::aarch64::_pio2, 5349 (address)StubRoutines::aarch64::_dsin_coef, 5350 (address)StubRoutines::aarch64::_dcos_coef); 5351 return start; 5352 } 5353 5354 address generate_dlog() { 5355 __ align(CodeEntryAlignment); 5356 StubCodeMark mark(this, "StubRoutines", "dlog"); 5357 address entry = __ pc(); 5358 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 5359 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 5360 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 5361 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 5362 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 5363 return entry; 5364 } 5365 5366 5367 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5368 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5369 Label &DIFF2) { 5370 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5371 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5372 5373 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5374 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5375 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5376 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5377 5378 __ fmovd(tmpL, vtmp3); 5379 __ eor(rscratch2, tmp3, tmpL); 5380 __ cbnz(rscratch2, DIFF2); 5381 5382 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5383 __ umov(tmpL, vtmp3, __ D, 1); 5384 __ eor(rscratch2, tmpU, tmpL); 5385 __ cbnz(rscratch2, DIFF1); 5386 5387 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5388 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5389 __ fmovd(tmpL, vtmp); 5390 __ eor(rscratch2, tmp3, tmpL); 5391 __ cbnz(rscratch2, DIFF2); 5392 5393 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5394 __ umov(tmpL, vtmp, __ D, 1); 5395 __ eor(rscratch2, tmpU, tmpL); 5396 __ cbnz(rscratch2, DIFF1); 5397 } 5398 5399 // r0 = result 5400 // r1 = str1 5401 // r2 = cnt1 5402 // r3 = str2 5403 // r4 = cnt2 5404 // r10 = tmp1 5405 // r11 = tmp2 5406 address generate_compare_long_string_different_encoding(bool isLU) { 5407 __ align(CodeEntryAlignment); 5408 StubCodeMark mark(this, "StubRoutines", isLU 5409 ? "compare_long_string_different_encoding LU" 5410 : "compare_long_string_different_encoding UL"); 5411 address entry = __ pc(); 5412 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5413 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5414 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5415 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5416 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5417 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5418 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5419 5420 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5421 5422 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5423 // cnt2 == amount of characters left to compare 5424 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5425 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5426 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5427 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5428 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5429 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5430 __ eor(rscratch2, tmp1, tmp2); 5431 __ mov(rscratch1, tmp2); 5432 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5433 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5434 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5435 __ push(spilled_regs, sp); 5436 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5437 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5438 5439 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5440 5441 if (SoftwarePrefetchHintDistance >= 0) { 5442 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5443 __ br(__ LT, NO_PREFETCH); 5444 __ bind(LARGE_LOOP_PREFETCH); 5445 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5446 __ mov(tmp4, 2); 5447 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5448 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5449 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5450 __ subs(tmp4, tmp4, 1); 5451 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5452 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5453 __ mov(tmp4, 2); 5454 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5455 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5456 __ subs(tmp4, tmp4, 1); 5457 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5458 __ sub(cnt2, cnt2, 64); 5459 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5460 __ br(__ GE, LARGE_LOOP_PREFETCH); 5461 } 5462 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5463 __ bind(NO_PREFETCH); 5464 __ subs(cnt2, cnt2, 16); 5465 __ br(__ LT, TAIL); 5466 __ align(OptoLoopAlignment); 5467 __ bind(SMALL_LOOP); // smaller loop 5468 __ subs(cnt2, cnt2, 16); 5469 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5470 __ br(__ GE, SMALL_LOOP); 5471 __ cmn(cnt2, (u1)16); 5472 __ br(__ EQ, LOAD_LAST); 5473 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5474 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5475 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5476 __ ldr(tmp3, Address(cnt1, -8)); 5477 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5478 __ b(LOAD_LAST); 5479 __ bind(DIFF2); 5480 __ mov(tmpU, tmp3); 5481 __ bind(DIFF1); 5482 __ pop(spilled_regs, sp); 5483 __ b(CALCULATE_DIFFERENCE); 5484 __ bind(LOAD_LAST); 5485 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5486 // No need to load it again 5487 __ mov(tmpU, tmp3); 5488 __ pop(spilled_regs, sp); 5489 5490 // tmp2 points to the address of the last 4 Latin1 characters right now 5491 __ ldrs(vtmp, Address(tmp2)); 5492 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5493 __ fmovd(tmpL, vtmp); 5494 5495 __ eor(rscratch2, tmpU, tmpL); 5496 __ cbz(rscratch2, DONE); 5497 5498 // Find the first different characters in the longwords and 5499 // compute their difference. 5500 __ bind(CALCULATE_DIFFERENCE); 5501 __ rev(rscratch2, rscratch2); 5502 __ clz(rscratch2, rscratch2); 5503 __ andr(rscratch2, rscratch2, -16); 5504 __ lsrv(tmp1, tmp1, rscratch2); 5505 __ uxthw(tmp1, tmp1); 5506 __ lsrv(rscratch1, rscratch1, rscratch2); 5507 __ uxthw(rscratch1, rscratch1); 5508 __ subw(result, tmp1, rscratch1); 5509 __ bind(DONE); 5510 __ ret(lr); 5511 return entry; 5512 } 5513 5514 address generate_method_entry_barrier() { 5515 __ align(CodeEntryAlignment); 5516 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5517 5518 Label deoptimize_label; 5519 5520 address start = __ pc(); 5521 5522 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5523 5524 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5525 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5526 // We can get here despite the nmethod being good, if we have not 5527 // yet applied our cross modification fence (or data fence). 5528 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5529 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5530 __ ldrw(rscratch2, rscratch2); 5531 __ strw(rscratch2, thread_epoch_addr); 5532 __ isb(); 5533 __ membar(__ LoadLoad); 5534 } 5535 5536 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5537 5538 __ enter(); 5539 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5540 5541 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5542 5543 __ push_call_clobbered_registers(); 5544 5545 __ mov(c_rarg0, rscratch2); 5546 __ call_VM_leaf 5547 (CAST_FROM_FN_PTR 5548 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5549 5550 __ reset_last_Java_frame(true); 5551 5552 __ mov(rscratch1, r0); 5553 5554 __ pop_call_clobbered_registers(); 5555 5556 __ cbnz(rscratch1, deoptimize_label); 5557 5558 __ leave(); 5559 __ ret(lr); 5560 5561 __ BIND(deoptimize_label); 5562 5563 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5564 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5565 5566 __ mov(sp, rscratch1); 5567 __ br(rscratch2); 5568 5569 return start; 5570 } 5571 5572 // r0 = result 5573 // r1 = str1 5574 // r2 = cnt1 5575 // r3 = str2 5576 // r4 = cnt2 5577 // r10 = tmp1 5578 // r11 = tmp2 5579 address generate_compare_long_string_same_encoding(bool isLL) { 5580 __ align(CodeEntryAlignment); 5581 StubCodeMark mark(this, "StubRoutines", isLL 5582 ? "compare_long_string_same_encoding LL" 5583 : "compare_long_string_same_encoding UU"); 5584 address entry = __ pc(); 5585 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5586 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5587 5588 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5589 5590 // exit from large loop when less than 64 bytes left to read or we're about 5591 // to prefetch memory behind array border 5592 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5593 5594 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5595 __ eor(rscratch2, tmp1, tmp2); 5596 __ cbnz(rscratch2, CAL_DIFFERENCE); 5597 5598 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5599 // update pointers, because of previous read 5600 __ add(str1, str1, wordSize); 5601 __ add(str2, str2, wordSize); 5602 if (SoftwarePrefetchHintDistance >= 0) { 5603 __ align(OptoLoopAlignment); 5604 __ bind(LARGE_LOOP_PREFETCH); 5605 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5606 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5607 5608 for (int i = 0; i < 4; i++) { 5609 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5610 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5611 __ cmp(tmp1, tmp2); 5612 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5613 __ br(Assembler::NE, DIFF); 5614 } 5615 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5616 __ add(str1, str1, 64); 5617 __ add(str2, str2, 64); 5618 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5619 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5620 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5621 } 5622 5623 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5624 __ br(Assembler::LE, LESS16); 5625 __ align(OptoLoopAlignment); 5626 __ bind(LOOP_COMPARE16); 5627 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5628 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5629 __ cmp(tmp1, tmp2); 5630 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5631 __ br(Assembler::NE, DIFF); 5632 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5633 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5634 __ br(Assembler::LT, LESS16); 5635 5636 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5637 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5638 __ cmp(tmp1, tmp2); 5639 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5640 __ br(Assembler::NE, DIFF); 5641 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5642 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5643 __ br(Assembler::GE, LOOP_COMPARE16); 5644 __ cbz(cnt2, LENGTH_DIFF); 5645 5646 __ bind(LESS16); 5647 // each 8 compare 5648 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5649 __ br(Assembler::LE, LESS8); 5650 __ ldr(tmp1, Address(__ post(str1, 8))); 5651 __ ldr(tmp2, Address(__ post(str2, 8))); 5652 __ eor(rscratch2, tmp1, tmp2); 5653 __ cbnz(rscratch2, CAL_DIFFERENCE); 5654 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5655 5656 __ bind(LESS8); // directly load last 8 bytes 5657 if (!isLL) { 5658 __ add(cnt2, cnt2, cnt2); 5659 } 5660 __ ldr(tmp1, Address(str1, cnt2)); 5661 __ ldr(tmp2, Address(str2, cnt2)); 5662 __ eor(rscratch2, tmp1, tmp2); 5663 __ cbz(rscratch2, LENGTH_DIFF); 5664 __ b(CAL_DIFFERENCE); 5665 5666 __ bind(DIFF); 5667 __ cmp(tmp1, tmp2); 5668 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5669 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5670 // reuse rscratch2 register for the result of eor instruction 5671 __ eor(rscratch2, tmp1, tmp2); 5672 5673 __ bind(CAL_DIFFERENCE); 5674 __ rev(rscratch2, rscratch2); 5675 __ clz(rscratch2, rscratch2); 5676 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5677 __ lsrv(tmp1, tmp1, rscratch2); 5678 __ lsrv(tmp2, tmp2, rscratch2); 5679 if (isLL) { 5680 __ uxtbw(tmp1, tmp1); 5681 __ uxtbw(tmp2, tmp2); 5682 } else { 5683 __ uxthw(tmp1, tmp1); 5684 __ uxthw(tmp2, tmp2); 5685 } 5686 __ subw(result, tmp1, tmp2); 5687 5688 __ bind(LENGTH_DIFF); 5689 __ ret(lr); 5690 return entry; 5691 } 5692 5693 enum string_compare_mode { 5694 LL, 5695 LU, 5696 UL, 5697 UU, 5698 }; 5699 5700 // The following registers are declared in aarch64.ad 5701 // r0 = result 5702 // r1 = str1 5703 // r2 = cnt1 5704 // r3 = str2 5705 // r4 = cnt2 5706 // r10 = tmp1 5707 // r11 = tmp2 5708 // z0 = ztmp1 5709 // z1 = ztmp2 5710 // p0 = pgtmp1 5711 // p1 = pgtmp2 5712 address generate_compare_long_string_sve(string_compare_mode mode) { 5713 __ align(CodeEntryAlignment); 5714 address entry = __ pc(); 5715 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5716 tmp1 = r10, tmp2 = r11; 5717 5718 Label LOOP, DONE, MISMATCH; 5719 Register vec_len = tmp1; 5720 Register idx = tmp2; 5721 // The minimum of the string lengths has been stored in cnt2. 5722 Register cnt = cnt2; 5723 FloatRegister ztmp1 = z0, ztmp2 = z1; 5724 PRegister pgtmp1 = p0, pgtmp2 = p1; 5725 5726 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5727 switch (mode) { \ 5728 case LL: \ 5729 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5730 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5731 break; \ 5732 case LU: \ 5733 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5734 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5735 break; \ 5736 case UL: \ 5737 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5738 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5739 break; \ 5740 case UU: \ 5741 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5742 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5743 break; \ 5744 default: \ 5745 ShouldNotReachHere(); \ 5746 } 5747 5748 const char* stubname; 5749 switch (mode) { 5750 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5751 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5752 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5753 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5754 default: ShouldNotReachHere(); 5755 } 5756 5757 StubCodeMark mark(this, "StubRoutines", stubname); 5758 5759 __ mov(idx, 0); 5760 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5761 5762 if (mode == LL) { 5763 __ sve_cntb(vec_len); 5764 } else { 5765 __ sve_cnth(vec_len); 5766 } 5767 5768 __ sub(rscratch1, cnt, vec_len); 5769 5770 __ bind(LOOP); 5771 5772 // main loop 5773 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5774 __ add(idx, idx, vec_len); 5775 // Compare strings. 5776 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5777 __ br(__ NE, MISMATCH); 5778 __ cmp(idx, rscratch1); 5779 __ br(__ LT, LOOP); 5780 5781 // post loop, last iteration 5782 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5783 5784 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5785 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5786 __ br(__ EQ, DONE); 5787 5788 __ bind(MISMATCH); 5789 5790 // Crop the vector to find its location. 5791 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5792 // Extract the first different characters of each string. 5793 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5794 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5795 5796 // Compute the difference of the first different characters. 5797 __ sub(result, rscratch1, rscratch2); 5798 5799 __ bind(DONE); 5800 __ ret(lr); 5801 #undef LOAD_PAIR 5802 return entry; 5803 } 5804 5805 void generate_compare_long_strings() { 5806 if (UseSVE == 0) { 5807 StubRoutines::aarch64::_compare_long_string_LL 5808 = generate_compare_long_string_same_encoding(true); 5809 StubRoutines::aarch64::_compare_long_string_UU 5810 = generate_compare_long_string_same_encoding(false); 5811 StubRoutines::aarch64::_compare_long_string_LU 5812 = generate_compare_long_string_different_encoding(true); 5813 StubRoutines::aarch64::_compare_long_string_UL 5814 = generate_compare_long_string_different_encoding(false); 5815 } else { 5816 StubRoutines::aarch64::_compare_long_string_LL 5817 = generate_compare_long_string_sve(LL); 5818 StubRoutines::aarch64::_compare_long_string_UU 5819 = generate_compare_long_string_sve(UU); 5820 StubRoutines::aarch64::_compare_long_string_LU 5821 = generate_compare_long_string_sve(LU); 5822 StubRoutines::aarch64::_compare_long_string_UL 5823 = generate_compare_long_string_sve(UL); 5824 } 5825 } 5826 5827 // R0 = result 5828 // R1 = str2 5829 // R2 = cnt1 5830 // R3 = str1 5831 // R4 = cnt2 5832 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5833 // 5834 // This generic linear code use few additional ideas, which makes it faster: 5835 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5836 // in order to skip initial loading(help in systems with 1 ld pipeline) 5837 // 2) we can use "fast" algorithm of finding single character to search for 5838 // first symbol with less branches(1 branch per each loaded register instead 5839 // of branch for each symbol), so, this is where constants like 5840 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5841 // 3) after loading and analyzing 1st register of source string, it can be 5842 // used to search for every 1st character entry, saving few loads in 5843 // comparison with "simplier-but-slower" implementation 5844 // 4) in order to avoid lots of push/pop operations, code below is heavily 5845 // re-using/re-initializing/compressing register values, which makes code 5846 // larger and a bit less readable, however, most of extra operations are 5847 // issued during loads or branches, so, penalty is minimal 5848 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5849 const char* stubName = str1_isL 5850 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5851 : "indexof_linear_uu"; 5852 __ align(CodeEntryAlignment); 5853 StubCodeMark mark(this, "StubRoutines", stubName); 5854 address entry = __ pc(); 5855 5856 int str1_chr_size = str1_isL ? 1 : 2; 5857 int str2_chr_size = str2_isL ? 1 : 2; 5858 int str1_chr_shift = str1_isL ? 0 : 1; 5859 int str2_chr_shift = str2_isL ? 0 : 1; 5860 bool isL = str1_isL && str2_isL; 5861 // parameters 5862 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5863 // temporary registers 5864 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5865 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5866 // redefinitions 5867 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5868 5869 __ push(spilled_regs, sp); 5870 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5871 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5872 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5873 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5874 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5875 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5876 // Read whole register from str1. It is safe, because length >=8 here 5877 __ ldr(ch1, Address(str1)); 5878 // Read whole register from str2. It is safe, because length >=8 here 5879 __ ldr(ch2, Address(str2)); 5880 __ sub(cnt2, cnt2, cnt1); 5881 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5882 if (str1_isL != str2_isL) { 5883 __ eor(v0, __ T16B, v0, v0); 5884 } 5885 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5886 __ mul(first, first, tmp1); 5887 // check if we have less than 1 register to check 5888 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5889 if (str1_isL != str2_isL) { 5890 __ fmovd(v1, ch1); 5891 } 5892 __ br(__ LE, L_SMALL); 5893 __ eor(ch2, first, ch2); 5894 if (str1_isL != str2_isL) { 5895 __ zip1(v1, __ T16B, v1, v0); 5896 } 5897 __ sub(tmp2, ch2, tmp1); 5898 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5899 __ bics(tmp2, tmp2, ch2); 5900 if (str1_isL != str2_isL) { 5901 __ fmovd(ch1, v1); 5902 } 5903 __ br(__ NE, L_HAS_ZERO); 5904 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5905 __ add(result, result, wordSize/str2_chr_size); 5906 __ add(str2, str2, wordSize); 5907 __ br(__ LT, L_POST_LOOP); 5908 __ BIND(L_LOOP); 5909 __ ldr(ch2, Address(str2)); 5910 __ eor(ch2, first, ch2); 5911 __ sub(tmp2, ch2, tmp1); 5912 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5913 __ bics(tmp2, tmp2, ch2); 5914 __ br(__ NE, L_HAS_ZERO); 5915 __ BIND(L_LOOP_PROCEED); 5916 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5917 __ add(str2, str2, wordSize); 5918 __ add(result, result, wordSize/str2_chr_size); 5919 __ br(__ GE, L_LOOP); 5920 __ BIND(L_POST_LOOP); 5921 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5922 __ br(__ LE, NOMATCH); 5923 __ ldr(ch2, Address(str2)); 5924 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5925 __ eor(ch2, first, ch2); 5926 __ sub(tmp2, ch2, tmp1); 5927 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5928 __ mov(tmp4, -1); // all bits set 5929 __ b(L_SMALL_PROCEED); 5930 __ align(OptoLoopAlignment); 5931 __ BIND(L_SMALL); 5932 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5933 __ eor(ch2, first, ch2); 5934 if (str1_isL != str2_isL) { 5935 __ zip1(v1, __ T16B, v1, v0); 5936 } 5937 __ sub(tmp2, ch2, tmp1); 5938 __ mov(tmp4, -1); // all bits set 5939 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5940 if (str1_isL != str2_isL) { 5941 __ fmovd(ch1, v1); // move converted 4 symbols 5942 } 5943 __ BIND(L_SMALL_PROCEED); 5944 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5945 __ bic(tmp2, tmp2, ch2); 5946 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5947 __ rbit(tmp2, tmp2); 5948 __ br(__ EQ, NOMATCH); 5949 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5950 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5951 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5952 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5953 if (str2_isL) { // LL 5954 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5955 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5956 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5957 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5958 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5959 } else { 5960 __ mov(ch2, 0xE); // all bits in byte set except last one 5961 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5962 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5963 __ lslv(tmp2, tmp2, tmp4); 5964 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5965 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5966 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5967 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5968 } 5969 __ cmp(ch1, ch2); 5970 __ mov(tmp4, wordSize/str2_chr_size); 5971 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5972 __ BIND(L_SMALL_CMP_LOOP); 5973 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5974 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5975 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5976 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5977 __ add(tmp4, tmp4, 1); 5978 __ cmp(tmp4, cnt1); 5979 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5980 __ cmp(first, ch2); 5981 __ br(__ EQ, L_SMALL_CMP_LOOP); 5982 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5983 __ cbz(tmp2, NOMATCH); // no more matches. exit 5984 __ clz(tmp4, tmp2); 5985 __ add(result, result, 1); // advance index 5986 __ add(str2, str2, str2_chr_size); // advance pointer 5987 __ b(L_SMALL_HAS_ZERO_LOOP); 5988 __ align(OptoLoopAlignment); 5989 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5990 __ cmp(first, ch2); 5991 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5992 __ b(DONE); 5993 __ align(OptoLoopAlignment); 5994 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5995 if (str2_isL) { // LL 5996 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5997 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5998 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5999 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6000 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6001 } else { 6002 __ mov(ch2, 0xE); // all bits in byte set except last one 6003 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6004 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6005 __ lslv(tmp2, tmp2, tmp4); 6006 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6007 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6008 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6009 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6010 } 6011 __ cmp(ch1, ch2); 6012 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6013 __ b(DONE); 6014 __ align(OptoLoopAlignment); 6015 __ BIND(L_HAS_ZERO); 6016 __ rbit(tmp2, tmp2); 6017 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6018 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6019 // It's fine because both counters are 32bit and are not changed in this 6020 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6021 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6022 __ sub(result, result, 1); 6023 __ BIND(L_HAS_ZERO_LOOP); 6024 __ mov(cnt1, wordSize/str2_chr_size); 6025 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6026 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6027 if (str2_isL) { 6028 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6029 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6030 __ lslv(tmp2, tmp2, tmp4); 6031 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6032 __ add(tmp4, tmp4, 1); 6033 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6034 __ lsl(tmp2, tmp2, 1); 6035 __ mov(tmp4, wordSize/str2_chr_size); 6036 } else { 6037 __ mov(ch2, 0xE); 6038 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6039 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6040 __ lslv(tmp2, tmp2, tmp4); 6041 __ add(tmp4, tmp4, 1); 6042 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6043 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6044 __ lsl(tmp2, tmp2, 1); 6045 __ mov(tmp4, wordSize/str2_chr_size); 6046 __ sub(str2, str2, str2_chr_size); 6047 } 6048 __ cmp(ch1, ch2); 6049 __ mov(tmp4, wordSize/str2_chr_size); 6050 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6051 __ BIND(L_CMP_LOOP); 6052 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6053 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6054 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6055 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6056 __ add(tmp4, tmp4, 1); 6057 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6058 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6059 __ cmp(cnt1, ch2); 6060 __ br(__ EQ, L_CMP_LOOP); 6061 __ BIND(L_CMP_LOOP_NOMATCH); 6062 // here we're not matched 6063 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6064 __ clz(tmp4, tmp2); 6065 __ add(str2, str2, str2_chr_size); // advance pointer 6066 __ b(L_HAS_ZERO_LOOP); 6067 __ align(OptoLoopAlignment); 6068 __ BIND(L_CMP_LOOP_LAST_CMP); 6069 __ cmp(cnt1, ch2); 6070 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6071 __ b(DONE); 6072 __ align(OptoLoopAlignment); 6073 __ BIND(L_CMP_LOOP_LAST_CMP2); 6074 if (str2_isL) { 6075 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6076 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6077 __ lslv(tmp2, tmp2, tmp4); 6078 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6079 __ add(tmp4, tmp4, 1); 6080 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6081 __ lsl(tmp2, tmp2, 1); 6082 } else { 6083 __ mov(ch2, 0xE); 6084 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6085 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6086 __ lslv(tmp2, tmp2, tmp4); 6087 __ add(tmp4, tmp4, 1); 6088 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6089 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6090 __ lsl(tmp2, tmp2, 1); 6091 __ sub(str2, str2, str2_chr_size); 6092 } 6093 __ cmp(ch1, ch2); 6094 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6095 __ b(DONE); 6096 __ align(OptoLoopAlignment); 6097 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6098 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6099 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6100 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6101 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6102 // result by analyzed characters value, so, we can just reset lower bits 6103 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6104 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6105 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6106 // index of last analyzed substring inside current octet. So, str2 in at 6107 // respective start address. We need to advance it to next octet 6108 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6109 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6110 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6111 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6112 __ movw(cnt2, cnt2); 6113 __ b(L_LOOP_PROCEED); 6114 __ align(OptoLoopAlignment); 6115 __ BIND(NOMATCH); 6116 __ mov(result, -1); 6117 __ BIND(DONE); 6118 __ pop(spilled_regs, sp); 6119 __ ret(lr); 6120 return entry; 6121 } 6122 6123 void generate_string_indexof_stubs() { 6124 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6125 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6126 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6127 } 6128 6129 void inflate_and_store_2_fp_registers(bool generatePrfm, 6130 FloatRegister src1, FloatRegister src2) { 6131 Register dst = r1; 6132 __ zip1(v1, __ T16B, src1, v0); 6133 __ zip2(v2, __ T16B, src1, v0); 6134 if (generatePrfm) { 6135 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6136 } 6137 __ zip1(v3, __ T16B, src2, v0); 6138 __ zip2(v4, __ T16B, src2, v0); 6139 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6140 } 6141 6142 // R0 = src 6143 // R1 = dst 6144 // R2 = len 6145 // R3 = len >> 3 6146 // V0 = 0 6147 // v1 = loaded 8 bytes 6148 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6149 address generate_large_byte_array_inflate() { 6150 __ align(CodeEntryAlignment); 6151 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6152 address entry = __ pc(); 6153 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6154 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6155 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6156 6157 // do one more 8-byte read to have address 16-byte aligned in most cases 6158 // also use single store instruction 6159 __ ldrd(v2, __ post(src, 8)); 6160 __ sub(octetCounter, octetCounter, 2); 6161 __ zip1(v1, __ T16B, v1, v0); 6162 __ zip1(v2, __ T16B, v2, v0); 6163 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6164 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6165 __ subs(rscratch1, octetCounter, large_loop_threshold); 6166 __ br(__ LE, LOOP_START); 6167 __ b(LOOP_PRFM_START); 6168 __ bind(LOOP_PRFM); 6169 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6170 __ bind(LOOP_PRFM_START); 6171 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6172 __ sub(octetCounter, octetCounter, 8); 6173 __ subs(rscratch1, octetCounter, large_loop_threshold); 6174 inflate_and_store_2_fp_registers(true, v3, v4); 6175 inflate_and_store_2_fp_registers(true, v5, v6); 6176 __ br(__ GT, LOOP_PRFM); 6177 __ cmp(octetCounter, (u1)8); 6178 __ br(__ LT, DONE); 6179 __ bind(LOOP); 6180 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6181 __ bind(LOOP_START); 6182 __ sub(octetCounter, octetCounter, 8); 6183 __ cmp(octetCounter, (u1)8); 6184 inflate_and_store_2_fp_registers(false, v3, v4); 6185 inflate_and_store_2_fp_registers(false, v5, v6); 6186 __ br(__ GE, LOOP); 6187 __ bind(DONE); 6188 __ ret(lr); 6189 return entry; 6190 } 6191 6192 /** 6193 * Arguments: 6194 * 6195 * Input: 6196 * c_rarg0 - current state address 6197 * c_rarg1 - H key address 6198 * c_rarg2 - data address 6199 * c_rarg3 - number of blocks 6200 * 6201 * Output: 6202 * Updated state at c_rarg0 6203 */ 6204 address generate_ghash_processBlocks() { 6205 // Bafflingly, GCM uses little-endian for the byte order, but 6206 // big-endian for the bit order. For example, the polynomial 1 is 6207 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6208 // 6209 // So, we must either reverse the bytes in each word and do 6210 // everything big-endian or reverse the bits in each byte and do 6211 // it little-endian. On AArch64 it's more idiomatic to reverse 6212 // the bits in each byte (we have an instruction, RBIT, to do 6213 // that) and keep the data in little-endian bit order through the 6214 // calculation, bit-reversing the inputs and outputs. 6215 6216 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6217 __ align(wordSize * 2); 6218 address p = __ pc(); 6219 __ emit_int64(0x87); // The low-order bits of the field 6220 // polynomial (i.e. p = z^7+z^2+z+1) 6221 // repeated in the low and high parts of a 6222 // 128-bit vector 6223 __ emit_int64(0x87); 6224 6225 __ align(CodeEntryAlignment); 6226 address start = __ pc(); 6227 6228 Register state = c_rarg0; 6229 Register subkeyH = c_rarg1; 6230 Register data = c_rarg2; 6231 Register blocks = c_rarg3; 6232 6233 FloatRegister vzr = v30; 6234 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6235 6236 __ ldrq(v24, p); // The field polynomial 6237 6238 __ ldrq(v0, Address(state)); 6239 __ ldrq(v1, Address(subkeyH)); 6240 6241 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6242 __ rbit(v0, __ T16B, v0); 6243 __ rev64(v1, __ T16B, v1); 6244 __ rbit(v1, __ T16B, v1); 6245 6246 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6247 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6248 6249 { 6250 Label L_ghash_loop; 6251 __ bind(L_ghash_loop); 6252 6253 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6254 // reversing each byte 6255 __ rbit(v2, __ T16B, v2); 6256 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6257 6258 // Multiply state in v2 by subkey in v1 6259 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6260 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6261 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6262 // Reduce v7:v5 by the field polynomial 6263 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6264 6265 __ sub(blocks, blocks, 1); 6266 __ cbnz(blocks, L_ghash_loop); 6267 } 6268 6269 // The bit-reversed result is at this point in v0 6270 __ rev64(v0, __ T16B, v0); 6271 __ rbit(v0, __ T16B, v0); 6272 6273 __ st1(v0, __ T16B, state); 6274 __ ret(lr); 6275 6276 return start; 6277 } 6278 6279 address generate_ghash_processBlocks_wide() { 6280 address small = generate_ghash_processBlocks(); 6281 6282 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6283 __ align(wordSize * 2); 6284 address p = __ pc(); 6285 __ emit_int64(0x87); // The low-order bits of the field 6286 // polynomial (i.e. p = z^7+z^2+z+1) 6287 // repeated in the low and high parts of a 6288 // 128-bit vector 6289 __ emit_int64(0x87); 6290 6291 __ align(CodeEntryAlignment); 6292 address start = __ pc(); 6293 6294 Register state = c_rarg0; 6295 Register subkeyH = c_rarg1; 6296 Register data = c_rarg2; 6297 Register blocks = c_rarg3; 6298 6299 const int unroll = 4; 6300 6301 __ cmp(blocks, (unsigned char)(unroll * 2)); 6302 __ br(__ LT, small); 6303 6304 if (unroll > 1) { 6305 // Save state before entering routine 6306 __ sub(sp, sp, 4 * 16); 6307 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6308 __ sub(sp, sp, 4 * 16); 6309 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6310 } 6311 6312 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6313 6314 if (unroll > 1) { 6315 // And restore state 6316 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6317 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6318 } 6319 6320 __ cmp(blocks, (unsigned char)0); 6321 __ br(__ GT, small); 6322 6323 __ ret(lr); 6324 6325 return start; 6326 } 6327 6328 void generate_base64_encode_simdround(Register src, Register dst, 6329 FloatRegister codec, u8 size) { 6330 6331 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6332 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6333 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6334 6335 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6336 6337 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6338 6339 __ ushr(ind0, arrangement, in0, 2); 6340 6341 __ ushr(ind1, arrangement, in1, 2); 6342 __ shl(in0, arrangement, in0, 6); 6343 __ orr(ind1, arrangement, ind1, in0); 6344 __ ushr(ind1, arrangement, ind1, 2); 6345 6346 __ ushr(ind2, arrangement, in2, 4); 6347 __ shl(in1, arrangement, in1, 4); 6348 __ orr(ind2, arrangement, in1, ind2); 6349 __ ushr(ind2, arrangement, ind2, 2); 6350 6351 __ shl(ind3, arrangement, in2, 2); 6352 __ ushr(ind3, arrangement, ind3, 2); 6353 6354 __ tbl(out0, arrangement, codec, 4, ind0); 6355 __ tbl(out1, arrangement, codec, 4, ind1); 6356 __ tbl(out2, arrangement, codec, 4, ind2); 6357 __ tbl(out3, arrangement, codec, 4, ind3); 6358 6359 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6360 } 6361 6362 /** 6363 * Arguments: 6364 * 6365 * Input: 6366 * c_rarg0 - src_start 6367 * c_rarg1 - src_offset 6368 * c_rarg2 - src_length 6369 * c_rarg3 - dest_start 6370 * c_rarg4 - dest_offset 6371 * c_rarg5 - isURL 6372 * 6373 */ 6374 address generate_base64_encodeBlock() { 6375 6376 static const char toBase64[64] = { 6377 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6378 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6379 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6380 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6381 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6382 }; 6383 6384 static const char toBase64URL[64] = { 6385 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6386 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6387 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6388 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6389 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6390 }; 6391 6392 __ align(CodeEntryAlignment); 6393 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6394 address start = __ pc(); 6395 6396 Register src = c_rarg0; // source array 6397 Register soff = c_rarg1; // source start offset 6398 Register send = c_rarg2; // source end offset 6399 Register dst = c_rarg3; // dest array 6400 Register doff = c_rarg4; // position for writing to dest array 6401 Register isURL = c_rarg5; // Base64 or URL character set 6402 6403 // c_rarg6 and c_rarg7 are free to use as temps 6404 Register codec = c_rarg6; 6405 Register length = c_rarg7; 6406 6407 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6408 6409 __ add(src, src, soff); 6410 __ add(dst, dst, doff); 6411 __ sub(length, send, soff); 6412 6413 // load the codec base address 6414 __ lea(codec, ExternalAddress((address) toBase64)); 6415 __ cbz(isURL, ProcessData); 6416 __ lea(codec, ExternalAddress((address) toBase64URL)); 6417 6418 __ BIND(ProcessData); 6419 6420 // too short to formup a SIMD loop, roll back 6421 __ cmp(length, (u1)24); 6422 __ br(Assembler::LT, Process3B); 6423 6424 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6425 6426 __ BIND(Process48B); 6427 __ cmp(length, (u1)48); 6428 __ br(Assembler::LT, Process24B); 6429 generate_base64_encode_simdround(src, dst, v0, 16); 6430 __ sub(length, length, 48); 6431 __ b(Process48B); 6432 6433 __ BIND(Process24B); 6434 __ cmp(length, (u1)24); 6435 __ br(Assembler::LT, SIMDExit); 6436 generate_base64_encode_simdround(src, dst, v0, 8); 6437 __ sub(length, length, 24); 6438 6439 __ BIND(SIMDExit); 6440 __ cbz(length, Exit); 6441 6442 __ BIND(Process3B); 6443 // 3 src bytes, 24 bits 6444 __ ldrb(r10, __ post(src, 1)); 6445 __ ldrb(r11, __ post(src, 1)); 6446 __ ldrb(r12, __ post(src, 1)); 6447 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6448 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6449 // codec index 6450 __ ubfmw(r15, r12, 18, 23); 6451 __ ubfmw(r14, r12, 12, 17); 6452 __ ubfmw(r13, r12, 6, 11); 6453 __ andw(r12, r12, 63); 6454 // get the code based on the codec 6455 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6456 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6457 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6458 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6459 __ strb(r15, __ post(dst, 1)); 6460 __ strb(r14, __ post(dst, 1)); 6461 __ strb(r13, __ post(dst, 1)); 6462 __ strb(r12, __ post(dst, 1)); 6463 __ sub(length, length, 3); 6464 __ cbnz(length, Process3B); 6465 6466 __ BIND(Exit); 6467 __ ret(lr); 6468 6469 return start; 6470 } 6471 6472 void generate_base64_decode_simdround(Register src, Register dst, 6473 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6474 6475 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6476 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6477 6478 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6479 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6480 6481 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6482 6483 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6484 6485 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6486 6487 // we need unsigned saturating subtract, to make sure all input values 6488 // in range [0, 63] will have 0U value in the higher half lookup 6489 __ uqsubv(decH0, __ T16B, in0, v27); 6490 __ uqsubv(decH1, __ T16B, in1, v27); 6491 __ uqsubv(decH2, __ T16B, in2, v27); 6492 __ uqsubv(decH3, __ T16B, in3, v27); 6493 6494 // lower half lookup 6495 __ tbl(decL0, arrangement, codecL, 4, in0); 6496 __ tbl(decL1, arrangement, codecL, 4, in1); 6497 __ tbl(decL2, arrangement, codecL, 4, in2); 6498 __ tbl(decL3, arrangement, codecL, 4, in3); 6499 6500 // higher half lookup 6501 __ tbx(decH0, arrangement, codecH, 4, decH0); 6502 __ tbx(decH1, arrangement, codecH, 4, decH1); 6503 __ tbx(decH2, arrangement, codecH, 4, decH2); 6504 __ tbx(decH3, arrangement, codecH, 4, decH3); 6505 6506 // combine lower and higher 6507 __ orr(decL0, arrangement, decL0, decH0); 6508 __ orr(decL1, arrangement, decL1, decH1); 6509 __ orr(decL2, arrangement, decL2, decH2); 6510 __ orr(decL3, arrangement, decL3, decH3); 6511 6512 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6513 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6514 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6515 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6516 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6517 __ orr(in0, arrangement, decH0, decH1); 6518 __ orr(in1, arrangement, decH2, decH3); 6519 __ orr(in2, arrangement, in0, in1); 6520 __ umaxv(in3, arrangement, in2); 6521 __ umov(rscratch2, in3, __ B, 0); 6522 6523 // get the data to output 6524 __ shl(out0, arrangement, decL0, 2); 6525 __ ushr(out1, arrangement, decL1, 4); 6526 __ orr(out0, arrangement, out0, out1); 6527 __ shl(out1, arrangement, decL1, 4); 6528 __ ushr(out2, arrangement, decL2, 2); 6529 __ orr(out1, arrangement, out1, out2); 6530 __ shl(out2, arrangement, decL2, 6); 6531 __ orr(out2, arrangement, out2, decL3); 6532 6533 __ cbz(rscratch2, NoIllegalData); 6534 6535 // handle illegal input 6536 __ umov(r10, in2, __ D, 0); 6537 if (size == 16) { 6538 __ cbnz(r10, ErrorInLowerHalf); 6539 6540 // illegal input is in higher half, store the lower half now. 6541 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6542 6543 __ umov(r10, in2, __ D, 1); 6544 __ umov(r11, out0, __ D, 1); 6545 __ umov(r12, out1, __ D, 1); 6546 __ umov(r13, out2, __ D, 1); 6547 __ b(StoreLegalData); 6548 6549 __ BIND(ErrorInLowerHalf); 6550 } 6551 __ umov(r11, out0, __ D, 0); 6552 __ umov(r12, out1, __ D, 0); 6553 __ umov(r13, out2, __ D, 0); 6554 6555 __ BIND(StoreLegalData); 6556 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6557 __ strb(r11, __ post(dst, 1)); 6558 __ strb(r12, __ post(dst, 1)); 6559 __ strb(r13, __ post(dst, 1)); 6560 __ lsr(r10, r10, 8); 6561 __ lsr(r11, r11, 8); 6562 __ lsr(r12, r12, 8); 6563 __ lsr(r13, r13, 8); 6564 __ b(StoreLegalData); 6565 6566 __ BIND(NoIllegalData); 6567 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6568 } 6569 6570 6571 /** 6572 * Arguments: 6573 * 6574 * Input: 6575 * c_rarg0 - src_start 6576 * c_rarg1 - src_offset 6577 * c_rarg2 - src_length 6578 * c_rarg3 - dest_start 6579 * c_rarg4 - dest_offset 6580 * c_rarg5 - isURL 6581 * c_rarg6 - isMIME 6582 * 6583 */ 6584 address generate_base64_decodeBlock() { 6585 6586 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6587 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6588 // titled "Base64 decoding". 6589 6590 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6591 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6592 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6593 static const uint8_t fromBase64ForNoSIMD[256] = { 6594 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6595 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6596 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6597 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6598 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6599 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6600 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6601 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6602 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6603 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6604 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6605 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6606 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6607 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6608 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6609 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6610 }; 6611 6612 static const uint8_t fromBase64URLForNoSIMD[256] = { 6613 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6614 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6616 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6617 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6618 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6619 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6620 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6621 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6622 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6623 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6624 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6625 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6626 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6627 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6628 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6629 }; 6630 6631 // A legal value of base64 code is in range [0, 127]. We need two lookups 6632 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6633 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6634 // table vector lookup use tbx, out of range indices are unchanged in 6635 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6636 // The value of index 64 is set to 0, so that we know that we already get the 6637 // decoded data with the 1st lookup. 6638 static const uint8_t fromBase64ForSIMD[128] = { 6639 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6640 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6641 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6642 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6643 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6644 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6645 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6646 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6647 }; 6648 6649 static const uint8_t fromBase64URLForSIMD[128] = { 6650 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6651 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6652 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6653 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6654 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6655 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6656 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6657 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6658 }; 6659 6660 __ align(CodeEntryAlignment); 6661 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6662 address start = __ pc(); 6663 6664 Register src = c_rarg0; // source array 6665 Register soff = c_rarg1; // source start offset 6666 Register send = c_rarg2; // source end offset 6667 Register dst = c_rarg3; // dest array 6668 Register doff = c_rarg4; // position for writing to dest array 6669 Register isURL = c_rarg5; // Base64 or URL character set 6670 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6671 6672 Register length = send; // reuse send as length of source data to process 6673 6674 Register simd_codec = c_rarg6; 6675 Register nosimd_codec = c_rarg7; 6676 6677 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6678 6679 __ enter(); 6680 6681 __ add(src, src, soff); 6682 __ add(dst, dst, doff); 6683 6684 __ mov(doff, dst); 6685 6686 __ sub(length, send, soff); 6687 __ bfm(length, zr, 0, 1); 6688 6689 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6690 __ cbz(isURL, ProcessData); 6691 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6692 6693 __ BIND(ProcessData); 6694 __ mov(rscratch1, length); 6695 __ cmp(length, (u1)144); // 144 = 80 + 64 6696 __ br(Assembler::LT, Process4B); 6697 6698 // In the MIME case, the line length cannot be more than 76 6699 // bytes (see RFC 2045). This is too short a block for SIMD 6700 // to be worthwhile, so we use non-SIMD here. 6701 __ movw(rscratch1, 79); 6702 6703 __ BIND(Process4B); 6704 __ ldrw(r14, __ post(src, 4)); 6705 __ ubfxw(r10, r14, 0, 8); 6706 __ ubfxw(r11, r14, 8, 8); 6707 __ ubfxw(r12, r14, 16, 8); 6708 __ ubfxw(r13, r14, 24, 8); 6709 // get the de-code 6710 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6711 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6712 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6713 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6714 // error detection, 255u indicates an illegal input 6715 __ orrw(r14, r10, r11); 6716 __ orrw(r15, r12, r13); 6717 __ orrw(r14, r14, r15); 6718 __ tbnz(r14, 7, Exit); 6719 // recover the data 6720 __ lslw(r14, r10, 10); 6721 __ bfiw(r14, r11, 4, 6); 6722 __ bfmw(r14, r12, 2, 5); 6723 __ rev16w(r14, r14); 6724 __ bfiw(r13, r12, 6, 2); 6725 __ strh(r14, __ post(dst, 2)); 6726 __ strb(r13, __ post(dst, 1)); 6727 // non-simd loop 6728 __ subsw(rscratch1, rscratch1, 4); 6729 __ br(Assembler::GT, Process4B); 6730 6731 // if exiting from PreProcess80B, rscratch1 == -1; 6732 // otherwise, rscratch1 == 0. 6733 __ cbzw(rscratch1, Exit); 6734 __ sub(length, length, 80); 6735 6736 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6737 __ cbz(isURL, SIMDEnter); 6738 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6739 6740 __ BIND(SIMDEnter); 6741 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6742 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6743 __ mov(rscratch1, 63); 6744 __ dup(v27, __ T16B, rscratch1); 6745 6746 __ BIND(Process64B); 6747 __ cmp(length, (u1)64); 6748 __ br(Assembler::LT, Process32B); 6749 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6750 __ sub(length, length, 64); 6751 __ b(Process64B); 6752 6753 __ BIND(Process32B); 6754 __ cmp(length, (u1)32); 6755 __ br(Assembler::LT, SIMDExit); 6756 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6757 __ sub(length, length, 32); 6758 __ b(Process32B); 6759 6760 __ BIND(SIMDExit); 6761 __ cbz(length, Exit); 6762 __ movw(rscratch1, length); 6763 __ b(Process4B); 6764 6765 __ BIND(Exit); 6766 __ sub(c_rarg0, dst, doff); 6767 6768 __ leave(); 6769 __ ret(lr); 6770 6771 return start; 6772 } 6773 6774 // Support for spin waits. 6775 address generate_spin_wait() { 6776 __ align(CodeEntryAlignment); 6777 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6778 address start = __ pc(); 6779 6780 __ spin_wait(); 6781 __ ret(lr); 6782 6783 return start; 6784 } 6785 6786 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6787 6788 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6789 // 6790 // If LSE is in use, generate LSE versions of all the stubs. The 6791 // non-LSE versions are in atomic_aarch64.S. 6792 6793 // class AtomicStubMark records the entry point of a stub and the 6794 // stub pointer which will point to it. The stub pointer is set to 6795 // the entry point when ~AtomicStubMark() is called, which must be 6796 // after ICache::invalidate_range. This ensures safe publication of 6797 // the generated code. 6798 class AtomicStubMark { 6799 address _entry_point; 6800 aarch64_atomic_stub_t *_stub; 6801 MacroAssembler *_masm; 6802 public: 6803 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6804 _masm = masm; 6805 __ align(32); 6806 _entry_point = __ pc(); 6807 _stub = stub; 6808 } 6809 ~AtomicStubMark() { 6810 *_stub = (aarch64_atomic_stub_t)_entry_point; 6811 } 6812 }; 6813 6814 // NB: For memory_order_conservative we need a trailing membar after 6815 // LSE atomic operations but not a leading membar. 6816 // 6817 // We don't need a leading membar because a clause in the Arm ARM 6818 // says: 6819 // 6820 // Barrier-ordered-before 6821 // 6822 // Barrier instructions order prior Memory effects before subsequent 6823 // Memory effects generated by the same Observer. A read or a write 6824 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6825 // Observer if and only if RW1 appears in program order before RW 2 6826 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6827 // instruction with both Acquire and Release semantics. 6828 // 6829 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6830 // and Release semantics, therefore we don't need a leading 6831 // barrier. However, there is no corresponding Barrier-ordered-after 6832 // relationship, therefore we need a trailing membar to prevent a 6833 // later store or load from being reordered with the store in an 6834 // atomic instruction. 6835 // 6836 // This was checked by using the herd7 consistency model simulator 6837 // (http://diy.inria.fr/) with this test case: 6838 // 6839 // AArch64 LseCas 6840 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6841 // P0 | P1; 6842 // LDR W4, [X2] | MOV W3, #0; 6843 // DMB LD | MOV W4, #1; 6844 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6845 // | DMB ISH; 6846 // | STR W4, [X2]; 6847 // exists 6848 // (0:X3=0 /\ 0:X4=1) 6849 // 6850 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6851 // with the store to x in P1. Without the DMB in P1 this may happen. 6852 // 6853 // At the time of writing we don't know of any AArch64 hardware that 6854 // reorders stores in this way, but the Reference Manual permits it. 6855 6856 void gen_cas_entry(Assembler::operand_size size, 6857 atomic_memory_order order) { 6858 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6859 exchange_val = c_rarg2; 6860 bool acquire, release; 6861 switch (order) { 6862 case memory_order_relaxed: 6863 acquire = false; 6864 release = false; 6865 break; 6866 case memory_order_release: 6867 acquire = false; 6868 release = true; 6869 break; 6870 default: 6871 acquire = true; 6872 release = true; 6873 break; 6874 } 6875 __ mov(prev, compare_val); 6876 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6877 if (order == memory_order_conservative) { 6878 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6879 } 6880 if (size == Assembler::xword) { 6881 __ mov(r0, prev); 6882 } else { 6883 __ movw(r0, prev); 6884 } 6885 __ ret(lr); 6886 } 6887 6888 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6889 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6890 // If not relaxed, then default to conservative. Relaxed is the only 6891 // case we use enough to be worth specializing. 6892 if (order == memory_order_relaxed) { 6893 __ ldadd(size, incr, prev, addr); 6894 } else { 6895 __ ldaddal(size, incr, prev, addr); 6896 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6897 } 6898 if (size == Assembler::xword) { 6899 __ mov(r0, prev); 6900 } else { 6901 __ movw(r0, prev); 6902 } 6903 __ ret(lr); 6904 } 6905 6906 void gen_swpal_entry(Assembler::operand_size size) { 6907 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6908 __ swpal(size, incr, prev, addr); 6909 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6910 if (size == Assembler::xword) { 6911 __ mov(r0, prev); 6912 } else { 6913 __ movw(r0, prev); 6914 } 6915 __ ret(lr); 6916 } 6917 6918 void generate_atomic_entry_points() { 6919 if (! UseLSE) { 6920 return; 6921 } 6922 6923 __ align(CodeEntryAlignment); 6924 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6925 address first_entry = __ pc(); 6926 6927 // ADD, memory_order_conservative 6928 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6929 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6930 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6931 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6932 6933 // ADD, memory_order_relaxed 6934 AtomicStubMark mark_fetch_add_4_relaxed 6935 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6936 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6937 AtomicStubMark mark_fetch_add_8_relaxed 6938 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6939 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6940 6941 // XCHG, memory_order_conservative 6942 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6943 gen_swpal_entry(Assembler::word); 6944 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6945 gen_swpal_entry(Assembler::xword); 6946 6947 // CAS, memory_order_conservative 6948 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6949 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6950 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6951 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6952 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6953 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6954 6955 // CAS, memory_order_relaxed 6956 AtomicStubMark mark_cmpxchg_1_relaxed 6957 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6958 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6959 AtomicStubMark mark_cmpxchg_4_relaxed 6960 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6961 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6962 AtomicStubMark mark_cmpxchg_8_relaxed 6963 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6964 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6965 6966 AtomicStubMark mark_cmpxchg_4_release 6967 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6968 gen_cas_entry(MacroAssembler::word, memory_order_release); 6969 AtomicStubMark mark_cmpxchg_8_release 6970 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6971 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6972 6973 AtomicStubMark mark_cmpxchg_4_seq_cst 6974 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6975 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6976 AtomicStubMark mark_cmpxchg_8_seq_cst 6977 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6978 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6979 6980 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6981 } 6982 #endif // LINUX 6983 6984 address generate_cont_thaw(Continuation::thaw_kind kind) { 6985 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 6986 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 6987 6988 address start = __ pc(); 6989 6990 if (return_barrier) { 6991 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 6992 __ mov(sp, rscratch1); 6993 } 6994 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6995 6996 if (return_barrier) { 6997 // preserve possible return value from a method returning to the return barrier 6998 __ fmovd(rscratch1, v0); 6999 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7000 } 7001 7002 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7003 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7004 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7005 7006 if (return_barrier) { 7007 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7008 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7009 __ fmovd(v0, rscratch1); 7010 } 7011 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7012 7013 7014 Label thaw_success; 7015 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7016 __ cbnz(rscratch2, thaw_success); 7017 __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); 7018 __ br(rscratch1); 7019 __ bind(thaw_success); 7020 7021 // make room for the thawed frames 7022 __ sub(rscratch1, sp, rscratch2); 7023 __ andr(rscratch1, rscratch1, -16); // align 7024 __ mov(sp, rscratch1); 7025 7026 if (return_barrier) { 7027 // save original return value -- again 7028 __ fmovd(rscratch1, v0); 7029 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7030 } 7031 7032 // If we want, we can templatize thaw by kind, and have three different entries 7033 __ movw(c_rarg1, (uint32_t)kind); 7034 7035 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7036 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7037 7038 if (return_barrier) { 7039 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7040 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7041 __ fmovd(v0, rscratch1); 7042 } else { 7043 __ mov(r0, zr); // return 0 (success) from doYield 7044 } 7045 7046 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7047 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7048 __ mov(rfp, sp); 7049 7050 if (return_barrier_exception) { 7051 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7052 __ authenticate_return_address(c_rarg1); 7053 __ verify_oop(r0); 7054 // save return value containing the exception oop in callee-saved R19 7055 __ mov(r19, r0); 7056 7057 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7058 7059 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7060 // __ reinitialize_ptrue(); 7061 7062 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7063 7064 __ mov(r1, r0); // the exception handler 7065 __ mov(r0, r19); // restore return value containing the exception oop 7066 __ verify_oop(r0); 7067 7068 __ leave(); 7069 __ mov(r3, lr); 7070 __ br(r1); // the exception handler 7071 } else { 7072 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7073 __ leave(); 7074 __ ret(lr); 7075 } 7076 7077 return start; 7078 } 7079 7080 address generate_cont_thaw() { 7081 if (!Continuations::enabled()) return nullptr; 7082 7083 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7084 address start = __ pc(); 7085 generate_cont_thaw(Continuation::thaw_top); 7086 return start; 7087 } 7088 7089 address generate_cont_returnBarrier() { 7090 if (!Continuations::enabled()) return nullptr; 7091 7092 // TODO: will probably need multiple return barriers depending on return type 7093 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7094 address start = __ pc(); 7095 7096 generate_cont_thaw(Continuation::thaw_return_barrier); 7097 7098 return start; 7099 } 7100 7101 address generate_cont_returnBarrier_exception() { 7102 if (!Continuations::enabled()) return nullptr; 7103 7104 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7105 address start = __ pc(); 7106 7107 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7108 7109 return start; 7110 } 7111 7112 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7113 // are represented as long[5], with BITS_PER_LIMB = 26. 7114 // Pack five 26-bit limbs into three 64-bit registers. 7115 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7116 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7117 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7118 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7119 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7120 7121 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7122 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7123 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7124 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7125 7126 if (dest2->is_valid()) { 7127 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7128 } else { 7129 #ifdef ASSERT 7130 Label OK; 7131 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7132 __ br(__ EQ, OK); 7133 __ stop("high bits of Poly1305 integer should be zero"); 7134 __ should_not_reach_here(); 7135 __ bind(OK); 7136 #endif 7137 } 7138 } 7139 7140 // As above, but return only a 128-bit integer, packed into two 7141 // 64-bit registers. 7142 void pack_26(Register dest0, Register dest1, Register src) { 7143 pack_26(dest0, dest1, noreg, src); 7144 } 7145 7146 // Multiply and multiply-accumulate unsigned 64-bit registers. 7147 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7148 __ mul(prod_lo, n, m); 7149 __ umulh(prod_hi, n, m); 7150 } 7151 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7152 wide_mul(rscratch1, rscratch2, n, m); 7153 __ adds(sum_lo, sum_lo, rscratch1); 7154 __ adc(sum_hi, sum_hi, rscratch2); 7155 } 7156 7157 // Poly1305, RFC 7539 7158 7159 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7160 // description of the tricks used to simplify and accelerate this 7161 // computation. 7162 7163 address generate_poly1305_processBlocks() { 7164 __ align(CodeEntryAlignment); 7165 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7166 address start = __ pc(); 7167 Label here; 7168 __ enter(); 7169 RegSet callee_saved = RegSet::range(r19, r28); 7170 __ push(callee_saved, sp); 7171 7172 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7173 7174 // Arguments 7175 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7176 7177 // R_n is the 128-bit randomly-generated key, packed into two 7178 // registers. The caller passes this key to us as long[5], with 7179 // BITS_PER_LIMB = 26. 7180 const Register R_0 = *++regs, R_1 = *++regs; 7181 pack_26(R_0, R_1, r_start); 7182 7183 // RR_n is (R_n >> 2) * 5 7184 const Register RR_0 = *++regs, RR_1 = *++regs; 7185 __ lsr(RR_0, R_0, 2); 7186 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7187 __ lsr(RR_1, R_1, 2); 7188 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7189 7190 // U_n is the current checksum 7191 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7192 pack_26(U_0, U_1, U_2, acc_start); 7193 7194 static constexpr int BLOCK_LENGTH = 16; 7195 Label DONE, LOOP; 7196 7197 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7198 __ br(Assembler::LT, DONE); { 7199 __ bind(LOOP); 7200 7201 // S_n is to be the sum of U_n and the next block of data 7202 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7203 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7204 __ adds(S_0, U_0, S_0); 7205 __ adcs(S_1, U_1, S_1); 7206 __ adc(S_2, U_2, zr); 7207 __ add(S_2, S_2, 1); 7208 7209 const Register U_0HI = *++regs, U_1HI = *++regs; 7210 7211 // NB: this logic depends on some of the special properties of 7212 // Poly1305 keys. In particular, because we know that the top 7213 // four bits of R_0 and R_1 are zero, we can add together 7214 // partial products without any risk of needing to propagate a 7215 // carry out. 7216 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7217 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7218 __ andr(U_2, R_0, 3); 7219 __ mul(U_2, S_2, U_2); 7220 7221 // Recycle registers S_0, S_1, S_2 7222 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7223 7224 // Partial reduction mod 2**130 - 5 7225 __ adds(U_1, U_0HI, U_1); 7226 __ adc(U_2, U_1HI, U_2); 7227 // Sum now in U_2:U_1:U_0. 7228 // Dead: U_0HI, U_1HI. 7229 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7230 7231 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7232 7233 // First, U_2:U_1:U_0 += (U_2 >> 2) 7234 __ lsr(rscratch1, U_2, 2); 7235 __ andr(U_2, U_2, (u8)3); 7236 __ adds(U_0, U_0, rscratch1); 7237 __ adcs(U_1, U_1, zr); 7238 __ adc(U_2, U_2, zr); 7239 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7240 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7241 __ adcs(U_1, U_1, zr); 7242 __ adc(U_2, U_2, zr); 7243 7244 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7245 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7246 __ br(~ Assembler::LT, LOOP); 7247 } 7248 7249 // Further reduce modulo 2^130 - 5 7250 __ lsr(rscratch1, U_2, 2); 7251 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7252 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7253 __ adcs(U_1, U_1, zr); 7254 __ andr(U_2, U_2, (u1)3); 7255 __ adc(U_2, U_2, zr); 7256 7257 // Unpack the sum into five 26-bit limbs and write to memory. 7258 __ ubfiz(rscratch1, U_0, 0, 26); 7259 __ ubfx(rscratch2, U_0, 26, 26); 7260 __ stp(rscratch1, rscratch2, Address(acc_start)); 7261 __ ubfx(rscratch1, U_0, 52, 12); 7262 __ bfi(rscratch1, U_1, 12, 14); 7263 __ ubfx(rscratch2, U_1, 14, 26); 7264 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7265 __ ubfx(rscratch1, U_1, 40, 24); 7266 __ bfi(rscratch1, U_2, 24, 3); 7267 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7268 7269 __ bind(DONE); 7270 __ pop(callee_saved, sp); 7271 __ leave(); 7272 __ ret(lr); 7273 7274 return start; 7275 } 7276 7277 #if INCLUDE_JFR 7278 7279 static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { 7280 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7281 __ mov(c_rarg0, thread); 7282 } 7283 7284 // The handle is dereferenced through a load barrier. 7285 static void jfr_epilogue(MacroAssembler* _masm) { 7286 __ reset_last_Java_frame(true); 7287 } 7288 7289 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 7290 // It returns a jobject handle to the event writer. 7291 // The handle is dereferenced and the return value is the event writer oop. 7292 static RuntimeStub* generate_jfr_write_checkpoint() { 7293 enum layout { 7294 rbp_off, 7295 rbpH_off, 7296 return_off, 7297 return_off2, 7298 framesize // inclusive of return address 7299 }; 7300 7301 int insts_size = 1024; 7302 int locs_size = 64; 7303 CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); 7304 OopMapSet* oop_maps = new OopMapSet(); 7305 MacroAssembler* masm = new MacroAssembler(&code); 7306 MacroAssembler* _masm = masm; 7307 7308 address start = __ pc(); 7309 __ enter(); 7310 int frame_complete = __ pc() - start; 7311 address the_pc = __ pc(); 7312 jfr_prologue(the_pc, _masm, rthread); 7313 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 7314 jfr_epilogue(_masm); 7315 __ resolve_global_jobject(r0, rscratch1, rscratch2); 7316 __ leave(); 7317 __ ret(lr); 7318 7319 OopMap* map = new OopMap(framesize, 1); // rfp 7320 oop_maps->add_gc_map(the_pc - start, map); 7321 7322 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7323 RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, 7324 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7325 oop_maps, false); 7326 return stub; 7327 } 7328 7329 // For c2: call to return a leased buffer. 7330 static RuntimeStub* generate_jfr_return_lease() { 7331 enum layout { 7332 rbp_off, 7333 rbpH_off, 7334 return_off, 7335 return_off2, 7336 framesize // inclusive of return address 7337 }; 7338 7339 int insts_size = 1024; 7340 int locs_size = 64; 7341 CodeBuffer code("jfr_return_lease", insts_size, locs_size); 7342 OopMapSet* oop_maps = new OopMapSet(); 7343 MacroAssembler* masm = new MacroAssembler(&code); 7344 MacroAssembler* _masm = masm; 7345 7346 address start = __ pc(); 7347 __ enter(); 7348 int frame_complete = __ pc() - start; 7349 address the_pc = __ pc(); 7350 jfr_prologue(the_pc, _masm, rthread); 7351 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 7352 jfr_epilogue(_masm); 7353 7354 __ leave(); 7355 __ ret(lr); 7356 7357 OopMap* map = new OopMap(framesize, 1); // rfp 7358 oop_maps->add_gc_map(the_pc - start, map); 7359 7360 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7361 RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete, 7362 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7363 oop_maps, false); 7364 return stub; 7365 } 7366 7367 #endif // INCLUDE_JFR 7368 7369 // exception handler for upcall stubs 7370 address generate_upcall_stub_exception_handler() { 7371 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7372 address start = __ pc(); 7373 7374 // Native caller has no idea how to handle exceptions, 7375 // so we just crash here. Up to callee to catch exceptions. 7376 __ verify_oop(r0); 7377 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7378 __ blr(rscratch1); 7379 __ should_not_reach_here(); 7380 7381 return start; 7382 } 7383 7384 // Continuation point for throwing of implicit exceptions that are 7385 // not handled in the current activation. Fabricates an exception 7386 // oop and initiates normal exception dispatching in this 7387 // frame. Since we need to preserve callee-saved values (currently 7388 // only for C2, but done for C1 as well) we need a callee-saved oop 7389 // map and therefore have to make these stubs into RuntimeStubs 7390 // rather than BufferBlobs. If the compiler needs all registers to 7391 // be preserved between the fault point and the exception handler 7392 // then it must assume responsibility for that in 7393 // AbstractCompiler::continuation_for_implicit_null_exception or 7394 // continuation_for_implicit_division_by_zero_exception. All other 7395 // implicit exceptions (e.g., NullPointerException or 7396 // AbstractMethodError on entry) are either at call sites or 7397 // otherwise assume that stack unwinding will be initiated, so 7398 // caller saved registers were assumed volatile in the compiler. 7399 7400 #undef __ 7401 #define __ masm-> 7402 7403 address generate_throw_exception(const char* name, 7404 address runtime_entry, 7405 Register arg1 = noreg, 7406 Register arg2 = noreg) { 7407 // Information about frame layout at time of blocking runtime call. 7408 // Note that we only have to preserve callee-saved registers since 7409 // the compilers are responsible for supplying a continuation point 7410 // if they expect all registers to be preserved. 7411 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 7412 enum layout { 7413 rfp_off = 0, 7414 rfp_off2, 7415 return_off, 7416 return_off2, 7417 framesize // inclusive of return address 7418 }; 7419 7420 int insts_size = 512; 7421 int locs_size = 64; 7422 7423 CodeBuffer code(name, insts_size, locs_size); 7424 OopMapSet* oop_maps = new OopMapSet(); 7425 MacroAssembler* masm = new MacroAssembler(&code); 7426 7427 address start = __ pc(); 7428 7429 // This is an inlined and slightly modified version of call_VM 7430 // which has the ability to fetch the return PC out of 7431 // thread-local storage and also sets up last_Java_sp slightly 7432 // differently than the real call_VM 7433 7434 __ enter(); // Save FP and LR before call 7435 7436 assert(is_even(framesize/2), "sp not 16-byte aligned"); 7437 7438 // lr and fp are already in place 7439 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 7440 7441 int frame_complete = __ pc() - start; 7442 7443 // Set up last_Java_sp and last_Java_fp 7444 address the_pc = __ pc(); 7445 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7446 7447 // Call runtime 7448 if (arg1 != noreg) { 7449 assert(arg2 != c_rarg1, "clobbered"); 7450 __ mov(c_rarg1, arg1); 7451 } 7452 if (arg2 != noreg) { 7453 __ mov(c_rarg2, arg2); 7454 } 7455 __ mov(c_rarg0, rthread); 7456 BLOCK_COMMENT("call runtime_entry"); 7457 __ mov(rscratch1, runtime_entry); 7458 __ blr(rscratch1); 7459 7460 // Generate oop map 7461 OopMap* map = new OopMap(framesize, 0); 7462 7463 oop_maps->add_gc_map(the_pc - start, map); 7464 7465 __ reset_last_Java_frame(true); 7466 7467 // Reinitialize the ptrue predicate register, in case the external runtime 7468 // call clobbers ptrue reg, as we may return to SVE compiled code. 7469 __ reinitialize_ptrue(); 7470 7471 __ leave(); 7472 7473 // check for pending exceptions 7474 #ifdef ASSERT 7475 Label L; 7476 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 7477 __ cbnz(rscratch1, L); 7478 __ should_not_reach_here(); 7479 __ bind(L); 7480 #endif // ASSERT 7481 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 7482 7483 // codeBlob framesize is in words (not VMRegImpl::slot_size) 7484 RuntimeStub* stub = 7485 RuntimeStub::new_runtime_stub(name, 7486 &code, 7487 frame_complete, 7488 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7489 oop_maps, false); 7490 return stub->entry_point(); 7491 } 7492 7493 class MontgomeryMultiplyGenerator : public MacroAssembler { 7494 7495 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7496 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7497 7498 RegSet _toSave; 7499 bool _squaring; 7500 7501 public: 7502 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7503 : MacroAssembler(as->code()), _squaring(squaring) { 7504 7505 // Register allocation 7506 7507 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7508 Pa_base = *regs; // Argument registers 7509 if (squaring) 7510 Pb_base = Pa_base; 7511 else 7512 Pb_base = *++regs; 7513 Pn_base = *++regs; 7514 Rlen= *++regs; 7515 inv = *++regs; 7516 Pm_base = *++regs; 7517 7518 // Working registers: 7519 Ra = *++regs; // The current digit of a, b, n, and m. 7520 Rb = *++regs; 7521 Rm = *++regs; 7522 Rn = *++regs; 7523 7524 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7525 Pb = *++regs; 7526 Pm = *++regs; 7527 Pn = *++regs; 7528 7529 t0 = *++regs; // Three registers which form a 7530 t1 = *++regs; // triple-precision accumuator. 7531 t2 = *++regs; 7532 7533 Ri = *++regs; // Inner and outer loop indexes. 7534 Rj = *++regs; 7535 7536 Rhi_ab = *++regs; // Product registers: low and high parts 7537 Rlo_ab = *++regs; // of a*b and m*n. 7538 Rhi_mn = *++regs; 7539 Rlo_mn = *++regs; 7540 7541 // r19 and up are callee-saved. 7542 _toSave = RegSet::range(r19, *regs) + Pm_base; 7543 } 7544 7545 private: 7546 void save_regs() { 7547 push(_toSave, sp); 7548 } 7549 7550 void restore_regs() { 7551 pop(_toSave, sp); 7552 } 7553 7554 template <typename T> 7555 void unroll_2(Register count, T block) { 7556 Label loop, end, odd; 7557 tbnz(count, 0, odd); 7558 cbz(count, end); 7559 align(16); 7560 bind(loop); 7561 (this->*block)(); 7562 bind(odd); 7563 (this->*block)(); 7564 subs(count, count, 2); 7565 br(Assembler::GT, loop); 7566 bind(end); 7567 } 7568 7569 template <typename T> 7570 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7571 Label loop, end, odd; 7572 tbnz(count, 0, odd); 7573 cbz(count, end); 7574 align(16); 7575 bind(loop); 7576 (this->*block)(d, s, tmp); 7577 bind(odd); 7578 (this->*block)(d, s, tmp); 7579 subs(count, count, 2); 7580 br(Assembler::GT, loop); 7581 bind(end); 7582 } 7583 7584 void pre1(RegisterOrConstant i) { 7585 block_comment("pre1"); 7586 // Pa = Pa_base; 7587 // Pb = Pb_base + i; 7588 // Pm = Pm_base; 7589 // Pn = Pn_base + i; 7590 // Ra = *Pa; 7591 // Rb = *Pb; 7592 // Rm = *Pm; 7593 // Rn = *Pn; 7594 ldr(Ra, Address(Pa_base)); 7595 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7596 ldr(Rm, Address(Pm_base)); 7597 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7598 lea(Pa, Address(Pa_base)); 7599 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7600 lea(Pm, Address(Pm_base)); 7601 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7602 7603 // Zero the m*n result. 7604 mov(Rhi_mn, zr); 7605 mov(Rlo_mn, zr); 7606 } 7607 7608 // The core multiply-accumulate step of a Montgomery 7609 // multiplication. The idea is to schedule operations as a 7610 // pipeline so that instructions with long latencies (loads and 7611 // multiplies) have time to complete before their results are 7612 // used. This most benefits in-order implementations of the 7613 // architecture but out-of-order ones also benefit. 7614 void step() { 7615 block_comment("step"); 7616 // MACC(Ra, Rb, t0, t1, t2); 7617 // Ra = *++Pa; 7618 // Rb = *--Pb; 7619 umulh(Rhi_ab, Ra, Rb); 7620 mul(Rlo_ab, Ra, Rb); 7621 ldr(Ra, pre(Pa, wordSize)); 7622 ldr(Rb, pre(Pb, -wordSize)); 7623 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7624 // previous iteration. 7625 // MACC(Rm, Rn, t0, t1, t2); 7626 // Rm = *++Pm; 7627 // Rn = *--Pn; 7628 umulh(Rhi_mn, Rm, Rn); 7629 mul(Rlo_mn, Rm, Rn); 7630 ldr(Rm, pre(Pm, wordSize)); 7631 ldr(Rn, pre(Pn, -wordSize)); 7632 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7633 } 7634 7635 void post1() { 7636 block_comment("post1"); 7637 7638 // MACC(Ra, Rb, t0, t1, t2); 7639 // Ra = *++Pa; 7640 // Rb = *--Pb; 7641 umulh(Rhi_ab, Ra, Rb); 7642 mul(Rlo_ab, Ra, Rb); 7643 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7644 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7645 7646 // *Pm = Rm = t0 * inv; 7647 mul(Rm, t0, inv); 7648 str(Rm, Address(Pm)); 7649 7650 // MACC(Rm, Rn, t0, t1, t2); 7651 // t0 = t1; t1 = t2; t2 = 0; 7652 umulh(Rhi_mn, Rm, Rn); 7653 7654 #ifndef PRODUCT 7655 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7656 { 7657 mul(Rlo_mn, Rm, Rn); 7658 add(Rlo_mn, t0, Rlo_mn); 7659 Label ok; 7660 cbz(Rlo_mn, ok); { 7661 stop("broken Montgomery multiply"); 7662 } bind(ok); 7663 } 7664 #endif 7665 // We have very carefully set things up so that 7666 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7667 // the lower half of Rm * Rn because we know the result already: 7668 // it must be -t0. t0 + (-t0) must generate a carry iff 7669 // t0 != 0. So, rather than do a mul and an adds we just set 7670 // the carry flag iff t0 is nonzero. 7671 // 7672 // mul(Rlo_mn, Rm, Rn); 7673 // adds(zr, t0, Rlo_mn); 7674 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7675 adcs(t0, t1, Rhi_mn); 7676 adc(t1, t2, zr); 7677 mov(t2, zr); 7678 } 7679 7680 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7681 block_comment("pre2"); 7682 // Pa = Pa_base + i-len; 7683 // Pb = Pb_base + len; 7684 // Pm = Pm_base + i-len; 7685 // Pn = Pn_base + len; 7686 7687 if (i.is_register()) { 7688 sub(Rj, i.as_register(), len); 7689 } else { 7690 mov(Rj, i.as_constant()); 7691 sub(Rj, Rj, len); 7692 } 7693 // Rj == i-len 7694 7695 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7696 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7697 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7698 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7699 7700 // Ra = *++Pa; 7701 // Rb = *--Pb; 7702 // Rm = *++Pm; 7703 // Rn = *--Pn; 7704 ldr(Ra, pre(Pa, wordSize)); 7705 ldr(Rb, pre(Pb, -wordSize)); 7706 ldr(Rm, pre(Pm, wordSize)); 7707 ldr(Rn, pre(Pn, -wordSize)); 7708 7709 mov(Rhi_mn, zr); 7710 mov(Rlo_mn, zr); 7711 } 7712 7713 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7714 block_comment("post2"); 7715 if (i.is_constant()) { 7716 mov(Rj, i.as_constant()-len.as_constant()); 7717 } else { 7718 sub(Rj, i.as_register(), len); 7719 } 7720 7721 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7722 7723 // As soon as we know the least significant digit of our result, 7724 // store it. 7725 // Pm_base[i-len] = t0; 7726 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7727 7728 // t0 = t1; t1 = t2; t2 = 0; 7729 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7730 adc(t1, t2, zr); 7731 mov(t2, zr); 7732 } 7733 7734 // A carry in t0 after Montgomery multiplication means that we 7735 // should subtract multiples of n from our result in m. We'll 7736 // keep doing that until there is no carry. 7737 void normalize(RegisterOrConstant len) { 7738 block_comment("normalize"); 7739 // while (t0) 7740 // t0 = sub(Pm_base, Pn_base, t0, len); 7741 Label loop, post, again; 7742 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7743 cbz(t0, post); { 7744 bind(again); { 7745 mov(i, zr); 7746 mov(cnt, len); 7747 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7748 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7749 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7750 align(16); 7751 bind(loop); { 7752 sbcs(Rm, Rm, Rn); 7753 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7754 add(i, i, 1); 7755 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7756 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7757 sub(cnt, cnt, 1); 7758 } cbnz(cnt, loop); 7759 sbc(t0, t0, zr); 7760 } cbnz(t0, again); 7761 } bind(post); 7762 } 7763 7764 // Move memory at s to d, reversing words. 7765 // Increments d to end of copied memory 7766 // Destroys tmp1, tmp2 7767 // Preserves len 7768 // Leaves s pointing to the address which was in d at start 7769 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7770 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7771 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7772 7773 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7774 mov(tmp1, len); 7775 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7776 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7777 } 7778 // where 7779 void reverse1(Register d, Register s, Register tmp) { 7780 ldr(tmp, pre(s, -wordSize)); 7781 ror(tmp, tmp, 32); 7782 str(tmp, post(d, wordSize)); 7783 } 7784 7785 void step_squaring() { 7786 // An extra ACC 7787 step(); 7788 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7789 } 7790 7791 void last_squaring(RegisterOrConstant i) { 7792 Label dont; 7793 // if ((i & 1) == 0) { 7794 tbnz(i.as_register(), 0, dont); { 7795 // MACC(Ra, Rb, t0, t1, t2); 7796 // Ra = *++Pa; 7797 // Rb = *--Pb; 7798 umulh(Rhi_ab, Ra, Rb); 7799 mul(Rlo_ab, Ra, Rb); 7800 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7801 } bind(dont); 7802 } 7803 7804 void extra_step_squaring() { 7805 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7806 7807 // MACC(Rm, Rn, t0, t1, t2); 7808 // Rm = *++Pm; 7809 // Rn = *--Pn; 7810 umulh(Rhi_mn, Rm, Rn); 7811 mul(Rlo_mn, Rm, Rn); 7812 ldr(Rm, pre(Pm, wordSize)); 7813 ldr(Rn, pre(Pn, -wordSize)); 7814 } 7815 7816 void post1_squaring() { 7817 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7818 7819 // *Pm = Rm = t0 * inv; 7820 mul(Rm, t0, inv); 7821 str(Rm, Address(Pm)); 7822 7823 // MACC(Rm, Rn, t0, t1, t2); 7824 // t0 = t1; t1 = t2; t2 = 0; 7825 umulh(Rhi_mn, Rm, Rn); 7826 7827 #ifndef PRODUCT 7828 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7829 { 7830 mul(Rlo_mn, Rm, Rn); 7831 add(Rlo_mn, t0, Rlo_mn); 7832 Label ok; 7833 cbz(Rlo_mn, ok); { 7834 stop("broken Montgomery multiply"); 7835 } bind(ok); 7836 } 7837 #endif 7838 // We have very carefully set things up so that 7839 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7840 // the lower half of Rm * Rn because we know the result already: 7841 // it must be -t0. t0 + (-t0) must generate a carry iff 7842 // t0 != 0. So, rather than do a mul and an adds we just set 7843 // the carry flag iff t0 is nonzero. 7844 // 7845 // mul(Rlo_mn, Rm, Rn); 7846 // adds(zr, t0, Rlo_mn); 7847 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7848 adcs(t0, t1, Rhi_mn); 7849 adc(t1, t2, zr); 7850 mov(t2, zr); 7851 } 7852 7853 void acc(Register Rhi, Register Rlo, 7854 Register t0, Register t1, Register t2) { 7855 adds(t0, t0, Rlo); 7856 adcs(t1, t1, Rhi); 7857 adc(t2, t2, zr); 7858 } 7859 7860 public: 7861 /** 7862 * Fast Montgomery multiplication. The derivation of the 7863 * algorithm is in A Cryptographic Library for the Motorola 7864 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7865 * 7866 * Arguments: 7867 * 7868 * Inputs for multiplication: 7869 * c_rarg0 - int array elements a 7870 * c_rarg1 - int array elements b 7871 * c_rarg2 - int array elements n (the modulus) 7872 * c_rarg3 - int length 7873 * c_rarg4 - int inv 7874 * c_rarg5 - int array elements m (the result) 7875 * 7876 * Inputs for squaring: 7877 * c_rarg0 - int array elements a 7878 * c_rarg1 - int array elements n (the modulus) 7879 * c_rarg2 - int length 7880 * c_rarg3 - int inv 7881 * c_rarg4 - int array elements m (the result) 7882 * 7883 */ 7884 address generate_multiply() { 7885 Label argh, nothing; 7886 bind(argh); 7887 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7888 7889 align(CodeEntryAlignment); 7890 address entry = pc(); 7891 7892 cbzw(Rlen, nothing); 7893 7894 enter(); 7895 7896 // Make room. 7897 cmpw(Rlen, 512); 7898 br(Assembler::HI, argh); 7899 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7900 andr(sp, Ra, -2 * wordSize); 7901 7902 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7903 7904 { 7905 // Copy input args, reversing as we go. We use Ra as a 7906 // temporary variable. 7907 reverse(Ra, Pa_base, Rlen, t0, t1); 7908 if (!_squaring) 7909 reverse(Ra, Pb_base, Rlen, t0, t1); 7910 reverse(Ra, Pn_base, Rlen, t0, t1); 7911 } 7912 7913 // Push all call-saved registers and also Pm_base which we'll need 7914 // at the end. 7915 save_regs(); 7916 7917 #ifndef PRODUCT 7918 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7919 { 7920 ldr(Rn, Address(Pn_base, 0)); 7921 mul(Rlo_mn, Rn, inv); 7922 subs(zr, Rlo_mn, -1); 7923 Label ok; 7924 br(EQ, ok); { 7925 stop("broken inverse in Montgomery multiply"); 7926 } bind(ok); 7927 } 7928 #endif 7929 7930 mov(Pm_base, Ra); 7931 7932 mov(t0, zr); 7933 mov(t1, zr); 7934 mov(t2, zr); 7935 7936 block_comment("for (int i = 0; i < len; i++) {"); 7937 mov(Ri, zr); { 7938 Label loop, end; 7939 cmpw(Ri, Rlen); 7940 br(Assembler::GE, end); 7941 7942 bind(loop); 7943 pre1(Ri); 7944 7945 block_comment(" for (j = i; j; j--) {"); { 7946 movw(Rj, Ri); 7947 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7948 } block_comment(" } // j"); 7949 7950 post1(); 7951 addw(Ri, Ri, 1); 7952 cmpw(Ri, Rlen); 7953 br(Assembler::LT, loop); 7954 bind(end); 7955 block_comment("} // i"); 7956 } 7957 7958 block_comment("for (int i = len; i < 2*len; i++) {"); 7959 mov(Ri, Rlen); { 7960 Label loop, end; 7961 cmpw(Ri, Rlen, Assembler::LSL, 1); 7962 br(Assembler::GE, end); 7963 7964 bind(loop); 7965 pre2(Ri, Rlen); 7966 7967 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7968 lslw(Rj, Rlen, 1); 7969 subw(Rj, Rj, Ri); 7970 subw(Rj, Rj, 1); 7971 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7972 } block_comment(" } // j"); 7973 7974 post2(Ri, Rlen); 7975 addw(Ri, Ri, 1); 7976 cmpw(Ri, Rlen, Assembler::LSL, 1); 7977 br(Assembler::LT, loop); 7978 bind(end); 7979 } 7980 block_comment("} // i"); 7981 7982 normalize(Rlen); 7983 7984 mov(Ra, Pm_base); // Save Pm_base in Ra 7985 restore_regs(); // Restore caller's Pm_base 7986 7987 // Copy our result into caller's Pm_base 7988 reverse(Pm_base, Ra, Rlen, t0, t1); 7989 7990 leave(); 7991 bind(nothing); 7992 ret(lr); 7993 7994 return entry; 7995 } 7996 // In C, approximately: 7997 7998 // void 7999 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 8000 // julong Pn_base[], julong Pm_base[], 8001 // julong inv, int len) { 8002 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8003 // julong *Pa, *Pb, *Pn, *Pm; 8004 // julong Ra, Rb, Rn, Rm; 8005 8006 // int i; 8007 8008 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8009 8010 // for (i = 0; i < len; i++) { 8011 // int j; 8012 8013 // Pa = Pa_base; 8014 // Pb = Pb_base + i; 8015 // Pm = Pm_base; 8016 // Pn = Pn_base + i; 8017 8018 // Ra = *Pa; 8019 // Rb = *Pb; 8020 // Rm = *Pm; 8021 // Rn = *Pn; 8022 8023 // int iters = i; 8024 // for (j = 0; iters--; j++) { 8025 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8026 // MACC(Ra, Rb, t0, t1, t2); 8027 // Ra = *++Pa; 8028 // Rb = *--Pb; 8029 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8030 // MACC(Rm, Rn, t0, t1, t2); 8031 // Rm = *++Pm; 8032 // Rn = *--Pn; 8033 // } 8034 8035 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8036 // MACC(Ra, Rb, t0, t1, t2); 8037 // *Pm = Rm = t0 * inv; 8038 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8039 // MACC(Rm, Rn, t0, t1, t2); 8040 8041 // assert(t0 == 0, "broken Montgomery multiply"); 8042 8043 // t0 = t1; t1 = t2; t2 = 0; 8044 // } 8045 8046 // for (i = len; i < 2*len; i++) { 8047 // int j; 8048 8049 // Pa = Pa_base + i-len; 8050 // Pb = Pb_base + len; 8051 // Pm = Pm_base + i-len; 8052 // Pn = Pn_base + len; 8053 8054 // Ra = *++Pa; 8055 // Rb = *--Pb; 8056 // Rm = *++Pm; 8057 // Rn = *--Pn; 8058 8059 // int iters = len*2-i-1; 8060 // for (j = i-len+1; iters--; j++) { 8061 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8062 // MACC(Ra, Rb, t0, t1, t2); 8063 // Ra = *++Pa; 8064 // Rb = *--Pb; 8065 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8066 // MACC(Rm, Rn, t0, t1, t2); 8067 // Rm = *++Pm; 8068 // Rn = *--Pn; 8069 // } 8070 8071 // Pm_base[i-len] = t0; 8072 // t0 = t1; t1 = t2; t2 = 0; 8073 // } 8074 8075 // while (t0) 8076 // t0 = sub(Pm_base, Pn_base, t0, len); 8077 // } 8078 8079 /** 8080 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8081 * multiplies than Montgomery multiplication so it should be up to 8082 * 25% faster. However, its loop control is more complex and it 8083 * may actually run slower on some machines. 8084 * 8085 * Arguments: 8086 * 8087 * Inputs: 8088 * c_rarg0 - int array elements a 8089 * c_rarg1 - int array elements n (the modulus) 8090 * c_rarg2 - int length 8091 * c_rarg3 - int inv 8092 * c_rarg4 - int array elements m (the result) 8093 * 8094 */ 8095 address generate_square() { 8096 Label argh; 8097 bind(argh); 8098 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8099 8100 align(CodeEntryAlignment); 8101 address entry = pc(); 8102 8103 enter(); 8104 8105 // Make room. 8106 cmpw(Rlen, 512); 8107 br(Assembler::HI, argh); 8108 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8109 andr(sp, Ra, -2 * wordSize); 8110 8111 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8112 8113 { 8114 // Copy input args, reversing as we go. We use Ra as a 8115 // temporary variable. 8116 reverse(Ra, Pa_base, Rlen, t0, t1); 8117 reverse(Ra, Pn_base, Rlen, t0, t1); 8118 } 8119 8120 // Push all call-saved registers and also Pm_base which we'll need 8121 // at the end. 8122 save_regs(); 8123 8124 mov(Pm_base, Ra); 8125 8126 mov(t0, zr); 8127 mov(t1, zr); 8128 mov(t2, zr); 8129 8130 block_comment("for (int i = 0; i < len; i++) {"); 8131 mov(Ri, zr); { 8132 Label loop, end; 8133 bind(loop); 8134 cmp(Ri, Rlen); 8135 br(Assembler::GE, end); 8136 8137 pre1(Ri); 8138 8139 block_comment("for (j = (i+1)/2; j; j--) {"); { 8140 add(Rj, Ri, 1); 8141 lsr(Rj, Rj, 1); 8142 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8143 } block_comment(" } // j"); 8144 8145 last_squaring(Ri); 8146 8147 block_comment(" for (j = i/2; j; j--) {"); { 8148 lsr(Rj, Ri, 1); 8149 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8150 } block_comment(" } // j"); 8151 8152 post1_squaring(); 8153 add(Ri, Ri, 1); 8154 cmp(Ri, Rlen); 8155 br(Assembler::LT, loop); 8156 8157 bind(end); 8158 block_comment("} // i"); 8159 } 8160 8161 block_comment("for (int i = len; i < 2*len; i++) {"); 8162 mov(Ri, Rlen); { 8163 Label loop, end; 8164 bind(loop); 8165 cmp(Ri, Rlen, Assembler::LSL, 1); 8166 br(Assembler::GE, end); 8167 8168 pre2(Ri, Rlen); 8169 8170 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8171 lsl(Rj, Rlen, 1); 8172 sub(Rj, Rj, Ri); 8173 sub(Rj, Rj, 1); 8174 lsr(Rj, Rj, 1); 8175 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8176 } block_comment(" } // j"); 8177 8178 last_squaring(Ri); 8179 8180 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8181 lsl(Rj, Rlen, 1); 8182 sub(Rj, Rj, Ri); 8183 lsr(Rj, Rj, 1); 8184 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8185 } block_comment(" } // j"); 8186 8187 post2(Ri, Rlen); 8188 add(Ri, Ri, 1); 8189 cmp(Ri, Rlen, Assembler::LSL, 1); 8190 8191 br(Assembler::LT, loop); 8192 bind(end); 8193 block_comment("} // i"); 8194 } 8195 8196 normalize(Rlen); 8197 8198 mov(Ra, Pm_base); // Save Pm_base in Ra 8199 restore_regs(); // Restore caller's Pm_base 8200 8201 // Copy our result into caller's Pm_base 8202 reverse(Pm_base, Ra, Rlen, t0, t1); 8203 8204 leave(); 8205 ret(lr); 8206 8207 return entry; 8208 } 8209 // In C, approximately: 8210 8211 // void 8212 // montgomery_square(julong Pa_base[], julong Pn_base[], 8213 // julong Pm_base[], julong inv, int len) { 8214 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8215 // julong *Pa, *Pb, *Pn, *Pm; 8216 // julong Ra, Rb, Rn, Rm; 8217 8218 // int i; 8219 8220 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8221 8222 // for (i = 0; i < len; i++) { 8223 // int j; 8224 8225 // Pa = Pa_base; 8226 // Pb = Pa_base + i; 8227 // Pm = Pm_base; 8228 // Pn = Pn_base + i; 8229 8230 // Ra = *Pa; 8231 // Rb = *Pb; 8232 // Rm = *Pm; 8233 // Rn = *Pn; 8234 8235 // int iters = (i+1)/2; 8236 // for (j = 0; iters--; j++) { 8237 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8238 // MACC2(Ra, Rb, t0, t1, t2); 8239 // Ra = *++Pa; 8240 // Rb = *--Pb; 8241 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8242 // MACC(Rm, Rn, t0, t1, t2); 8243 // Rm = *++Pm; 8244 // Rn = *--Pn; 8245 // } 8246 // if ((i & 1) == 0) { 8247 // assert(Ra == Pa_base[j], "must be"); 8248 // MACC(Ra, Ra, t0, t1, t2); 8249 // } 8250 // iters = i/2; 8251 // assert(iters == i-j, "must be"); 8252 // for (; iters--; j++) { 8253 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8254 // MACC(Rm, Rn, t0, t1, t2); 8255 // Rm = *++Pm; 8256 // Rn = *--Pn; 8257 // } 8258 8259 // *Pm = Rm = t0 * inv; 8260 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8261 // MACC(Rm, Rn, t0, t1, t2); 8262 8263 // assert(t0 == 0, "broken Montgomery multiply"); 8264 8265 // t0 = t1; t1 = t2; t2 = 0; 8266 // } 8267 8268 // for (i = len; i < 2*len; i++) { 8269 // int start = i-len+1; 8270 // int end = start + (len - start)/2; 8271 // int j; 8272 8273 // Pa = Pa_base + i-len; 8274 // Pb = Pa_base + len; 8275 // Pm = Pm_base + i-len; 8276 // Pn = Pn_base + len; 8277 8278 // Ra = *++Pa; 8279 // Rb = *--Pb; 8280 // Rm = *++Pm; 8281 // Rn = *--Pn; 8282 8283 // int iters = (2*len-i-1)/2; 8284 // assert(iters == end-start, "must be"); 8285 // for (j = start; iters--; j++) { 8286 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8287 // MACC2(Ra, Rb, t0, t1, t2); 8288 // Ra = *++Pa; 8289 // Rb = *--Pb; 8290 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8291 // MACC(Rm, Rn, t0, t1, t2); 8292 // Rm = *++Pm; 8293 // Rn = *--Pn; 8294 // } 8295 // if ((i & 1) == 0) { 8296 // assert(Ra == Pa_base[j], "must be"); 8297 // MACC(Ra, Ra, t0, t1, t2); 8298 // } 8299 // iters = (2*len-i)/2; 8300 // assert(iters == len-j, "must be"); 8301 // for (; iters--; j++) { 8302 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8303 // MACC(Rm, Rn, t0, t1, t2); 8304 // Rm = *++Pm; 8305 // Rn = *--Pn; 8306 // } 8307 // Pm_base[i-len] = t0; 8308 // t0 = t1; t1 = t2; t2 = 0; 8309 // } 8310 8311 // while (t0) 8312 // t0 = sub(Pm_base, Pn_base, t0, len); 8313 // } 8314 }; 8315 8316 8317 // Call here from the interpreter or compiled code to either load 8318 // multiple returned values from the inline type instance being 8319 // returned to registers or to store returned values to a newly 8320 // allocated inline type instance. 8321 address generate_return_value_stub(address destination, const char* name, bool has_res) { 8322 // We need to save all registers the calling convention may use so 8323 // the runtime calls read or update those registers. This needs to 8324 // be in sync with SharedRuntime::java_return_convention(). 8325 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 8326 enum layout { 8327 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 8328 j_rarg6_off, j_rarg6_2, 8329 j_rarg5_off, j_rarg5_2, 8330 j_rarg4_off, j_rarg4_2, 8331 j_rarg3_off, j_rarg3_2, 8332 j_rarg2_off, j_rarg2_2, 8333 j_rarg1_off, j_rarg1_2, 8334 j_rarg0_off, j_rarg0_2, 8335 8336 j_farg7_off, j_farg7_2, 8337 j_farg6_off, j_farg6_2, 8338 j_farg5_off, j_farg5_2, 8339 j_farg4_off, j_farg4_2, 8340 j_farg3_off, j_farg3_2, 8341 j_farg2_off, j_farg2_2, 8342 j_farg1_off, j_farg1_2, 8343 j_farg0_off, j_farg0_2, 8344 8345 rfp_off, rfp_off2, 8346 return_off, return_off2, 8347 8348 framesize // inclusive of return address 8349 }; 8350 8351 CodeBuffer code(name, 512, 64); 8352 MacroAssembler* masm = new MacroAssembler(&code); 8353 8354 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 8355 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 8356 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 8357 int frame_size_in_words = frame_size_in_bytes / wordSize; 8358 8359 OopMapSet* oop_maps = new OopMapSet(); 8360 OopMap* map = new OopMap(frame_size_in_slots, 0); 8361 8362 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 8363 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 8364 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 8365 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 8366 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 8367 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 8368 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 8369 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 8370 8371 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 8372 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 8373 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 8374 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 8375 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 8376 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 8377 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 8378 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 8379 8380 address start = __ pc(); 8381 8382 __ enter(); // Save FP and LR before call 8383 8384 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 8385 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 8386 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 8387 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 8388 8389 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 8390 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 8391 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 8392 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 8393 8394 int frame_complete = __ offset(); 8395 8396 // Set up last_Java_sp and last_Java_fp 8397 address the_pc = __ pc(); 8398 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 8399 8400 // Call runtime 8401 __ mov(c_rarg1, r0); 8402 __ mov(c_rarg0, rthread); 8403 8404 __ mov(rscratch1, destination); 8405 __ blr(rscratch1); 8406 8407 oop_maps->add_gc_map(the_pc - start, map); 8408 8409 __ reset_last_Java_frame(false); 8410 8411 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 8412 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 8413 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 8414 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 8415 8416 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 8417 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 8418 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 8419 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 8420 8421 __ leave(); 8422 8423 // check for pending exceptions 8424 Label pending; 8425 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 8426 __ cbnz(rscratch1, pending); 8427 8428 if (has_res) { 8429 __ get_vm_result(r0, rthread); 8430 } 8431 8432 __ ret(lr); 8433 8434 __ bind(pending); 8435 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 8436 8437 // ------------- 8438 // make sure all code is generated 8439 masm->flush(); 8440 8441 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 8442 return stub->entry_point(); 8443 } 8444 8445 // Initialization 8446 void generate_initial_stubs() { 8447 // Generate initial stubs and initializes the entry points 8448 8449 // entry points that exist in all platforms Note: This is code 8450 // that could be shared among different platforms - however the 8451 // benefit seems to be smaller than the disadvantage of having a 8452 // much more complicated generator structure. See also comment in 8453 // stubRoutines.hpp. 8454 8455 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8456 8457 StubRoutines::_call_stub_entry = 8458 generate_call_stub(StubRoutines::_call_stub_return_address); 8459 8460 // is referenced by megamorphic call 8461 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8462 8463 // Build this early so it's available for the interpreter. 8464 StubRoutines::_throw_StackOverflowError_entry = 8465 generate_throw_exception("StackOverflowError throw_exception", 8466 CAST_FROM_FN_PTR(address, 8467 SharedRuntime::throw_StackOverflowError)); 8468 StubRoutines::_throw_delayed_StackOverflowError_entry = 8469 generate_throw_exception("delayed StackOverflowError throw_exception", 8470 CAST_FROM_FN_PTR(address, 8471 SharedRuntime::throw_delayed_StackOverflowError)); 8472 8473 // Initialize table for copy memory (arraycopy) check. 8474 if (UnsafeCopyMemory::_table == nullptr) { 8475 UnsafeCopyMemory::create_table(8); 8476 } 8477 8478 if (UseCRC32Intrinsics) { 8479 // set table address before stub generation which use it 8480 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8481 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8482 } 8483 8484 if (UseCRC32CIntrinsics) { 8485 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8486 } 8487 8488 // Disabled until JDK-8210858 is fixed 8489 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 8490 // StubRoutines::_dlog = generate_dlog(); 8491 // } 8492 8493 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8494 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8495 } 8496 8497 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8498 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8499 } 8500 8501 if (InlineTypeReturnedAsFields) { 8502 StubRoutines::_load_inline_type_fields_in_regs = 8503 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 8504 StubRoutines::_store_inline_type_fields_to_buf = 8505 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 8506 } 8507 } 8508 8509 void generate_continuation_stubs() { 8510 // Continuation stubs: 8511 StubRoutines::_cont_thaw = generate_cont_thaw(); 8512 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8513 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8514 8515 JFR_ONLY(generate_jfr_stubs();) 8516 } 8517 8518 #if INCLUDE_JFR 8519 void generate_jfr_stubs() { 8520 StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint(); 8521 StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point(); 8522 StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease(); 8523 StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point(); 8524 } 8525 #endif // INCLUDE_JFR 8526 8527 void generate_final_stubs() { 8528 // support for verify_oop (must happen after universe_init) 8529 if (VerifyOops) { 8530 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8531 } 8532 StubRoutines::_throw_AbstractMethodError_entry = 8533 generate_throw_exception("AbstractMethodError throw_exception", 8534 CAST_FROM_FN_PTR(address, 8535 SharedRuntime:: 8536 throw_AbstractMethodError)); 8537 8538 StubRoutines::_throw_IncompatibleClassChangeError_entry = 8539 generate_throw_exception("IncompatibleClassChangeError throw_exception", 8540 CAST_FROM_FN_PTR(address, 8541 SharedRuntime:: 8542 throw_IncompatibleClassChangeError)); 8543 8544 StubRoutines::_throw_NullPointerException_at_call_entry = 8545 generate_throw_exception("NullPointerException at call throw_exception", 8546 CAST_FROM_FN_PTR(address, 8547 SharedRuntime:: 8548 throw_NullPointerException_at_call)); 8549 8550 // arraycopy stubs used by compilers 8551 generate_arraycopy_stubs(); 8552 8553 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8554 if (bs_nm != nullptr) { 8555 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8556 } 8557 8558 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8559 8560 if (UsePoly1305Intrinsics) { 8561 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8562 } 8563 8564 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8565 8566 generate_atomic_entry_points(); 8567 8568 #endif // LINUX 8569 8570 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8571 8572 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8573 } 8574 8575 void generate_compiler_stubs() { 8576 #if COMPILER2_OR_JVMCI 8577 8578 if (UseSVE == 0) { 8579 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8580 } 8581 8582 // array equals stub for large arrays. 8583 if (!UseSimpleArrayEquals) { 8584 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8585 } 8586 8587 // byte_array_inflate stub for large arrays. 8588 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8589 8590 // countPositives stub for large arrays. 8591 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8592 8593 generate_compare_long_strings(); 8594 8595 generate_string_indexof_stubs(); 8596 8597 #ifdef COMPILER2 8598 if (UseMultiplyToLenIntrinsic) { 8599 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8600 } 8601 8602 if (UseSquareToLenIntrinsic) { 8603 StubRoutines::_squareToLen = generate_squareToLen(); 8604 } 8605 8606 if (UseMulAddIntrinsic) { 8607 StubRoutines::_mulAdd = generate_mulAdd(); 8608 } 8609 8610 if (UseSIMDForBigIntegerShiftIntrinsics) { 8611 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8612 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8613 } 8614 8615 if (UseMontgomeryMultiplyIntrinsic) { 8616 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8617 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8618 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8619 } 8620 8621 if (UseMontgomerySquareIntrinsic) { 8622 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8623 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8624 // We use generate_multiply() rather than generate_square() 8625 // because it's faster for the sizes of modulus we care about. 8626 StubRoutines::_montgomerySquare = g.generate_multiply(); 8627 } 8628 #endif // COMPILER2 8629 8630 if (UseChaCha20Intrinsics) { 8631 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8632 } 8633 8634 if (UseBASE64Intrinsics) { 8635 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8636 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8637 } 8638 8639 // data cache line writeback 8640 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8641 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8642 8643 if (UseAESIntrinsics) { 8644 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8645 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8646 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8647 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8648 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8649 } 8650 if (UseGHASHIntrinsics) { 8651 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8652 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8653 } 8654 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8655 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8656 } 8657 8658 if (UseMD5Intrinsics) { 8659 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8660 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8661 } 8662 if (UseSHA1Intrinsics) { 8663 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8664 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8665 } 8666 if (UseSHA256Intrinsics) { 8667 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8668 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8669 } 8670 if (UseSHA512Intrinsics) { 8671 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8672 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8673 } 8674 if (UseSHA3Intrinsics) { 8675 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8676 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8677 } 8678 8679 // generate Adler32 intrinsics code 8680 if (UseAdler32Intrinsics) { 8681 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8682 } 8683 #endif // COMPILER2_OR_JVMCI 8684 } 8685 8686 public: 8687 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8688 switch(kind) { 8689 case Initial_stubs: 8690 generate_initial_stubs(); 8691 break; 8692 case Continuation_stubs: 8693 generate_continuation_stubs(); 8694 break; 8695 case Compiler_stubs: 8696 generate_compiler_stubs(); 8697 break; 8698 case Final_stubs: 8699 generate_final_stubs(); 8700 break; 8701 default: 8702 fatal("unexpected stubs kind: %d", kind); 8703 break; 8704 }; 8705 } 8706 }; // end class declaration 8707 8708 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8709 StubGenerator g(code, kind); 8710 } 8711 8712 8713 #if defined (LINUX) 8714 8715 // Define pointers to atomic stubs and initialize them to point to the 8716 // code in atomic_aarch64.S. 8717 8718 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8719 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8720 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8721 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8722 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8723 8724 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8725 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8726 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8727 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8728 DEFAULT_ATOMIC_OP(xchg, 4, ) 8729 DEFAULT_ATOMIC_OP(xchg, 8, ) 8730 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8731 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8732 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8733 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8734 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8735 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8736 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8737 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8738 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8739 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8740 8741 #undef DEFAULT_ATOMIC_OP 8742 8743 #endif // LINUX