1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/globalDefinitions.hpp" 57 #include "utilities/powerOfTwo.hpp" 58 #ifdef COMPILER2 59 #include "opto/runtime.hpp" 60 #endif 61 #if INCLUDE_ZGC 62 #include "gc/z/zThreadLocalData.hpp" 63 #endif 64 65 // Declaration and definition of StubGenerator (no .hpp file). 66 // For a more detailed description of the stub routine structure 67 // see the comment in stubRoutines.hpp 68 69 #undef __ 70 #define __ _masm-> 71 72 #ifdef PRODUCT 73 #define BLOCK_COMMENT(str) /* nothing */ 74 #else 75 #define BLOCK_COMMENT(str) __ block_comment(str) 76 #endif 77 78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 79 80 // Stub Code definitions 81 82 class StubGenerator: public StubCodeGenerator { 83 private: 84 85 #ifdef PRODUCT 86 #define inc_counter_np(counter) ((void)0) 87 #else 88 void inc_counter_np_(uint& counter) { 89 __ lea(rscratch2, ExternalAddress((address)&counter)); 90 __ ldrw(rscratch1, Address(rscratch2)); 91 __ addw(rscratch1, rscratch1, 1); 92 __ strw(rscratch1, Address(rscratch2)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubCodeMark mark(this, "StubRoutines", "call_stub"); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 // All of j_rargN may be used to return inline type fields so be careful 332 // not to clobber those. 333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 334 // assignment of Rresult below. 335 Register Rresult = r14, Rresult_type = r15; 336 __ ldr(Rresult, result); 337 Label is_long, is_float, is_double, check_prim, exit; 338 __ ldr(Rresult_type, result_type); 339 __ cmp(Rresult_type, (u1)T_OBJECT); 340 __ br(Assembler::EQ, check_prim); 341 __ cmp(Rresult_type, (u1)T_LONG); 342 __ br(Assembler::EQ, is_long); 343 __ cmp(Rresult_type, (u1)T_FLOAT); 344 __ br(Assembler::EQ, is_float); 345 __ cmp(Rresult_type, (u1)T_DOUBLE); 346 __ br(Assembler::EQ, is_double); 347 348 // handle T_INT case 349 __ strw(r0, Address(Rresult)); 350 351 __ BIND(exit); 352 353 // pop parameters 354 __ sub(esp, rfp, -sp_after_call_off * wordSize); 355 356 #ifdef ASSERT 357 // verify that threads correspond 358 { 359 Label L, S; 360 __ ldr(rscratch1, thread); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::NE, S); 363 __ get_thread(rscratch1); 364 __ cmp(rthread, rscratch1); 365 __ br(Assembler::EQ, L); 366 __ BIND(S); 367 __ stop("StubRoutines::call_stub: threads must correspond"); 368 __ BIND(L); 369 } 370 #endif 371 372 __ pop_cont_fastpath(rthread); 373 374 // restore callee-save registers 375 __ ldpd(v15, v14, d15_save); 376 __ ldpd(v13, v12, d13_save); 377 __ ldpd(v11, v10, d11_save); 378 __ ldpd(v9, v8, d9_save); 379 380 __ ldp(r28, r27, r28_save); 381 __ ldp(r26, r25, r26_save); 382 __ ldp(r24, r23, r24_save); 383 __ ldp(r22, r21, r22_save); 384 __ ldp(r20, r19, r20_save); 385 386 // restore fpcr 387 __ ldr(rscratch1, fpcr_save); 388 __ set_fpcr(rscratch1); 389 390 __ ldp(c_rarg0, c_rarg1, call_wrapper); 391 __ ldrw(c_rarg2, result_type); 392 __ ldr(c_rarg3, method); 393 __ ldp(c_rarg4, c_rarg5, entry_point); 394 __ ldp(c_rarg6, c_rarg7, parameter_size); 395 396 // leave frame and return to caller 397 __ leave(); 398 __ ret(lr); 399 400 // handle return types different from T_INT 401 __ BIND(check_prim); 402 if (InlineTypeReturnedAsFields) { 403 // Check for scalarized return value 404 __ tbz(r0, 0, is_long); 405 // Load pack handler address 406 __ andr(rscratch1, r0, -2); 407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 409 __ blr(rscratch1); 410 __ b(exit); 411 } 412 413 __ BIND(is_long); 414 __ str(r0, Address(Rresult, 0)); 415 __ br(Assembler::AL, exit); 416 417 __ BIND(is_float); 418 __ strs(j_farg0, Address(Rresult, 0)); 419 __ br(Assembler::AL, exit); 420 421 __ BIND(is_double); 422 __ strd(j_farg0, Address(Rresult, 0)); 423 __ br(Assembler::AL, exit); 424 425 return start; 426 } 427 428 // Return point for a Java call if there's an exception thrown in 429 // Java code. The exception is caught and transformed into a 430 // pending exception stored in JavaThread that can be tested from 431 // within the VM. 432 // 433 // Note: Usually the parameters are removed by the callee. In case 434 // of an exception crossing an activation frame boundary, that is 435 // not the case if the callee is compiled code => need to setup the 436 // rsp. 437 // 438 // r0: exception oop 439 440 address generate_catch_exception() { 441 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 442 address start = __ pc(); 443 444 // same as in generate_call_stub(): 445 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 446 const Address thread (rfp, thread_off * wordSize); 447 448 #ifdef ASSERT 449 // verify that threads correspond 450 { 451 Label L, S; 452 __ ldr(rscratch1, thread); 453 __ cmp(rthread, rscratch1); 454 __ br(Assembler::NE, S); 455 __ get_thread(rscratch1); 456 __ cmp(rthread, rscratch1); 457 __ br(Assembler::EQ, L); 458 __ bind(S); 459 __ stop("StubRoutines::catch_exception: threads must correspond"); 460 __ bind(L); 461 } 462 #endif 463 464 // set pending exception 465 __ verify_oop(r0); 466 467 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 468 __ mov(rscratch1, (address)__FILE__); 469 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 470 __ movw(rscratch1, (int)__LINE__); 471 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 472 473 // complete return to VM 474 assert(StubRoutines::_call_stub_return_address != nullptr, 475 "_call_stub_return_address must have been generated before"); 476 __ b(StubRoutines::_call_stub_return_address); 477 478 return start; 479 } 480 481 // Continuation point for runtime calls returning with a pending 482 // exception. The pending exception check happened in the runtime 483 // or native call stub. The pending exception in Thread is 484 // converted into a Java-level exception. 485 // 486 // Contract with Java-level exception handlers: 487 // r0: exception 488 // r3: throwing pc 489 // 490 // NOTE: At entry of this stub, exception-pc must be in LR !! 491 492 // NOTE: this is always used as a jump target within generated code 493 // so it just needs to be generated code with no x86 prolog 494 495 address generate_forward_exception() { 496 StubCodeMark mark(this, "StubRoutines", "forward exception"); 497 address start = __ pc(); 498 499 // Upon entry, LR points to the return address returning into 500 // Java (interpreted or compiled) code; i.e., the return address 501 // becomes the throwing pc. 502 // 503 // Arguments pushed before the runtime call are still on the stack 504 // but the exception handler will reset the stack pointer -> 505 // ignore them. A potential result in registers can be ignored as 506 // well. 507 508 #ifdef ASSERT 509 // make sure this code is only executed if there is a pending exception 510 { 511 Label L; 512 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 513 __ cbnz(rscratch1, L); 514 __ stop("StubRoutines::forward exception: no pending exception (1)"); 515 __ bind(L); 516 } 517 #endif 518 519 // compute exception handler into r19 520 521 // call the VM to find the handler address associated with the 522 // caller address. pass thread in r0 and caller pc (ret address) 523 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 524 // the stack. 525 __ mov(c_rarg1, lr); 526 // lr will be trashed by the VM call so we move it to R19 527 // (callee-saved) because we also need to pass it to the handler 528 // returned by this call. 529 __ mov(r19, lr); 530 BLOCK_COMMENT("call exception_handler_for_return_address"); 531 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 532 SharedRuntime::exception_handler_for_return_address), 533 rthread, c_rarg1); 534 // Reinitialize the ptrue predicate register, in case the external runtime 535 // call clobbers ptrue reg, as we may return to SVE compiled code. 536 __ reinitialize_ptrue(); 537 538 // we should not really care that lr is no longer the callee 539 // address. we saved the value the handler needs in r19 so we can 540 // just copy it to r3. however, the C2 handler will push its own 541 // frame and then calls into the VM and the VM code asserts that 542 // the PC for the frame above the handler belongs to a compiled 543 // Java method. So, we restore lr here to satisfy that assert. 544 __ mov(lr, r19); 545 // setup r0 & r3 & clear pending exception 546 __ mov(r3, r19); 547 __ mov(r19, r0); 548 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 549 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 550 551 #ifdef ASSERT 552 // make sure exception is set 553 { 554 Label L; 555 __ cbnz(r0, L); 556 __ stop("StubRoutines::forward exception: no pending exception (2)"); 557 __ bind(L); 558 } 559 #endif 560 561 // continue at exception handler 562 // r0: exception 563 // r3: throwing pc 564 // r19: exception handler 565 __ verify_oop(r0); 566 __ br(r19); 567 568 return start; 569 } 570 571 // Non-destructive plausibility checks for oops 572 // 573 // Arguments: 574 // r0: oop to verify 575 // rscratch1: error message 576 // 577 // Stack after saving c_rarg3: 578 // [tos + 0]: saved c_rarg3 579 // [tos + 1]: saved c_rarg2 580 // [tos + 2]: saved lr 581 // [tos + 3]: saved rscratch2 582 // [tos + 4]: saved r0 583 // [tos + 5]: saved rscratch1 584 address generate_verify_oop() { 585 586 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 587 address start = __ pc(); 588 589 Label exit, error; 590 591 // save c_rarg2 and c_rarg3 592 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 593 594 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 595 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 596 __ ldr(c_rarg3, Address(c_rarg2)); 597 __ add(c_rarg3, c_rarg3, 1); 598 __ str(c_rarg3, Address(c_rarg2)); 599 600 // object is in r0 601 // make sure object is 'reasonable' 602 __ cbz(r0, exit); // if obj is null it is OK 603 604 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 605 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 606 607 // return if everything seems ok 608 __ bind(exit); 609 610 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 611 __ ret(lr); 612 613 // handle errors 614 __ bind(error); 615 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 616 617 __ push(RegSet::range(r0, r29), sp); 618 // debug(char* msg, int64_t pc, int64_t regs[]) 619 __ mov(c_rarg0, rscratch1); // pass address of error message 620 __ mov(c_rarg1, lr); // pass return address 621 __ mov(c_rarg2, sp); // pass address of regs on stack 622 #ifndef PRODUCT 623 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 624 #endif 625 BLOCK_COMMENT("call MacroAssembler::debug"); 626 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 627 __ blr(rscratch1); 628 __ hlt(0); 629 630 return start; 631 } 632 633 // Generate indices for iota vector. 634 address generate_iota_indices(const char *stub_name) { 635 __ align(CodeEntryAlignment); 636 StubCodeMark mark(this, "StubRoutines", stub_name); 637 address start = __ pc(); 638 // B 639 __ emit_data64(0x0706050403020100, relocInfo::none); 640 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 641 // H 642 __ emit_data64(0x0003000200010000, relocInfo::none); 643 __ emit_data64(0x0007000600050004, relocInfo::none); 644 // S 645 __ emit_data64(0x0000000100000000, relocInfo::none); 646 __ emit_data64(0x0000000300000002, relocInfo::none); 647 // D 648 __ emit_data64(0x0000000000000000, relocInfo::none); 649 __ emit_data64(0x0000000000000001, relocInfo::none); 650 // S - FP 651 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 652 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 653 // D - FP 654 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 655 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 656 return start; 657 } 658 659 // The inner part of zero_words(). This is the bulk operation, 660 // zeroing words in blocks, possibly using DC ZVA to do it. The 661 // caller is responsible for zeroing the last few words. 662 // 663 // Inputs: 664 // r10: the HeapWord-aligned base address of an array to zero. 665 // r11: the count in HeapWords, r11 > 0. 666 // 667 // Returns r10 and r11, adjusted for the caller to clear. 668 // r10: the base address of the tail of words left to clear. 669 // r11: the number of words in the tail. 670 // r11 < MacroAssembler::zero_words_block_size. 671 672 address generate_zero_blocks() { 673 Label done; 674 Label base_aligned; 675 676 Register base = r10, cnt = r11; 677 678 __ align(CodeEntryAlignment); 679 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 680 address start = __ pc(); 681 682 if (UseBlockZeroing) { 683 int zva_length = VM_Version::zva_length(); 684 685 // Ensure ZVA length can be divided by 16. This is required by 686 // the subsequent operations. 687 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 688 689 __ tbz(base, 3, base_aligned); 690 __ str(zr, Address(__ post(base, 8))); 691 __ sub(cnt, cnt, 1); 692 __ bind(base_aligned); 693 694 // Ensure count >= zva_length * 2 so that it still deserves a zva after 695 // alignment. 696 Label small; 697 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 698 __ subs(rscratch1, cnt, low_limit >> 3); 699 __ br(Assembler::LT, small); 700 __ zero_dcache_blocks(base, cnt); 701 __ bind(small); 702 } 703 704 { 705 // Number of stp instructions we'll unroll 706 const int unroll = 707 MacroAssembler::zero_words_block_size / 2; 708 // Clear the remaining blocks. 709 Label loop; 710 __ subs(cnt, cnt, unroll * 2); 711 __ br(Assembler::LT, done); 712 __ bind(loop); 713 for (int i = 0; i < unroll; i++) 714 __ stp(zr, zr, __ post(base, 16)); 715 __ subs(cnt, cnt, unroll * 2); 716 __ br(Assembler::GE, loop); 717 __ bind(done); 718 __ add(cnt, cnt, unroll * 2); 719 } 720 721 __ ret(lr); 722 723 return start; 724 } 725 726 727 typedef enum { 728 copy_forwards = 1, 729 copy_backwards = -1 730 } copy_direction; 731 732 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 733 // for arraycopy stubs. 734 class ArrayCopyBarrierSetHelper : StackObj { 735 BarrierSetAssembler* _bs_asm; 736 MacroAssembler* _masm; 737 DecoratorSet _decorators; 738 BasicType _type; 739 Register _gct1; 740 Register _gct2; 741 Register _gct3; 742 FloatRegister _gcvt1; 743 FloatRegister _gcvt2; 744 FloatRegister _gcvt3; 745 746 public: 747 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 748 DecoratorSet decorators, 749 BasicType type, 750 Register gct1, 751 Register gct2, 752 Register gct3, 753 FloatRegister gcvt1, 754 FloatRegister gcvt2, 755 FloatRegister gcvt3) 756 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 757 _masm(masm), 758 _decorators(decorators), 759 _type(type), 760 _gct1(gct1), 761 _gct2(gct2), 762 _gct3(gct3), 763 _gcvt1(gcvt1), 764 _gcvt2(gcvt2), 765 _gcvt3(gcvt3) { 766 } 767 768 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 770 dst1, dst2, src, 771 _gct1, _gct2, _gcvt1); 772 } 773 774 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 778 } 779 780 void copy_load_at_16(Register dst1, Register dst2, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 782 dst1, dst2, src, 783 _gct1); 784 } 785 786 void copy_store_at_16(Address dst, Register src1, Register src2) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 788 dst, src1, src2, 789 _gct1, _gct2, _gct3); 790 } 791 792 void copy_load_at_8(Register dst, Address src) { 793 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 794 dst, noreg, src, 795 _gct1); 796 } 797 798 void copy_store_at_8(Address dst, Register src) { 799 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 800 dst, src, noreg, 801 _gct1, _gct2, _gct3); 802 } 803 }; 804 805 // Bulk copy of blocks of 8 words. 806 // 807 // count is a count of words. 808 // 809 // Precondition: count >= 8 810 // 811 // Postconditions: 812 // 813 // The least significant bit of count contains the remaining count 814 // of words to copy. The rest of count is trash. 815 // 816 // s and d are adjusted to point to the remaining words to copy 817 // 818 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 819 copy_direction direction) { 820 int unit = wordSize * direction; 821 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 822 823 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 824 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 825 const Register stride = r14; 826 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 827 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 828 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 829 830 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 831 assert_different_registers(s, d, count, rscratch1, rscratch2); 832 833 Label again, drain; 834 const char *stub_name; 835 if (direction == copy_forwards) 836 stub_name = "forward_copy_longs"; 837 else 838 stub_name = "backward_copy_longs"; 839 840 __ align(CodeEntryAlignment); 841 842 StubCodeMark mark(this, "StubRoutines", stub_name); 843 844 __ bind(start); 845 846 Label unaligned_copy_long; 847 if (AvoidUnalignedAccesses) { 848 __ tbnz(d, 3, unaligned_copy_long); 849 } 850 851 if (direction == copy_forwards) { 852 __ sub(s, s, bias); 853 __ sub(d, d, bias); 854 } 855 856 #ifdef ASSERT 857 // Make sure we are never given < 8 words 858 { 859 Label L; 860 __ cmp(count, (u1)8); 861 __ br(Assembler::GE, L); 862 __ stop("genrate_copy_longs called with < 8 words"); 863 __ bind(L); 864 } 865 #endif 866 867 // Fill 8 registers 868 if (UseSIMDForMemoryOps) { 869 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 870 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 871 } else { 872 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 873 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 874 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 875 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 876 } 877 878 __ subs(count, count, 16); 879 __ br(Assembler::LO, drain); 880 881 int prefetch = PrefetchCopyIntervalInBytes; 882 bool use_stride = false; 883 if (direction == copy_backwards) { 884 use_stride = prefetch > 256; 885 prefetch = -prefetch; 886 if (use_stride) __ mov(stride, prefetch); 887 } 888 889 __ bind(again); 890 891 if (PrefetchCopyIntervalInBytes > 0) 892 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 893 894 if (UseSIMDForMemoryOps) { 895 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 896 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 897 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 898 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 899 } else { 900 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 902 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 903 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 904 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 905 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 906 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 907 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 908 } 909 910 __ subs(count, count, 8); 911 __ br(Assembler::HS, again); 912 913 // Drain 914 __ bind(drain); 915 if (UseSIMDForMemoryOps) { 916 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 917 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 918 } else { 919 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 920 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 921 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 922 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 923 } 924 925 { 926 Label L1, L2; 927 __ tbz(count, exact_log2(4), L1); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 930 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 931 } else { 932 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 933 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 934 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 935 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 936 } 937 __ bind(L1); 938 939 if (direction == copy_forwards) { 940 __ add(s, s, bias); 941 __ add(d, d, bias); 942 } 943 944 __ tbz(count, 1, L2); 945 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 946 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 947 __ bind(L2); 948 } 949 950 __ ret(lr); 951 952 if (AvoidUnalignedAccesses) { 953 Label drain, again; 954 // Register order for storing. Order is different for backward copy. 955 956 __ bind(unaligned_copy_long); 957 958 // source address is even aligned, target odd aligned 959 // 960 // when forward copying word pairs we read long pairs at offsets 961 // {0, 2, 4, 6} (in long words). when backwards copying we read 962 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 963 // address by -2 in the forwards case so we can compute the 964 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 965 // or -1. 966 // 967 // when forward copying we need to store 1 word, 3 pairs and 968 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 969 // zero offset We adjust the destination by -1 which means we 970 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 971 // 972 // When backwards copyng we need to store 1 word, 3 pairs and 973 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 974 // offsets {1, 3, 5, 7, 8} * unit. 975 976 if (direction == copy_forwards) { 977 __ sub(s, s, 16); 978 __ sub(d, d, 8); 979 } 980 981 // Fill 8 registers 982 // 983 // for forwards copy s was offset by -16 from the original input 984 // value of s so the register contents are at these offsets 985 // relative to the 64 bit block addressed by that original input 986 // and so on for each successive 64 byte block when s is updated 987 // 988 // t0 at offset 0, t1 at offset 8 989 // t2 at offset 16, t3 at offset 24 990 // t4 at offset 32, t5 at offset 40 991 // t6 at offset 48, t7 at offset 56 992 993 // for backwards copy s was not offset so the register contents 994 // are at these offsets into the preceding 64 byte block 995 // relative to that original input and so on for each successive 996 // preceding 64 byte block when s is updated. this explains the 997 // slightly counter-intuitive looking pattern of register usage 998 // in the stp instructions for backwards copy. 999 // 1000 // t0 at offset -16, t1 at offset -8 1001 // t2 at offset -32, t3 at offset -24 1002 // t4 at offset -48, t5 at offset -40 1003 // t6 at offset -64, t7 at offset -56 1004 1005 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1006 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1007 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1008 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1009 1010 __ subs(count, count, 16); 1011 __ br(Assembler::LO, drain); 1012 1013 int prefetch = PrefetchCopyIntervalInBytes; 1014 bool use_stride = false; 1015 if (direction == copy_backwards) { 1016 use_stride = prefetch > 256; 1017 prefetch = -prefetch; 1018 if (use_stride) __ mov(stride, prefetch); 1019 } 1020 1021 __ bind(again); 1022 1023 if (PrefetchCopyIntervalInBytes > 0) 1024 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1025 1026 if (direction == copy_forwards) { 1027 // allowing for the offset of -8 the store instructions place 1028 // registers into the target 64 bit block at the following 1029 // offsets 1030 // 1031 // t0 at offset 0 1032 // t1 at offset 8, t2 at offset 16 1033 // t3 at offset 24, t4 at offset 32 1034 // t5 at offset 40, t6 at offset 48 1035 // t7 at offset 56 1036 1037 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1038 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1039 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1040 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1041 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1042 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1043 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1044 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1045 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1046 } else { 1047 // d was not offset when we started so the registers are 1048 // written into the 64 bit block preceding d with the following 1049 // offsets 1050 // 1051 // t1 at offset -8 1052 // t3 at offset -24, t0 at offset -16 1053 // t5 at offset -48, t2 at offset -32 1054 // t7 at offset -56, t4 at offset -48 1055 // t6 at offset -64 1056 // 1057 // note that this matches the offsets previously noted for the 1058 // loads 1059 1060 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1061 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1062 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1063 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1064 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1065 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1066 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1067 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1068 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1069 } 1070 1071 __ subs(count, count, 8); 1072 __ br(Assembler::HS, again); 1073 1074 // Drain 1075 // 1076 // this uses the same pattern of offsets and register arguments 1077 // as above 1078 __ bind(drain); 1079 if (direction == copy_forwards) { 1080 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1081 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1082 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1083 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1084 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1085 } else { 1086 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1087 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1088 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1089 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1090 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1091 } 1092 // now we need to copy any remaining part block which may 1093 // include a 4 word block subblock and/or a 2 word subblock. 1094 // bits 2 and 1 in the count are the tell-tale for whether we 1095 // have each such subblock 1096 { 1097 Label L1, L2; 1098 __ tbz(count, exact_log2(4), L1); 1099 // this is the same as above but copying only 4 longs hence 1100 // with only one intervening stp between the str instructions 1101 // but note that the offsets and registers still follow the 1102 // same pattern 1103 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1104 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1105 if (direction == copy_forwards) { 1106 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1107 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1108 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1109 } else { 1110 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1111 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1112 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1113 } 1114 __ bind(L1); 1115 1116 __ tbz(count, 1, L2); 1117 // this is the same as above but copying only 2 longs hence 1118 // there is no intervening stp between the str instructions 1119 // but note that the offset and register patterns are still 1120 // the same 1121 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1122 if (direction == copy_forwards) { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1124 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1125 } else { 1126 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1127 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1128 } 1129 __ bind(L2); 1130 1131 // for forwards copy we need to re-adjust the offsets we 1132 // applied so that s and d are follow the last words written 1133 1134 if (direction == copy_forwards) { 1135 __ add(s, s, 16); 1136 __ add(d, d, 8); 1137 } 1138 1139 } 1140 1141 __ ret(lr); 1142 } 1143 } 1144 1145 // Small copy: less than 16 bytes. 1146 // 1147 // NB: Ignores all of the bits of count which represent more than 15 1148 // bytes, so a caller doesn't have to mask them. 1149 1150 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1151 bool is_backwards = step < 0; 1152 size_t granularity = uabs(step); 1153 int direction = is_backwards ? -1 : 1; 1154 1155 Label Lword, Lint, Lshort, Lbyte; 1156 1157 assert(granularity 1158 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1159 1160 const Register t0 = r3; 1161 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1162 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1163 1164 // ??? I don't know if this bit-test-and-branch is the right thing 1165 // to do. It does a lot of jumping, resulting in several 1166 // mispredicted branches. It might make more sense to do this 1167 // with something like Duff's device with a single computed branch. 1168 1169 __ tbz(count, 3 - exact_log2(granularity), Lword); 1170 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1171 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1172 __ bind(Lword); 1173 1174 if (granularity <= sizeof (jint)) { 1175 __ tbz(count, 2 - exact_log2(granularity), Lint); 1176 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1177 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1178 __ bind(Lint); 1179 } 1180 1181 if (granularity <= sizeof (jshort)) { 1182 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1183 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1184 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1185 __ bind(Lshort); 1186 } 1187 1188 if (granularity <= sizeof (jbyte)) { 1189 __ tbz(count, 0, Lbyte); 1190 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1191 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1192 __ bind(Lbyte); 1193 } 1194 } 1195 1196 Label copy_f, copy_b; 1197 Label copy_obj_f, copy_obj_b; 1198 Label copy_obj_uninit_f, copy_obj_uninit_b; 1199 1200 // All-singing all-dancing memory copy. 1201 // 1202 // Copy count units of memory from s to d. The size of a unit is 1203 // step, which can be positive or negative depending on the direction 1204 // of copy. If is_aligned is false, we align the source address. 1205 // 1206 1207 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1208 Register s, Register d, Register count, int step) { 1209 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1210 bool is_backwards = step < 0; 1211 unsigned int granularity = uabs(step); 1212 const Register t0 = r3, t1 = r4; 1213 1214 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1215 // load all the data before writing anything 1216 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1217 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1218 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1219 const Register send = r17, dend = r16; 1220 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1221 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1222 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1223 1224 if (PrefetchCopyIntervalInBytes > 0) 1225 __ prfm(Address(s, 0), PLDL1KEEP); 1226 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1227 __ br(Assembler::HI, copy_big); 1228 1229 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1230 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1231 1232 __ cmp(count, u1(16/granularity)); 1233 __ br(Assembler::LS, copy16); 1234 1235 __ cmp(count, u1(64/granularity)); 1236 __ br(Assembler::HI, copy80); 1237 1238 __ cmp(count, u1(32/granularity)); 1239 __ br(Assembler::LS, copy32); 1240 1241 // 33..64 bytes 1242 if (UseSIMDForMemoryOps) { 1243 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1244 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1245 bs.copy_store_at_32(Address(d, 0), v0, v1); 1246 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1247 } else { 1248 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1249 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1250 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1251 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1252 1253 bs.copy_store_at_16(Address(d, 0), t0, t1); 1254 bs.copy_store_at_16(Address(d, 16), t2, t3); 1255 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1256 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1257 } 1258 __ b(finish); 1259 1260 // 17..32 bytes 1261 __ bind(copy32); 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1264 1265 bs.copy_store_at_16(Address(d, 0), t0, t1); 1266 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1267 __ b(finish); 1268 1269 // 65..80/96 bytes 1270 // (96 bytes if SIMD because we do 32 byes per instruction) 1271 __ bind(copy80); 1272 if (UseSIMDForMemoryOps) { 1273 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1274 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1275 // Unaligned pointers can be an issue for copying. 1276 // The issue has more chances to happen when granularity of data is 1277 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1278 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1279 // The most performance drop has been seen for the range 65-80 bytes. 1280 // For such cases using the pair of ldp/stp instead of the third pair of 1281 // ldpq/stpq fixes the performance issue. 1282 if (granularity < sizeof (jint)) { 1283 Label copy96; 1284 __ cmp(count, u1(80/granularity)); 1285 __ br(Assembler::HI, copy96); 1286 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1287 1288 bs.copy_store_at_32(Address(d, 0), v0, v1); 1289 bs.copy_store_at_32(Address(d, 32), v2, v3); 1290 1291 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1292 __ b(finish); 1293 1294 __ bind(copy96); 1295 } 1296 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1297 1298 bs.copy_store_at_32(Address(d, 0), v0, v1); 1299 bs.copy_store_at_32(Address(d, 32), v2, v3); 1300 1301 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1302 } else { 1303 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1304 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1305 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1306 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1307 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1308 1309 bs.copy_store_at_16(Address(d, 0), t0, t1); 1310 bs.copy_store_at_16(Address(d, 16), t2, t3); 1311 bs.copy_store_at_16(Address(d, 32), t4, t5); 1312 bs.copy_store_at_16(Address(d, 48), t6, t7); 1313 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1314 } 1315 __ b(finish); 1316 1317 // 0..16 bytes 1318 __ bind(copy16); 1319 __ cmp(count, u1(8/granularity)); 1320 __ br(Assembler::LO, copy8); 1321 1322 // 8..16 bytes 1323 bs.copy_load_at_8(t0, Address(s, 0)); 1324 bs.copy_load_at_8(t1, Address(send, -8)); 1325 bs.copy_store_at_8(Address(d, 0), t0); 1326 bs.copy_store_at_8(Address(dend, -8), t1); 1327 __ b(finish); 1328 1329 if (granularity < 8) { 1330 // 4..7 bytes 1331 __ bind(copy8); 1332 __ tbz(count, 2 - exact_log2(granularity), copy4); 1333 __ ldrw(t0, Address(s, 0)); 1334 __ ldrw(t1, Address(send, -4)); 1335 __ strw(t0, Address(d, 0)); 1336 __ strw(t1, Address(dend, -4)); 1337 __ b(finish); 1338 if (granularity < 4) { 1339 // 0..3 bytes 1340 __ bind(copy4); 1341 __ cbz(count, finish); // get rid of 0 case 1342 if (granularity == 2) { 1343 __ ldrh(t0, Address(s, 0)); 1344 __ strh(t0, Address(d, 0)); 1345 } else { // granularity == 1 1346 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1347 // the first and last byte. 1348 // Handle the 3 byte case by loading and storing base + count/2 1349 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1350 // This does means in the 1 byte case we load/store the same 1351 // byte 3 times. 1352 __ lsr(count, count, 1); 1353 __ ldrb(t0, Address(s, 0)); 1354 __ ldrb(t1, Address(send, -1)); 1355 __ ldrb(t2, Address(s, count)); 1356 __ strb(t0, Address(d, 0)); 1357 __ strb(t1, Address(dend, -1)); 1358 __ strb(t2, Address(d, count)); 1359 } 1360 __ b(finish); 1361 } 1362 } 1363 1364 __ bind(copy_big); 1365 if (is_backwards) { 1366 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1367 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1368 } 1369 1370 // Now we've got the small case out of the way we can align the 1371 // source address on a 2-word boundary. 1372 1373 // Here we will materialize a count in r15, which is used by copy_memory_small 1374 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1375 // Up until here, we have used t9, which aliases r15, but from here on, that register 1376 // can not be used as a temp register, as it contains the count. 1377 1378 Label aligned; 1379 1380 if (is_aligned) { 1381 // We may have to adjust by 1 word to get s 2-word-aligned. 1382 __ tbz(s, exact_log2(wordSize), aligned); 1383 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1384 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1385 __ sub(count, count, wordSize/granularity); 1386 } else { 1387 if (is_backwards) { 1388 __ andr(r15, s, 2 * wordSize - 1); 1389 } else { 1390 __ neg(r15, s); 1391 __ andr(r15, r15, 2 * wordSize - 1); 1392 } 1393 // r15 is the byte adjustment needed to align s. 1394 __ cbz(r15, aligned); 1395 int shift = exact_log2(granularity); 1396 if (shift) __ lsr(r15, r15, shift); 1397 __ sub(count, count, r15); 1398 1399 #if 0 1400 // ?? This code is only correct for a disjoint copy. It may or 1401 // may not make sense to use it in that case. 1402 1403 // Copy the first pair; s and d may not be aligned. 1404 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1405 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1406 1407 // Align s and d, adjust count 1408 if (is_backwards) { 1409 __ sub(s, s, r15); 1410 __ sub(d, d, r15); 1411 } else { 1412 __ add(s, s, r15); 1413 __ add(d, d, r15); 1414 } 1415 #else 1416 copy_memory_small(decorators, type, s, d, r15, step); 1417 #endif 1418 } 1419 1420 __ bind(aligned); 1421 1422 // s is now 2-word-aligned. 1423 1424 // We have a count of units and some trailing bytes. Adjust the 1425 // count and do a bulk copy of words. 1426 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1427 if (direction == copy_forwards) { 1428 if (type != T_OBJECT) { 1429 __ bl(copy_f); 1430 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1431 __ bl(copy_obj_uninit_f); 1432 } else { 1433 __ bl(copy_obj_f); 1434 } 1435 } else { 1436 if (type != T_OBJECT) { 1437 __ bl(copy_b); 1438 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1439 __ bl(copy_obj_uninit_b); 1440 } else { 1441 __ bl(copy_obj_b); 1442 } 1443 } 1444 1445 // And the tail. 1446 copy_memory_small(decorators, type, s, d, count, step); 1447 1448 if (granularity >= 8) __ bind(copy8); 1449 if (granularity >= 4) __ bind(copy4); 1450 __ bind(finish); 1451 } 1452 1453 1454 void clobber_registers() { 1455 #ifdef ASSERT 1456 RegSet clobbered 1457 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1458 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1459 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1460 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1461 __ mov(*it, rscratch1); 1462 } 1463 #endif 1464 1465 } 1466 1467 // Scan over array at a for count oops, verifying each one. 1468 // Preserves a and count, clobbers rscratch1 and rscratch2. 1469 void verify_oop_array (int size, Register a, Register count, Register temp) { 1470 Label loop, end; 1471 __ mov(rscratch1, a); 1472 __ mov(rscratch2, zr); 1473 __ bind(loop); 1474 __ cmp(rscratch2, count); 1475 __ br(Assembler::HS, end); 1476 if (size == wordSize) { 1477 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1478 __ verify_oop(temp); 1479 } else { 1480 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1481 __ decode_heap_oop(temp); // calls verify_oop 1482 } 1483 __ add(rscratch2, rscratch2, 1); 1484 __ b(loop); 1485 __ bind(end); 1486 } 1487 1488 // Arguments: 1489 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1490 // ignored 1491 // is_oop - true => oop array, so generate store check code 1492 // name - stub name string 1493 // 1494 // Inputs: 1495 // c_rarg0 - source array address 1496 // c_rarg1 - destination array address 1497 // c_rarg2 - element count, treated as ssize_t, can be zero 1498 // 1499 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1500 // the hardware handle it. The two dwords within qwords that span 1501 // cache line boundaries will still be loaded and stored atomically. 1502 // 1503 // Side Effects: 1504 // disjoint_int_copy_entry is set to the no-overlap entry point 1505 // used by generate_conjoint_int_oop_copy(). 1506 // 1507 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1508 const char *name, bool dest_uninitialized = false) { 1509 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1510 RegSet saved_reg = RegSet::of(s, d, count); 1511 __ align(CodeEntryAlignment); 1512 StubCodeMark mark(this, "StubRoutines", name); 1513 address start = __ pc(); 1514 __ enter(); 1515 1516 if (entry != nullptr) { 1517 *entry = __ pc(); 1518 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1519 BLOCK_COMMENT("Entry:"); 1520 } 1521 1522 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1523 if (dest_uninitialized) { 1524 decorators |= IS_DEST_UNINITIALIZED; 1525 } 1526 if (aligned) { 1527 decorators |= ARRAYCOPY_ALIGNED; 1528 } 1529 1530 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1531 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1532 1533 if (is_oop) { 1534 // save regs before copy_memory 1535 __ push(RegSet::of(d, count), sp); 1536 } 1537 { 1538 // UnsafeCopyMemory page error: continue after ucm 1539 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1540 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1541 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1542 } 1543 1544 if (is_oop) { 1545 __ pop(RegSet::of(d, count), sp); 1546 if (VerifyOops) 1547 verify_oop_array(size, d, count, r16); 1548 } 1549 1550 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1551 1552 __ leave(); 1553 __ mov(r0, zr); // return 0 1554 __ ret(lr); 1555 return start; 1556 } 1557 1558 // Arguments: 1559 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1560 // ignored 1561 // is_oop - true => oop array, so generate store check code 1562 // name - stub name string 1563 // 1564 // Inputs: 1565 // c_rarg0 - source array address 1566 // c_rarg1 - destination array address 1567 // c_rarg2 - element count, treated as ssize_t, can be zero 1568 // 1569 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1570 // the hardware handle it. The two dwords within qwords that span 1571 // cache line boundaries will still be loaded and stored atomically. 1572 // 1573 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1574 address *entry, const char *name, 1575 bool dest_uninitialized = false) { 1576 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1577 RegSet saved_regs = RegSet::of(s, d, count); 1578 StubCodeMark mark(this, "StubRoutines", name); 1579 address start = __ pc(); 1580 __ enter(); 1581 1582 if (entry != nullptr) { 1583 *entry = __ pc(); 1584 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1585 BLOCK_COMMENT("Entry:"); 1586 } 1587 1588 // use fwd copy when (d-s) above_equal (count*size) 1589 __ sub(rscratch1, d, s); 1590 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1591 __ br(Assembler::HS, nooverlap_target); 1592 1593 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1594 if (dest_uninitialized) { 1595 decorators |= IS_DEST_UNINITIALIZED; 1596 } 1597 if (aligned) { 1598 decorators |= ARRAYCOPY_ALIGNED; 1599 } 1600 1601 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1602 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1603 1604 if (is_oop) { 1605 // save regs before copy_memory 1606 __ push(RegSet::of(d, count), sp); 1607 } 1608 { 1609 // UnsafeCopyMemory page error: continue after ucm 1610 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1611 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1612 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1613 } 1614 if (is_oop) { 1615 __ pop(RegSet::of(d, count), sp); 1616 if (VerifyOops) 1617 verify_oop_array(size, d, count, r16); 1618 } 1619 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1620 __ leave(); 1621 __ mov(r0, zr); // return 0 1622 __ ret(lr); 1623 return start; 1624 } 1625 1626 // Arguments: 1627 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1628 // ignored 1629 // name - stub name string 1630 // 1631 // Inputs: 1632 // c_rarg0 - source array address 1633 // c_rarg1 - destination array address 1634 // c_rarg2 - element count, treated as ssize_t, can be zero 1635 // 1636 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1637 // we let the hardware handle it. The one to eight bytes within words, 1638 // dwords or qwords that span cache line boundaries will still be loaded 1639 // and stored atomically. 1640 // 1641 // Side Effects: 1642 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1643 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1644 // we let the hardware handle it. The one to eight bytes within words, 1645 // dwords or qwords that span cache line boundaries will still be loaded 1646 // and stored atomically. 1647 // 1648 // Side Effects: 1649 // disjoint_byte_copy_entry is set to the no-overlap entry point 1650 // used by generate_conjoint_byte_copy(). 1651 // 1652 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1653 const bool not_oop = false; 1654 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1655 } 1656 1657 // Arguments: 1658 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1659 // ignored 1660 // name - stub name string 1661 // 1662 // Inputs: 1663 // c_rarg0 - source array address 1664 // c_rarg1 - destination array address 1665 // c_rarg2 - element count, treated as ssize_t, can be zero 1666 // 1667 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1668 // we let the hardware handle it. The one to eight bytes within words, 1669 // dwords or qwords that span cache line boundaries will still be loaded 1670 // and stored atomically. 1671 // 1672 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1673 address* entry, const char *name) { 1674 const bool not_oop = false; 1675 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1676 } 1677 1678 // Arguments: 1679 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1680 // ignored 1681 // name - stub name string 1682 // 1683 // Inputs: 1684 // c_rarg0 - source array address 1685 // c_rarg1 - destination array address 1686 // c_rarg2 - element count, treated as ssize_t, can be zero 1687 // 1688 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1689 // let the hardware handle it. The two or four words within dwords 1690 // or qwords that span cache line boundaries will still be loaded 1691 // and stored atomically. 1692 // 1693 // Side Effects: 1694 // disjoint_short_copy_entry is set to the no-overlap entry point 1695 // used by generate_conjoint_short_copy(). 1696 // 1697 address generate_disjoint_short_copy(bool aligned, 1698 address* entry, const char *name) { 1699 const bool not_oop = false; 1700 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1701 } 1702 1703 // Arguments: 1704 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1705 // ignored 1706 // name - stub name string 1707 // 1708 // Inputs: 1709 // c_rarg0 - source array address 1710 // c_rarg1 - destination array address 1711 // c_rarg2 - element count, treated as ssize_t, can be zero 1712 // 1713 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1714 // let the hardware handle it. The two or four words within dwords 1715 // or qwords that span cache line boundaries will still be loaded 1716 // and stored atomically. 1717 // 1718 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1719 address *entry, const char *name) { 1720 const bool not_oop = false; 1721 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1722 1723 } 1724 // Arguments: 1725 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1726 // ignored 1727 // name - stub name string 1728 // 1729 // Inputs: 1730 // c_rarg0 - source array address 1731 // c_rarg1 - destination array address 1732 // c_rarg2 - element count, treated as ssize_t, can be zero 1733 // 1734 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1735 // the hardware handle it. The two dwords within qwords that span 1736 // cache line boundaries will still be loaded and stored atomically. 1737 // 1738 // Side Effects: 1739 // disjoint_int_copy_entry is set to the no-overlap entry point 1740 // used by generate_conjoint_int_oop_copy(). 1741 // 1742 address generate_disjoint_int_copy(bool aligned, address *entry, 1743 const char *name, bool dest_uninitialized = false) { 1744 const bool not_oop = false; 1745 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1746 } 1747 1748 // Arguments: 1749 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1750 // ignored 1751 // name - stub name string 1752 // 1753 // Inputs: 1754 // c_rarg0 - source array address 1755 // c_rarg1 - destination array address 1756 // c_rarg2 - element count, treated as ssize_t, can be zero 1757 // 1758 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1759 // the hardware handle it. The two dwords within qwords that span 1760 // cache line boundaries will still be loaded and stored atomically. 1761 // 1762 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1763 address *entry, const char *name, 1764 bool dest_uninitialized = false) { 1765 const bool not_oop = false; 1766 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1767 } 1768 1769 1770 // Arguments: 1771 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1772 // ignored 1773 // name - stub name string 1774 // 1775 // Inputs: 1776 // c_rarg0 - source array address 1777 // c_rarg1 - destination array address 1778 // c_rarg2 - element count, treated as size_t, can be zero 1779 // 1780 // Side Effects: 1781 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1782 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1783 // 1784 address generate_disjoint_long_copy(bool aligned, address *entry, 1785 const char *name, bool dest_uninitialized = false) { 1786 const bool not_oop = false; 1787 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1788 } 1789 1790 // Arguments: 1791 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1792 // ignored 1793 // name - stub name string 1794 // 1795 // Inputs: 1796 // c_rarg0 - source array address 1797 // c_rarg1 - destination array address 1798 // c_rarg2 - element count, treated as size_t, can be zero 1799 // 1800 address generate_conjoint_long_copy(bool aligned, 1801 address nooverlap_target, address *entry, 1802 const char *name, bool dest_uninitialized = false) { 1803 const bool not_oop = false; 1804 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1805 } 1806 1807 // Arguments: 1808 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1809 // ignored 1810 // name - stub name string 1811 // 1812 // Inputs: 1813 // c_rarg0 - source array address 1814 // c_rarg1 - destination array address 1815 // c_rarg2 - element count, treated as size_t, can be zero 1816 // 1817 // Side Effects: 1818 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1819 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1820 // 1821 address generate_disjoint_oop_copy(bool aligned, address *entry, 1822 const char *name, bool dest_uninitialized) { 1823 const bool is_oop = true; 1824 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1825 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1826 } 1827 1828 // Arguments: 1829 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1830 // ignored 1831 // name - stub name string 1832 // 1833 // Inputs: 1834 // c_rarg0 - source array address 1835 // c_rarg1 - destination array address 1836 // c_rarg2 - element count, treated as size_t, can be zero 1837 // 1838 address generate_conjoint_oop_copy(bool aligned, 1839 address nooverlap_target, address *entry, 1840 const char *name, bool dest_uninitialized) { 1841 const bool is_oop = true; 1842 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1843 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1844 name, dest_uninitialized); 1845 } 1846 1847 1848 // Helper for generating a dynamic type check. 1849 // Smashes rscratch1, rscratch2. 1850 void generate_type_check(Register sub_klass, 1851 Register super_check_offset, 1852 Register super_klass, 1853 Label& L_success) { 1854 assert_different_registers(sub_klass, super_check_offset, super_klass); 1855 1856 BLOCK_COMMENT("type_check:"); 1857 1858 Label L_miss; 1859 1860 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1861 super_check_offset); 1862 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1863 1864 // Fall through on failure! 1865 __ BIND(L_miss); 1866 } 1867 1868 // 1869 // Generate checkcasting array copy stub 1870 // 1871 // Input: 1872 // c_rarg0 - source array address 1873 // c_rarg1 - destination array address 1874 // c_rarg2 - element count, treated as ssize_t, can be zero 1875 // c_rarg3 - size_t ckoff (super_check_offset) 1876 // c_rarg4 - oop ckval (super_klass) 1877 // 1878 // Output: 1879 // r0 == 0 - success 1880 // r0 == -1^K - failure, where K is partial transfer count 1881 // 1882 address generate_checkcast_copy(const char *name, address *entry, 1883 bool dest_uninitialized = false) { 1884 1885 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1886 1887 // Input registers (after setup_arg_regs) 1888 const Register from = c_rarg0; // source array address 1889 const Register to = c_rarg1; // destination array address 1890 const Register count = c_rarg2; // elementscount 1891 const Register ckoff = c_rarg3; // super_check_offset 1892 const Register ckval = c_rarg4; // super_klass 1893 1894 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1895 RegSet wb_post_saved_regs = RegSet::of(count); 1896 1897 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1898 const Register copied_oop = r22; // actual oop copied 1899 const Register count_save = r21; // orig elementscount 1900 const Register start_to = r20; // destination array start address 1901 const Register r19_klass = r19; // oop._klass 1902 1903 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1904 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1905 1906 //--------------------------------------------------------------- 1907 // Assembler stub will be used for this call to arraycopy 1908 // if the two arrays are subtypes of Object[] but the 1909 // destination array type is not equal to or a supertype 1910 // of the source type. Each element must be separately 1911 // checked. 1912 1913 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1914 copied_oop, r19_klass, count_save); 1915 1916 __ align(CodeEntryAlignment); 1917 StubCodeMark mark(this, "StubRoutines", name); 1918 address start = __ pc(); 1919 1920 __ enter(); // required for proper stackwalking of RuntimeStub frame 1921 1922 #ifdef ASSERT 1923 // caller guarantees that the arrays really are different 1924 // otherwise, we would have to make conjoint checks 1925 { Label L; 1926 __ b(L); // conjoint check not yet implemented 1927 __ stop("checkcast_copy within a single array"); 1928 __ bind(L); 1929 } 1930 #endif //ASSERT 1931 1932 // Caller of this entry point must set up the argument registers. 1933 if (entry != nullptr) { 1934 *entry = __ pc(); 1935 BLOCK_COMMENT("Entry:"); 1936 } 1937 1938 // Empty array: Nothing to do. 1939 __ cbz(count, L_done); 1940 __ push(RegSet::of(r19, r20, r21, r22), sp); 1941 1942 #ifdef ASSERT 1943 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1944 // The ckoff and ckval must be mutually consistent, 1945 // even though caller generates both. 1946 { Label L; 1947 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1948 __ ldrw(start_to, Address(ckval, sco_offset)); 1949 __ cmpw(ckoff, start_to); 1950 __ br(Assembler::EQ, L); 1951 __ stop("super_check_offset inconsistent"); 1952 __ bind(L); 1953 } 1954 #endif //ASSERT 1955 1956 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1957 bool is_oop = true; 1958 int element_size = UseCompressedOops ? 4 : 8; 1959 if (dest_uninitialized) { 1960 decorators |= IS_DEST_UNINITIALIZED; 1961 } 1962 1963 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1964 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1965 1966 // save the original count 1967 __ mov(count_save, count); 1968 1969 // Copy from low to high addresses 1970 __ mov(start_to, to); // Save destination array start address 1971 __ b(L_load_element); 1972 1973 // ======== begin loop ======== 1974 // (Loop is rotated; its entry is L_load_element.) 1975 // Loop control: 1976 // for (; count != 0; count--) { 1977 // copied_oop = load_heap_oop(from++); 1978 // ... generate_type_check ...; 1979 // store_heap_oop(to++, copied_oop); 1980 // } 1981 __ align(OptoLoopAlignment); 1982 1983 __ BIND(L_store_element); 1984 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1985 __ post(to, element_size), copied_oop, noreg, 1986 gct1, gct2, gct3); 1987 __ sub(count, count, 1); 1988 __ cbz(count, L_do_card_marks); 1989 1990 // ======== loop entry is here ======== 1991 __ BIND(L_load_element); 1992 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1993 copied_oop, noreg, __ post(from, element_size), 1994 gct1); 1995 __ cbz(copied_oop, L_store_element); 1996 1997 __ load_klass(r19_klass, copied_oop);// query the object klass 1998 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1999 // ======== end loop ======== 2000 2001 // It was a real error; we must depend on the caller to finish the job. 2002 // Register count = remaining oops, count_orig = total oops. 2003 // Emit GC store barriers for the oops we have copied and report 2004 // their number to the caller. 2005 2006 __ subs(count, count_save, count); // K = partially copied oop count 2007 __ eon(count, count, zr); // report (-1^K) to caller 2008 __ br(Assembler::EQ, L_done_pop); 2009 2010 __ BIND(L_do_card_marks); 2011 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2012 2013 __ bind(L_done_pop); 2014 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2015 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2016 2017 __ bind(L_done); 2018 __ mov(r0, count); 2019 __ leave(); 2020 __ ret(lr); 2021 2022 return start; 2023 } 2024 2025 // Perform range checks on the proposed arraycopy. 2026 // Kills temp, but nothing else. 2027 // Also, clean the sign bits of src_pos and dst_pos. 2028 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2029 Register src_pos, // source position (c_rarg1) 2030 Register dst, // destination array oo (c_rarg2) 2031 Register dst_pos, // destination position (c_rarg3) 2032 Register length, 2033 Register temp, 2034 Label& L_failed) { 2035 BLOCK_COMMENT("arraycopy_range_checks:"); 2036 2037 assert_different_registers(rscratch1, temp); 2038 2039 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2040 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2041 __ addw(temp, length, src_pos); 2042 __ cmpw(temp, rscratch1); 2043 __ br(Assembler::HI, L_failed); 2044 2045 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2046 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2047 __ addw(temp, length, dst_pos); 2048 __ cmpw(temp, rscratch1); 2049 __ br(Assembler::HI, L_failed); 2050 2051 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2052 __ movw(src_pos, src_pos); 2053 __ movw(dst_pos, dst_pos); 2054 2055 BLOCK_COMMENT("arraycopy_range_checks done"); 2056 } 2057 2058 // These stubs get called from some dumb test routine. 2059 // I'll write them properly when they're called from 2060 // something that's actually doing something. 2061 static void fake_arraycopy_stub(address src, address dst, int count) { 2062 assert(count == 0, "huh?"); 2063 } 2064 2065 2066 // 2067 // Generate 'unsafe' array copy stub 2068 // Though just as safe as the other stubs, it takes an unscaled 2069 // size_t argument instead of an element count. 2070 // 2071 // Input: 2072 // c_rarg0 - source array address 2073 // c_rarg1 - destination array address 2074 // c_rarg2 - byte count, treated as ssize_t, can be zero 2075 // 2076 // Examines the alignment of the operands and dispatches 2077 // to a long, int, short, or byte copy loop. 2078 // 2079 address generate_unsafe_copy(const char *name, 2080 address byte_copy_entry, 2081 address short_copy_entry, 2082 address int_copy_entry, 2083 address long_copy_entry) { 2084 Label L_long_aligned, L_int_aligned, L_short_aligned; 2085 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2086 2087 __ align(CodeEntryAlignment); 2088 StubCodeMark mark(this, "StubRoutines", name); 2089 address start = __ pc(); 2090 __ enter(); // required for proper stackwalking of RuntimeStub frame 2091 2092 // bump this on entry, not on exit: 2093 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2094 2095 __ orr(rscratch1, s, d); 2096 __ orr(rscratch1, rscratch1, count); 2097 2098 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2099 __ cbz(rscratch1, L_long_aligned); 2100 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2101 __ cbz(rscratch1, L_int_aligned); 2102 __ tbz(rscratch1, 0, L_short_aligned); 2103 __ b(RuntimeAddress(byte_copy_entry)); 2104 2105 __ BIND(L_short_aligned); 2106 __ lsr(count, count, LogBytesPerShort); // size => short_count 2107 __ b(RuntimeAddress(short_copy_entry)); 2108 __ BIND(L_int_aligned); 2109 __ lsr(count, count, LogBytesPerInt); // size => int_count 2110 __ b(RuntimeAddress(int_copy_entry)); 2111 __ BIND(L_long_aligned); 2112 __ lsr(count, count, LogBytesPerLong); // size => long_count 2113 __ b(RuntimeAddress(long_copy_entry)); 2114 2115 return start; 2116 } 2117 2118 // 2119 // Generate generic array copy stubs 2120 // 2121 // Input: 2122 // c_rarg0 - src oop 2123 // c_rarg1 - src_pos (32-bits) 2124 // c_rarg2 - dst oop 2125 // c_rarg3 - dst_pos (32-bits) 2126 // c_rarg4 - element count (32-bits) 2127 // 2128 // Output: 2129 // r0 == 0 - success 2130 // r0 == -1^K - failure, where K is partial transfer count 2131 // 2132 address generate_generic_copy(const char *name, 2133 address byte_copy_entry, address short_copy_entry, 2134 address int_copy_entry, address oop_copy_entry, 2135 address long_copy_entry, address checkcast_copy_entry) { 2136 2137 Label L_failed, L_objArray; 2138 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2139 2140 // Input registers 2141 const Register src = c_rarg0; // source array oop 2142 const Register src_pos = c_rarg1; // source position 2143 const Register dst = c_rarg2; // destination array oop 2144 const Register dst_pos = c_rarg3; // destination position 2145 const Register length = c_rarg4; 2146 2147 2148 // Registers used as temps 2149 const Register dst_klass = c_rarg5; 2150 2151 __ align(CodeEntryAlignment); 2152 2153 StubCodeMark mark(this, "StubRoutines", name); 2154 2155 address start = __ pc(); 2156 2157 __ enter(); // required for proper stackwalking of RuntimeStub frame 2158 2159 // bump this on entry, not on exit: 2160 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2161 2162 //----------------------------------------------------------------------- 2163 // Assembler stub will be used for this call to arraycopy 2164 // if the following conditions are met: 2165 // 2166 // (1) src and dst must not be null. 2167 // (2) src_pos must not be negative. 2168 // (3) dst_pos must not be negative. 2169 // (4) length must not be negative. 2170 // (5) src klass and dst klass should be the same and not null. 2171 // (6) src and dst should be arrays. 2172 // (7) src_pos + length must not exceed length of src. 2173 // (8) dst_pos + length must not exceed length of dst. 2174 // 2175 2176 // if (src == nullptr) return -1; 2177 __ cbz(src, L_failed); 2178 2179 // if (src_pos < 0) return -1; 2180 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2181 2182 // if (dst == nullptr) return -1; 2183 __ cbz(dst, L_failed); 2184 2185 // if (dst_pos < 0) return -1; 2186 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2187 2188 // registers used as temp 2189 const Register scratch_length = r16; // elements count to copy 2190 const Register scratch_src_klass = r17; // array klass 2191 const Register lh = r15; // layout helper 2192 2193 // if (length < 0) return -1; 2194 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2195 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2196 2197 __ load_klass(scratch_src_klass, src); 2198 #ifdef ASSERT 2199 // assert(src->klass() != nullptr); 2200 { 2201 BLOCK_COMMENT("assert klasses not null {"); 2202 Label L1, L2; 2203 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2204 __ bind(L1); 2205 __ stop("broken null klass"); 2206 __ bind(L2); 2207 __ load_klass(rscratch1, dst); 2208 __ cbz(rscratch1, L1); // this would be broken also 2209 BLOCK_COMMENT("} assert klasses not null done"); 2210 } 2211 #endif 2212 2213 // Load layout helper (32-bits) 2214 // 2215 // |array_tag| | header_size | element_type | |log2_element_size| 2216 // 32 30 24 16 8 2 0 2217 // 2218 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2219 // 2220 2221 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2222 2223 // Handle objArrays completely differently... 2224 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2225 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2226 __ movw(rscratch1, objArray_lh); 2227 __ eorw(rscratch2, lh, rscratch1); 2228 __ cbzw(rscratch2, L_objArray); 2229 2230 // if (src->klass() != dst->klass()) return -1; 2231 __ load_klass(rscratch2, dst); 2232 __ eor(rscratch2, rscratch2, scratch_src_klass); 2233 __ cbnz(rscratch2, L_failed); 2234 2235 // Check for flat inline type array -> return -1 2236 __ test_flat_array_oop(src, rscratch2, L_failed); 2237 2238 // Check for null-free (non-flat) inline type array -> handle as object array 2239 __ test_null_free_array_oop(src, rscratch2, L_objArray); 2240 2241 // if (!src->is_Array()) return -1; 2242 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2243 2244 // At this point, it is known to be a typeArray (array_tag 0x3). 2245 #ifdef ASSERT 2246 { 2247 BLOCK_COMMENT("assert primitive array {"); 2248 Label L; 2249 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2250 __ cmpw(lh, rscratch2); 2251 __ br(Assembler::GE, L); 2252 __ stop("must be a primitive array"); 2253 __ bind(L); 2254 BLOCK_COMMENT("} assert primitive array done"); 2255 } 2256 #endif 2257 2258 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2259 rscratch2, L_failed); 2260 2261 // TypeArrayKlass 2262 // 2263 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2264 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2265 // 2266 2267 const Register rscratch1_offset = rscratch1; // array offset 2268 const Register r15_elsize = lh; // element size 2269 2270 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2271 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2272 __ add(src, src, rscratch1_offset); // src array offset 2273 __ add(dst, dst, rscratch1_offset); // dst array offset 2274 BLOCK_COMMENT("choose copy loop based on element size"); 2275 2276 // next registers should be set before the jump to corresponding stub 2277 const Register from = c_rarg0; // source array address 2278 const Register to = c_rarg1; // destination array address 2279 const Register count = c_rarg2; // elements count 2280 2281 // 'from', 'to', 'count' registers should be set in such order 2282 // since they are the same as 'src', 'src_pos', 'dst'. 2283 2284 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2285 2286 // The possible values of elsize are 0-3, i.e. exact_log2(element 2287 // size in bytes). We do a simple bitwise binary search. 2288 __ BIND(L_copy_bytes); 2289 __ tbnz(r15_elsize, 1, L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_shorts); 2291 __ lea(from, Address(src, src_pos));// src_addr 2292 __ lea(to, Address(dst, dst_pos));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(byte_copy_entry)); 2295 2296 __ BIND(L_copy_shorts); 2297 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2298 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2299 __ movw(count, scratch_length); // length 2300 __ b(RuntimeAddress(short_copy_entry)); 2301 2302 __ BIND(L_copy_ints); 2303 __ tbnz(r15_elsize, 0, L_copy_longs); 2304 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2305 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2306 __ movw(count, scratch_length); // length 2307 __ b(RuntimeAddress(int_copy_entry)); 2308 2309 __ BIND(L_copy_longs); 2310 #ifdef ASSERT 2311 { 2312 BLOCK_COMMENT("assert long copy {"); 2313 Label L; 2314 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2315 __ cmpw(r15_elsize, LogBytesPerLong); 2316 __ br(Assembler::EQ, L); 2317 __ stop("must be long copy, but elsize is wrong"); 2318 __ bind(L); 2319 BLOCK_COMMENT("} assert long copy done"); 2320 } 2321 #endif 2322 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2323 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2324 __ movw(count, scratch_length); // length 2325 __ b(RuntimeAddress(long_copy_entry)); 2326 2327 // ObjArrayKlass 2328 __ BIND(L_objArray); 2329 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2330 2331 Label L_plain_copy, L_checkcast_copy; 2332 // test array classes for subtyping 2333 __ load_klass(r15, dst); 2334 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2335 __ br(Assembler::NE, L_checkcast_copy); 2336 2337 // Identically typed arrays can be copied without element-wise checks. 2338 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2339 rscratch2, L_failed); 2340 2341 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2342 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2343 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2344 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2345 __ movw(count, scratch_length); // length 2346 __ BIND(L_plain_copy); 2347 __ b(RuntimeAddress(oop_copy_entry)); 2348 2349 __ BIND(L_checkcast_copy); 2350 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2351 { 2352 // Before looking at dst.length, make sure dst is also an objArray. 2353 __ ldrw(rscratch1, Address(r15, lh_offset)); 2354 __ movw(rscratch2, objArray_lh); 2355 __ eorw(rscratch1, rscratch1, rscratch2); 2356 __ cbnzw(rscratch1, L_failed); 2357 2358 // It is safe to examine both src.length and dst.length. 2359 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2360 r15, L_failed); 2361 2362 __ load_klass(dst_klass, dst); // reload 2363 2364 // Marshal the base address arguments now, freeing registers. 2365 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2366 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2367 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2368 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2369 __ movw(count, length); // length (reloaded) 2370 Register sco_temp = c_rarg3; // this register is free now 2371 assert_different_registers(from, to, count, sco_temp, 2372 dst_klass, scratch_src_klass); 2373 // assert_clean_int(count, sco_temp); 2374 2375 // Generate the type check. 2376 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2377 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2378 2379 // Smashes rscratch1, rscratch2 2380 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2381 2382 // Fetch destination element klass from the ObjArrayKlass header. 2383 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2384 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2386 2387 // the checkcast_copy loop needs two extra arguments: 2388 assert(c_rarg3 == sco_temp, "#3 already in place"); 2389 // Set up arguments for checkcast_copy_entry. 2390 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2391 __ b(RuntimeAddress(checkcast_copy_entry)); 2392 } 2393 2394 __ BIND(L_failed); 2395 __ mov(r0, -1); 2396 __ leave(); // required for proper stackwalking of RuntimeStub frame 2397 __ ret(lr); 2398 2399 return start; 2400 } 2401 2402 // 2403 // Generate stub for array fill. If "aligned" is true, the 2404 // "to" address is assumed to be heapword aligned. 2405 // 2406 // Arguments for generated stub: 2407 // to: c_rarg0 2408 // value: c_rarg1 2409 // count: c_rarg2 treated as signed 2410 // 2411 address generate_fill(BasicType t, bool aligned, const char *name) { 2412 __ align(CodeEntryAlignment); 2413 StubCodeMark mark(this, "StubRoutines", name); 2414 address start = __ pc(); 2415 2416 BLOCK_COMMENT("Entry:"); 2417 2418 const Register to = c_rarg0; // source array address 2419 const Register value = c_rarg1; // value 2420 const Register count = c_rarg2; // elements count 2421 2422 const Register bz_base = r10; // base for block_zero routine 2423 const Register cnt_words = r11; // temp register 2424 2425 __ enter(); 2426 2427 Label L_fill_elements, L_exit1; 2428 2429 int shift = -1; 2430 switch (t) { 2431 case T_BYTE: 2432 shift = 0; 2433 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2434 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2435 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2436 __ br(Assembler::LO, L_fill_elements); 2437 break; 2438 case T_SHORT: 2439 shift = 1; 2440 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2441 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2442 __ br(Assembler::LO, L_fill_elements); 2443 break; 2444 case T_INT: 2445 shift = 2; 2446 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2447 __ br(Assembler::LO, L_fill_elements); 2448 break; 2449 default: ShouldNotReachHere(); 2450 } 2451 2452 // Align source address at 8 bytes address boundary. 2453 Label L_skip_align1, L_skip_align2, L_skip_align4; 2454 if (!aligned) { 2455 switch (t) { 2456 case T_BYTE: 2457 // One byte misalignment happens only for byte arrays. 2458 __ tbz(to, 0, L_skip_align1); 2459 __ strb(value, Address(__ post(to, 1))); 2460 __ subw(count, count, 1); 2461 __ bind(L_skip_align1); 2462 // Fallthrough 2463 case T_SHORT: 2464 // Two bytes misalignment happens only for byte and short (char) arrays. 2465 __ tbz(to, 1, L_skip_align2); 2466 __ strh(value, Address(__ post(to, 2))); 2467 __ subw(count, count, 2 >> shift); 2468 __ bind(L_skip_align2); 2469 // Fallthrough 2470 case T_INT: 2471 // Align to 8 bytes, we know we are 4 byte aligned to start. 2472 __ tbz(to, 2, L_skip_align4); 2473 __ strw(value, Address(__ post(to, 4))); 2474 __ subw(count, count, 4 >> shift); 2475 __ bind(L_skip_align4); 2476 break; 2477 default: ShouldNotReachHere(); 2478 } 2479 } 2480 2481 // 2482 // Fill large chunks 2483 // 2484 __ lsrw(cnt_words, count, 3 - shift); // number of words 2485 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2486 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2487 if (UseBlockZeroing) { 2488 Label non_block_zeroing, rest; 2489 // If the fill value is zero we can use the fast zero_words(). 2490 __ cbnz(value, non_block_zeroing); 2491 __ mov(bz_base, to); 2492 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2493 address tpc = __ zero_words(bz_base, cnt_words); 2494 if (tpc == nullptr) { 2495 fatal("CodeCache is full at generate_fill"); 2496 } 2497 __ b(rest); 2498 __ bind(non_block_zeroing); 2499 __ fill_words(to, cnt_words, value); 2500 __ bind(rest); 2501 } else { 2502 __ fill_words(to, cnt_words, value); 2503 } 2504 2505 // Remaining count is less than 8 bytes. Fill it by a single store. 2506 // Note that the total length is no less than 8 bytes. 2507 if (t == T_BYTE || t == T_SHORT) { 2508 Label L_exit1; 2509 __ cbzw(count, L_exit1); 2510 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2511 __ str(value, Address(to, -8)); // overwrite some elements 2512 __ bind(L_exit1); 2513 __ leave(); 2514 __ ret(lr); 2515 } 2516 2517 // Handle copies less than 8 bytes. 2518 Label L_fill_2, L_fill_4, L_exit2; 2519 __ bind(L_fill_elements); 2520 switch (t) { 2521 case T_BYTE: 2522 __ tbz(count, 0, L_fill_2); 2523 __ strb(value, Address(__ post(to, 1))); 2524 __ bind(L_fill_2); 2525 __ tbz(count, 1, L_fill_4); 2526 __ strh(value, Address(__ post(to, 2))); 2527 __ bind(L_fill_4); 2528 __ tbz(count, 2, L_exit2); 2529 __ strw(value, Address(to)); 2530 break; 2531 case T_SHORT: 2532 __ tbz(count, 0, L_fill_4); 2533 __ strh(value, Address(__ post(to, 2))); 2534 __ bind(L_fill_4); 2535 __ tbz(count, 1, L_exit2); 2536 __ strw(value, Address(to)); 2537 break; 2538 case T_INT: 2539 __ cbzw(count, L_exit2); 2540 __ strw(value, Address(to)); 2541 break; 2542 default: ShouldNotReachHere(); 2543 } 2544 __ bind(L_exit2); 2545 __ leave(); 2546 __ ret(lr); 2547 return start; 2548 } 2549 2550 address generate_data_cache_writeback() { 2551 const Register line = c_rarg0; // address of line to write back 2552 2553 __ align(CodeEntryAlignment); 2554 2555 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2556 2557 address start = __ pc(); 2558 __ enter(); 2559 __ cache_wb(Address(line, 0)); 2560 __ leave(); 2561 __ ret(lr); 2562 2563 return start; 2564 } 2565 2566 address generate_data_cache_writeback_sync() { 2567 const Register is_pre = c_rarg0; // pre or post sync 2568 2569 __ align(CodeEntryAlignment); 2570 2571 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2572 2573 // pre wbsync is a no-op 2574 // post wbsync translates to an sfence 2575 2576 Label skip; 2577 address start = __ pc(); 2578 __ enter(); 2579 __ cbnz(is_pre, skip); 2580 __ cache_wbsync(false); 2581 __ bind(skip); 2582 __ leave(); 2583 __ ret(lr); 2584 2585 return start; 2586 } 2587 2588 void generate_arraycopy_stubs() { 2589 address entry; 2590 address entry_jbyte_arraycopy; 2591 address entry_jshort_arraycopy; 2592 address entry_jint_arraycopy; 2593 address entry_oop_arraycopy; 2594 address entry_jlong_arraycopy; 2595 address entry_checkcast_arraycopy; 2596 2597 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2598 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2599 2600 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2601 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2602 2603 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2604 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2605 2606 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2607 2608 //*** jbyte 2609 // Always need aligned and unaligned versions 2610 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2611 "jbyte_disjoint_arraycopy"); 2612 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2613 &entry_jbyte_arraycopy, 2614 "jbyte_arraycopy"); 2615 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2616 "arrayof_jbyte_disjoint_arraycopy"); 2617 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2618 "arrayof_jbyte_arraycopy"); 2619 2620 //*** jshort 2621 // Always need aligned and unaligned versions 2622 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2623 "jshort_disjoint_arraycopy"); 2624 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2625 &entry_jshort_arraycopy, 2626 "jshort_arraycopy"); 2627 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2628 "arrayof_jshort_disjoint_arraycopy"); 2629 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2630 "arrayof_jshort_arraycopy"); 2631 2632 //*** jint 2633 // Aligned versions 2634 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2635 "arrayof_jint_disjoint_arraycopy"); 2636 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2637 "arrayof_jint_arraycopy"); 2638 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2639 // entry_jint_arraycopy always points to the unaligned version 2640 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2641 "jint_disjoint_arraycopy"); 2642 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2643 &entry_jint_arraycopy, 2644 "jint_arraycopy"); 2645 2646 //*** jlong 2647 // It is always aligned 2648 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2649 "arrayof_jlong_disjoint_arraycopy"); 2650 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2651 "arrayof_jlong_arraycopy"); 2652 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2653 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2654 2655 //*** oops 2656 { 2657 // With compressed oops we need unaligned versions; notice that 2658 // we overwrite entry_oop_arraycopy. 2659 bool aligned = !UseCompressedOops; 2660 2661 StubRoutines::_arrayof_oop_disjoint_arraycopy 2662 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2663 /*dest_uninitialized*/false); 2664 StubRoutines::_arrayof_oop_arraycopy 2665 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2666 /*dest_uninitialized*/false); 2667 // Aligned versions without pre-barriers 2668 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2669 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2670 /*dest_uninitialized*/true); 2671 StubRoutines::_arrayof_oop_arraycopy_uninit 2672 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2673 /*dest_uninitialized*/true); 2674 } 2675 2676 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2677 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2678 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2679 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2680 2681 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2682 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2683 /*dest_uninitialized*/true); 2684 2685 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2686 entry_jbyte_arraycopy, 2687 entry_jshort_arraycopy, 2688 entry_jint_arraycopy, 2689 entry_jlong_arraycopy); 2690 2691 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2692 entry_jbyte_arraycopy, 2693 entry_jshort_arraycopy, 2694 entry_jint_arraycopy, 2695 entry_oop_arraycopy, 2696 entry_jlong_arraycopy, 2697 entry_checkcast_arraycopy); 2698 2699 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2700 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2701 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2702 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2703 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2704 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2705 } 2706 2707 void generate_math_stubs() { Unimplemented(); } 2708 2709 // Arguments: 2710 // 2711 // Inputs: 2712 // c_rarg0 - source byte array address 2713 // c_rarg1 - destination byte array address 2714 // c_rarg2 - K (key) in little endian int array 2715 // 2716 address generate_aescrypt_encryptBlock() { 2717 __ align(CodeEntryAlignment); 2718 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2719 2720 const Register from = c_rarg0; // source array address 2721 const Register to = c_rarg1; // destination array address 2722 const Register key = c_rarg2; // key array address 2723 const Register keylen = rscratch1; 2724 2725 address start = __ pc(); 2726 __ enter(); 2727 2728 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2729 2730 __ aesenc_loadkeys(key, keylen); 2731 __ aesecb_encrypt(from, to, keylen); 2732 2733 __ mov(r0, 0); 2734 2735 __ leave(); 2736 __ ret(lr); 2737 2738 return start; 2739 } 2740 2741 // Arguments: 2742 // 2743 // Inputs: 2744 // c_rarg0 - source byte array address 2745 // c_rarg1 - destination byte array address 2746 // c_rarg2 - K (key) in little endian int array 2747 // 2748 address generate_aescrypt_decryptBlock() { 2749 assert(UseAES, "need AES cryptographic extension support"); 2750 __ align(CodeEntryAlignment); 2751 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2752 Label L_doLast; 2753 2754 const Register from = c_rarg0; // source array address 2755 const Register to = c_rarg1; // destination array address 2756 const Register key = c_rarg2; // key array address 2757 const Register keylen = rscratch1; 2758 2759 address start = __ pc(); 2760 __ enter(); // required for proper stackwalking of RuntimeStub frame 2761 2762 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2763 2764 __ aesecb_decrypt(from, to, key, keylen); 2765 2766 __ mov(r0, 0); 2767 2768 __ leave(); 2769 __ ret(lr); 2770 2771 return start; 2772 } 2773 2774 // Arguments: 2775 // 2776 // Inputs: 2777 // c_rarg0 - source byte array address 2778 // c_rarg1 - destination byte array address 2779 // c_rarg2 - K (key) in little endian int array 2780 // c_rarg3 - r vector byte array address 2781 // c_rarg4 - input length 2782 // 2783 // Output: 2784 // x0 - input length 2785 // 2786 address generate_cipherBlockChaining_encryptAESCrypt() { 2787 assert(UseAES, "need AES cryptographic extension support"); 2788 __ align(CodeEntryAlignment); 2789 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2790 2791 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2792 2793 const Register from = c_rarg0; // source array address 2794 const Register to = c_rarg1; // destination array address 2795 const Register key = c_rarg2; // key array address 2796 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2797 // and left with the results of the last encryption block 2798 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2799 const Register keylen = rscratch1; 2800 2801 address start = __ pc(); 2802 2803 __ enter(); 2804 2805 __ movw(rscratch2, len_reg); 2806 2807 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2808 2809 __ ld1(v0, __ T16B, rvec); 2810 2811 __ cmpw(keylen, 52); 2812 __ br(Assembler::CC, L_loadkeys_44); 2813 __ br(Assembler::EQ, L_loadkeys_52); 2814 2815 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2816 __ rev32(v17, __ T16B, v17); 2817 __ rev32(v18, __ T16B, v18); 2818 __ BIND(L_loadkeys_52); 2819 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2820 __ rev32(v19, __ T16B, v19); 2821 __ rev32(v20, __ T16B, v20); 2822 __ BIND(L_loadkeys_44); 2823 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2824 __ rev32(v21, __ T16B, v21); 2825 __ rev32(v22, __ T16B, v22); 2826 __ rev32(v23, __ T16B, v23); 2827 __ rev32(v24, __ T16B, v24); 2828 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2829 __ rev32(v25, __ T16B, v25); 2830 __ rev32(v26, __ T16B, v26); 2831 __ rev32(v27, __ T16B, v27); 2832 __ rev32(v28, __ T16B, v28); 2833 __ ld1(v29, v30, v31, __ T16B, key); 2834 __ rev32(v29, __ T16B, v29); 2835 __ rev32(v30, __ T16B, v30); 2836 __ rev32(v31, __ T16B, v31); 2837 2838 __ BIND(L_aes_loop); 2839 __ ld1(v1, __ T16B, __ post(from, 16)); 2840 __ eor(v0, __ T16B, v0, v1); 2841 2842 __ br(Assembler::CC, L_rounds_44); 2843 __ br(Assembler::EQ, L_rounds_52); 2844 2845 __ aese(v0, v17); __ aesmc(v0, v0); 2846 __ aese(v0, v18); __ aesmc(v0, v0); 2847 __ BIND(L_rounds_52); 2848 __ aese(v0, v19); __ aesmc(v0, v0); 2849 __ aese(v0, v20); __ aesmc(v0, v0); 2850 __ BIND(L_rounds_44); 2851 __ aese(v0, v21); __ aesmc(v0, v0); 2852 __ aese(v0, v22); __ aesmc(v0, v0); 2853 __ aese(v0, v23); __ aesmc(v0, v0); 2854 __ aese(v0, v24); __ aesmc(v0, v0); 2855 __ aese(v0, v25); __ aesmc(v0, v0); 2856 __ aese(v0, v26); __ aesmc(v0, v0); 2857 __ aese(v0, v27); __ aesmc(v0, v0); 2858 __ aese(v0, v28); __ aesmc(v0, v0); 2859 __ aese(v0, v29); __ aesmc(v0, v0); 2860 __ aese(v0, v30); 2861 __ eor(v0, __ T16B, v0, v31); 2862 2863 __ st1(v0, __ T16B, __ post(to, 16)); 2864 2865 __ subw(len_reg, len_reg, 16); 2866 __ cbnzw(len_reg, L_aes_loop); 2867 2868 __ st1(v0, __ T16B, rvec); 2869 2870 __ mov(r0, rscratch2); 2871 2872 __ leave(); 2873 __ ret(lr); 2874 2875 return start; 2876 } 2877 2878 // Arguments: 2879 // 2880 // Inputs: 2881 // c_rarg0 - source byte array address 2882 // c_rarg1 - destination byte array address 2883 // c_rarg2 - K (key) in little endian int array 2884 // c_rarg3 - r vector byte array address 2885 // c_rarg4 - input length 2886 // 2887 // Output: 2888 // r0 - input length 2889 // 2890 address generate_cipherBlockChaining_decryptAESCrypt() { 2891 assert(UseAES, "need AES cryptographic extension support"); 2892 __ align(CodeEntryAlignment); 2893 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2894 2895 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2896 2897 const Register from = c_rarg0; // source array address 2898 const Register to = c_rarg1; // destination array address 2899 const Register key = c_rarg2; // key array address 2900 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2901 // and left with the results of the last encryption block 2902 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2903 const Register keylen = rscratch1; 2904 2905 address start = __ pc(); 2906 2907 __ enter(); 2908 2909 __ movw(rscratch2, len_reg); 2910 2911 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2912 2913 __ ld1(v2, __ T16B, rvec); 2914 2915 __ ld1(v31, __ T16B, __ post(key, 16)); 2916 __ rev32(v31, __ T16B, v31); 2917 2918 __ cmpw(keylen, 52); 2919 __ br(Assembler::CC, L_loadkeys_44); 2920 __ br(Assembler::EQ, L_loadkeys_52); 2921 2922 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2923 __ rev32(v17, __ T16B, v17); 2924 __ rev32(v18, __ T16B, v18); 2925 __ BIND(L_loadkeys_52); 2926 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2927 __ rev32(v19, __ T16B, v19); 2928 __ rev32(v20, __ T16B, v20); 2929 __ BIND(L_loadkeys_44); 2930 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2931 __ rev32(v21, __ T16B, v21); 2932 __ rev32(v22, __ T16B, v22); 2933 __ rev32(v23, __ T16B, v23); 2934 __ rev32(v24, __ T16B, v24); 2935 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2936 __ rev32(v25, __ T16B, v25); 2937 __ rev32(v26, __ T16B, v26); 2938 __ rev32(v27, __ T16B, v27); 2939 __ rev32(v28, __ T16B, v28); 2940 __ ld1(v29, v30, __ T16B, key); 2941 __ rev32(v29, __ T16B, v29); 2942 __ rev32(v30, __ T16B, v30); 2943 2944 __ BIND(L_aes_loop); 2945 __ ld1(v0, __ T16B, __ post(from, 16)); 2946 __ orr(v1, __ T16B, v0, v0); 2947 2948 __ br(Assembler::CC, L_rounds_44); 2949 __ br(Assembler::EQ, L_rounds_52); 2950 2951 __ aesd(v0, v17); __ aesimc(v0, v0); 2952 __ aesd(v0, v18); __ aesimc(v0, v0); 2953 __ BIND(L_rounds_52); 2954 __ aesd(v0, v19); __ aesimc(v0, v0); 2955 __ aesd(v0, v20); __ aesimc(v0, v0); 2956 __ BIND(L_rounds_44); 2957 __ aesd(v0, v21); __ aesimc(v0, v0); 2958 __ aesd(v0, v22); __ aesimc(v0, v0); 2959 __ aesd(v0, v23); __ aesimc(v0, v0); 2960 __ aesd(v0, v24); __ aesimc(v0, v0); 2961 __ aesd(v0, v25); __ aesimc(v0, v0); 2962 __ aesd(v0, v26); __ aesimc(v0, v0); 2963 __ aesd(v0, v27); __ aesimc(v0, v0); 2964 __ aesd(v0, v28); __ aesimc(v0, v0); 2965 __ aesd(v0, v29); __ aesimc(v0, v0); 2966 __ aesd(v0, v30); 2967 __ eor(v0, __ T16B, v0, v31); 2968 __ eor(v0, __ T16B, v0, v2); 2969 2970 __ st1(v0, __ T16B, __ post(to, 16)); 2971 __ orr(v2, __ T16B, v1, v1); 2972 2973 __ subw(len_reg, len_reg, 16); 2974 __ cbnzw(len_reg, L_aes_loop); 2975 2976 __ st1(v2, __ T16B, rvec); 2977 2978 __ mov(r0, rscratch2); 2979 2980 __ leave(); 2981 __ ret(lr); 2982 2983 return start; 2984 } 2985 2986 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2987 // Inputs: 128-bits. in is preserved. 2988 // The least-significant 64-bit word is in the upper dword of each vector. 2989 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2990 // Output: result 2991 void be_add_128_64(FloatRegister result, FloatRegister in, 2992 FloatRegister inc, FloatRegister tmp) { 2993 assert_different_registers(result, tmp, inc); 2994 2995 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2996 // input 2997 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2998 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 2999 // MSD == 0 (must be!) to LSD 3000 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3001 } 3002 3003 // CTR AES crypt. 3004 // Arguments: 3005 // 3006 // Inputs: 3007 // c_rarg0 - source byte array address 3008 // c_rarg1 - destination byte array address 3009 // c_rarg2 - K (key) in little endian int array 3010 // c_rarg3 - counter vector byte array address 3011 // c_rarg4 - input length 3012 // c_rarg5 - saved encryptedCounter start 3013 // c_rarg6 - saved used length 3014 // 3015 // Output: 3016 // r0 - input length 3017 // 3018 address generate_counterMode_AESCrypt() { 3019 const Register in = c_rarg0; 3020 const Register out = c_rarg1; 3021 const Register key = c_rarg2; 3022 const Register counter = c_rarg3; 3023 const Register saved_len = c_rarg4, len = r10; 3024 const Register saved_encrypted_ctr = c_rarg5; 3025 const Register used_ptr = c_rarg6, used = r12; 3026 3027 const Register offset = r7; 3028 const Register keylen = r11; 3029 3030 const unsigned char block_size = 16; 3031 const int bulk_width = 4; 3032 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3033 // performance with larger data sizes, but it also means that the 3034 // fast path isn't used until you have at least 8 blocks, and up 3035 // to 127 bytes of data will be executed on the slow path. For 3036 // that reason, and also so as not to blow away too much icache, 4 3037 // blocks seems like a sensible compromise. 3038 3039 // Algorithm: 3040 // 3041 // if (len == 0) { 3042 // goto DONE; 3043 // } 3044 // int result = len; 3045 // do { 3046 // if (used >= blockSize) { 3047 // if (len >= bulk_width * blockSize) { 3048 // CTR_large_block(); 3049 // if (len == 0) 3050 // goto DONE; 3051 // } 3052 // for (;;) { 3053 // 16ByteVector v0 = counter; 3054 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3055 // used = 0; 3056 // if (len < blockSize) 3057 // break; /* goto NEXT */ 3058 // 16ByteVector v1 = load16Bytes(in, offset); 3059 // v1 = v1 ^ encryptedCounter; 3060 // store16Bytes(out, offset); 3061 // used = blockSize; 3062 // offset += blockSize; 3063 // len -= blockSize; 3064 // if (len == 0) 3065 // goto DONE; 3066 // } 3067 // } 3068 // NEXT: 3069 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3070 // len--; 3071 // } while (len != 0); 3072 // DONE: 3073 // return result; 3074 // 3075 // CTR_large_block() 3076 // Wide bulk encryption of whole blocks. 3077 3078 __ align(CodeEntryAlignment); 3079 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3080 const address start = __ pc(); 3081 __ enter(); 3082 3083 Label DONE, CTR_large_block, large_block_return; 3084 __ ldrw(used, Address(used_ptr)); 3085 __ cbzw(saved_len, DONE); 3086 3087 __ mov(len, saved_len); 3088 __ mov(offset, 0); 3089 3090 // Compute #rounds for AES based on the length of the key array 3091 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3092 3093 __ aesenc_loadkeys(key, keylen); 3094 3095 { 3096 Label L_CTR_loop, NEXT; 3097 3098 __ bind(L_CTR_loop); 3099 3100 __ cmp(used, block_size); 3101 __ br(__ LO, NEXT); 3102 3103 // Maybe we have a lot of data 3104 __ subsw(rscratch1, len, bulk_width * block_size); 3105 __ br(__ HS, CTR_large_block); 3106 __ BIND(large_block_return); 3107 __ cbzw(len, DONE); 3108 3109 // Setup the counter 3110 __ movi(v4, __ T4S, 0); 3111 __ movi(v5, __ T4S, 1); 3112 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3113 3114 // 128-bit big-endian increment 3115 __ ld1(v0, __ T16B, counter); 3116 __ rev64(v16, __ T16B, v0); 3117 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3118 __ rev64(v16, __ T16B, v16); 3119 __ st1(v16, __ T16B, counter); 3120 // Previous counter value is in v0 3121 // v4 contains { 0, 1 } 3122 3123 { 3124 // We have fewer than bulk_width blocks of data left. Encrypt 3125 // them one by one until there is less than a full block 3126 // remaining, being careful to save both the encrypted counter 3127 // and the counter. 3128 3129 Label inner_loop; 3130 __ bind(inner_loop); 3131 // Counter to encrypt is in v0 3132 __ aesecb_encrypt(noreg, noreg, keylen); 3133 __ st1(v0, __ T16B, saved_encrypted_ctr); 3134 3135 // Do we have a remaining full block? 3136 3137 __ mov(used, 0); 3138 __ cmp(len, block_size); 3139 __ br(__ LO, NEXT); 3140 3141 // Yes, we have a full block 3142 __ ldrq(v1, Address(in, offset)); 3143 __ eor(v1, __ T16B, v1, v0); 3144 __ strq(v1, Address(out, offset)); 3145 __ mov(used, block_size); 3146 __ add(offset, offset, block_size); 3147 3148 __ subw(len, len, block_size); 3149 __ cbzw(len, DONE); 3150 3151 // Increment the counter, store it back 3152 __ orr(v0, __ T16B, v16, v16); 3153 __ rev64(v16, __ T16B, v16); 3154 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3155 __ rev64(v16, __ T16B, v16); 3156 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3157 3158 __ b(inner_loop); 3159 } 3160 3161 __ BIND(NEXT); 3162 3163 // Encrypt a single byte, and loop. 3164 // We expect this to be a rare event. 3165 __ ldrb(rscratch1, Address(in, offset)); 3166 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3167 __ eor(rscratch1, rscratch1, rscratch2); 3168 __ strb(rscratch1, Address(out, offset)); 3169 __ add(offset, offset, 1); 3170 __ add(used, used, 1); 3171 __ subw(len, len,1); 3172 __ cbnzw(len, L_CTR_loop); 3173 } 3174 3175 __ bind(DONE); 3176 __ strw(used, Address(used_ptr)); 3177 __ mov(r0, saved_len); 3178 3179 __ leave(); // required for proper stackwalking of RuntimeStub frame 3180 __ ret(lr); 3181 3182 // Bulk encryption 3183 3184 __ BIND (CTR_large_block); 3185 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3186 3187 if (bulk_width == 8) { 3188 __ sub(sp, sp, 4 * 16); 3189 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3190 } 3191 __ sub(sp, sp, 4 * 16); 3192 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3193 RegSet saved_regs = (RegSet::of(in, out, offset) 3194 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3195 __ push(saved_regs, sp); 3196 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3197 __ add(in, in, offset); 3198 __ add(out, out, offset); 3199 3200 // Keys should already be loaded into the correct registers 3201 3202 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3203 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3204 3205 // AES/CTR loop 3206 { 3207 Label L_CTR_loop; 3208 __ BIND(L_CTR_loop); 3209 3210 // Setup the counters 3211 __ movi(v8, __ T4S, 0); 3212 __ movi(v9, __ T4S, 1); 3213 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3214 3215 for (int i = 0; i < bulk_width; i++) { 3216 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3217 __ rev64(v0_ofs, __ T16B, v16); 3218 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3219 } 3220 3221 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3222 3223 // Encrypt the counters 3224 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3225 3226 if (bulk_width == 8) { 3227 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3228 } 3229 3230 // XOR the encrypted counters with the inputs 3231 for (int i = 0; i < bulk_width; i++) { 3232 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3233 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3234 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3235 } 3236 3237 // Write the encrypted data 3238 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3239 if (bulk_width == 8) { 3240 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3241 } 3242 3243 __ subw(len, len, 16 * bulk_width); 3244 __ cbnzw(len, L_CTR_loop); 3245 } 3246 3247 // Save the counter back where it goes 3248 __ rev64(v16, __ T16B, v16); 3249 __ st1(v16, __ T16B, counter); 3250 3251 __ pop(saved_regs, sp); 3252 3253 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3254 if (bulk_width == 8) { 3255 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3256 } 3257 3258 __ andr(rscratch1, len, -16 * bulk_width); 3259 __ sub(len, len, rscratch1); 3260 __ add(offset, offset, rscratch1); 3261 __ mov(used, 16); 3262 __ strw(used, Address(used_ptr)); 3263 __ b(large_block_return); 3264 3265 return start; 3266 } 3267 3268 // Vector AES Galois Counter Mode implementation. Parameters: 3269 // 3270 // in = c_rarg0 3271 // len = c_rarg1 3272 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3273 // out = c_rarg3 3274 // key = c_rarg4 3275 // state = c_rarg5 - GHASH.state 3276 // subkeyHtbl = c_rarg6 - powers of H 3277 // counter = c_rarg7 - 16 bytes of CTR 3278 // return - number of processed bytes 3279 address generate_galoisCounterMode_AESCrypt() { 3280 address ghash_polynomial = __ pc(); 3281 __ emit_int64(0x87); // The low-order bits of the field 3282 // polynomial (i.e. p = z^7+z^2+z+1) 3283 // repeated in the low and high parts of a 3284 // 128-bit vector 3285 __ emit_int64(0x87); 3286 3287 __ align(CodeEntryAlignment); 3288 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3289 address start = __ pc(); 3290 __ enter(); 3291 3292 const Register in = c_rarg0; 3293 const Register len = c_rarg1; 3294 const Register ct = c_rarg2; 3295 const Register out = c_rarg3; 3296 // and updated with the incremented counter in the end 3297 3298 const Register key = c_rarg4; 3299 const Register state = c_rarg5; 3300 3301 const Register subkeyHtbl = c_rarg6; 3302 3303 const Register counter = c_rarg7; 3304 3305 const Register keylen = r10; 3306 // Save state before entering routine 3307 __ sub(sp, sp, 4 * 16); 3308 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3309 __ sub(sp, sp, 4 * 16); 3310 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3311 3312 // __ andr(len, len, -512); 3313 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3314 __ str(len, __ pre(sp, -2 * wordSize)); 3315 3316 Label DONE; 3317 __ cbz(len, DONE); 3318 3319 // Compute #rounds for AES based on the length of the key array 3320 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3321 3322 __ aesenc_loadkeys(key, keylen); 3323 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3324 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3325 3326 // AES/CTR loop 3327 { 3328 Label L_CTR_loop; 3329 __ BIND(L_CTR_loop); 3330 3331 // Setup the counters 3332 __ movi(v8, __ T4S, 0); 3333 __ movi(v9, __ T4S, 1); 3334 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3335 3336 assert(v0->encoding() < v8->encoding(), ""); 3337 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3338 FloatRegister f = as_FloatRegister(i); 3339 __ rev32(f, __ T16B, v16); 3340 __ addv(v16, __ T4S, v16, v8); 3341 } 3342 3343 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3344 3345 // Encrypt the counters 3346 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3347 3348 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3349 3350 // XOR the encrypted counters with the inputs 3351 for (int i = 0; i < 8; i++) { 3352 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3353 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3354 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3355 } 3356 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3357 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3358 3359 __ subw(len, len, 16 * 8); 3360 __ cbnzw(len, L_CTR_loop); 3361 } 3362 3363 __ rev32(v16, __ T16B, v16); 3364 __ st1(v16, __ T16B, counter); 3365 3366 __ ldr(len, Address(sp)); 3367 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3368 3369 // GHASH/CTR loop 3370 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3371 len, /*unrolls*/4); 3372 3373 #ifdef ASSERT 3374 { Label L; 3375 __ cmp(len, (unsigned char)0); 3376 __ br(Assembler::EQ, L); 3377 __ stop("stubGenerator: abort"); 3378 __ bind(L); 3379 } 3380 #endif 3381 3382 __ bind(DONE); 3383 // Return the number of bytes processed 3384 __ ldr(r0, __ post(sp, 2 * wordSize)); 3385 3386 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3387 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3388 3389 __ leave(); // required for proper stackwalking of RuntimeStub frame 3390 __ ret(lr); 3391 return start; 3392 } 3393 3394 class Cached64Bytes { 3395 private: 3396 MacroAssembler *_masm; 3397 Register _regs[8]; 3398 3399 public: 3400 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3401 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3402 auto it = rs.begin(); 3403 for (auto &r: _regs) { 3404 r = *it; 3405 ++it; 3406 } 3407 } 3408 3409 void gen_loads(Register base) { 3410 for (int i = 0; i < 8; i += 2) { 3411 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3412 } 3413 } 3414 3415 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3416 void extract_u32(Register dest, int i) { 3417 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3418 } 3419 }; 3420 3421 // Utility routines for md5. 3422 // Clobbers r10 and r11. 3423 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3424 int k, int s, int t) { 3425 Register rscratch3 = r10; 3426 Register rscratch4 = r11; 3427 3428 __ eorw(rscratch3, r3, r4); 3429 __ movw(rscratch2, t); 3430 __ andw(rscratch3, rscratch3, r2); 3431 __ addw(rscratch4, r1, rscratch2); 3432 reg_cache.extract_u32(rscratch1, k); 3433 __ eorw(rscratch3, rscratch3, r4); 3434 __ addw(rscratch4, rscratch4, rscratch1); 3435 __ addw(rscratch3, rscratch3, rscratch4); 3436 __ rorw(rscratch2, rscratch3, 32 - s); 3437 __ addw(r1, rscratch2, r2); 3438 } 3439 3440 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3441 int k, int s, int t) { 3442 Register rscratch3 = r10; 3443 Register rscratch4 = r11; 3444 3445 __ andw(rscratch3, r2, r4); 3446 __ bicw(rscratch4, r3, r4); 3447 reg_cache.extract_u32(rscratch1, k); 3448 __ movw(rscratch2, t); 3449 __ orrw(rscratch3, rscratch3, rscratch4); 3450 __ addw(rscratch4, r1, rscratch2); 3451 __ addw(rscratch4, rscratch4, rscratch1); 3452 __ addw(rscratch3, rscratch3, rscratch4); 3453 __ rorw(rscratch2, rscratch3, 32 - s); 3454 __ addw(r1, rscratch2, r2); 3455 } 3456 3457 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3458 int k, int s, int t) { 3459 Register rscratch3 = r10; 3460 Register rscratch4 = r11; 3461 3462 __ eorw(rscratch3, r3, r4); 3463 __ movw(rscratch2, t); 3464 __ addw(rscratch4, r1, rscratch2); 3465 reg_cache.extract_u32(rscratch1, k); 3466 __ eorw(rscratch3, rscratch3, r2); 3467 __ addw(rscratch4, rscratch4, rscratch1); 3468 __ addw(rscratch3, rscratch3, rscratch4); 3469 __ rorw(rscratch2, rscratch3, 32 - s); 3470 __ addw(r1, rscratch2, r2); 3471 } 3472 3473 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3474 int k, int s, int t) { 3475 Register rscratch3 = r10; 3476 Register rscratch4 = r11; 3477 3478 __ movw(rscratch3, t); 3479 __ ornw(rscratch2, r2, r4); 3480 __ addw(rscratch4, r1, rscratch3); 3481 reg_cache.extract_u32(rscratch1, k); 3482 __ eorw(rscratch3, rscratch2, r3); 3483 __ addw(rscratch4, rscratch4, rscratch1); 3484 __ addw(rscratch3, rscratch3, rscratch4); 3485 __ rorw(rscratch2, rscratch3, 32 - s); 3486 __ addw(r1, rscratch2, r2); 3487 } 3488 3489 // Arguments: 3490 // 3491 // Inputs: 3492 // c_rarg0 - byte[] source+offset 3493 // c_rarg1 - int[] SHA.state 3494 // c_rarg2 - int offset 3495 // c_rarg3 - int limit 3496 // 3497 address generate_md5_implCompress(bool multi_block, const char *name) { 3498 __ align(CodeEntryAlignment); 3499 StubCodeMark mark(this, "StubRoutines", name); 3500 address start = __ pc(); 3501 3502 Register buf = c_rarg0; 3503 Register state = c_rarg1; 3504 Register ofs = c_rarg2; 3505 Register limit = c_rarg3; 3506 Register a = r4; 3507 Register b = r5; 3508 Register c = r6; 3509 Register d = r7; 3510 Register rscratch3 = r10; 3511 Register rscratch4 = r11; 3512 3513 Register state_regs[2] = { r12, r13 }; 3514 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3515 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3516 3517 __ push(saved_regs, sp); 3518 3519 __ ldp(state_regs[0], state_regs[1], Address(state)); 3520 __ ubfx(a, state_regs[0], 0, 32); 3521 __ ubfx(b, state_regs[0], 32, 32); 3522 __ ubfx(c, state_regs[1], 0, 32); 3523 __ ubfx(d, state_regs[1], 32, 32); 3524 3525 Label md5_loop; 3526 __ BIND(md5_loop); 3527 3528 reg_cache.gen_loads(buf); 3529 3530 // Round 1 3531 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3532 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3533 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3534 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3535 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3536 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3537 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3538 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3539 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3540 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3541 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3542 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3543 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3544 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3545 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3546 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3547 3548 // Round 2 3549 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3550 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3551 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3552 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3553 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3554 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3555 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3556 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3557 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3558 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3559 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3560 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3561 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3562 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3563 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3564 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3565 3566 // Round 3 3567 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3568 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3569 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3570 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3571 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3572 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3573 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3574 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3575 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3576 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3577 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3578 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3579 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3580 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3581 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3582 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3583 3584 // Round 4 3585 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3586 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3587 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3588 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3589 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3590 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3591 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3592 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3593 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3594 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3595 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3596 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3597 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3598 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3599 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3600 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3601 3602 __ addw(a, state_regs[0], a); 3603 __ ubfx(rscratch2, state_regs[0], 32, 32); 3604 __ addw(b, rscratch2, b); 3605 __ addw(c, state_regs[1], c); 3606 __ ubfx(rscratch4, state_regs[1], 32, 32); 3607 __ addw(d, rscratch4, d); 3608 3609 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3610 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3611 3612 if (multi_block) { 3613 __ add(buf, buf, 64); 3614 __ add(ofs, ofs, 64); 3615 __ cmp(ofs, limit); 3616 __ br(Assembler::LE, md5_loop); 3617 __ mov(c_rarg0, ofs); // return ofs 3618 } 3619 3620 // write hash values back in the correct order 3621 __ stp(state_regs[0], state_regs[1], Address(state)); 3622 3623 __ pop(saved_regs, sp); 3624 3625 __ ret(lr); 3626 3627 return start; 3628 } 3629 3630 // Arguments: 3631 // 3632 // Inputs: 3633 // c_rarg0 - byte[] source+offset 3634 // c_rarg1 - int[] SHA.state 3635 // c_rarg2 - int offset 3636 // c_rarg3 - int limit 3637 // 3638 address generate_sha1_implCompress(bool multi_block, const char *name) { 3639 __ align(CodeEntryAlignment); 3640 StubCodeMark mark(this, "StubRoutines", name); 3641 address start = __ pc(); 3642 3643 Register buf = c_rarg0; 3644 Register state = c_rarg1; 3645 Register ofs = c_rarg2; 3646 Register limit = c_rarg3; 3647 3648 Label keys; 3649 Label sha1_loop; 3650 3651 // load the keys into v0..v3 3652 __ adr(rscratch1, keys); 3653 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3654 // load 5 words state into v6, v7 3655 __ ldrq(v6, Address(state, 0)); 3656 __ ldrs(v7, Address(state, 16)); 3657 3658 3659 __ BIND(sha1_loop); 3660 // load 64 bytes of data into v16..v19 3661 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3662 __ rev32(v16, __ T16B, v16); 3663 __ rev32(v17, __ T16B, v17); 3664 __ rev32(v18, __ T16B, v18); 3665 __ rev32(v19, __ T16B, v19); 3666 3667 // do the sha1 3668 __ addv(v4, __ T4S, v16, v0); 3669 __ orr(v20, __ T16B, v6, v6); 3670 3671 FloatRegister d0 = v16; 3672 FloatRegister d1 = v17; 3673 FloatRegister d2 = v18; 3674 FloatRegister d3 = v19; 3675 3676 for (int round = 0; round < 20; round++) { 3677 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3678 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3679 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3680 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3681 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3682 3683 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3684 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3685 __ sha1h(tmp2, __ T4S, v20); 3686 if (round < 5) 3687 __ sha1c(v20, __ T4S, tmp3, tmp4); 3688 else if (round < 10 || round >= 15) 3689 __ sha1p(v20, __ T4S, tmp3, tmp4); 3690 else 3691 __ sha1m(v20, __ T4S, tmp3, tmp4); 3692 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3693 3694 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3695 } 3696 3697 __ addv(v7, __ T2S, v7, v21); 3698 __ addv(v6, __ T4S, v6, v20); 3699 3700 if (multi_block) { 3701 __ add(ofs, ofs, 64); 3702 __ cmp(ofs, limit); 3703 __ br(Assembler::LE, sha1_loop); 3704 __ mov(c_rarg0, ofs); // return ofs 3705 } 3706 3707 __ strq(v6, Address(state, 0)); 3708 __ strs(v7, Address(state, 16)); 3709 3710 __ ret(lr); 3711 3712 __ bind(keys); 3713 __ emit_int32(0x5a827999); 3714 __ emit_int32(0x6ed9eba1); 3715 __ emit_int32(0x8f1bbcdc); 3716 __ emit_int32(0xca62c1d6); 3717 3718 return start; 3719 } 3720 3721 3722 // Arguments: 3723 // 3724 // Inputs: 3725 // c_rarg0 - byte[] source+offset 3726 // c_rarg1 - int[] SHA.state 3727 // c_rarg2 - int offset 3728 // c_rarg3 - int limit 3729 // 3730 address generate_sha256_implCompress(bool multi_block, const char *name) { 3731 static const uint32_t round_consts[64] = { 3732 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3733 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3734 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3735 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3736 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3737 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3738 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3739 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3740 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3741 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3742 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3743 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3744 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3745 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3746 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3747 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3748 }; 3749 __ align(CodeEntryAlignment); 3750 StubCodeMark mark(this, "StubRoutines", name); 3751 address start = __ pc(); 3752 3753 Register buf = c_rarg0; 3754 Register state = c_rarg1; 3755 Register ofs = c_rarg2; 3756 Register limit = c_rarg3; 3757 3758 Label sha1_loop; 3759 3760 __ stpd(v8, v9, __ pre(sp, -32)); 3761 __ stpd(v10, v11, Address(sp, 16)); 3762 3763 // dga == v0 3764 // dgb == v1 3765 // dg0 == v2 3766 // dg1 == v3 3767 // dg2 == v4 3768 // t0 == v6 3769 // t1 == v7 3770 3771 // load 16 keys to v16..v31 3772 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3773 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3774 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3775 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3776 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3777 3778 // load 8 words (256 bits) state 3779 __ ldpq(v0, v1, state); 3780 3781 __ BIND(sha1_loop); 3782 // load 64 bytes of data into v8..v11 3783 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3784 __ rev32(v8, __ T16B, v8); 3785 __ rev32(v9, __ T16B, v9); 3786 __ rev32(v10, __ T16B, v10); 3787 __ rev32(v11, __ T16B, v11); 3788 3789 __ addv(v6, __ T4S, v8, v16); 3790 __ orr(v2, __ T16B, v0, v0); 3791 __ orr(v3, __ T16B, v1, v1); 3792 3793 FloatRegister d0 = v8; 3794 FloatRegister d1 = v9; 3795 FloatRegister d2 = v10; 3796 FloatRegister d3 = v11; 3797 3798 3799 for (int round = 0; round < 16; round++) { 3800 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3801 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3802 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3803 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3804 3805 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3806 __ orr(v4, __ T16B, v2, v2); 3807 if (round < 15) 3808 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3809 __ sha256h(v2, __ T4S, v3, tmp2); 3810 __ sha256h2(v3, __ T4S, v4, tmp2); 3811 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3812 3813 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3814 } 3815 3816 __ addv(v0, __ T4S, v0, v2); 3817 __ addv(v1, __ T4S, v1, v3); 3818 3819 if (multi_block) { 3820 __ add(ofs, ofs, 64); 3821 __ cmp(ofs, limit); 3822 __ br(Assembler::LE, sha1_loop); 3823 __ mov(c_rarg0, ofs); // return ofs 3824 } 3825 3826 __ ldpd(v10, v11, Address(sp, 16)); 3827 __ ldpd(v8, v9, __ post(sp, 32)); 3828 3829 __ stpq(v0, v1, state); 3830 3831 __ ret(lr); 3832 3833 return start; 3834 } 3835 3836 // Double rounds for sha512. 3837 void sha512_dround(int dr, 3838 FloatRegister vi0, FloatRegister vi1, 3839 FloatRegister vi2, FloatRegister vi3, 3840 FloatRegister vi4, FloatRegister vrc0, 3841 FloatRegister vrc1, FloatRegister vin0, 3842 FloatRegister vin1, FloatRegister vin2, 3843 FloatRegister vin3, FloatRegister vin4) { 3844 if (dr < 36) { 3845 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3846 } 3847 __ addv(v5, __ T2D, vrc0, vin0); 3848 __ ext(v6, __ T16B, vi2, vi3, 8); 3849 __ ext(v5, __ T16B, v5, v5, 8); 3850 __ ext(v7, __ T16B, vi1, vi2, 8); 3851 __ addv(vi3, __ T2D, vi3, v5); 3852 if (dr < 32) { 3853 __ ext(v5, __ T16B, vin3, vin4, 8); 3854 __ sha512su0(vin0, __ T2D, vin1); 3855 } 3856 __ sha512h(vi3, __ T2D, v6, v7); 3857 if (dr < 32) { 3858 __ sha512su1(vin0, __ T2D, vin2, v5); 3859 } 3860 __ addv(vi4, __ T2D, vi1, vi3); 3861 __ sha512h2(vi3, __ T2D, vi1, vi0); 3862 } 3863 3864 // Arguments: 3865 // 3866 // Inputs: 3867 // c_rarg0 - byte[] source+offset 3868 // c_rarg1 - int[] SHA.state 3869 // c_rarg2 - int offset 3870 // c_rarg3 - int limit 3871 // 3872 address generate_sha512_implCompress(bool multi_block, const char *name) { 3873 static const uint64_t round_consts[80] = { 3874 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3875 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3876 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3877 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3878 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3879 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3880 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3881 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3882 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3883 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3884 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3885 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3886 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3887 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3888 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3889 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3890 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3891 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3892 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3893 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3894 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3895 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3896 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3897 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3898 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3899 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3900 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3901 }; 3902 3903 __ align(CodeEntryAlignment); 3904 StubCodeMark mark(this, "StubRoutines", name); 3905 address start = __ pc(); 3906 3907 Register buf = c_rarg0; 3908 Register state = c_rarg1; 3909 Register ofs = c_rarg2; 3910 Register limit = c_rarg3; 3911 3912 __ stpd(v8, v9, __ pre(sp, -64)); 3913 __ stpd(v10, v11, Address(sp, 16)); 3914 __ stpd(v12, v13, Address(sp, 32)); 3915 __ stpd(v14, v15, Address(sp, 48)); 3916 3917 Label sha512_loop; 3918 3919 // load state 3920 __ ld1(v8, v9, v10, v11, __ T2D, state); 3921 3922 // load first 4 round constants 3923 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3924 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3925 3926 __ BIND(sha512_loop); 3927 // load 128B of data into v12..v19 3928 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3929 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3930 __ rev64(v12, __ T16B, v12); 3931 __ rev64(v13, __ T16B, v13); 3932 __ rev64(v14, __ T16B, v14); 3933 __ rev64(v15, __ T16B, v15); 3934 __ rev64(v16, __ T16B, v16); 3935 __ rev64(v17, __ T16B, v17); 3936 __ rev64(v18, __ T16B, v18); 3937 __ rev64(v19, __ T16B, v19); 3938 3939 __ mov(rscratch2, rscratch1); 3940 3941 __ mov(v0, __ T16B, v8); 3942 __ mov(v1, __ T16B, v9); 3943 __ mov(v2, __ T16B, v10); 3944 __ mov(v3, __ T16B, v11); 3945 3946 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3947 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3948 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3949 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3950 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3951 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3952 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3953 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3954 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3955 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3956 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3957 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3958 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3959 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3960 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3961 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3962 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3963 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3964 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3965 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3966 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3967 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3968 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3969 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3970 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3971 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3972 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3973 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3974 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3975 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3976 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3977 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3978 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3979 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3980 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3981 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3982 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3983 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3984 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3985 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3986 3987 __ addv(v8, __ T2D, v8, v0); 3988 __ addv(v9, __ T2D, v9, v1); 3989 __ addv(v10, __ T2D, v10, v2); 3990 __ addv(v11, __ T2D, v11, v3); 3991 3992 if (multi_block) { 3993 __ add(ofs, ofs, 128); 3994 __ cmp(ofs, limit); 3995 __ br(Assembler::LE, sha512_loop); 3996 __ mov(c_rarg0, ofs); // return ofs 3997 } 3998 3999 __ st1(v8, v9, v10, v11, __ T2D, state); 4000 4001 __ ldpd(v14, v15, Address(sp, 48)); 4002 __ ldpd(v12, v13, Address(sp, 32)); 4003 __ ldpd(v10, v11, Address(sp, 16)); 4004 __ ldpd(v8, v9, __ post(sp, 64)); 4005 4006 __ ret(lr); 4007 4008 return start; 4009 } 4010 4011 // Arguments: 4012 // 4013 // Inputs: 4014 // c_rarg0 - byte[] source+offset 4015 // c_rarg1 - byte[] SHA.state 4016 // c_rarg2 - int block_size 4017 // c_rarg3 - int offset 4018 // c_rarg4 - int limit 4019 // 4020 address generate_sha3_implCompress(bool multi_block, const char *name) { 4021 static const uint64_t round_consts[24] = { 4022 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4023 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4024 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4025 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4026 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4027 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4028 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4029 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4030 }; 4031 4032 __ align(CodeEntryAlignment); 4033 StubCodeMark mark(this, "StubRoutines", name); 4034 address start = __ pc(); 4035 4036 Register buf = c_rarg0; 4037 Register state = c_rarg1; 4038 Register block_size = c_rarg2; 4039 Register ofs = c_rarg3; 4040 Register limit = c_rarg4; 4041 4042 Label sha3_loop, rounds24_loop; 4043 Label sha3_512_or_sha3_384, shake128; 4044 4045 __ stpd(v8, v9, __ pre(sp, -64)); 4046 __ stpd(v10, v11, Address(sp, 16)); 4047 __ stpd(v12, v13, Address(sp, 32)); 4048 __ stpd(v14, v15, Address(sp, 48)); 4049 4050 // load state 4051 __ add(rscratch1, state, 32); 4052 __ ld1(v0, v1, v2, v3, __ T1D, state); 4053 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4054 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4055 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4056 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4057 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4058 __ ld1(v24, __ T1D, rscratch1); 4059 4060 __ BIND(sha3_loop); 4061 4062 // 24 keccak rounds 4063 __ movw(rscratch2, 24); 4064 4065 // load round_constants base 4066 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4067 4068 // load input 4069 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4070 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4071 __ eor(v0, __ T8B, v0, v25); 4072 __ eor(v1, __ T8B, v1, v26); 4073 __ eor(v2, __ T8B, v2, v27); 4074 __ eor(v3, __ T8B, v3, v28); 4075 __ eor(v4, __ T8B, v4, v29); 4076 __ eor(v5, __ T8B, v5, v30); 4077 __ eor(v6, __ T8B, v6, v31); 4078 4079 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4080 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4081 4082 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4083 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4084 __ eor(v7, __ T8B, v7, v25); 4085 __ eor(v8, __ T8B, v8, v26); 4086 __ eor(v9, __ T8B, v9, v27); 4087 __ eor(v10, __ T8B, v10, v28); 4088 __ eor(v11, __ T8B, v11, v29); 4089 __ eor(v12, __ T8B, v12, v30); 4090 __ eor(v13, __ T8B, v13, v31); 4091 4092 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4093 __ eor(v14, __ T8B, v14, v25); 4094 __ eor(v15, __ T8B, v15, v26); 4095 __ eor(v16, __ T8B, v16, v27); 4096 4097 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4098 __ andw(c_rarg5, block_size, 48); 4099 __ cbzw(c_rarg5, rounds24_loop); 4100 4101 __ tbnz(block_size, 5, shake128); 4102 // block_size == 144, bit5 == 0, SHA3-244 4103 __ ldrd(v28, __ post(buf, 8)); 4104 __ eor(v17, __ T8B, v17, v28); 4105 __ b(rounds24_loop); 4106 4107 __ BIND(shake128); 4108 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4109 __ eor(v17, __ T8B, v17, v28); 4110 __ eor(v18, __ T8B, v18, v29); 4111 __ eor(v19, __ T8B, v19, v30); 4112 __ eor(v20, __ T8B, v20, v31); 4113 __ b(rounds24_loop); // block_size == 168, SHAKE128 4114 4115 __ BIND(sha3_512_or_sha3_384); 4116 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4117 __ eor(v7, __ T8B, v7, v25); 4118 __ eor(v8, __ T8B, v8, v26); 4119 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4120 4121 // SHA3-384 4122 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4123 __ eor(v9, __ T8B, v9, v27); 4124 __ eor(v10, __ T8B, v10, v28); 4125 __ eor(v11, __ T8B, v11, v29); 4126 __ eor(v12, __ T8B, v12, v30); 4127 4128 __ BIND(rounds24_loop); 4129 __ subw(rscratch2, rscratch2, 1); 4130 4131 __ eor3(v29, __ T16B, v4, v9, v14); 4132 __ eor3(v26, __ T16B, v1, v6, v11); 4133 __ eor3(v28, __ T16B, v3, v8, v13); 4134 __ eor3(v25, __ T16B, v0, v5, v10); 4135 __ eor3(v27, __ T16B, v2, v7, v12); 4136 __ eor3(v29, __ T16B, v29, v19, v24); 4137 __ eor3(v26, __ T16B, v26, v16, v21); 4138 __ eor3(v28, __ T16B, v28, v18, v23); 4139 __ eor3(v25, __ T16B, v25, v15, v20); 4140 __ eor3(v27, __ T16B, v27, v17, v22); 4141 4142 __ rax1(v30, __ T2D, v29, v26); 4143 __ rax1(v26, __ T2D, v26, v28); 4144 __ rax1(v28, __ T2D, v28, v25); 4145 __ rax1(v25, __ T2D, v25, v27); 4146 __ rax1(v27, __ T2D, v27, v29); 4147 4148 __ eor(v0, __ T16B, v0, v30); 4149 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4150 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4151 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4152 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4153 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4154 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4155 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4156 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4157 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4158 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4159 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4160 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4161 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4162 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4163 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4164 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4165 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4166 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4167 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4168 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4169 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4170 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4171 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4172 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4173 4174 __ bcax(v20, __ T16B, v31, v22, v8); 4175 __ bcax(v21, __ T16B, v8, v23, v22); 4176 __ bcax(v22, __ T16B, v22, v24, v23); 4177 __ bcax(v23, __ T16B, v23, v31, v24); 4178 __ bcax(v24, __ T16B, v24, v8, v31); 4179 4180 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4181 4182 __ bcax(v17, __ T16B, v25, v19, v3); 4183 __ bcax(v18, __ T16B, v3, v15, v19); 4184 __ bcax(v19, __ T16B, v19, v16, v15); 4185 __ bcax(v15, __ T16B, v15, v25, v16); 4186 __ bcax(v16, __ T16B, v16, v3, v25); 4187 4188 __ bcax(v10, __ T16B, v29, v12, v26); 4189 __ bcax(v11, __ T16B, v26, v13, v12); 4190 __ bcax(v12, __ T16B, v12, v14, v13); 4191 __ bcax(v13, __ T16B, v13, v29, v14); 4192 __ bcax(v14, __ T16B, v14, v26, v29); 4193 4194 __ bcax(v7, __ T16B, v30, v9, v4); 4195 __ bcax(v8, __ T16B, v4, v5, v9); 4196 __ bcax(v9, __ T16B, v9, v6, v5); 4197 __ bcax(v5, __ T16B, v5, v30, v6); 4198 __ bcax(v6, __ T16B, v6, v4, v30); 4199 4200 __ bcax(v3, __ T16B, v27, v0, v28); 4201 __ bcax(v4, __ T16B, v28, v1, v0); 4202 __ bcax(v0, __ T16B, v0, v2, v1); 4203 __ bcax(v1, __ T16B, v1, v27, v2); 4204 __ bcax(v2, __ T16B, v2, v28, v27); 4205 4206 __ eor(v0, __ T16B, v0, v31); 4207 4208 __ cbnzw(rscratch2, rounds24_loop); 4209 4210 if (multi_block) { 4211 __ add(ofs, ofs, block_size); 4212 __ cmp(ofs, limit); 4213 __ br(Assembler::LE, sha3_loop); 4214 __ mov(c_rarg0, ofs); // return ofs 4215 } 4216 4217 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4218 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4219 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4220 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4221 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4222 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4223 __ st1(v24, __ T1D, state); 4224 4225 __ ldpd(v14, v15, Address(sp, 48)); 4226 __ ldpd(v12, v13, Address(sp, 32)); 4227 __ ldpd(v10, v11, Address(sp, 16)); 4228 __ ldpd(v8, v9, __ post(sp, 64)); 4229 4230 __ ret(lr); 4231 4232 return start; 4233 } 4234 4235 /** 4236 * Arguments: 4237 * 4238 * Inputs: 4239 * c_rarg0 - int crc 4240 * c_rarg1 - byte* buf 4241 * c_rarg2 - int length 4242 * 4243 * Output: 4244 * rax - int crc result 4245 */ 4246 address generate_updateBytesCRC32() { 4247 assert(UseCRC32Intrinsics, "what are we doing here?"); 4248 4249 __ align(CodeEntryAlignment); 4250 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4251 4252 address start = __ pc(); 4253 4254 const Register crc = c_rarg0; // crc 4255 const Register buf = c_rarg1; // source java byte array address 4256 const Register len = c_rarg2; // length 4257 const Register table0 = c_rarg3; // crc_table address 4258 const Register table1 = c_rarg4; 4259 const Register table2 = c_rarg5; 4260 const Register table3 = c_rarg6; 4261 const Register tmp3 = c_rarg7; 4262 4263 BLOCK_COMMENT("Entry:"); 4264 __ enter(); // required for proper stackwalking of RuntimeStub frame 4265 4266 __ kernel_crc32(crc, buf, len, 4267 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4268 4269 __ leave(); // required for proper stackwalking of RuntimeStub frame 4270 __ ret(lr); 4271 4272 return start; 4273 } 4274 4275 // ChaCha20 block function. This version parallelizes by loading 4276 // individual 32-bit state elements into vectors for four blocks 4277 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4278 // 4279 // state (int[16]) = c_rarg0 4280 // keystream (byte[1024]) = c_rarg1 4281 // return - number of bytes of keystream (always 256) 4282 address generate_chacha20Block_blockpar() { 4283 Label L_twoRounds, L_cc20_const; 4284 // The constant data is broken into two 128-bit segments to be loaded 4285 // onto FloatRegisters. The first 128 bits are a counter add overlay 4286 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4287 // The second 128-bits is a table constant used for 8-bit left rotations. 4288 __ BIND(L_cc20_const); 4289 __ emit_int64(0x0000000100000000UL); 4290 __ emit_int64(0x0000000300000002UL); 4291 __ emit_int64(0x0605040702010003UL); 4292 __ emit_int64(0x0E0D0C0F0A09080BUL); 4293 4294 __ align(CodeEntryAlignment); 4295 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4296 address start = __ pc(); 4297 __ enter(); 4298 4299 int i, j; 4300 const Register state = c_rarg0; 4301 const Register keystream = c_rarg1; 4302 const Register loopCtr = r10; 4303 const Register tmpAddr = r11; 4304 4305 const FloatRegister stateFirst = v0; 4306 const FloatRegister stateSecond = v1; 4307 const FloatRegister stateThird = v2; 4308 const FloatRegister stateFourth = v3; 4309 const FloatRegister origCtrState = v28; 4310 const FloatRegister scratch = v29; 4311 const FloatRegister lrot8Tbl = v30; 4312 4313 // Organize SIMD registers in an array that facilitates 4314 // putting repetitive opcodes into loop structures. It is 4315 // important that each grouping of 4 registers is monotonically 4316 // increasing to support the requirements of multi-register 4317 // instructions (e.g. ld4r, st4, etc.) 4318 const FloatRegister workSt[16] = { 4319 v4, v5, v6, v7, v16, v17, v18, v19, 4320 v20, v21, v22, v23, v24, v25, v26, v27 4321 }; 4322 4323 // Load from memory and interlace across 16 SIMD registers, 4324 // With each word from memory being broadcast to all lanes of 4325 // each successive SIMD register. 4326 // Addr(0) -> All lanes in workSt[i] 4327 // Addr(4) -> All lanes workSt[i + 1], etc. 4328 __ mov(tmpAddr, state); 4329 for (i = 0; i < 16; i += 4) { 4330 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4331 __ post(tmpAddr, 16)); 4332 } 4333 4334 // Pull in constant data. The first 16 bytes are the add overlay 4335 // which is applied to the vector holding the counter (state[12]). 4336 // The second 16 bytes is the index register for the 8-bit left 4337 // rotation tbl instruction. 4338 __ adr(tmpAddr, L_cc20_const); 4339 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4340 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4341 4342 // Set up the 10 iteration loop and perform all 8 quarter round ops 4343 __ mov(loopCtr, 10); 4344 __ BIND(L_twoRounds); 4345 4346 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4347 scratch, lrot8Tbl); 4348 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4349 scratch, lrot8Tbl); 4350 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4351 scratch, lrot8Tbl); 4352 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4353 scratch, lrot8Tbl); 4354 4355 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4356 scratch, lrot8Tbl); 4357 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4358 scratch, lrot8Tbl); 4359 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4360 scratch, lrot8Tbl); 4361 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4362 scratch, lrot8Tbl); 4363 4364 // Decrement and iterate 4365 __ sub(loopCtr, loopCtr, 1); 4366 __ cbnz(loopCtr, L_twoRounds); 4367 4368 __ mov(tmpAddr, state); 4369 4370 // Add the starting state back to the post-loop keystream 4371 // state. We read/interlace the state array from memory into 4372 // 4 registers similar to what we did in the beginning. Then 4373 // add the counter overlay onto workSt[12] at the end. 4374 for (i = 0; i < 16; i += 4) { 4375 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4376 __ post(tmpAddr, 16)); 4377 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4378 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4379 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4380 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4381 } 4382 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4383 4384 // Write to key stream, storing the same element out of workSt[0..15] 4385 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4386 // for the next element position. 4387 for (i = 0; i < 4; i++) { 4388 for (j = 0; j < 16; j += 4) { 4389 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4390 __ post(keystream, 16)); 4391 } 4392 } 4393 4394 __ mov(r0, 256); // Return length of output keystream 4395 __ leave(); 4396 __ ret(lr); 4397 4398 return start; 4399 } 4400 4401 /** 4402 * Arguments: 4403 * 4404 * Inputs: 4405 * c_rarg0 - int crc 4406 * c_rarg1 - byte* buf 4407 * c_rarg2 - int length 4408 * c_rarg3 - int* table 4409 * 4410 * Output: 4411 * r0 - int crc result 4412 */ 4413 address generate_updateBytesCRC32C() { 4414 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4415 4416 __ align(CodeEntryAlignment); 4417 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4418 4419 address start = __ pc(); 4420 4421 const Register crc = c_rarg0; // crc 4422 const Register buf = c_rarg1; // source java byte array address 4423 const Register len = c_rarg2; // length 4424 const Register table0 = c_rarg3; // crc_table address 4425 const Register table1 = c_rarg4; 4426 const Register table2 = c_rarg5; 4427 const Register table3 = c_rarg6; 4428 const Register tmp3 = c_rarg7; 4429 4430 BLOCK_COMMENT("Entry:"); 4431 __ enter(); // required for proper stackwalking of RuntimeStub frame 4432 4433 __ kernel_crc32c(crc, buf, len, 4434 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4435 4436 __ leave(); // required for proper stackwalking of RuntimeStub frame 4437 __ ret(lr); 4438 4439 return start; 4440 } 4441 4442 /*** 4443 * Arguments: 4444 * 4445 * Inputs: 4446 * c_rarg0 - int adler 4447 * c_rarg1 - byte* buff 4448 * c_rarg2 - int len 4449 * 4450 * Output: 4451 * c_rarg0 - int adler result 4452 */ 4453 address generate_updateBytesAdler32() { 4454 __ align(CodeEntryAlignment); 4455 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4456 address start = __ pc(); 4457 4458 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4459 4460 // Aliases 4461 Register adler = c_rarg0; 4462 Register s1 = c_rarg0; 4463 Register s2 = c_rarg3; 4464 Register buff = c_rarg1; 4465 Register len = c_rarg2; 4466 Register nmax = r4; 4467 Register base = r5; 4468 Register count = r6; 4469 Register temp0 = rscratch1; 4470 Register temp1 = rscratch2; 4471 FloatRegister vbytes = v0; 4472 FloatRegister vs1acc = v1; 4473 FloatRegister vs2acc = v2; 4474 FloatRegister vtable = v3; 4475 4476 // Max number of bytes we can process before having to take the mod 4477 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4478 uint64_t BASE = 0xfff1; 4479 uint64_t NMAX = 0x15B0; 4480 4481 __ mov(base, BASE); 4482 __ mov(nmax, NMAX); 4483 4484 // Load accumulation coefficients for the upper 16 bits 4485 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4486 __ ld1(vtable, __ T16B, Address(temp0)); 4487 4488 // s1 is initialized to the lower 16 bits of adler 4489 // s2 is initialized to the upper 16 bits of adler 4490 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4491 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4492 4493 // The pipelined loop needs at least 16 elements for 1 iteration 4494 // It does check this, but it is more effective to skip to the cleanup loop 4495 __ cmp(len, (u1)16); 4496 __ br(Assembler::HS, L_nmax); 4497 __ cbz(len, L_combine); 4498 4499 __ bind(L_simple_by1_loop); 4500 __ ldrb(temp0, Address(__ post(buff, 1))); 4501 __ add(s1, s1, temp0); 4502 __ add(s2, s2, s1); 4503 __ subs(len, len, 1); 4504 __ br(Assembler::HI, L_simple_by1_loop); 4505 4506 // s1 = s1 % BASE 4507 __ subs(temp0, s1, base); 4508 __ csel(s1, temp0, s1, Assembler::HS); 4509 4510 // s2 = s2 % BASE 4511 __ lsr(temp0, s2, 16); 4512 __ lsl(temp1, temp0, 4); 4513 __ sub(temp1, temp1, temp0); 4514 __ add(s2, temp1, s2, ext::uxth); 4515 4516 __ subs(temp0, s2, base); 4517 __ csel(s2, temp0, s2, Assembler::HS); 4518 4519 __ b(L_combine); 4520 4521 __ bind(L_nmax); 4522 __ subs(len, len, nmax); 4523 __ sub(count, nmax, 16); 4524 __ br(Assembler::LO, L_by16); 4525 4526 __ bind(L_nmax_loop); 4527 4528 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4529 vbytes, vs1acc, vs2acc, vtable); 4530 4531 __ subs(count, count, 16); 4532 __ br(Assembler::HS, L_nmax_loop); 4533 4534 // s1 = s1 % BASE 4535 __ lsr(temp0, s1, 16); 4536 __ lsl(temp1, temp0, 4); 4537 __ sub(temp1, temp1, temp0); 4538 __ add(temp1, temp1, s1, ext::uxth); 4539 4540 __ lsr(temp0, temp1, 16); 4541 __ lsl(s1, temp0, 4); 4542 __ sub(s1, s1, temp0); 4543 __ add(s1, s1, temp1, ext:: uxth); 4544 4545 __ subs(temp0, s1, base); 4546 __ csel(s1, temp0, s1, Assembler::HS); 4547 4548 // s2 = s2 % BASE 4549 __ lsr(temp0, s2, 16); 4550 __ lsl(temp1, temp0, 4); 4551 __ sub(temp1, temp1, temp0); 4552 __ add(temp1, temp1, s2, ext::uxth); 4553 4554 __ lsr(temp0, temp1, 16); 4555 __ lsl(s2, temp0, 4); 4556 __ sub(s2, s2, temp0); 4557 __ add(s2, s2, temp1, ext:: uxth); 4558 4559 __ subs(temp0, s2, base); 4560 __ csel(s2, temp0, s2, Assembler::HS); 4561 4562 __ subs(len, len, nmax); 4563 __ sub(count, nmax, 16); 4564 __ br(Assembler::HS, L_nmax_loop); 4565 4566 __ bind(L_by16); 4567 __ adds(len, len, count); 4568 __ br(Assembler::LO, L_by1); 4569 4570 __ bind(L_by16_loop); 4571 4572 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4573 vbytes, vs1acc, vs2acc, vtable); 4574 4575 __ subs(len, len, 16); 4576 __ br(Assembler::HS, L_by16_loop); 4577 4578 __ bind(L_by1); 4579 __ adds(len, len, 15); 4580 __ br(Assembler::LO, L_do_mod); 4581 4582 __ bind(L_by1_loop); 4583 __ ldrb(temp0, Address(__ post(buff, 1))); 4584 __ add(s1, temp0, s1); 4585 __ add(s2, s2, s1); 4586 __ subs(len, len, 1); 4587 __ br(Assembler::HS, L_by1_loop); 4588 4589 __ bind(L_do_mod); 4590 // s1 = s1 % BASE 4591 __ lsr(temp0, s1, 16); 4592 __ lsl(temp1, temp0, 4); 4593 __ sub(temp1, temp1, temp0); 4594 __ add(temp1, temp1, s1, ext::uxth); 4595 4596 __ lsr(temp0, temp1, 16); 4597 __ lsl(s1, temp0, 4); 4598 __ sub(s1, s1, temp0); 4599 __ add(s1, s1, temp1, ext:: uxth); 4600 4601 __ subs(temp0, s1, base); 4602 __ csel(s1, temp0, s1, Assembler::HS); 4603 4604 // s2 = s2 % BASE 4605 __ lsr(temp0, s2, 16); 4606 __ lsl(temp1, temp0, 4); 4607 __ sub(temp1, temp1, temp0); 4608 __ add(temp1, temp1, s2, ext::uxth); 4609 4610 __ lsr(temp0, temp1, 16); 4611 __ lsl(s2, temp0, 4); 4612 __ sub(s2, s2, temp0); 4613 __ add(s2, s2, temp1, ext:: uxth); 4614 4615 __ subs(temp0, s2, base); 4616 __ csel(s2, temp0, s2, Assembler::HS); 4617 4618 // Combine lower bits and higher bits 4619 __ bind(L_combine); 4620 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4621 4622 __ ret(lr); 4623 4624 return start; 4625 } 4626 4627 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4628 Register temp0, Register temp1, FloatRegister vbytes, 4629 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4630 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4631 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4632 // In non-vectorized code, we update s1 and s2 as: 4633 // s1 <- s1 + b1 4634 // s2 <- s2 + s1 4635 // s1 <- s1 + b2 4636 // s2 <- s2 + b1 4637 // ... 4638 // s1 <- s1 + b16 4639 // s2 <- s2 + s1 4640 // Putting above assignments together, we have: 4641 // s1_new = s1 + b1 + b2 + ... + b16 4642 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4643 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4644 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4645 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4646 4647 // s2 = s2 + s1 * 16 4648 __ add(s2, s2, s1, Assembler::LSL, 4); 4649 4650 // vs1acc = b1 + b2 + b3 + ... + b16 4651 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4652 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4653 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4654 __ uaddlv(vs1acc, __ T16B, vbytes); 4655 __ uaddlv(vs2acc, __ T8H, vs2acc); 4656 4657 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4658 __ fmovd(temp0, vs1acc); 4659 __ fmovd(temp1, vs2acc); 4660 __ add(s1, s1, temp0); 4661 __ add(s2, s2, temp1); 4662 } 4663 4664 /** 4665 * Arguments: 4666 * 4667 * Input: 4668 * c_rarg0 - x address 4669 * c_rarg1 - x length 4670 * c_rarg2 - y address 4671 * c_rarg3 - y length 4672 * c_rarg4 - z address 4673 * c_rarg5 - z length 4674 */ 4675 address generate_multiplyToLen() { 4676 __ align(CodeEntryAlignment); 4677 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4678 4679 address start = __ pc(); 4680 const Register x = r0; 4681 const Register xlen = r1; 4682 const Register y = r2; 4683 const Register ylen = r3; 4684 const Register z = r4; 4685 const Register zlen = r5; 4686 4687 const Register tmp1 = r10; 4688 const Register tmp2 = r11; 4689 const Register tmp3 = r12; 4690 const Register tmp4 = r13; 4691 const Register tmp5 = r14; 4692 const Register tmp6 = r15; 4693 const Register tmp7 = r16; 4694 4695 BLOCK_COMMENT("Entry:"); 4696 __ enter(); // required for proper stackwalking of RuntimeStub frame 4697 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4698 __ leave(); // required for proper stackwalking of RuntimeStub frame 4699 __ ret(lr); 4700 4701 return start; 4702 } 4703 4704 address generate_squareToLen() { 4705 // squareToLen algorithm for sizes 1..127 described in java code works 4706 // faster than multiply_to_len on some CPUs and slower on others, but 4707 // multiply_to_len shows a bit better overall results 4708 __ align(CodeEntryAlignment); 4709 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4710 address start = __ pc(); 4711 4712 const Register x = r0; 4713 const Register xlen = r1; 4714 const Register z = r2; 4715 const Register zlen = r3; 4716 const Register y = r4; // == x 4717 const Register ylen = r5; // == xlen 4718 4719 const Register tmp1 = r10; 4720 const Register tmp2 = r11; 4721 const Register tmp3 = r12; 4722 const Register tmp4 = r13; 4723 const Register tmp5 = r14; 4724 const Register tmp6 = r15; 4725 const Register tmp7 = r16; 4726 4727 RegSet spilled_regs = RegSet::of(y, ylen); 4728 BLOCK_COMMENT("Entry:"); 4729 __ enter(); 4730 __ push(spilled_regs, sp); 4731 __ mov(y, x); 4732 __ mov(ylen, xlen); 4733 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4734 __ pop(spilled_regs, sp); 4735 __ leave(); 4736 __ ret(lr); 4737 return start; 4738 } 4739 4740 address generate_mulAdd() { 4741 __ align(CodeEntryAlignment); 4742 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4743 4744 address start = __ pc(); 4745 4746 const Register out = r0; 4747 const Register in = r1; 4748 const Register offset = r2; 4749 const Register len = r3; 4750 const Register k = r4; 4751 4752 BLOCK_COMMENT("Entry:"); 4753 __ enter(); 4754 __ mul_add(out, in, offset, len, k); 4755 __ leave(); 4756 __ ret(lr); 4757 4758 return start; 4759 } 4760 4761 // Arguments: 4762 // 4763 // Input: 4764 // c_rarg0 - newArr address 4765 // c_rarg1 - oldArr address 4766 // c_rarg2 - newIdx 4767 // c_rarg3 - shiftCount 4768 // c_rarg4 - numIter 4769 // 4770 address generate_bigIntegerRightShift() { 4771 __ align(CodeEntryAlignment); 4772 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4773 address start = __ pc(); 4774 4775 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4776 4777 Register newArr = c_rarg0; 4778 Register oldArr = c_rarg1; 4779 Register newIdx = c_rarg2; 4780 Register shiftCount = c_rarg3; 4781 Register numIter = c_rarg4; 4782 Register idx = numIter; 4783 4784 Register newArrCur = rscratch1; 4785 Register shiftRevCount = rscratch2; 4786 Register oldArrCur = r13; 4787 Register oldArrNext = r14; 4788 4789 FloatRegister oldElem0 = v0; 4790 FloatRegister oldElem1 = v1; 4791 FloatRegister newElem = v2; 4792 FloatRegister shiftVCount = v3; 4793 FloatRegister shiftVRevCount = v4; 4794 4795 __ cbz(idx, Exit); 4796 4797 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4798 4799 // left shift count 4800 __ movw(shiftRevCount, 32); 4801 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4802 4803 // numIter too small to allow a 4-words SIMD loop, rolling back 4804 __ cmp(numIter, (u1)4); 4805 __ br(Assembler::LT, ShiftThree); 4806 4807 __ dup(shiftVCount, __ T4S, shiftCount); 4808 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4809 __ negr(shiftVCount, __ T4S, shiftVCount); 4810 4811 __ BIND(ShiftSIMDLoop); 4812 4813 // Calculate the load addresses 4814 __ sub(idx, idx, 4); 4815 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4816 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4817 __ add(oldArrCur, oldArrNext, 4); 4818 4819 // Load 4 words and process 4820 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4821 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4822 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4823 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4824 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4825 __ st1(newElem, __ T4S, Address(newArrCur)); 4826 4827 __ cmp(idx, (u1)4); 4828 __ br(Assembler::LT, ShiftTwoLoop); 4829 __ b(ShiftSIMDLoop); 4830 4831 __ BIND(ShiftTwoLoop); 4832 __ cbz(idx, Exit); 4833 __ cmp(idx, (u1)1); 4834 __ br(Assembler::EQ, ShiftOne); 4835 4836 // Calculate the load addresses 4837 __ sub(idx, idx, 2); 4838 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4839 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4840 __ add(oldArrCur, oldArrNext, 4); 4841 4842 // Load 2 words and process 4843 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4844 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4845 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4846 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4847 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4848 __ st1(newElem, __ T2S, Address(newArrCur)); 4849 __ b(ShiftTwoLoop); 4850 4851 __ BIND(ShiftThree); 4852 __ tbz(idx, 1, ShiftOne); 4853 __ tbz(idx, 0, ShiftTwo); 4854 __ ldrw(r10, Address(oldArr, 12)); 4855 __ ldrw(r11, Address(oldArr, 8)); 4856 __ lsrvw(r10, r10, shiftCount); 4857 __ lslvw(r11, r11, shiftRevCount); 4858 __ orrw(r12, r10, r11); 4859 __ strw(r12, Address(newArr, 8)); 4860 4861 __ BIND(ShiftTwo); 4862 __ ldrw(r10, Address(oldArr, 8)); 4863 __ ldrw(r11, Address(oldArr, 4)); 4864 __ lsrvw(r10, r10, shiftCount); 4865 __ lslvw(r11, r11, shiftRevCount); 4866 __ orrw(r12, r10, r11); 4867 __ strw(r12, Address(newArr, 4)); 4868 4869 __ BIND(ShiftOne); 4870 __ ldrw(r10, Address(oldArr, 4)); 4871 __ ldrw(r11, Address(oldArr)); 4872 __ lsrvw(r10, r10, shiftCount); 4873 __ lslvw(r11, r11, shiftRevCount); 4874 __ orrw(r12, r10, r11); 4875 __ strw(r12, Address(newArr)); 4876 4877 __ BIND(Exit); 4878 __ ret(lr); 4879 4880 return start; 4881 } 4882 4883 // Arguments: 4884 // 4885 // Input: 4886 // c_rarg0 - newArr address 4887 // c_rarg1 - oldArr address 4888 // c_rarg2 - newIdx 4889 // c_rarg3 - shiftCount 4890 // c_rarg4 - numIter 4891 // 4892 address generate_bigIntegerLeftShift() { 4893 __ align(CodeEntryAlignment); 4894 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4895 address start = __ pc(); 4896 4897 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4898 4899 Register newArr = c_rarg0; 4900 Register oldArr = c_rarg1; 4901 Register newIdx = c_rarg2; 4902 Register shiftCount = c_rarg3; 4903 Register numIter = c_rarg4; 4904 4905 Register shiftRevCount = rscratch1; 4906 Register oldArrNext = rscratch2; 4907 4908 FloatRegister oldElem0 = v0; 4909 FloatRegister oldElem1 = v1; 4910 FloatRegister newElem = v2; 4911 FloatRegister shiftVCount = v3; 4912 FloatRegister shiftVRevCount = v4; 4913 4914 __ cbz(numIter, Exit); 4915 4916 __ add(oldArrNext, oldArr, 4); 4917 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4918 4919 // right shift count 4920 __ movw(shiftRevCount, 32); 4921 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4922 4923 // numIter too small to allow a 4-words SIMD loop, rolling back 4924 __ cmp(numIter, (u1)4); 4925 __ br(Assembler::LT, ShiftThree); 4926 4927 __ dup(shiftVCount, __ T4S, shiftCount); 4928 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4929 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4930 4931 __ BIND(ShiftSIMDLoop); 4932 4933 // load 4 words and process 4934 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4935 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4936 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4937 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4938 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4939 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4940 __ sub(numIter, numIter, 4); 4941 4942 __ cmp(numIter, (u1)4); 4943 __ br(Assembler::LT, ShiftTwoLoop); 4944 __ b(ShiftSIMDLoop); 4945 4946 __ BIND(ShiftTwoLoop); 4947 __ cbz(numIter, Exit); 4948 __ cmp(numIter, (u1)1); 4949 __ br(Assembler::EQ, ShiftOne); 4950 4951 // load 2 words and process 4952 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4953 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4954 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4955 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4956 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4957 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4958 __ sub(numIter, numIter, 2); 4959 __ b(ShiftTwoLoop); 4960 4961 __ BIND(ShiftThree); 4962 __ ldrw(r10, __ post(oldArr, 4)); 4963 __ ldrw(r11, __ post(oldArrNext, 4)); 4964 __ lslvw(r10, r10, shiftCount); 4965 __ lsrvw(r11, r11, shiftRevCount); 4966 __ orrw(r12, r10, r11); 4967 __ strw(r12, __ post(newArr, 4)); 4968 __ tbz(numIter, 1, Exit); 4969 __ tbz(numIter, 0, ShiftOne); 4970 4971 __ BIND(ShiftTwo); 4972 __ ldrw(r10, __ post(oldArr, 4)); 4973 __ ldrw(r11, __ post(oldArrNext, 4)); 4974 __ lslvw(r10, r10, shiftCount); 4975 __ lsrvw(r11, r11, shiftRevCount); 4976 __ orrw(r12, r10, r11); 4977 __ strw(r12, __ post(newArr, 4)); 4978 4979 __ BIND(ShiftOne); 4980 __ ldrw(r10, Address(oldArr)); 4981 __ ldrw(r11, Address(oldArrNext)); 4982 __ lslvw(r10, r10, shiftCount); 4983 __ lsrvw(r11, r11, shiftRevCount); 4984 __ orrw(r12, r10, r11); 4985 __ strw(r12, Address(newArr)); 4986 4987 __ BIND(Exit); 4988 __ ret(lr); 4989 4990 return start; 4991 } 4992 4993 address generate_count_positives(address &count_positives_long) { 4994 const u1 large_loop_size = 64; 4995 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4996 int dcache_line = VM_Version::dcache_line_size(); 4997 4998 Register ary1 = r1, len = r2, result = r0; 4999 5000 __ align(CodeEntryAlignment); 5001 5002 StubCodeMark mark(this, "StubRoutines", "count_positives"); 5003 5004 address entry = __ pc(); 5005 5006 __ enter(); 5007 // precondition: a copy of len is already in result 5008 // __ mov(result, len); 5009 5010 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5011 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5012 5013 __ cmp(len, (u1)15); 5014 __ br(Assembler::GT, LEN_OVER_15); 5015 // The only case when execution falls into this code is when pointer is near 5016 // the end of memory page and we have to avoid reading next page 5017 __ add(ary1, ary1, len); 5018 __ subs(len, len, 8); 5019 __ br(Assembler::GT, LEN_OVER_8); 5020 __ ldr(rscratch2, Address(ary1, -8)); 5021 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5022 __ lsrv(rscratch2, rscratch2, rscratch1); 5023 __ tst(rscratch2, UPPER_BIT_MASK); 5024 __ csel(result, zr, result, Assembler::NE); 5025 __ leave(); 5026 __ ret(lr); 5027 __ bind(LEN_OVER_8); 5028 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5029 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5030 __ tst(rscratch2, UPPER_BIT_MASK); 5031 __ br(Assembler::NE, RET_NO_POP); 5032 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5033 __ lsrv(rscratch1, rscratch1, rscratch2); 5034 __ tst(rscratch1, UPPER_BIT_MASK); 5035 __ bind(RET_NO_POP); 5036 __ csel(result, zr, result, Assembler::NE); 5037 __ leave(); 5038 __ ret(lr); 5039 5040 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5041 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5042 5043 count_positives_long = __ pc(); // 2nd entry point 5044 5045 __ enter(); 5046 5047 __ bind(LEN_OVER_15); 5048 __ push(spilled_regs, sp); 5049 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5050 __ cbz(rscratch2, ALIGNED); 5051 __ ldp(tmp6, tmp1, Address(ary1)); 5052 __ mov(tmp5, 16); 5053 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5054 __ add(ary1, ary1, rscratch1); 5055 __ orr(tmp6, tmp6, tmp1); 5056 __ tst(tmp6, UPPER_BIT_MASK); 5057 __ br(Assembler::NE, RET_ADJUST); 5058 __ sub(len, len, rscratch1); 5059 5060 __ bind(ALIGNED); 5061 __ cmp(len, large_loop_size); 5062 __ br(Assembler::LT, CHECK_16); 5063 // Perform 16-byte load as early return in pre-loop to handle situation 5064 // when initially aligned large array has negative values at starting bytes, 5065 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5066 // slower. Cases with negative bytes further ahead won't be affected that 5067 // much. In fact, it'll be faster due to early loads, less instructions and 5068 // less branches in LARGE_LOOP. 5069 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5070 __ sub(len, len, 16); 5071 __ orr(tmp6, tmp6, tmp1); 5072 __ tst(tmp6, UPPER_BIT_MASK); 5073 __ br(Assembler::NE, RET_ADJUST_16); 5074 __ cmp(len, large_loop_size); 5075 __ br(Assembler::LT, CHECK_16); 5076 5077 if (SoftwarePrefetchHintDistance >= 0 5078 && SoftwarePrefetchHintDistance >= dcache_line) { 5079 // initial prefetch 5080 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5081 } 5082 __ bind(LARGE_LOOP); 5083 if (SoftwarePrefetchHintDistance >= 0) { 5084 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5085 } 5086 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5087 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5088 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5089 // instructions per cycle and have less branches, but this approach disables 5090 // early return, thus, all 64 bytes are loaded and checked every time. 5091 __ ldp(tmp2, tmp3, Address(ary1)); 5092 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5093 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5094 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5095 __ add(ary1, ary1, large_loop_size); 5096 __ sub(len, len, large_loop_size); 5097 __ orr(tmp2, tmp2, tmp3); 5098 __ orr(tmp4, tmp4, tmp5); 5099 __ orr(rscratch1, rscratch1, rscratch2); 5100 __ orr(tmp6, tmp6, tmp1); 5101 __ orr(tmp2, tmp2, tmp4); 5102 __ orr(rscratch1, rscratch1, tmp6); 5103 __ orr(tmp2, tmp2, rscratch1); 5104 __ tst(tmp2, UPPER_BIT_MASK); 5105 __ br(Assembler::NE, RET_ADJUST_LONG); 5106 __ cmp(len, large_loop_size); 5107 __ br(Assembler::GE, LARGE_LOOP); 5108 5109 __ bind(CHECK_16); // small 16-byte load pre-loop 5110 __ cmp(len, (u1)16); 5111 __ br(Assembler::LT, POST_LOOP16); 5112 5113 __ bind(LOOP16); // small 16-byte load loop 5114 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5115 __ sub(len, len, 16); 5116 __ orr(tmp2, tmp2, tmp3); 5117 __ tst(tmp2, UPPER_BIT_MASK); 5118 __ br(Assembler::NE, RET_ADJUST_16); 5119 __ cmp(len, (u1)16); 5120 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5121 5122 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5123 __ cmp(len, (u1)8); 5124 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5125 __ ldr(tmp3, Address(__ post(ary1, 8))); 5126 __ tst(tmp3, UPPER_BIT_MASK); 5127 __ br(Assembler::NE, RET_ADJUST); 5128 __ sub(len, len, 8); 5129 5130 __ bind(POST_LOOP16_LOAD_TAIL); 5131 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5132 __ ldr(tmp1, Address(ary1)); 5133 __ mov(tmp2, 64); 5134 __ sub(tmp4, tmp2, len, __ LSL, 3); 5135 __ lslv(tmp1, tmp1, tmp4); 5136 __ tst(tmp1, UPPER_BIT_MASK); 5137 __ br(Assembler::NE, RET_ADJUST); 5138 // Fallthrough 5139 5140 __ bind(RET_LEN); 5141 __ pop(spilled_regs, sp); 5142 __ leave(); 5143 __ ret(lr); 5144 5145 // difference result - len is the count of guaranteed to be 5146 // positive bytes 5147 5148 __ bind(RET_ADJUST_LONG); 5149 __ add(len, len, (u1)(large_loop_size - 16)); 5150 __ bind(RET_ADJUST_16); 5151 __ add(len, len, 16); 5152 __ bind(RET_ADJUST); 5153 __ pop(spilled_regs, sp); 5154 __ leave(); 5155 __ sub(result, result, len); 5156 __ ret(lr); 5157 5158 return entry; 5159 } 5160 5161 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5162 bool usePrefetch, Label &NOT_EQUAL) { 5163 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5164 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5165 tmp7 = r12, tmp8 = r13; 5166 Label LOOP; 5167 5168 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5169 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5170 __ bind(LOOP); 5171 if (usePrefetch) { 5172 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5173 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5174 } 5175 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5176 __ eor(tmp1, tmp1, tmp2); 5177 __ eor(tmp3, tmp3, tmp4); 5178 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5179 __ orr(tmp1, tmp1, tmp3); 5180 __ cbnz(tmp1, NOT_EQUAL); 5181 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5182 __ eor(tmp5, tmp5, tmp6); 5183 __ eor(tmp7, tmp7, tmp8); 5184 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5185 __ orr(tmp5, tmp5, tmp7); 5186 __ cbnz(tmp5, NOT_EQUAL); 5187 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5188 __ eor(tmp1, tmp1, tmp2); 5189 __ eor(tmp3, tmp3, tmp4); 5190 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5191 __ orr(tmp1, tmp1, tmp3); 5192 __ cbnz(tmp1, NOT_EQUAL); 5193 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5194 __ eor(tmp5, tmp5, tmp6); 5195 __ sub(cnt1, cnt1, 8 * wordSize); 5196 __ eor(tmp7, tmp7, tmp8); 5197 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5198 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5199 // cmp) because subs allows an unlimited range of immediate operand. 5200 __ subs(tmp6, cnt1, loopThreshold); 5201 __ orr(tmp5, tmp5, tmp7); 5202 __ cbnz(tmp5, NOT_EQUAL); 5203 __ br(__ GE, LOOP); 5204 // post-loop 5205 __ eor(tmp1, tmp1, tmp2); 5206 __ eor(tmp3, tmp3, tmp4); 5207 __ orr(tmp1, tmp1, tmp3); 5208 __ sub(cnt1, cnt1, 2 * wordSize); 5209 __ cbnz(tmp1, NOT_EQUAL); 5210 } 5211 5212 void generate_large_array_equals_loop_simd(int loopThreshold, 5213 bool usePrefetch, Label &NOT_EQUAL) { 5214 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5215 tmp2 = rscratch2; 5216 Label LOOP; 5217 5218 __ bind(LOOP); 5219 if (usePrefetch) { 5220 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5221 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5222 } 5223 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5224 __ sub(cnt1, cnt1, 8 * wordSize); 5225 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5226 __ subs(tmp1, cnt1, loopThreshold); 5227 __ eor(v0, __ T16B, v0, v4); 5228 __ eor(v1, __ T16B, v1, v5); 5229 __ eor(v2, __ T16B, v2, v6); 5230 __ eor(v3, __ T16B, v3, v7); 5231 __ orr(v0, __ T16B, v0, v1); 5232 __ orr(v1, __ T16B, v2, v3); 5233 __ orr(v0, __ T16B, v0, v1); 5234 __ umov(tmp1, v0, __ D, 0); 5235 __ umov(tmp2, v0, __ D, 1); 5236 __ orr(tmp1, tmp1, tmp2); 5237 __ cbnz(tmp1, NOT_EQUAL); 5238 __ br(__ GE, LOOP); 5239 } 5240 5241 // a1 = r1 - array1 address 5242 // a2 = r2 - array2 address 5243 // result = r0 - return value. Already contains "false" 5244 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5245 // r3-r5 are reserved temporary registers 5246 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5247 address generate_large_array_equals() { 5248 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5249 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5250 tmp7 = r12, tmp8 = r13; 5251 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5252 SMALL_LOOP, POST_LOOP; 5253 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5254 // calculate if at least 32 prefetched bytes are used 5255 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5256 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5257 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5258 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5259 tmp5, tmp6, tmp7, tmp8); 5260 5261 __ align(CodeEntryAlignment); 5262 5263 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5264 5265 address entry = __ pc(); 5266 __ enter(); 5267 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5268 // also advance pointers to use post-increment instead of pre-increment 5269 __ add(a1, a1, wordSize); 5270 __ add(a2, a2, wordSize); 5271 if (AvoidUnalignedAccesses) { 5272 // both implementations (SIMD/nonSIMD) are using relatively large load 5273 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5274 // on some CPUs in case of address is not at least 16-byte aligned. 5275 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5276 // load if needed at least for 1st address and make if 16-byte aligned. 5277 Label ALIGNED16; 5278 __ tbz(a1, 3, ALIGNED16); 5279 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5280 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5281 __ sub(cnt1, cnt1, wordSize); 5282 __ eor(tmp1, tmp1, tmp2); 5283 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5284 __ bind(ALIGNED16); 5285 } 5286 if (UseSIMDForArrayEquals) { 5287 if (SoftwarePrefetchHintDistance >= 0) { 5288 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5289 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5290 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5291 /* prfm = */ true, NOT_EQUAL); 5292 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5293 __ br(__ LT, TAIL); 5294 } 5295 __ bind(NO_PREFETCH_LARGE_LOOP); 5296 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5297 /* prfm = */ false, NOT_EQUAL); 5298 } else { 5299 __ push(spilled_regs, sp); 5300 if (SoftwarePrefetchHintDistance >= 0) { 5301 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5302 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5303 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5304 /* prfm = */ true, NOT_EQUAL); 5305 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5306 __ br(__ LT, TAIL); 5307 } 5308 __ bind(NO_PREFETCH_LARGE_LOOP); 5309 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5310 /* prfm = */ false, NOT_EQUAL); 5311 } 5312 __ bind(TAIL); 5313 __ cbz(cnt1, EQUAL); 5314 __ subs(cnt1, cnt1, wordSize); 5315 __ br(__ LE, POST_LOOP); 5316 __ bind(SMALL_LOOP); 5317 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5318 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5319 __ subs(cnt1, cnt1, wordSize); 5320 __ eor(tmp1, tmp1, tmp2); 5321 __ cbnz(tmp1, NOT_EQUAL); 5322 __ br(__ GT, SMALL_LOOP); 5323 __ bind(POST_LOOP); 5324 __ ldr(tmp1, Address(a1, cnt1)); 5325 __ ldr(tmp2, Address(a2, cnt1)); 5326 __ eor(tmp1, tmp1, tmp2); 5327 __ cbnz(tmp1, NOT_EQUAL); 5328 __ bind(EQUAL); 5329 __ mov(result, true); 5330 __ bind(NOT_EQUAL); 5331 if (!UseSIMDForArrayEquals) { 5332 __ pop(spilled_regs, sp); 5333 } 5334 __ bind(NOT_EQUAL_NO_POP); 5335 __ leave(); 5336 __ ret(lr); 5337 return entry; 5338 } 5339 5340 address generate_dsin_dcos(bool isCos) { 5341 __ align(CodeEntryAlignment); 5342 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5343 address start = __ pc(); 5344 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5345 (address)StubRoutines::aarch64::_two_over_pi, 5346 (address)StubRoutines::aarch64::_pio2, 5347 (address)StubRoutines::aarch64::_dsin_coef, 5348 (address)StubRoutines::aarch64::_dcos_coef); 5349 return start; 5350 } 5351 5352 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5353 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5354 Label &DIFF2) { 5355 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5356 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5357 5358 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5359 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5360 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5361 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5362 5363 __ fmovd(tmpL, vtmp3); 5364 __ eor(rscratch2, tmp3, tmpL); 5365 __ cbnz(rscratch2, DIFF2); 5366 5367 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5368 __ umov(tmpL, vtmp3, __ D, 1); 5369 __ eor(rscratch2, tmpU, tmpL); 5370 __ cbnz(rscratch2, DIFF1); 5371 5372 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5373 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5374 __ fmovd(tmpL, vtmp); 5375 __ eor(rscratch2, tmp3, tmpL); 5376 __ cbnz(rscratch2, DIFF2); 5377 5378 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5379 __ umov(tmpL, vtmp, __ D, 1); 5380 __ eor(rscratch2, tmpU, tmpL); 5381 __ cbnz(rscratch2, DIFF1); 5382 } 5383 5384 // r0 = result 5385 // r1 = str1 5386 // r2 = cnt1 5387 // r3 = str2 5388 // r4 = cnt2 5389 // r10 = tmp1 5390 // r11 = tmp2 5391 address generate_compare_long_string_different_encoding(bool isLU) { 5392 __ align(CodeEntryAlignment); 5393 StubCodeMark mark(this, "StubRoutines", isLU 5394 ? "compare_long_string_different_encoding LU" 5395 : "compare_long_string_different_encoding UL"); 5396 address entry = __ pc(); 5397 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5398 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5399 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5400 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5401 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5402 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5403 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5404 5405 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5406 5407 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5408 // cnt2 == amount of characters left to compare 5409 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5410 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5411 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5412 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5413 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5414 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5415 __ eor(rscratch2, tmp1, tmp2); 5416 __ mov(rscratch1, tmp2); 5417 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5418 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5419 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5420 __ push(spilled_regs, sp); 5421 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5422 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5423 5424 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5425 5426 if (SoftwarePrefetchHintDistance >= 0) { 5427 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5428 __ br(__ LT, NO_PREFETCH); 5429 __ bind(LARGE_LOOP_PREFETCH); 5430 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5431 __ mov(tmp4, 2); 5432 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5433 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5434 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5435 __ subs(tmp4, tmp4, 1); 5436 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5437 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5438 __ mov(tmp4, 2); 5439 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5440 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5441 __ subs(tmp4, tmp4, 1); 5442 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5443 __ sub(cnt2, cnt2, 64); 5444 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5445 __ br(__ GE, LARGE_LOOP_PREFETCH); 5446 } 5447 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5448 __ bind(NO_PREFETCH); 5449 __ subs(cnt2, cnt2, 16); 5450 __ br(__ LT, TAIL); 5451 __ align(OptoLoopAlignment); 5452 __ bind(SMALL_LOOP); // smaller loop 5453 __ subs(cnt2, cnt2, 16); 5454 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5455 __ br(__ GE, SMALL_LOOP); 5456 __ cmn(cnt2, (u1)16); 5457 __ br(__ EQ, LOAD_LAST); 5458 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5459 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5460 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5461 __ ldr(tmp3, Address(cnt1, -8)); 5462 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5463 __ b(LOAD_LAST); 5464 __ bind(DIFF2); 5465 __ mov(tmpU, tmp3); 5466 __ bind(DIFF1); 5467 __ pop(spilled_regs, sp); 5468 __ b(CALCULATE_DIFFERENCE); 5469 __ bind(LOAD_LAST); 5470 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5471 // No need to load it again 5472 __ mov(tmpU, tmp3); 5473 __ pop(spilled_regs, sp); 5474 5475 // tmp2 points to the address of the last 4 Latin1 characters right now 5476 __ ldrs(vtmp, Address(tmp2)); 5477 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5478 __ fmovd(tmpL, vtmp); 5479 5480 __ eor(rscratch2, tmpU, tmpL); 5481 __ cbz(rscratch2, DONE); 5482 5483 // Find the first different characters in the longwords and 5484 // compute their difference. 5485 __ bind(CALCULATE_DIFFERENCE); 5486 __ rev(rscratch2, rscratch2); 5487 __ clz(rscratch2, rscratch2); 5488 __ andr(rscratch2, rscratch2, -16); 5489 __ lsrv(tmp1, tmp1, rscratch2); 5490 __ uxthw(tmp1, tmp1); 5491 __ lsrv(rscratch1, rscratch1, rscratch2); 5492 __ uxthw(rscratch1, rscratch1); 5493 __ subw(result, tmp1, rscratch1); 5494 __ bind(DONE); 5495 __ ret(lr); 5496 return entry; 5497 } 5498 5499 // r0 = input (float16) 5500 // v0 = result (float) 5501 // v1 = temporary float register 5502 address generate_float16ToFloat() { 5503 __ align(CodeEntryAlignment); 5504 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 5505 address entry = __ pc(); 5506 BLOCK_COMMENT("Entry:"); 5507 __ flt16_to_flt(v0, r0, v1); 5508 __ ret(lr); 5509 return entry; 5510 } 5511 5512 // v0 = input (float) 5513 // r0 = result (float16) 5514 // v1 = temporary float register 5515 address generate_floatToFloat16() { 5516 __ align(CodeEntryAlignment); 5517 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 5518 address entry = __ pc(); 5519 BLOCK_COMMENT("Entry:"); 5520 __ flt_to_flt16(r0, v0, v1); 5521 __ ret(lr); 5522 return entry; 5523 } 5524 5525 address generate_method_entry_barrier() { 5526 __ align(CodeEntryAlignment); 5527 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5528 5529 Label deoptimize_label; 5530 5531 address start = __ pc(); 5532 5533 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5534 5535 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5536 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5537 // We can get here despite the nmethod being good, if we have not 5538 // yet applied our cross modification fence (or data fence). 5539 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5540 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5541 __ ldrw(rscratch2, rscratch2); 5542 __ strw(rscratch2, thread_epoch_addr); 5543 __ isb(); 5544 __ membar(__ LoadLoad); 5545 } 5546 5547 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5548 5549 __ enter(); 5550 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5551 5552 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5553 5554 __ push_call_clobbered_registers(); 5555 5556 __ mov(c_rarg0, rscratch2); 5557 __ call_VM_leaf 5558 (CAST_FROM_FN_PTR 5559 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5560 5561 __ reset_last_Java_frame(true); 5562 5563 __ mov(rscratch1, r0); 5564 5565 __ pop_call_clobbered_registers(); 5566 5567 __ cbnz(rscratch1, deoptimize_label); 5568 5569 __ leave(); 5570 __ ret(lr); 5571 5572 __ BIND(deoptimize_label); 5573 5574 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5575 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5576 5577 __ mov(sp, rscratch1); 5578 __ br(rscratch2); 5579 5580 return start; 5581 } 5582 5583 // r0 = result 5584 // r1 = str1 5585 // r2 = cnt1 5586 // r3 = str2 5587 // r4 = cnt2 5588 // r10 = tmp1 5589 // r11 = tmp2 5590 address generate_compare_long_string_same_encoding(bool isLL) { 5591 __ align(CodeEntryAlignment); 5592 StubCodeMark mark(this, "StubRoutines", isLL 5593 ? "compare_long_string_same_encoding LL" 5594 : "compare_long_string_same_encoding UU"); 5595 address entry = __ pc(); 5596 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5597 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5598 5599 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5600 5601 // exit from large loop when less than 64 bytes left to read or we're about 5602 // to prefetch memory behind array border 5603 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5604 5605 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5606 __ eor(rscratch2, tmp1, tmp2); 5607 __ cbnz(rscratch2, CAL_DIFFERENCE); 5608 5609 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5610 // update pointers, because of previous read 5611 __ add(str1, str1, wordSize); 5612 __ add(str2, str2, wordSize); 5613 if (SoftwarePrefetchHintDistance >= 0) { 5614 __ align(OptoLoopAlignment); 5615 __ bind(LARGE_LOOP_PREFETCH); 5616 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5617 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5618 5619 for (int i = 0; i < 4; i++) { 5620 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5621 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5622 __ cmp(tmp1, tmp2); 5623 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5624 __ br(Assembler::NE, DIFF); 5625 } 5626 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5627 __ add(str1, str1, 64); 5628 __ add(str2, str2, 64); 5629 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5630 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5631 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5632 } 5633 5634 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5635 __ br(Assembler::LE, LESS16); 5636 __ align(OptoLoopAlignment); 5637 __ bind(LOOP_COMPARE16); 5638 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5639 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5640 __ cmp(tmp1, tmp2); 5641 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5642 __ br(Assembler::NE, DIFF); 5643 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5644 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5645 __ br(Assembler::LT, LESS16); 5646 5647 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5648 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5649 __ cmp(tmp1, tmp2); 5650 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5651 __ br(Assembler::NE, DIFF); 5652 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5653 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5654 __ br(Assembler::GE, LOOP_COMPARE16); 5655 __ cbz(cnt2, LENGTH_DIFF); 5656 5657 __ bind(LESS16); 5658 // each 8 compare 5659 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5660 __ br(Assembler::LE, LESS8); 5661 __ ldr(tmp1, Address(__ post(str1, 8))); 5662 __ ldr(tmp2, Address(__ post(str2, 8))); 5663 __ eor(rscratch2, tmp1, tmp2); 5664 __ cbnz(rscratch2, CAL_DIFFERENCE); 5665 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5666 5667 __ bind(LESS8); // directly load last 8 bytes 5668 if (!isLL) { 5669 __ add(cnt2, cnt2, cnt2); 5670 } 5671 __ ldr(tmp1, Address(str1, cnt2)); 5672 __ ldr(tmp2, Address(str2, cnt2)); 5673 __ eor(rscratch2, tmp1, tmp2); 5674 __ cbz(rscratch2, LENGTH_DIFF); 5675 __ b(CAL_DIFFERENCE); 5676 5677 __ bind(DIFF); 5678 __ cmp(tmp1, tmp2); 5679 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5680 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5681 // reuse rscratch2 register for the result of eor instruction 5682 __ eor(rscratch2, tmp1, tmp2); 5683 5684 __ bind(CAL_DIFFERENCE); 5685 __ rev(rscratch2, rscratch2); 5686 __ clz(rscratch2, rscratch2); 5687 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5688 __ lsrv(tmp1, tmp1, rscratch2); 5689 __ lsrv(tmp2, tmp2, rscratch2); 5690 if (isLL) { 5691 __ uxtbw(tmp1, tmp1); 5692 __ uxtbw(tmp2, tmp2); 5693 } else { 5694 __ uxthw(tmp1, tmp1); 5695 __ uxthw(tmp2, tmp2); 5696 } 5697 __ subw(result, tmp1, tmp2); 5698 5699 __ bind(LENGTH_DIFF); 5700 __ ret(lr); 5701 return entry; 5702 } 5703 5704 enum string_compare_mode { 5705 LL, 5706 LU, 5707 UL, 5708 UU, 5709 }; 5710 5711 // The following registers are declared in aarch64.ad 5712 // r0 = result 5713 // r1 = str1 5714 // r2 = cnt1 5715 // r3 = str2 5716 // r4 = cnt2 5717 // r10 = tmp1 5718 // r11 = tmp2 5719 // z0 = ztmp1 5720 // z1 = ztmp2 5721 // p0 = pgtmp1 5722 // p1 = pgtmp2 5723 address generate_compare_long_string_sve(string_compare_mode mode) { 5724 __ align(CodeEntryAlignment); 5725 address entry = __ pc(); 5726 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5727 tmp1 = r10, tmp2 = r11; 5728 5729 Label LOOP, DONE, MISMATCH; 5730 Register vec_len = tmp1; 5731 Register idx = tmp2; 5732 // The minimum of the string lengths has been stored in cnt2. 5733 Register cnt = cnt2; 5734 FloatRegister ztmp1 = z0, ztmp2 = z1; 5735 PRegister pgtmp1 = p0, pgtmp2 = p1; 5736 5737 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5738 switch (mode) { \ 5739 case LL: \ 5740 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5741 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5742 break; \ 5743 case LU: \ 5744 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5745 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5746 break; \ 5747 case UL: \ 5748 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5749 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5750 break; \ 5751 case UU: \ 5752 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5753 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5754 break; \ 5755 default: \ 5756 ShouldNotReachHere(); \ 5757 } 5758 5759 const char* stubname; 5760 switch (mode) { 5761 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5762 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5763 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5764 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5765 default: ShouldNotReachHere(); 5766 } 5767 5768 StubCodeMark mark(this, "StubRoutines", stubname); 5769 5770 __ mov(idx, 0); 5771 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5772 5773 if (mode == LL) { 5774 __ sve_cntb(vec_len); 5775 } else { 5776 __ sve_cnth(vec_len); 5777 } 5778 5779 __ sub(rscratch1, cnt, vec_len); 5780 5781 __ bind(LOOP); 5782 5783 // main loop 5784 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5785 __ add(idx, idx, vec_len); 5786 // Compare strings. 5787 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5788 __ br(__ NE, MISMATCH); 5789 __ cmp(idx, rscratch1); 5790 __ br(__ LT, LOOP); 5791 5792 // post loop, last iteration 5793 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5794 5795 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5796 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5797 __ br(__ EQ, DONE); 5798 5799 __ bind(MISMATCH); 5800 5801 // Crop the vector to find its location. 5802 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5803 // Extract the first different characters of each string. 5804 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5805 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5806 5807 // Compute the difference of the first different characters. 5808 __ sub(result, rscratch1, rscratch2); 5809 5810 __ bind(DONE); 5811 __ ret(lr); 5812 #undef LOAD_PAIR 5813 return entry; 5814 } 5815 5816 void generate_compare_long_strings() { 5817 if (UseSVE == 0) { 5818 StubRoutines::aarch64::_compare_long_string_LL 5819 = generate_compare_long_string_same_encoding(true); 5820 StubRoutines::aarch64::_compare_long_string_UU 5821 = generate_compare_long_string_same_encoding(false); 5822 StubRoutines::aarch64::_compare_long_string_LU 5823 = generate_compare_long_string_different_encoding(true); 5824 StubRoutines::aarch64::_compare_long_string_UL 5825 = generate_compare_long_string_different_encoding(false); 5826 } else { 5827 StubRoutines::aarch64::_compare_long_string_LL 5828 = generate_compare_long_string_sve(LL); 5829 StubRoutines::aarch64::_compare_long_string_UU 5830 = generate_compare_long_string_sve(UU); 5831 StubRoutines::aarch64::_compare_long_string_LU 5832 = generate_compare_long_string_sve(LU); 5833 StubRoutines::aarch64::_compare_long_string_UL 5834 = generate_compare_long_string_sve(UL); 5835 } 5836 } 5837 5838 // R0 = result 5839 // R1 = str2 5840 // R2 = cnt1 5841 // R3 = str1 5842 // R4 = cnt2 5843 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5844 // 5845 // This generic linear code use few additional ideas, which makes it faster: 5846 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5847 // in order to skip initial loading(help in systems with 1 ld pipeline) 5848 // 2) we can use "fast" algorithm of finding single character to search for 5849 // first symbol with less branches(1 branch per each loaded register instead 5850 // of branch for each symbol), so, this is where constants like 5851 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5852 // 3) after loading and analyzing 1st register of source string, it can be 5853 // used to search for every 1st character entry, saving few loads in 5854 // comparison with "simplier-but-slower" implementation 5855 // 4) in order to avoid lots of push/pop operations, code below is heavily 5856 // re-using/re-initializing/compressing register values, which makes code 5857 // larger and a bit less readable, however, most of extra operations are 5858 // issued during loads or branches, so, penalty is minimal 5859 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5860 const char* stubName = str1_isL 5861 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5862 : "indexof_linear_uu"; 5863 __ align(CodeEntryAlignment); 5864 StubCodeMark mark(this, "StubRoutines", stubName); 5865 address entry = __ pc(); 5866 5867 int str1_chr_size = str1_isL ? 1 : 2; 5868 int str2_chr_size = str2_isL ? 1 : 2; 5869 int str1_chr_shift = str1_isL ? 0 : 1; 5870 int str2_chr_shift = str2_isL ? 0 : 1; 5871 bool isL = str1_isL && str2_isL; 5872 // parameters 5873 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5874 // temporary registers 5875 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5876 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5877 // redefinitions 5878 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5879 5880 __ push(spilled_regs, sp); 5881 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5882 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5883 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5884 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5885 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5886 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5887 // Read whole register from str1. It is safe, because length >=8 here 5888 __ ldr(ch1, Address(str1)); 5889 // Read whole register from str2. It is safe, because length >=8 here 5890 __ ldr(ch2, Address(str2)); 5891 __ sub(cnt2, cnt2, cnt1); 5892 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5893 if (str1_isL != str2_isL) { 5894 __ eor(v0, __ T16B, v0, v0); 5895 } 5896 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5897 __ mul(first, first, tmp1); 5898 // check if we have less than 1 register to check 5899 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5900 if (str1_isL != str2_isL) { 5901 __ fmovd(v1, ch1); 5902 } 5903 __ br(__ LE, L_SMALL); 5904 __ eor(ch2, first, ch2); 5905 if (str1_isL != str2_isL) { 5906 __ zip1(v1, __ T16B, v1, v0); 5907 } 5908 __ sub(tmp2, ch2, tmp1); 5909 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5910 __ bics(tmp2, tmp2, ch2); 5911 if (str1_isL != str2_isL) { 5912 __ fmovd(ch1, v1); 5913 } 5914 __ br(__ NE, L_HAS_ZERO); 5915 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5916 __ add(result, result, wordSize/str2_chr_size); 5917 __ add(str2, str2, wordSize); 5918 __ br(__ LT, L_POST_LOOP); 5919 __ BIND(L_LOOP); 5920 __ ldr(ch2, Address(str2)); 5921 __ eor(ch2, first, ch2); 5922 __ sub(tmp2, ch2, tmp1); 5923 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5924 __ bics(tmp2, tmp2, ch2); 5925 __ br(__ NE, L_HAS_ZERO); 5926 __ BIND(L_LOOP_PROCEED); 5927 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5928 __ add(str2, str2, wordSize); 5929 __ add(result, result, wordSize/str2_chr_size); 5930 __ br(__ GE, L_LOOP); 5931 __ BIND(L_POST_LOOP); 5932 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5933 __ br(__ LE, NOMATCH); 5934 __ ldr(ch2, Address(str2)); 5935 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5936 __ eor(ch2, first, ch2); 5937 __ sub(tmp2, ch2, tmp1); 5938 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5939 __ mov(tmp4, -1); // all bits set 5940 __ b(L_SMALL_PROCEED); 5941 __ align(OptoLoopAlignment); 5942 __ BIND(L_SMALL); 5943 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5944 __ eor(ch2, first, ch2); 5945 if (str1_isL != str2_isL) { 5946 __ zip1(v1, __ T16B, v1, v0); 5947 } 5948 __ sub(tmp2, ch2, tmp1); 5949 __ mov(tmp4, -1); // all bits set 5950 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5951 if (str1_isL != str2_isL) { 5952 __ fmovd(ch1, v1); // move converted 4 symbols 5953 } 5954 __ BIND(L_SMALL_PROCEED); 5955 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5956 __ bic(tmp2, tmp2, ch2); 5957 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5958 __ rbit(tmp2, tmp2); 5959 __ br(__ EQ, NOMATCH); 5960 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5961 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5962 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5963 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5964 if (str2_isL) { // LL 5965 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5966 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5967 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5968 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5969 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5970 } else { 5971 __ mov(ch2, 0xE); // all bits in byte set except last one 5972 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5973 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5974 __ lslv(tmp2, tmp2, tmp4); 5975 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5976 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5977 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5978 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5979 } 5980 __ cmp(ch1, ch2); 5981 __ mov(tmp4, wordSize/str2_chr_size); 5982 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5983 __ BIND(L_SMALL_CMP_LOOP); 5984 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5985 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5986 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5987 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5988 __ add(tmp4, tmp4, 1); 5989 __ cmp(tmp4, cnt1); 5990 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5991 __ cmp(first, ch2); 5992 __ br(__ EQ, L_SMALL_CMP_LOOP); 5993 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5994 __ cbz(tmp2, NOMATCH); // no more matches. exit 5995 __ clz(tmp4, tmp2); 5996 __ add(result, result, 1); // advance index 5997 __ add(str2, str2, str2_chr_size); // advance pointer 5998 __ b(L_SMALL_HAS_ZERO_LOOP); 5999 __ align(OptoLoopAlignment); 6000 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 6001 __ cmp(first, ch2); 6002 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6003 __ b(DONE); 6004 __ align(OptoLoopAlignment); 6005 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 6006 if (str2_isL) { // LL 6007 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6008 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6009 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6010 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6011 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6012 } else { 6013 __ mov(ch2, 0xE); // all bits in byte set except last one 6014 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6015 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6016 __ lslv(tmp2, tmp2, tmp4); 6017 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6018 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6019 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6020 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6021 } 6022 __ cmp(ch1, ch2); 6023 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6024 __ b(DONE); 6025 __ align(OptoLoopAlignment); 6026 __ BIND(L_HAS_ZERO); 6027 __ rbit(tmp2, tmp2); 6028 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6029 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6030 // It's fine because both counters are 32bit and are not changed in this 6031 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6032 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6033 __ sub(result, result, 1); 6034 __ BIND(L_HAS_ZERO_LOOP); 6035 __ mov(cnt1, wordSize/str2_chr_size); 6036 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6037 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6038 if (str2_isL) { 6039 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6040 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6041 __ lslv(tmp2, tmp2, tmp4); 6042 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6043 __ add(tmp4, tmp4, 1); 6044 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6045 __ lsl(tmp2, tmp2, 1); 6046 __ mov(tmp4, wordSize/str2_chr_size); 6047 } else { 6048 __ mov(ch2, 0xE); 6049 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6050 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6051 __ lslv(tmp2, tmp2, tmp4); 6052 __ add(tmp4, tmp4, 1); 6053 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6054 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6055 __ lsl(tmp2, tmp2, 1); 6056 __ mov(tmp4, wordSize/str2_chr_size); 6057 __ sub(str2, str2, str2_chr_size); 6058 } 6059 __ cmp(ch1, ch2); 6060 __ mov(tmp4, wordSize/str2_chr_size); 6061 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6062 __ BIND(L_CMP_LOOP); 6063 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6064 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6065 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6066 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6067 __ add(tmp4, tmp4, 1); 6068 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6069 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6070 __ cmp(cnt1, ch2); 6071 __ br(__ EQ, L_CMP_LOOP); 6072 __ BIND(L_CMP_LOOP_NOMATCH); 6073 // here we're not matched 6074 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6075 __ clz(tmp4, tmp2); 6076 __ add(str2, str2, str2_chr_size); // advance pointer 6077 __ b(L_HAS_ZERO_LOOP); 6078 __ align(OptoLoopAlignment); 6079 __ BIND(L_CMP_LOOP_LAST_CMP); 6080 __ cmp(cnt1, ch2); 6081 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6082 __ b(DONE); 6083 __ align(OptoLoopAlignment); 6084 __ BIND(L_CMP_LOOP_LAST_CMP2); 6085 if (str2_isL) { 6086 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6087 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6088 __ lslv(tmp2, tmp2, tmp4); 6089 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6090 __ add(tmp4, tmp4, 1); 6091 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6092 __ lsl(tmp2, tmp2, 1); 6093 } else { 6094 __ mov(ch2, 0xE); 6095 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6096 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6097 __ lslv(tmp2, tmp2, tmp4); 6098 __ add(tmp4, tmp4, 1); 6099 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6100 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6101 __ lsl(tmp2, tmp2, 1); 6102 __ sub(str2, str2, str2_chr_size); 6103 } 6104 __ cmp(ch1, ch2); 6105 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6106 __ b(DONE); 6107 __ align(OptoLoopAlignment); 6108 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6109 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6110 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6111 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6112 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6113 // result by analyzed characters value, so, we can just reset lower bits 6114 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6115 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6116 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6117 // index of last analyzed substring inside current octet. So, str2 in at 6118 // respective start address. We need to advance it to next octet 6119 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6120 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6121 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6122 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6123 __ movw(cnt2, cnt2); 6124 __ b(L_LOOP_PROCEED); 6125 __ align(OptoLoopAlignment); 6126 __ BIND(NOMATCH); 6127 __ mov(result, -1); 6128 __ BIND(DONE); 6129 __ pop(spilled_regs, sp); 6130 __ ret(lr); 6131 return entry; 6132 } 6133 6134 void generate_string_indexof_stubs() { 6135 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6136 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6137 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6138 } 6139 6140 void inflate_and_store_2_fp_registers(bool generatePrfm, 6141 FloatRegister src1, FloatRegister src2) { 6142 Register dst = r1; 6143 __ zip1(v1, __ T16B, src1, v0); 6144 __ zip2(v2, __ T16B, src1, v0); 6145 if (generatePrfm) { 6146 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6147 } 6148 __ zip1(v3, __ T16B, src2, v0); 6149 __ zip2(v4, __ T16B, src2, v0); 6150 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6151 } 6152 6153 // R0 = src 6154 // R1 = dst 6155 // R2 = len 6156 // R3 = len >> 3 6157 // V0 = 0 6158 // v1 = loaded 8 bytes 6159 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6160 address generate_large_byte_array_inflate() { 6161 __ align(CodeEntryAlignment); 6162 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6163 address entry = __ pc(); 6164 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6165 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6166 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6167 6168 // do one more 8-byte read to have address 16-byte aligned in most cases 6169 // also use single store instruction 6170 __ ldrd(v2, __ post(src, 8)); 6171 __ sub(octetCounter, octetCounter, 2); 6172 __ zip1(v1, __ T16B, v1, v0); 6173 __ zip1(v2, __ T16B, v2, v0); 6174 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6175 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6176 __ subs(rscratch1, octetCounter, large_loop_threshold); 6177 __ br(__ LE, LOOP_START); 6178 __ b(LOOP_PRFM_START); 6179 __ bind(LOOP_PRFM); 6180 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6181 __ bind(LOOP_PRFM_START); 6182 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6183 __ sub(octetCounter, octetCounter, 8); 6184 __ subs(rscratch1, octetCounter, large_loop_threshold); 6185 inflate_and_store_2_fp_registers(true, v3, v4); 6186 inflate_and_store_2_fp_registers(true, v5, v6); 6187 __ br(__ GT, LOOP_PRFM); 6188 __ cmp(octetCounter, (u1)8); 6189 __ br(__ LT, DONE); 6190 __ bind(LOOP); 6191 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6192 __ bind(LOOP_START); 6193 __ sub(octetCounter, octetCounter, 8); 6194 __ cmp(octetCounter, (u1)8); 6195 inflate_and_store_2_fp_registers(false, v3, v4); 6196 inflate_and_store_2_fp_registers(false, v5, v6); 6197 __ br(__ GE, LOOP); 6198 __ bind(DONE); 6199 __ ret(lr); 6200 return entry; 6201 } 6202 6203 /** 6204 * Arguments: 6205 * 6206 * Input: 6207 * c_rarg0 - current state address 6208 * c_rarg1 - H key address 6209 * c_rarg2 - data address 6210 * c_rarg3 - number of blocks 6211 * 6212 * Output: 6213 * Updated state at c_rarg0 6214 */ 6215 address generate_ghash_processBlocks() { 6216 // Bafflingly, GCM uses little-endian for the byte order, but 6217 // big-endian for the bit order. For example, the polynomial 1 is 6218 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6219 // 6220 // So, we must either reverse the bytes in each word and do 6221 // everything big-endian or reverse the bits in each byte and do 6222 // it little-endian. On AArch64 it's more idiomatic to reverse 6223 // the bits in each byte (we have an instruction, RBIT, to do 6224 // that) and keep the data in little-endian bit order through the 6225 // calculation, bit-reversing the inputs and outputs. 6226 6227 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6228 __ align(wordSize * 2); 6229 address p = __ pc(); 6230 __ emit_int64(0x87); // The low-order bits of the field 6231 // polynomial (i.e. p = z^7+z^2+z+1) 6232 // repeated in the low and high parts of a 6233 // 128-bit vector 6234 __ emit_int64(0x87); 6235 6236 __ align(CodeEntryAlignment); 6237 address start = __ pc(); 6238 6239 Register state = c_rarg0; 6240 Register subkeyH = c_rarg1; 6241 Register data = c_rarg2; 6242 Register blocks = c_rarg3; 6243 6244 FloatRegister vzr = v30; 6245 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6246 6247 __ ldrq(v24, p); // The field polynomial 6248 6249 __ ldrq(v0, Address(state)); 6250 __ ldrq(v1, Address(subkeyH)); 6251 6252 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6253 __ rbit(v0, __ T16B, v0); 6254 __ rev64(v1, __ T16B, v1); 6255 __ rbit(v1, __ T16B, v1); 6256 6257 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6258 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6259 6260 { 6261 Label L_ghash_loop; 6262 __ bind(L_ghash_loop); 6263 6264 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6265 // reversing each byte 6266 __ rbit(v2, __ T16B, v2); 6267 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6268 6269 // Multiply state in v2 by subkey in v1 6270 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6271 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6272 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6273 // Reduce v7:v5 by the field polynomial 6274 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6275 6276 __ sub(blocks, blocks, 1); 6277 __ cbnz(blocks, L_ghash_loop); 6278 } 6279 6280 // The bit-reversed result is at this point in v0 6281 __ rev64(v0, __ T16B, v0); 6282 __ rbit(v0, __ T16B, v0); 6283 6284 __ st1(v0, __ T16B, state); 6285 __ ret(lr); 6286 6287 return start; 6288 } 6289 6290 address generate_ghash_processBlocks_wide() { 6291 address small = generate_ghash_processBlocks(); 6292 6293 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6294 __ align(wordSize * 2); 6295 address p = __ pc(); 6296 __ emit_int64(0x87); // The low-order bits of the field 6297 // polynomial (i.e. p = z^7+z^2+z+1) 6298 // repeated in the low and high parts of a 6299 // 128-bit vector 6300 __ emit_int64(0x87); 6301 6302 __ align(CodeEntryAlignment); 6303 address start = __ pc(); 6304 6305 Register state = c_rarg0; 6306 Register subkeyH = c_rarg1; 6307 Register data = c_rarg2; 6308 Register blocks = c_rarg3; 6309 6310 const int unroll = 4; 6311 6312 __ cmp(blocks, (unsigned char)(unroll * 2)); 6313 __ br(__ LT, small); 6314 6315 if (unroll > 1) { 6316 // Save state before entering routine 6317 __ sub(sp, sp, 4 * 16); 6318 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6319 __ sub(sp, sp, 4 * 16); 6320 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6321 } 6322 6323 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6324 6325 if (unroll > 1) { 6326 // And restore state 6327 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6328 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6329 } 6330 6331 __ cmp(blocks, (unsigned char)0); 6332 __ br(__ GT, small); 6333 6334 __ ret(lr); 6335 6336 return start; 6337 } 6338 6339 void generate_base64_encode_simdround(Register src, Register dst, 6340 FloatRegister codec, u8 size) { 6341 6342 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6343 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6344 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6345 6346 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6347 6348 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6349 6350 __ ushr(ind0, arrangement, in0, 2); 6351 6352 __ ushr(ind1, arrangement, in1, 2); 6353 __ shl(in0, arrangement, in0, 6); 6354 __ orr(ind1, arrangement, ind1, in0); 6355 __ ushr(ind1, arrangement, ind1, 2); 6356 6357 __ ushr(ind2, arrangement, in2, 4); 6358 __ shl(in1, arrangement, in1, 4); 6359 __ orr(ind2, arrangement, in1, ind2); 6360 __ ushr(ind2, arrangement, ind2, 2); 6361 6362 __ shl(ind3, arrangement, in2, 2); 6363 __ ushr(ind3, arrangement, ind3, 2); 6364 6365 __ tbl(out0, arrangement, codec, 4, ind0); 6366 __ tbl(out1, arrangement, codec, 4, ind1); 6367 __ tbl(out2, arrangement, codec, 4, ind2); 6368 __ tbl(out3, arrangement, codec, 4, ind3); 6369 6370 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6371 } 6372 6373 /** 6374 * Arguments: 6375 * 6376 * Input: 6377 * c_rarg0 - src_start 6378 * c_rarg1 - src_offset 6379 * c_rarg2 - src_length 6380 * c_rarg3 - dest_start 6381 * c_rarg4 - dest_offset 6382 * c_rarg5 - isURL 6383 * 6384 */ 6385 address generate_base64_encodeBlock() { 6386 6387 static const char toBase64[64] = { 6388 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6389 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6390 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6391 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6392 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6393 }; 6394 6395 static const char toBase64URL[64] = { 6396 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6397 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6398 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6399 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6400 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6401 }; 6402 6403 __ align(CodeEntryAlignment); 6404 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6405 address start = __ pc(); 6406 6407 Register src = c_rarg0; // source array 6408 Register soff = c_rarg1; // source start offset 6409 Register send = c_rarg2; // source end offset 6410 Register dst = c_rarg3; // dest array 6411 Register doff = c_rarg4; // position for writing to dest array 6412 Register isURL = c_rarg5; // Base64 or URL character set 6413 6414 // c_rarg6 and c_rarg7 are free to use as temps 6415 Register codec = c_rarg6; 6416 Register length = c_rarg7; 6417 6418 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6419 6420 __ add(src, src, soff); 6421 __ add(dst, dst, doff); 6422 __ sub(length, send, soff); 6423 6424 // load the codec base address 6425 __ lea(codec, ExternalAddress((address) toBase64)); 6426 __ cbz(isURL, ProcessData); 6427 __ lea(codec, ExternalAddress((address) toBase64URL)); 6428 6429 __ BIND(ProcessData); 6430 6431 // too short to formup a SIMD loop, roll back 6432 __ cmp(length, (u1)24); 6433 __ br(Assembler::LT, Process3B); 6434 6435 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6436 6437 __ BIND(Process48B); 6438 __ cmp(length, (u1)48); 6439 __ br(Assembler::LT, Process24B); 6440 generate_base64_encode_simdround(src, dst, v0, 16); 6441 __ sub(length, length, 48); 6442 __ b(Process48B); 6443 6444 __ BIND(Process24B); 6445 __ cmp(length, (u1)24); 6446 __ br(Assembler::LT, SIMDExit); 6447 generate_base64_encode_simdround(src, dst, v0, 8); 6448 __ sub(length, length, 24); 6449 6450 __ BIND(SIMDExit); 6451 __ cbz(length, Exit); 6452 6453 __ BIND(Process3B); 6454 // 3 src bytes, 24 bits 6455 __ ldrb(r10, __ post(src, 1)); 6456 __ ldrb(r11, __ post(src, 1)); 6457 __ ldrb(r12, __ post(src, 1)); 6458 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6459 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6460 // codec index 6461 __ ubfmw(r15, r12, 18, 23); 6462 __ ubfmw(r14, r12, 12, 17); 6463 __ ubfmw(r13, r12, 6, 11); 6464 __ andw(r12, r12, 63); 6465 // get the code based on the codec 6466 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6467 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6468 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6469 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6470 __ strb(r15, __ post(dst, 1)); 6471 __ strb(r14, __ post(dst, 1)); 6472 __ strb(r13, __ post(dst, 1)); 6473 __ strb(r12, __ post(dst, 1)); 6474 __ sub(length, length, 3); 6475 __ cbnz(length, Process3B); 6476 6477 __ BIND(Exit); 6478 __ ret(lr); 6479 6480 return start; 6481 } 6482 6483 void generate_base64_decode_simdround(Register src, Register dst, 6484 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6485 6486 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6487 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6488 6489 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6490 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6491 6492 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6493 6494 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6495 6496 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6497 6498 // we need unsigned saturating subtract, to make sure all input values 6499 // in range [0, 63] will have 0U value in the higher half lookup 6500 __ uqsubv(decH0, __ T16B, in0, v27); 6501 __ uqsubv(decH1, __ T16B, in1, v27); 6502 __ uqsubv(decH2, __ T16B, in2, v27); 6503 __ uqsubv(decH3, __ T16B, in3, v27); 6504 6505 // lower half lookup 6506 __ tbl(decL0, arrangement, codecL, 4, in0); 6507 __ tbl(decL1, arrangement, codecL, 4, in1); 6508 __ tbl(decL2, arrangement, codecL, 4, in2); 6509 __ tbl(decL3, arrangement, codecL, 4, in3); 6510 6511 // higher half lookup 6512 __ tbx(decH0, arrangement, codecH, 4, decH0); 6513 __ tbx(decH1, arrangement, codecH, 4, decH1); 6514 __ tbx(decH2, arrangement, codecH, 4, decH2); 6515 __ tbx(decH3, arrangement, codecH, 4, decH3); 6516 6517 // combine lower and higher 6518 __ orr(decL0, arrangement, decL0, decH0); 6519 __ orr(decL1, arrangement, decL1, decH1); 6520 __ orr(decL2, arrangement, decL2, decH2); 6521 __ orr(decL3, arrangement, decL3, decH3); 6522 6523 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6524 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6525 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6526 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6527 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6528 __ orr(in0, arrangement, decH0, decH1); 6529 __ orr(in1, arrangement, decH2, decH3); 6530 __ orr(in2, arrangement, in0, in1); 6531 __ umaxv(in3, arrangement, in2); 6532 __ umov(rscratch2, in3, __ B, 0); 6533 6534 // get the data to output 6535 __ shl(out0, arrangement, decL0, 2); 6536 __ ushr(out1, arrangement, decL1, 4); 6537 __ orr(out0, arrangement, out0, out1); 6538 __ shl(out1, arrangement, decL1, 4); 6539 __ ushr(out2, arrangement, decL2, 2); 6540 __ orr(out1, arrangement, out1, out2); 6541 __ shl(out2, arrangement, decL2, 6); 6542 __ orr(out2, arrangement, out2, decL3); 6543 6544 __ cbz(rscratch2, NoIllegalData); 6545 6546 // handle illegal input 6547 __ umov(r10, in2, __ D, 0); 6548 if (size == 16) { 6549 __ cbnz(r10, ErrorInLowerHalf); 6550 6551 // illegal input is in higher half, store the lower half now. 6552 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6553 6554 __ umov(r10, in2, __ D, 1); 6555 __ umov(r11, out0, __ D, 1); 6556 __ umov(r12, out1, __ D, 1); 6557 __ umov(r13, out2, __ D, 1); 6558 __ b(StoreLegalData); 6559 6560 __ BIND(ErrorInLowerHalf); 6561 } 6562 __ umov(r11, out0, __ D, 0); 6563 __ umov(r12, out1, __ D, 0); 6564 __ umov(r13, out2, __ D, 0); 6565 6566 __ BIND(StoreLegalData); 6567 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6568 __ strb(r11, __ post(dst, 1)); 6569 __ strb(r12, __ post(dst, 1)); 6570 __ strb(r13, __ post(dst, 1)); 6571 __ lsr(r10, r10, 8); 6572 __ lsr(r11, r11, 8); 6573 __ lsr(r12, r12, 8); 6574 __ lsr(r13, r13, 8); 6575 __ b(StoreLegalData); 6576 6577 __ BIND(NoIllegalData); 6578 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6579 } 6580 6581 6582 /** 6583 * Arguments: 6584 * 6585 * Input: 6586 * c_rarg0 - src_start 6587 * c_rarg1 - src_offset 6588 * c_rarg2 - src_length 6589 * c_rarg3 - dest_start 6590 * c_rarg4 - dest_offset 6591 * c_rarg5 - isURL 6592 * c_rarg6 - isMIME 6593 * 6594 */ 6595 address generate_base64_decodeBlock() { 6596 6597 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6598 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6599 // titled "Base64 decoding". 6600 6601 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6602 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6603 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6604 static const uint8_t fromBase64ForNoSIMD[256] = { 6605 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6606 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6607 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6608 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6609 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6610 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6611 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6612 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6613 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6614 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6616 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6617 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6618 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6619 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6620 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6621 }; 6622 6623 static const uint8_t fromBase64URLForNoSIMD[256] = { 6624 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6625 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6626 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6627 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6628 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6629 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6630 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6631 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6632 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6633 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6634 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6635 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6636 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6637 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6638 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6639 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6640 }; 6641 6642 // A legal value of base64 code is in range [0, 127]. We need two lookups 6643 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6644 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6645 // table vector lookup use tbx, out of range indices are unchanged in 6646 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6647 // The value of index 64 is set to 0, so that we know that we already get the 6648 // decoded data with the 1st lookup. 6649 static const uint8_t fromBase64ForSIMD[128] = { 6650 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6651 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6652 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6653 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6654 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6655 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6656 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6657 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6658 }; 6659 6660 static const uint8_t fromBase64URLForSIMD[128] = { 6661 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6662 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6663 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6664 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6665 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6666 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6667 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6668 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6669 }; 6670 6671 __ align(CodeEntryAlignment); 6672 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6673 address start = __ pc(); 6674 6675 Register src = c_rarg0; // source array 6676 Register soff = c_rarg1; // source start offset 6677 Register send = c_rarg2; // source end offset 6678 Register dst = c_rarg3; // dest array 6679 Register doff = c_rarg4; // position for writing to dest array 6680 Register isURL = c_rarg5; // Base64 or URL character set 6681 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6682 6683 Register length = send; // reuse send as length of source data to process 6684 6685 Register simd_codec = c_rarg6; 6686 Register nosimd_codec = c_rarg7; 6687 6688 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6689 6690 __ enter(); 6691 6692 __ add(src, src, soff); 6693 __ add(dst, dst, doff); 6694 6695 __ mov(doff, dst); 6696 6697 __ sub(length, send, soff); 6698 __ bfm(length, zr, 0, 1); 6699 6700 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6701 __ cbz(isURL, ProcessData); 6702 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6703 6704 __ BIND(ProcessData); 6705 __ mov(rscratch1, length); 6706 __ cmp(length, (u1)144); // 144 = 80 + 64 6707 __ br(Assembler::LT, Process4B); 6708 6709 // In the MIME case, the line length cannot be more than 76 6710 // bytes (see RFC 2045). This is too short a block for SIMD 6711 // to be worthwhile, so we use non-SIMD here. 6712 __ movw(rscratch1, 79); 6713 6714 __ BIND(Process4B); 6715 __ ldrw(r14, __ post(src, 4)); 6716 __ ubfxw(r10, r14, 0, 8); 6717 __ ubfxw(r11, r14, 8, 8); 6718 __ ubfxw(r12, r14, 16, 8); 6719 __ ubfxw(r13, r14, 24, 8); 6720 // get the de-code 6721 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6722 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6723 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6724 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6725 // error detection, 255u indicates an illegal input 6726 __ orrw(r14, r10, r11); 6727 __ orrw(r15, r12, r13); 6728 __ orrw(r14, r14, r15); 6729 __ tbnz(r14, 7, Exit); 6730 // recover the data 6731 __ lslw(r14, r10, 10); 6732 __ bfiw(r14, r11, 4, 6); 6733 __ bfmw(r14, r12, 2, 5); 6734 __ rev16w(r14, r14); 6735 __ bfiw(r13, r12, 6, 2); 6736 __ strh(r14, __ post(dst, 2)); 6737 __ strb(r13, __ post(dst, 1)); 6738 // non-simd loop 6739 __ subsw(rscratch1, rscratch1, 4); 6740 __ br(Assembler::GT, Process4B); 6741 6742 // if exiting from PreProcess80B, rscratch1 == -1; 6743 // otherwise, rscratch1 == 0. 6744 __ cbzw(rscratch1, Exit); 6745 __ sub(length, length, 80); 6746 6747 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6748 __ cbz(isURL, SIMDEnter); 6749 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6750 6751 __ BIND(SIMDEnter); 6752 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6753 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6754 __ mov(rscratch1, 63); 6755 __ dup(v27, __ T16B, rscratch1); 6756 6757 __ BIND(Process64B); 6758 __ cmp(length, (u1)64); 6759 __ br(Assembler::LT, Process32B); 6760 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6761 __ sub(length, length, 64); 6762 __ b(Process64B); 6763 6764 __ BIND(Process32B); 6765 __ cmp(length, (u1)32); 6766 __ br(Assembler::LT, SIMDExit); 6767 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6768 __ sub(length, length, 32); 6769 __ b(Process32B); 6770 6771 __ BIND(SIMDExit); 6772 __ cbz(length, Exit); 6773 __ movw(rscratch1, length); 6774 __ b(Process4B); 6775 6776 __ BIND(Exit); 6777 __ sub(c_rarg0, dst, doff); 6778 6779 __ leave(); 6780 __ ret(lr); 6781 6782 return start; 6783 } 6784 6785 // Support for spin waits. 6786 address generate_spin_wait() { 6787 __ align(CodeEntryAlignment); 6788 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6789 address start = __ pc(); 6790 6791 __ spin_wait(); 6792 __ ret(lr); 6793 6794 return start; 6795 } 6796 6797 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6798 6799 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6800 // 6801 // If LSE is in use, generate LSE versions of all the stubs. The 6802 // non-LSE versions are in atomic_aarch64.S. 6803 6804 // class AtomicStubMark records the entry point of a stub and the 6805 // stub pointer which will point to it. The stub pointer is set to 6806 // the entry point when ~AtomicStubMark() is called, which must be 6807 // after ICache::invalidate_range. This ensures safe publication of 6808 // the generated code. 6809 class AtomicStubMark { 6810 address _entry_point; 6811 aarch64_atomic_stub_t *_stub; 6812 MacroAssembler *_masm; 6813 public: 6814 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6815 _masm = masm; 6816 __ align(32); 6817 _entry_point = __ pc(); 6818 _stub = stub; 6819 } 6820 ~AtomicStubMark() { 6821 *_stub = (aarch64_atomic_stub_t)_entry_point; 6822 } 6823 }; 6824 6825 // NB: For memory_order_conservative we need a trailing membar after 6826 // LSE atomic operations but not a leading membar. 6827 // 6828 // We don't need a leading membar because a clause in the Arm ARM 6829 // says: 6830 // 6831 // Barrier-ordered-before 6832 // 6833 // Barrier instructions order prior Memory effects before subsequent 6834 // Memory effects generated by the same Observer. A read or a write 6835 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6836 // Observer if and only if RW1 appears in program order before RW 2 6837 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6838 // instruction with both Acquire and Release semantics. 6839 // 6840 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6841 // and Release semantics, therefore we don't need a leading 6842 // barrier. However, there is no corresponding Barrier-ordered-after 6843 // relationship, therefore we need a trailing membar to prevent a 6844 // later store or load from being reordered with the store in an 6845 // atomic instruction. 6846 // 6847 // This was checked by using the herd7 consistency model simulator 6848 // (http://diy.inria.fr/) with this test case: 6849 // 6850 // AArch64 LseCas 6851 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6852 // P0 | P1; 6853 // LDR W4, [X2] | MOV W3, #0; 6854 // DMB LD | MOV W4, #1; 6855 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6856 // | DMB ISH; 6857 // | STR W4, [X2]; 6858 // exists 6859 // (0:X3=0 /\ 0:X4=1) 6860 // 6861 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6862 // with the store to x in P1. Without the DMB in P1 this may happen. 6863 // 6864 // At the time of writing we don't know of any AArch64 hardware that 6865 // reorders stores in this way, but the Reference Manual permits it. 6866 6867 void gen_cas_entry(Assembler::operand_size size, 6868 atomic_memory_order order) { 6869 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6870 exchange_val = c_rarg2; 6871 bool acquire, release; 6872 switch (order) { 6873 case memory_order_relaxed: 6874 acquire = false; 6875 release = false; 6876 break; 6877 case memory_order_release: 6878 acquire = false; 6879 release = true; 6880 break; 6881 default: 6882 acquire = true; 6883 release = true; 6884 break; 6885 } 6886 __ mov(prev, compare_val); 6887 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6888 if (order == memory_order_conservative) { 6889 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6890 } 6891 if (size == Assembler::xword) { 6892 __ mov(r0, prev); 6893 } else { 6894 __ movw(r0, prev); 6895 } 6896 __ ret(lr); 6897 } 6898 6899 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6900 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6901 // If not relaxed, then default to conservative. Relaxed is the only 6902 // case we use enough to be worth specializing. 6903 if (order == memory_order_relaxed) { 6904 __ ldadd(size, incr, prev, addr); 6905 } else { 6906 __ ldaddal(size, incr, prev, addr); 6907 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6908 } 6909 if (size == Assembler::xword) { 6910 __ mov(r0, prev); 6911 } else { 6912 __ movw(r0, prev); 6913 } 6914 __ ret(lr); 6915 } 6916 6917 void gen_swpal_entry(Assembler::operand_size size) { 6918 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6919 __ swpal(size, incr, prev, addr); 6920 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6921 if (size == Assembler::xword) { 6922 __ mov(r0, prev); 6923 } else { 6924 __ movw(r0, prev); 6925 } 6926 __ ret(lr); 6927 } 6928 6929 void generate_atomic_entry_points() { 6930 if (! UseLSE) { 6931 return; 6932 } 6933 6934 __ align(CodeEntryAlignment); 6935 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6936 address first_entry = __ pc(); 6937 6938 // ADD, memory_order_conservative 6939 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6940 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6941 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6942 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6943 6944 // ADD, memory_order_relaxed 6945 AtomicStubMark mark_fetch_add_4_relaxed 6946 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6947 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6948 AtomicStubMark mark_fetch_add_8_relaxed 6949 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6950 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6951 6952 // XCHG, memory_order_conservative 6953 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6954 gen_swpal_entry(Assembler::word); 6955 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6956 gen_swpal_entry(Assembler::xword); 6957 6958 // CAS, memory_order_conservative 6959 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6960 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6961 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6962 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6963 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6964 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6965 6966 // CAS, memory_order_relaxed 6967 AtomicStubMark mark_cmpxchg_1_relaxed 6968 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6969 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6970 AtomicStubMark mark_cmpxchg_4_relaxed 6971 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6972 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6973 AtomicStubMark mark_cmpxchg_8_relaxed 6974 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6975 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6976 6977 AtomicStubMark mark_cmpxchg_4_release 6978 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6979 gen_cas_entry(MacroAssembler::word, memory_order_release); 6980 AtomicStubMark mark_cmpxchg_8_release 6981 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6982 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6983 6984 AtomicStubMark mark_cmpxchg_4_seq_cst 6985 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6986 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6987 AtomicStubMark mark_cmpxchg_8_seq_cst 6988 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6989 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6990 6991 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6992 } 6993 #endif // LINUX 6994 6995 address generate_cont_thaw(Continuation::thaw_kind kind) { 6996 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 6997 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 6998 6999 address start = __ pc(); 7000 7001 if (return_barrier) { 7002 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7003 __ mov(sp, rscratch1); 7004 } 7005 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7006 7007 if (return_barrier) { 7008 // preserve possible return value from a method returning to the return barrier 7009 __ fmovd(rscratch1, v0); 7010 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7011 } 7012 7013 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7014 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7015 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7016 7017 if (return_barrier) { 7018 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7019 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7020 __ fmovd(v0, rscratch1); 7021 } 7022 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7023 7024 7025 Label thaw_success; 7026 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7027 __ cbnz(rscratch2, thaw_success); 7028 __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); 7029 __ br(rscratch1); 7030 __ bind(thaw_success); 7031 7032 // make room for the thawed frames 7033 __ sub(rscratch1, sp, rscratch2); 7034 __ andr(rscratch1, rscratch1, -16); // align 7035 __ mov(sp, rscratch1); 7036 7037 if (return_barrier) { 7038 // save original return value -- again 7039 __ fmovd(rscratch1, v0); 7040 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7041 } 7042 7043 // If we want, we can templatize thaw by kind, and have three different entries 7044 __ movw(c_rarg1, (uint32_t)kind); 7045 7046 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7047 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7048 7049 if (return_barrier) { 7050 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7051 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7052 __ fmovd(v0, rscratch1); 7053 } else { 7054 __ mov(r0, zr); // return 0 (success) from doYield 7055 } 7056 7057 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7058 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7059 __ mov(rfp, sp); 7060 7061 if (return_barrier_exception) { 7062 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7063 __ authenticate_return_address(c_rarg1); 7064 __ verify_oop(r0); 7065 // save return value containing the exception oop in callee-saved R19 7066 __ mov(r19, r0); 7067 7068 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7069 7070 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7071 // __ reinitialize_ptrue(); 7072 7073 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7074 7075 __ mov(r1, r0); // the exception handler 7076 __ mov(r0, r19); // restore return value containing the exception oop 7077 __ verify_oop(r0); 7078 7079 __ leave(); 7080 __ mov(r3, lr); 7081 __ br(r1); // the exception handler 7082 } else { 7083 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7084 __ leave(); 7085 __ ret(lr); 7086 } 7087 7088 return start; 7089 } 7090 7091 address generate_cont_thaw() { 7092 if (!Continuations::enabled()) return nullptr; 7093 7094 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7095 address start = __ pc(); 7096 generate_cont_thaw(Continuation::thaw_top); 7097 return start; 7098 } 7099 7100 address generate_cont_returnBarrier() { 7101 if (!Continuations::enabled()) return nullptr; 7102 7103 // TODO: will probably need multiple return barriers depending on return type 7104 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7105 address start = __ pc(); 7106 7107 generate_cont_thaw(Continuation::thaw_return_barrier); 7108 7109 return start; 7110 } 7111 7112 address generate_cont_returnBarrier_exception() { 7113 if (!Continuations::enabled()) return nullptr; 7114 7115 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7116 address start = __ pc(); 7117 7118 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7119 7120 return start; 7121 } 7122 7123 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7124 // are represented as long[5], with BITS_PER_LIMB = 26. 7125 // Pack five 26-bit limbs into three 64-bit registers. 7126 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7127 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7128 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7129 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7130 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7131 7132 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7133 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7134 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7135 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7136 7137 if (dest2->is_valid()) { 7138 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7139 } else { 7140 #ifdef ASSERT 7141 Label OK; 7142 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7143 __ br(__ EQ, OK); 7144 __ stop("high bits of Poly1305 integer should be zero"); 7145 __ should_not_reach_here(); 7146 __ bind(OK); 7147 #endif 7148 } 7149 } 7150 7151 // As above, but return only a 128-bit integer, packed into two 7152 // 64-bit registers. 7153 void pack_26(Register dest0, Register dest1, Register src) { 7154 pack_26(dest0, dest1, noreg, src); 7155 } 7156 7157 // Multiply and multiply-accumulate unsigned 64-bit registers. 7158 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7159 __ mul(prod_lo, n, m); 7160 __ umulh(prod_hi, n, m); 7161 } 7162 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7163 wide_mul(rscratch1, rscratch2, n, m); 7164 __ adds(sum_lo, sum_lo, rscratch1); 7165 __ adc(sum_hi, sum_hi, rscratch2); 7166 } 7167 7168 // Poly1305, RFC 7539 7169 7170 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7171 // description of the tricks used to simplify and accelerate this 7172 // computation. 7173 7174 address generate_poly1305_processBlocks() { 7175 __ align(CodeEntryAlignment); 7176 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7177 address start = __ pc(); 7178 Label here; 7179 __ enter(); 7180 RegSet callee_saved = RegSet::range(r19, r28); 7181 __ push(callee_saved, sp); 7182 7183 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7184 7185 // Arguments 7186 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7187 7188 // R_n is the 128-bit randomly-generated key, packed into two 7189 // registers. The caller passes this key to us as long[5], with 7190 // BITS_PER_LIMB = 26. 7191 const Register R_0 = *++regs, R_1 = *++regs; 7192 pack_26(R_0, R_1, r_start); 7193 7194 // RR_n is (R_n >> 2) * 5 7195 const Register RR_0 = *++regs, RR_1 = *++regs; 7196 __ lsr(RR_0, R_0, 2); 7197 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7198 __ lsr(RR_1, R_1, 2); 7199 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7200 7201 // U_n is the current checksum 7202 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7203 pack_26(U_0, U_1, U_2, acc_start); 7204 7205 static constexpr int BLOCK_LENGTH = 16; 7206 Label DONE, LOOP; 7207 7208 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7209 __ br(Assembler::LT, DONE); { 7210 __ bind(LOOP); 7211 7212 // S_n is to be the sum of U_n and the next block of data 7213 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7214 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7215 __ adds(S_0, U_0, S_0); 7216 __ adcs(S_1, U_1, S_1); 7217 __ adc(S_2, U_2, zr); 7218 __ add(S_2, S_2, 1); 7219 7220 const Register U_0HI = *++regs, U_1HI = *++regs; 7221 7222 // NB: this logic depends on some of the special properties of 7223 // Poly1305 keys. In particular, because we know that the top 7224 // four bits of R_0 and R_1 are zero, we can add together 7225 // partial products without any risk of needing to propagate a 7226 // carry out. 7227 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7228 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7229 __ andr(U_2, R_0, 3); 7230 __ mul(U_2, S_2, U_2); 7231 7232 // Recycle registers S_0, S_1, S_2 7233 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7234 7235 // Partial reduction mod 2**130 - 5 7236 __ adds(U_1, U_0HI, U_1); 7237 __ adc(U_2, U_1HI, U_2); 7238 // Sum now in U_2:U_1:U_0. 7239 // Dead: U_0HI, U_1HI. 7240 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7241 7242 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7243 7244 // First, U_2:U_1:U_0 += (U_2 >> 2) 7245 __ lsr(rscratch1, U_2, 2); 7246 __ andr(U_2, U_2, (u8)3); 7247 __ adds(U_0, U_0, rscratch1); 7248 __ adcs(U_1, U_1, zr); 7249 __ adc(U_2, U_2, zr); 7250 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7251 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7252 __ adcs(U_1, U_1, zr); 7253 __ adc(U_2, U_2, zr); 7254 7255 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7256 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7257 __ br(~ Assembler::LT, LOOP); 7258 } 7259 7260 // Further reduce modulo 2^130 - 5 7261 __ lsr(rscratch1, U_2, 2); 7262 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7263 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7264 __ adcs(U_1, U_1, zr); 7265 __ andr(U_2, U_2, (u1)3); 7266 __ adc(U_2, U_2, zr); 7267 7268 // Unpack the sum into five 26-bit limbs and write to memory. 7269 __ ubfiz(rscratch1, U_0, 0, 26); 7270 __ ubfx(rscratch2, U_0, 26, 26); 7271 __ stp(rscratch1, rscratch2, Address(acc_start)); 7272 __ ubfx(rscratch1, U_0, 52, 12); 7273 __ bfi(rscratch1, U_1, 12, 14); 7274 __ ubfx(rscratch2, U_1, 14, 26); 7275 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7276 __ ubfx(rscratch1, U_1, 40, 24); 7277 __ bfi(rscratch1, U_2, 24, 3); 7278 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7279 7280 __ bind(DONE); 7281 __ pop(callee_saved, sp); 7282 __ leave(); 7283 __ ret(lr); 7284 7285 return start; 7286 } 7287 7288 #if INCLUDE_JFR 7289 7290 static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { 7291 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7292 __ mov(c_rarg0, thread); 7293 } 7294 7295 // The handle is dereferenced through a load barrier. 7296 static void jfr_epilogue(MacroAssembler* _masm) { 7297 __ reset_last_Java_frame(true); 7298 } 7299 7300 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 7301 // It returns a jobject handle to the event writer. 7302 // The handle is dereferenced and the return value is the event writer oop. 7303 static RuntimeStub* generate_jfr_write_checkpoint() { 7304 enum layout { 7305 rbp_off, 7306 rbpH_off, 7307 return_off, 7308 return_off2, 7309 framesize // inclusive of return address 7310 }; 7311 7312 int insts_size = 1024; 7313 int locs_size = 64; 7314 CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); 7315 OopMapSet* oop_maps = new OopMapSet(); 7316 MacroAssembler* masm = new MacroAssembler(&code); 7317 MacroAssembler* _masm = masm; 7318 7319 address start = __ pc(); 7320 __ enter(); 7321 int frame_complete = __ pc() - start; 7322 address the_pc = __ pc(); 7323 jfr_prologue(the_pc, _masm, rthread); 7324 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 7325 jfr_epilogue(_masm); 7326 __ resolve_global_jobject(r0, rscratch1, rscratch2); 7327 __ leave(); 7328 __ ret(lr); 7329 7330 OopMap* map = new OopMap(framesize, 1); // rfp 7331 oop_maps->add_gc_map(the_pc - start, map); 7332 7333 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7334 RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, 7335 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7336 oop_maps, false); 7337 return stub; 7338 } 7339 7340 // For c2: call to return a leased buffer. 7341 static RuntimeStub* generate_jfr_return_lease() { 7342 enum layout { 7343 rbp_off, 7344 rbpH_off, 7345 return_off, 7346 return_off2, 7347 framesize // inclusive of return address 7348 }; 7349 7350 int insts_size = 1024; 7351 int locs_size = 64; 7352 CodeBuffer code("jfr_return_lease", insts_size, locs_size); 7353 OopMapSet* oop_maps = new OopMapSet(); 7354 MacroAssembler* masm = new MacroAssembler(&code); 7355 MacroAssembler* _masm = masm; 7356 7357 address start = __ pc(); 7358 __ enter(); 7359 int frame_complete = __ pc() - start; 7360 address the_pc = __ pc(); 7361 jfr_prologue(the_pc, _masm, rthread); 7362 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 7363 jfr_epilogue(_masm); 7364 7365 __ leave(); 7366 __ ret(lr); 7367 7368 OopMap* map = new OopMap(framesize, 1); // rfp 7369 oop_maps->add_gc_map(the_pc - start, map); 7370 7371 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7372 RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete, 7373 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7374 oop_maps, false); 7375 return stub; 7376 } 7377 7378 #endif // INCLUDE_JFR 7379 7380 // exception handler for upcall stubs 7381 address generate_upcall_stub_exception_handler() { 7382 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7383 address start = __ pc(); 7384 7385 // Native caller has no idea how to handle exceptions, 7386 // so we just crash here. Up to callee to catch exceptions. 7387 __ verify_oop(r0); 7388 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7389 __ blr(rscratch1); 7390 __ should_not_reach_here(); 7391 7392 return start; 7393 } 7394 7395 // Continuation point for throwing of implicit exceptions that are 7396 // not handled in the current activation. Fabricates an exception 7397 // oop and initiates normal exception dispatching in this 7398 // frame. Since we need to preserve callee-saved values (currently 7399 // only for C2, but done for C1 as well) we need a callee-saved oop 7400 // map and therefore have to make these stubs into RuntimeStubs 7401 // rather than BufferBlobs. If the compiler needs all registers to 7402 // be preserved between the fault point and the exception handler 7403 // then it must assume responsibility for that in 7404 // AbstractCompiler::continuation_for_implicit_null_exception or 7405 // continuation_for_implicit_division_by_zero_exception. All other 7406 // implicit exceptions (e.g., NullPointerException or 7407 // AbstractMethodError on entry) are either at call sites or 7408 // otherwise assume that stack unwinding will be initiated, so 7409 // caller saved registers were assumed volatile in the compiler. 7410 7411 #undef __ 7412 #define __ masm-> 7413 7414 address generate_throw_exception(const char* name, 7415 address runtime_entry, 7416 Register arg1 = noreg, 7417 Register arg2 = noreg) { 7418 // Information about frame layout at time of blocking runtime call. 7419 // Note that we only have to preserve callee-saved registers since 7420 // the compilers are responsible for supplying a continuation point 7421 // if they expect all registers to be preserved. 7422 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 7423 enum layout { 7424 rfp_off = 0, 7425 rfp_off2, 7426 return_off, 7427 return_off2, 7428 framesize // inclusive of return address 7429 }; 7430 7431 int insts_size = 512; 7432 int locs_size = 64; 7433 7434 CodeBuffer code(name, insts_size, locs_size); 7435 OopMapSet* oop_maps = new OopMapSet(); 7436 MacroAssembler* masm = new MacroAssembler(&code); 7437 7438 address start = __ pc(); 7439 7440 // This is an inlined and slightly modified version of call_VM 7441 // which has the ability to fetch the return PC out of 7442 // thread-local storage and also sets up last_Java_sp slightly 7443 // differently than the real call_VM 7444 7445 __ enter(); // Save FP and LR before call 7446 7447 assert(is_even(framesize/2), "sp not 16-byte aligned"); 7448 7449 // lr and fp are already in place 7450 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 7451 7452 int frame_complete = __ pc() - start; 7453 7454 // Set up last_Java_sp and last_Java_fp 7455 address the_pc = __ pc(); 7456 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7457 7458 // Call runtime 7459 if (arg1 != noreg) { 7460 assert(arg2 != c_rarg1, "clobbered"); 7461 __ mov(c_rarg1, arg1); 7462 } 7463 if (arg2 != noreg) { 7464 __ mov(c_rarg2, arg2); 7465 } 7466 __ mov(c_rarg0, rthread); 7467 BLOCK_COMMENT("call runtime_entry"); 7468 __ mov(rscratch1, runtime_entry); 7469 __ blr(rscratch1); 7470 7471 // Generate oop map 7472 OopMap* map = new OopMap(framesize, 0); 7473 7474 oop_maps->add_gc_map(the_pc - start, map); 7475 7476 __ reset_last_Java_frame(true); 7477 7478 // Reinitialize the ptrue predicate register, in case the external runtime 7479 // call clobbers ptrue reg, as we may return to SVE compiled code. 7480 __ reinitialize_ptrue(); 7481 7482 __ leave(); 7483 7484 // check for pending exceptions 7485 #ifdef ASSERT 7486 Label L; 7487 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 7488 __ cbnz(rscratch1, L); 7489 __ should_not_reach_here(); 7490 __ bind(L); 7491 #endif // ASSERT 7492 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 7493 7494 // codeBlob framesize is in words (not VMRegImpl::slot_size) 7495 RuntimeStub* stub = 7496 RuntimeStub::new_runtime_stub(name, 7497 &code, 7498 frame_complete, 7499 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7500 oop_maps, false); 7501 return stub->entry_point(); 7502 } 7503 7504 class MontgomeryMultiplyGenerator : public MacroAssembler { 7505 7506 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7507 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7508 7509 RegSet _toSave; 7510 bool _squaring; 7511 7512 public: 7513 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7514 : MacroAssembler(as->code()), _squaring(squaring) { 7515 7516 // Register allocation 7517 7518 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7519 Pa_base = *regs; // Argument registers 7520 if (squaring) 7521 Pb_base = Pa_base; 7522 else 7523 Pb_base = *++regs; 7524 Pn_base = *++regs; 7525 Rlen= *++regs; 7526 inv = *++regs; 7527 Pm_base = *++regs; 7528 7529 // Working registers: 7530 Ra = *++regs; // The current digit of a, b, n, and m. 7531 Rb = *++regs; 7532 Rm = *++regs; 7533 Rn = *++regs; 7534 7535 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7536 Pb = *++regs; 7537 Pm = *++regs; 7538 Pn = *++regs; 7539 7540 t0 = *++regs; // Three registers which form a 7541 t1 = *++regs; // triple-precision accumuator. 7542 t2 = *++regs; 7543 7544 Ri = *++regs; // Inner and outer loop indexes. 7545 Rj = *++regs; 7546 7547 Rhi_ab = *++regs; // Product registers: low and high parts 7548 Rlo_ab = *++regs; // of a*b and m*n. 7549 Rhi_mn = *++regs; 7550 Rlo_mn = *++regs; 7551 7552 // r19 and up are callee-saved. 7553 _toSave = RegSet::range(r19, *regs) + Pm_base; 7554 } 7555 7556 private: 7557 void save_regs() { 7558 push(_toSave, sp); 7559 } 7560 7561 void restore_regs() { 7562 pop(_toSave, sp); 7563 } 7564 7565 template <typename T> 7566 void unroll_2(Register count, T block) { 7567 Label loop, end, odd; 7568 tbnz(count, 0, odd); 7569 cbz(count, end); 7570 align(16); 7571 bind(loop); 7572 (this->*block)(); 7573 bind(odd); 7574 (this->*block)(); 7575 subs(count, count, 2); 7576 br(Assembler::GT, loop); 7577 bind(end); 7578 } 7579 7580 template <typename T> 7581 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7582 Label loop, end, odd; 7583 tbnz(count, 0, odd); 7584 cbz(count, end); 7585 align(16); 7586 bind(loop); 7587 (this->*block)(d, s, tmp); 7588 bind(odd); 7589 (this->*block)(d, s, tmp); 7590 subs(count, count, 2); 7591 br(Assembler::GT, loop); 7592 bind(end); 7593 } 7594 7595 void pre1(RegisterOrConstant i) { 7596 block_comment("pre1"); 7597 // Pa = Pa_base; 7598 // Pb = Pb_base + i; 7599 // Pm = Pm_base; 7600 // Pn = Pn_base + i; 7601 // Ra = *Pa; 7602 // Rb = *Pb; 7603 // Rm = *Pm; 7604 // Rn = *Pn; 7605 ldr(Ra, Address(Pa_base)); 7606 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7607 ldr(Rm, Address(Pm_base)); 7608 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7609 lea(Pa, Address(Pa_base)); 7610 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7611 lea(Pm, Address(Pm_base)); 7612 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7613 7614 // Zero the m*n result. 7615 mov(Rhi_mn, zr); 7616 mov(Rlo_mn, zr); 7617 } 7618 7619 // The core multiply-accumulate step of a Montgomery 7620 // multiplication. The idea is to schedule operations as a 7621 // pipeline so that instructions with long latencies (loads and 7622 // multiplies) have time to complete before their results are 7623 // used. This most benefits in-order implementations of the 7624 // architecture but out-of-order ones also benefit. 7625 void step() { 7626 block_comment("step"); 7627 // MACC(Ra, Rb, t0, t1, t2); 7628 // Ra = *++Pa; 7629 // Rb = *--Pb; 7630 umulh(Rhi_ab, Ra, Rb); 7631 mul(Rlo_ab, Ra, Rb); 7632 ldr(Ra, pre(Pa, wordSize)); 7633 ldr(Rb, pre(Pb, -wordSize)); 7634 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7635 // previous iteration. 7636 // MACC(Rm, Rn, t0, t1, t2); 7637 // Rm = *++Pm; 7638 // Rn = *--Pn; 7639 umulh(Rhi_mn, Rm, Rn); 7640 mul(Rlo_mn, Rm, Rn); 7641 ldr(Rm, pre(Pm, wordSize)); 7642 ldr(Rn, pre(Pn, -wordSize)); 7643 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7644 } 7645 7646 void post1() { 7647 block_comment("post1"); 7648 7649 // MACC(Ra, Rb, t0, t1, t2); 7650 // Ra = *++Pa; 7651 // Rb = *--Pb; 7652 umulh(Rhi_ab, Ra, Rb); 7653 mul(Rlo_ab, Ra, Rb); 7654 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7655 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7656 7657 // *Pm = Rm = t0 * inv; 7658 mul(Rm, t0, inv); 7659 str(Rm, Address(Pm)); 7660 7661 // MACC(Rm, Rn, t0, t1, t2); 7662 // t0 = t1; t1 = t2; t2 = 0; 7663 umulh(Rhi_mn, Rm, Rn); 7664 7665 #ifndef PRODUCT 7666 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7667 { 7668 mul(Rlo_mn, Rm, Rn); 7669 add(Rlo_mn, t0, Rlo_mn); 7670 Label ok; 7671 cbz(Rlo_mn, ok); { 7672 stop("broken Montgomery multiply"); 7673 } bind(ok); 7674 } 7675 #endif 7676 // We have very carefully set things up so that 7677 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7678 // the lower half of Rm * Rn because we know the result already: 7679 // it must be -t0. t0 + (-t0) must generate a carry iff 7680 // t0 != 0. So, rather than do a mul and an adds we just set 7681 // the carry flag iff t0 is nonzero. 7682 // 7683 // mul(Rlo_mn, Rm, Rn); 7684 // adds(zr, t0, Rlo_mn); 7685 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7686 adcs(t0, t1, Rhi_mn); 7687 adc(t1, t2, zr); 7688 mov(t2, zr); 7689 } 7690 7691 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7692 block_comment("pre2"); 7693 // Pa = Pa_base + i-len; 7694 // Pb = Pb_base + len; 7695 // Pm = Pm_base + i-len; 7696 // Pn = Pn_base + len; 7697 7698 if (i.is_register()) { 7699 sub(Rj, i.as_register(), len); 7700 } else { 7701 mov(Rj, i.as_constant()); 7702 sub(Rj, Rj, len); 7703 } 7704 // Rj == i-len 7705 7706 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7707 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7708 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7709 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7710 7711 // Ra = *++Pa; 7712 // Rb = *--Pb; 7713 // Rm = *++Pm; 7714 // Rn = *--Pn; 7715 ldr(Ra, pre(Pa, wordSize)); 7716 ldr(Rb, pre(Pb, -wordSize)); 7717 ldr(Rm, pre(Pm, wordSize)); 7718 ldr(Rn, pre(Pn, -wordSize)); 7719 7720 mov(Rhi_mn, zr); 7721 mov(Rlo_mn, zr); 7722 } 7723 7724 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7725 block_comment("post2"); 7726 if (i.is_constant()) { 7727 mov(Rj, i.as_constant()-len.as_constant()); 7728 } else { 7729 sub(Rj, i.as_register(), len); 7730 } 7731 7732 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7733 7734 // As soon as we know the least significant digit of our result, 7735 // store it. 7736 // Pm_base[i-len] = t0; 7737 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7738 7739 // t0 = t1; t1 = t2; t2 = 0; 7740 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7741 adc(t1, t2, zr); 7742 mov(t2, zr); 7743 } 7744 7745 // A carry in t0 after Montgomery multiplication means that we 7746 // should subtract multiples of n from our result in m. We'll 7747 // keep doing that until there is no carry. 7748 void normalize(RegisterOrConstant len) { 7749 block_comment("normalize"); 7750 // while (t0) 7751 // t0 = sub(Pm_base, Pn_base, t0, len); 7752 Label loop, post, again; 7753 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7754 cbz(t0, post); { 7755 bind(again); { 7756 mov(i, zr); 7757 mov(cnt, len); 7758 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7759 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7760 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7761 align(16); 7762 bind(loop); { 7763 sbcs(Rm, Rm, Rn); 7764 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7765 add(i, i, 1); 7766 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7767 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7768 sub(cnt, cnt, 1); 7769 } cbnz(cnt, loop); 7770 sbc(t0, t0, zr); 7771 } cbnz(t0, again); 7772 } bind(post); 7773 } 7774 7775 // Move memory at s to d, reversing words. 7776 // Increments d to end of copied memory 7777 // Destroys tmp1, tmp2 7778 // Preserves len 7779 // Leaves s pointing to the address which was in d at start 7780 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7781 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7782 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7783 7784 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7785 mov(tmp1, len); 7786 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7787 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7788 } 7789 // where 7790 void reverse1(Register d, Register s, Register tmp) { 7791 ldr(tmp, pre(s, -wordSize)); 7792 ror(tmp, tmp, 32); 7793 str(tmp, post(d, wordSize)); 7794 } 7795 7796 void step_squaring() { 7797 // An extra ACC 7798 step(); 7799 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7800 } 7801 7802 void last_squaring(RegisterOrConstant i) { 7803 Label dont; 7804 // if ((i & 1) == 0) { 7805 tbnz(i.as_register(), 0, dont); { 7806 // MACC(Ra, Rb, t0, t1, t2); 7807 // Ra = *++Pa; 7808 // Rb = *--Pb; 7809 umulh(Rhi_ab, Ra, Rb); 7810 mul(Rlo_ab, Ra, Rb); 7811 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7812 } bind(dont); 7813 } 7814 7815 void extra_step_squaring() { 7816 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7817 7818 // MACC(Rm, Rn, t0, t1, t2); 7819 // Rm = *++Pm; 7820 // Rn = *--Pn; 7821 umulh(Rhi_mn, Rm, Rn); 7822 mul(Rlo_mn, Rm, Rn); 7823 ldr(Rm, pre(Pm, wordSize)); 7824 ldr(Rn, pre(Pn, -wordSize)); 7825 } 7826 7827 void post1_squaring() { 7828 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7829 7830 // *Pm = Rm = t0 * inv; 7831 mul(Rm, t0, inv); 7832 str(Rm, Address(Pm)); 7833 7834 // MACC(Rm, Rn, t0, t1, t2); 7835 // t0 = t1; t1 = t2; t2 = 0; 7836 umulh(Rhi_mn, Rm, Rn); 7837 7838 #ifndef PRODUCT 7839 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7840 { 7841 mul(Rlo_mn, Rm, Rn); 7842 add(Rlo_mn, t0, Rlo_mn); 7843 Label ok; 7844 cbz(Rlo_mn, ok); { 7845 stop("broken Montgomery multiply"); 7846 } bind(ok); 7847 } 7848 #endif 7849 // We have very carefully set things up so that 7850 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7851 // the lower half of Rm * Rn because we know the result already: 7852 // it must be -t0. t0 + (-t0) must generate a carry iff 7853 // t0 != 0. So, rather than do a mul and an adds we just set 7854 // the carry flag iff t0 is nonzero. 7855 // 7856 // mul(Rlo_mn, Rm, Rn); 7857 // adds(zr, t0, Rlo_mn); 7858 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7859 adcs(t0, t1, Rhi_mn); 7860 adc(t1, t2, zr); 7861 mov(t2, zr); 7862 } 7863 7864 void acc(Register Rhi, Register Rlo, 7865 Register t0, Register t1, Register t2) { 7866 adds(t0, t0, Rlo); 7867 adcs(t1, t1, Rhi); 7868 adc(t2, t2, zr); 7869 } 7870 7871 public: 7872 /** 7873 * Fast Montgomery multiplication. The derivation of the 7874 * algorithm is in A Cryptographic Library for the Motorola 7875 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7876 * 7877 * Arguments: 7878 * 7879 * Inputs for multiplication: 7880 * c_rarg0 - int array elements a 7881 * c_rarg1 - int array elements b 7882 * c_rarg2 - int array elements n (the modulus) 7883 * c_rarg3 - int length 7884 * c_rarg4 - int inv 7885 * c_rarg5 - int array elements m (the result) 7886 * 7887 * Inputs for squaring: 7888 * c_rarg0 - int array elements a 7889 * c_rarg1 - int array elements n (the modulus) 7890 * c_rarg2 - int length 7891 * c_rarg3 - int inv 7892 * c_rarg4 - int array elements m (the result) 7893 * 7894 */ 7895 address generate_multiply() { 7896 Label argh, nothing; 7897 bind(argh); 7898 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7899 7900 align(CodeEntryAlignment); 7901 address entry = pc(); 7902 7903 cbzw(Rlen, nothing); 7904 7905 enter(); 7906 7907 // Make room. 7908 cmpw(Rlen, 512); 7909 br(Assembler::HI, argh); 7910 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7911 andr(sp, Ra, -2 * wordSize); 7912 7913 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7914 7915 { 7916 // Copy input args, reversing as we go. We use Ra as a 7917 // temporary variable. 7918 reverse(Ra, Pa_base, Rlen, t0, t1); 7919 if (!_squaring) 7920 reverse(Ra, Pb_base, Rlen, t0, t1); 7921 reverse(Ra, Pn_base, Rlen, t0, t1); 7922 } 7923 7924 // Push all call-saved registers and also Pm_base which we'll need 7925 // at the end. 7926 save_regs(); 7927 7928 #ifndef PRODUCT 7929 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7930 { 7931 ldr(Rn, Address(Pn_base, 0)); 7932 mul(Rlo_mn, Rn, inv); 7933 subs(zr, Rlo_mn, -1); 7934 Label ok; 7935 br(EQ, ok); { 7936 stop("broken inverse in Montgomery multiply"); 7937 } bind(ok); 7938 } 7939 #endif 7940 7941 mov(Pm_base, Ra); 7942 7943 mov(t0, zr); 7944 mov(t1, zr); 7945 mov(t2, zr); 7946 7947 block_comment("for (int i = 0; i < len; i++) {"); 7948 mov(Ri, zr); { 7949 Label loop, end; 7950 cmpw(Ri, Rlen); 7951 br(Assembler::GE, end); 7952 7953 bind(loop); 7954 pre1(Ri); 7955 7956 block_comment(" for (j = i; j; j--) {"); { 7957 movw(Rj, Ri); 7958 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7959 } block_comment(" } // j"); 7960 7961 post1(); 7962 addw(Ri, Ri, 1); 7963 cmpw(Ri, Rlen); 7964 br(Assembler::LT, loop); 7965 bind(end); 7966 block_comment("} // i"); 7967 } 7968 7969 block_comment("for (int i = len; i < 2*len; i++) {"); 7970 mov(Ri, Rlen); { 7971 Label loop, end; 7972 cmpw(Ri, Rlen, Assembler::LSL, 1); 7973 br(Assembler::GE, end); 7974 7975 bind(loop); 7976 pre2(Ri, Rlen); 7977 7978 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7979 lslw(Rj, Rlen, 1); 7980 subw(Rj, Rj, Ri); 7981 subw(Rj, Rj, 1); 7982 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7983 } block_comment(" } // j"); 7984 7985 post2(Ri, Rlen); 7986 addw(Ri, Ri, 1); 7987 cmpw(Ri, Rlen, Assembler::LSL, 1); 7988 br(Assembler::LT, loop); 7989 bind(end); 7990 } 7991 block_comment("} // i"); 7992 7993 normalize(Rlen); 7994 7995 mov(Ra, Pm_base); // Save Pm_base in Ra 7996 restore_regs(); // Restore caller's Pm_base 7997 7998 // Copy our result into caller's Pm_base 7999 reverse(Pm_base, Ra, Rlen, t0, t1); 8000 8001 leave(); 8002 bind(nothing); 8003 ret(lr); 8004 8005 return entry; 8006 } 8007 // In C, approximately: 8008 8009 // void 8010 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 8011 // julong Pn_base[], julong Pm_base[], 8012 // julong inv, int len) { 8013 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8014 // julong *Pa, *Pb, *Pn, *Pm; 8015 // julong Ra, Rb, Rn, Rm; 8016 8017 // int i; 8018 8019 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8020 8021 // for (i = 0; i < len; i++) { 8022 // int j; 8023 8024 // Pa = Pa_base; 8025 // Pb = Pb_base + i; 8026 // Pm = Pm_base; 8027 // Pn = Pn_base + i; 8028 8029 // Ra = *Pa; 8030 // Rb = *Pb; 8031 // Rm = *Pm; 8032 // Rn = *Pn; 8033 8034 // int iters = i; 8035 // for (j = 0; iters--; j++) { 8036 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8037 // MACC(Ra, Rb, t0, t1, t2); 8038 // Ra = *++Pa; 8039 // Rb = *--Pb; 8040 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8041 // MACC(Rm, Rn, t0, t1, t2); 8042 // Rm = *++Pm; 8043 // Rn = *--Pn; 8044 // } 8045 8046 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8047 // MACC(Ra, Rb, t0, t1, t2); 8048 // *Pm = Rm = t0 * inv; 8049 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8050 // MACC(Rm, Rn, t0, t1, t2); 8051 8052 // assert(t0 == 0, "broken Montgomery multiply"); 8053 8054 // t0 = t1; t1 = t2; t2 = 0; 8055 // } 8056 8057 // for (i = len; i < 2*len; i++) { 8058 // int j; 8059 8060 // Pa = Pa_base + i-len; 8061 // Pb = Pb_base + len; 8062 // Pm = Pm_base + i-len; 8063 // Pn = Pn_base + len; 8064 8065 // Ra = *++Pa; 8066 // Rb = *--Pb; 8067 // Rm = *++Pm; 8068 // Rn = *--Pn; 8069 8070 // int iters = len*2-i-1; 8071 // for (j = i-len+1; iters--; j++) { 8072 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8073 // MACC(Ra, Rb, t0, t1, t2); 8074 // Ra = *++Pa; 8075 // Rb = *--Pb; 8076 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8077 // MACC(Rm, Rn, t0, t1, t2); 8078 // Rm = *++Pm; 8079 // Rn = *--Pn; 8080 // } 8081 8082 // Pm_base[i-len] = t0; 8083 // t0 = t1; t1 = t2; t2 = 0; 8084 // } 8085 8086 // while (t0) 8087 // t0 = sub(Pm_base, Pn_base, t0, len); 8088 // } 8089 8090 /** 8091 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8092 * multiplies than Montgomery multiplication so it should be up to 8093 * 25% faster. However, its loop control is more complex and it 8094 * may actually run slower on some machines. 8095 * 8096 * Arguments: 8097 * 8098 * Inputs: 8099 * c_rarg0 - int array elements a 8100 * c_rarg1 - int array elements n (the modulus) 8101 * c_rarg2 - int length 8102 * c_rarg3 - int inv 8103 * c_rarg4 - int array elements m (the result) 8104 * 8105 */ 8106 address generate_square() { 8107 Label argh; 8108 bind(argh); 8109 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8110 8111 align(CodeEntryAlignment); 8112 address entry = pc(); 8113 8114 enter(); 8115 8116 // Make room. 8117 cmpw(Rlen, 512); 8118 br(Assembler::HI, argh); 8119 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8120 andr(sp, Ra, -2 * wordSize); 8121 8122 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8123 8124 { 8125 // Copy input args, reversing as we go. We use Ra as a 8126 // temporary variable. 8127 reverse(Ra, Pa_base, Rlen, t0, t1); 8128 reverse(Ra, Pn_base, Rlen, t0, t1); 8129 } 8130 8131 // Push all call-saved registers and also Pm_base which we'll need 8132 // at the end. 8133 save_regs(); 8134 8135 mov(Pm_base, Ra); 8136 8137 mov(t0, zr); 8138 mov(t1, zr); 8139 mov(t2, zr); 8140 8141 block_comment("for (int i = 0; i < len; i++) {"); 8142 mov(Ri, zr); { 8143 Label loop, end; 8144 bind(loop); 8145 cmp(Ri, Rlen); 8146 br(Assembler::GE, end); 8147 8148 pre1(Ri); 8149 8150 block_comment("for (j = (i+1)/2; j; j--) {"); { 8151 add(Rj, Ri, 1); 8152 lsr(Rj, Rj, 1); 8153 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8154 } block_comment(" } // j"); 8155 8156 last_squaring(Ri); 8157 8158 block_comment(" for (j = i/2; j; j--) {"); { 8159 lsr(Rj, Ri, 1); 8160 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8161 } block_comment(" } // j"); 8162 8163 post1_squaring(); 8164 add(Ri, Ri, 1); 8165 cmp(Ri, Rlen); 8166 br(Assembler::LT, loop); 8167 8168 bind(end); 8169 block_comment("} // i"); 8170 } 8171 8172 block_comment("for (int i = len; i < 2*len; i++) {"); 8173 mov(Ri, Rlen); { 8174 Label loop, end; 8175 bind(loop); 8176 cmp(Ri, Rlen, Assembler::LSL, 1); 8177 br(Assembler::GE, end); 8178 8179 pre2(Ri, Rlen); 8180 8181 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8182 lsl(Rj, Rlen, 1); 8183 sub(Rj, Rj, Ri); 8184 sub(Rj, Rj, 1); 8185 lsr(Rj, Rj, 1); 8186 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8187 } block_comment(" } // j"); 8188 8189 last_squaring(Ri); 8190 8191 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8192 lsl(Rj, Rlen, 1); 8193 sub(Rj, Rj, Ri); 8194 lsr(Rj, Rj, 1); 8195 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8196 } block_comment(" } // j"); 8197 8198 post2(Ri, Rlen); 8199 add(Ri, Ri, 1); 8200 cmp(Ri, Rlen, Assembler::LSL, 1); 8201 8202 br(Assembler::LT, loop); 8203 bind(end); 8204 block_comment("} // i"); 8205 } 8206 8207 normalize(Rlen); 8208 8209 mov(Ra, Pm_base); // Save Pm_base in Ra 8210 restore_regs(); // Restore caller's Pm_base 8211 8212 // Copy our result into caller's Pm_base 8213 reverse(Pm_base, Ra, Rlen, t0, t1); 8214 8215 leave(); 8216 ret(lr); 8217 8218 return entry; 8219 } 8220 // In C, approximately: 8221 8222 // void 8223 // montgomery_square(julong Pa_base[], julong Pn_base[], 8224 // julong Pm_base[], julong inv, int len) { 8225 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8226 // julong *Pa, *Pb, *Pn, *Pm; 8227 // julong Ra, Rb, Rn, Rm; 8228 8229 // int i; 8230 8231 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8232 8233 // for (i = 0; i < len; i++) { 8234 // int j; 8235 8236 // Pa = Pa_base; 8237 // Pb = Pa_base + i; 8238 // Pm = Pm_base; 8239 // Pn = Pn_base + i; 8240 8241 // Ra = *Pa; 8242 // Rb = *Pb; 8243 // Rm = *Pm; 8244 // Rn = *Pn; 8245 8246 // int iters = (i+1)/2; 8247 // for (j = 0; iters--; j++) { 8248 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8249 // MACC2(Ra, Rb, t0, t1, t2); 8250 // Ra = *++Pa; 8251 // Rb = *--Pb; 8252 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8253 // MACC(Rm, Rn, t0, t1, t2); 8254 // Rm = *++Pm; 8255 // Rn = *--Pn; 8256 // } 8257 // if ((i & 1) == 0) { 8258 // assert(Ra == Pa_base[j], "must be"); 8259 // MACC(Ra, Ra, t0, t1, t2); 8260 // } 8261 // iters = i/2; 8262 // assert(iters == i-j, "must be"); 8263 // for (; iters--; j++) { 8264 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8265 // MACC(Rm, Rn, t0, t1, t2); 8266 // Rm = *++Pm; 8267 // Rn = *--Pn; 8268 // } 8269 8270 // *Pm = Rm = t0 * inv; 8271 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8272 // MACC(Rm, Rn, t0, t1, t2); 8273 8274 // assert(t0 == 0, "broken Montgomery multiply"); 8275 8276 // t0 = t1; t1 = t2; t2 = 0; 8277 // } 8278 8279 // for (i = len; i < 2*len; i++) { 8280 // int start = i-len+1; 8281 // int end = start + (len - start)/2; 8282 // int j; 8283 8284 // Pa = Pa_base + i-len; 8285 // Pb = Pa_base + len; 8286 // Pm = Pm_base + i-len; 8287 // Pn = Pn_base + len; 8288 8289 // Ra = *++Pa; 8290 // Rb = *--Pb; 8291 // Rm = *++Pm; 8292 // Rn = *--Pn; 8293 8294 // int iters = (2*len-i-1)/2; 8295 // assert(iters == end-start, "must be"); 8296 // for (j = start; iters--; j++) { 8297 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8298 // MACC2(Ra, Rb, t0, t1, t2); 8299 // Ra = *++Pa; 8300 // Rb = *--Pb; 8301 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8302 // MACC(Rm, Rn, t0, t1, t2); 8303 // Rm = *++Pm; 8304 // Rn = *--Pn; 8305 // } 8306 // if ((i & 1) == 0) { 8307 // assert(Ra == Pa_base[j], "must be"); 8308 // MACC(Ra, Ra, t0, t1, t2); 8309 // } 8310 // iters = (2*len-i)/2; 8311 // assert(iters == len-j, "must be"); 8312 // for (; iters--; j++) { 8313 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8314 // MACC(Rm, Rn, t0, t1, t2); 8315 // Rm = *++Pm; 8316 // Rn = *--Pn; 8317 // } 8318 // Pm_base[i-len] = t0; 8319 // t0 = t1; t1 = t2; t2 = 0; 8320 // } 8321 8322 // while (t0) 8323 // t0 = sub(Pm_base, Pn_base, t0, len); 8324 // } 8325 }; 8326 8327 8328 // Call here from the interpreter or compiled code to either load 8329 // multiple returned values from the inline type instance being 8330 // returned to registers or to store returned values to a newly 8331 // allocated inline type instance. 8332 address generate_return_value_stub(address destination, const char* name, bool has_res) { 8333 // We need to save all registers the calling convention may use so 8334 // the runtime calls read or update those registers. This needs to 8335 // be in sync with SharedRuntime::java_return_convention(). 8336 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 8337 enum layout { 8338 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 8339 j_rarg6_off, j_rarg6_2, 8340 j_rarg5_off, j_rarg5_2, 8341 j_rarg4_off, j_rarg4_2, 8342 j_rarg3_off, j_rarg3_2, 8343 j_rarg2_off, j_rarg2_2, 8344 j_rarg1_off, j_rarg1_2, 8345 j_rarg0_off, j_rarg0_2, 8346 8347 j_farg7_off, j_farg7_2, 8348 j_farg6_off, j_farg6_2, 8349 j_farg5_off, j_farg5_2, 8350 j_farg4_off, j_farg4_2, 8351 j_farg3_off, j_farg3_2, 8352 j_farg2_off, j_farg2_2, 8353 j_farg1_off, j_farg1_2, 8354 j_farg0_off, j_farg0_2, 8355 8356 rfp_off, rfp_off2, 8357 return_off, return_off2, 8358 8359 framesize // inclusive of return address 8360 }; 8361 8362 CodeBuffer code(name, 512, 64); 8363 MacroAssembler* masm = new MacroAssembler(&code); 8364 8365 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 8366 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 8367 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 8368 int frame_size_in_words = frame_size_in_bytes / wordSize; 8369 8370 OopMapSet* oop_maps = new OopMapSet(); 8371 OopMap* map = new OopMap(frame_size_in_slots, 0); 8372 8373 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 8374 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 8375 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 8376 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 8377 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 8378 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 8379 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 8380 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 8381 8382 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 8383 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 8384 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 8385 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 8386 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 8387 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 8388 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 8389 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 8390 8391 address start = __ pc(); 8392 8393 __ enter(); // Save FP and LR before call 8394 8395 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 8396 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 8397 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 8398 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 8399 8400 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 8401 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 8402 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 8403 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 8404 8405 int frame_complete = __ offset(); 8406 8407 // Set up last_Java_sp and last_Java_fp 8408 address the_pc = __ pc(); 8409 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 8410 8411 // Call runtime 8412 __ mov(c_rarg1, r0); 8413 __ mov(c_rarg0, rthread); 8414 8415 __ mov(rscratch1, destination); 8416 __ blr(rscratch1); 8417 8418 oop_maps->add_gc_map(the_pc - start, map); 8419 8420 __ reset_last_Java_frame(false); 8421 8422 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 8423 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 8424 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 8425 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 8426 8427 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 8428 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 8429 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 8430 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 8431 8432 __ leave(); 8433 8434 // check for pending exceptions 8435 Label pending; 8436 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 8437 __ cbnz(rscratch1, pending); 8438 8439 if (has_res) { 8440 __ get_vm_result(r0, rthread); 8441 } 8442 8443 __ ret(lr); 8444 8445 __ bind(pending); 8446 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 8447 8448 // ------------- 8449 // make sure all code is generated 8450 masm->flush(); 8451 8452 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 8453 return stub->entry_point(); 8454 } 8455 8456 // Initialization 8457 void generate_initial_stubs() { 8458 // Generate initial stubs and initializes the entry points 8459 8460 // entry points that exist in all platforms Note: This is code 8461 // that could be shared among different platforms - however the 8462 // benefit seems to be smaller than the disadvantage of having a 8463 // much more complicated generator structure. See also comment in 8464 // stubRoutines.hpp. 8465 8466 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8467 8468 StubRoutines::_call_stub_entry = 8469 generate_call_stub(StubRoutines::_call_stub_return_address); 8470 8471 // is referenced by megamorphic call 8472 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8473 8474 // Build this early so it's available for the interpreter. 8475 StubRoutines::_throw_StackOverflowError_entry = 8476 generate_throw_exception("StackOverflowError throw_exception", 8477 CAST_FROM_FN_PTR(address, 8478 SharedRuntime::throw_StackOverflowError)); 8479 StubRoutines::_throw_delayed_StackOverflowError_entry = 8480 generate_throw_exception("delayed StackOverflowError throw_exception", 8481 CAST_FROM_FN_PTR(address, 8482 SharedRuntime::throw_delayed_StackOverflowError)); 8483 8484 // Initialize table for copy memory (arraycopy) check. 8485 if (UnsafeCopyMemory::_table == nullptr) { 8486 UnsafeCopyMemory::create_table(8); 8487 } 8488 8489 if (UseCRC32Intrinsics) { 8490 // set table address before stub generation which use it 8491 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8492 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8493 } 8494 8495 if (UseCRC32CIntrinsics) { 8496 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8497 } 8498 8499 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8500 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8501 } 8502 8503 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8504 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8505 } 8506 8507 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8508 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8509 StubRoutines::_hf2f = generate_float16ToFloat(); 8510 StubRoutines::_f2hf = generate_floatToFloat16(); 8511 } 8512 8513 if (InlineTypeReturnedAsFields) { 8514 StubRoutines::_load_inline_type_fields_in_regs = 8515 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 8516 StubRoutines::_store_inline_type_fields_to_buf = 8517 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 8518 } 8519 8520 } 8521 8522 void generate_continuation_stubs() { 8523 // Continuation stubs: 8524 StubRoutines::_cont_thaw = generate_cont_thaw(); 8525 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8526 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8527 8528 JFR_ONLY(generate_jfr_stubs();) 8529 } 8530 8531 #if INCLUDE_JFR 8532 void generate_jfr_stubs() { 8533 StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint(); 8534 StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point(); 8535 StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease(); 8536 StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point(); 8537 } 8538 #endif // INCLUDE_JFR 8539 8540 void generate_final_stubs() { 8541 // support for verify_oop (must happen after universe_init) 8542 if (VerifyOops) { 8543 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8544 } 8545 StubRoutines::_throw_AbstractMethodError_entry = 8546 generate_throw_exception("AbstractMethodError throw_exception", 8547 CAST_FROM_FN_PTR(address, 8548 SharedRuntime:: 8549 throw_AbstractMethodError)); 8550 8551 StubRoutines::_throw_IncompatibleClassChangeError_entry = 8552 generate_throw_exception("IncompatibleClassChangeError throw_exception", 8553 CAST_FROM_FN_PTR(address, 8554 SharedRuntime:: 8555 throw_IncompatibleClassChangeError)); 8556 8557 StubRoutines::_throw_NullPointerException_at_call_entry = 8558 generate_throw_exception("NullPointerException at call throw_exception", 8559 CAST_FROM_FN_PTR(address, 8560 SharedRuntime:: 8561 throw_NullPointerException_at_call)); 8562 8563 // arraycopy stubs used by compilers 8564 generate_arraycopy_stubs(); 8565 8566 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8567 if (bs_nm != nullptr) { 8568 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8569 } 8570 8571 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8572 8573 if (UsePoly1305Intrinsics) { 8574 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8575 } 8576 8577 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8578 8579 generate_atomic_entry_points(); 8580 8581 #endif // LINUX 8582 8583 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8584 8585 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8586 } 8587 8588 void generate_compiler_stubs() { 8589 #if COMPILER2_OR_JVMCI 8590 8591 if (UseSVE == 0) { 8592 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8593 } 8594 8595 // array equals stub for large arrays. 8596 if (!UseSimpleArrayEquals) { 8597 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8598 } 8599 8600 // byte_array_inflate stub for large arrays. 8601 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8602 8603 // countPositives stub for large arrays. 8604 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8605 8606 generate_compare_long_strings(); 8607 8608 generate_string_indexof_stubs(); 8609 8610 #ifdef COMPILER2 8611 if (UseMultiplyToLenIntrinsic) { 8612 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8613 } 8614 8615 if (UseSquareToLenIntrinsic) { 8616 StubRoutines::_squareToLen = generate_squareToLen(); 8617 } 8618 8619 if (UseMulAddIntrinsic) { 8620 StubRoutines::_mulAdd = generate_mulAdd(); 8621 } 8622 8623 if (UseSIMDForBigIntegerShiftIntrinsics) { 8624 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8625 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8626 } 8627 8628 if (UseMontgomeryMultiplyIntrinsic) { 8629 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8630 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8631 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8632 } 8633 8634 if (UseMontgomerySquareIntrinsic) { 8635 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8636 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8637 // We use generate_multiply() rather than generate_square() 8638 // because it's faster for the sizes of modulus we care about. 8639 StubRoutines::_montgomerySquare = g.generate_multiply(); 8640 } 8641 #endif // COMPILER2 8642 8643 if (UseChaCha20Intrinsics) { 8644 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8645 } 8646 8647 if (UseBASE64Intrinsics) { 8648 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8649 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8650 } 8651 8652 // data cache line writeback 8653 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8654 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8655 8656 if (UseAESIntrinsics) { 8657 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8658 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8659 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8660 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8661 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8662 } 8663 if (UseGHASHIntrinsics) { 8664 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8665 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8666 } 8667 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8668 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8669 } 8670 8671 if (UseMD5Intrinsics) { 8672 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8673 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8674 } 8675 if (UseSHA1Intrinsics) { 8676 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8677 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8678 } 8679 if (UseSHA256Intrinsics) { 8680 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8681 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8682 } 8683 if (UseSHA512Intrinsics) { 8684 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8685 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8686 } 8687 if (UseSHA3Intrinsics) { 8688 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8689 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8690 } 8691 8692 // generate Adler32 intrinsics code 8693 if (UseAdler32Intrinsics) { 8694 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8695 } 8696 #endif // COMPILER2_OR_JVMCI 8697 } 8698 8699 public: 8700 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8701 switch(kind) { 8702 case Initial_stubs: 8703 generate_initial_stubs(); 8704 break; 8705 case Continuation_stubs: 8706 generate_continuation_stubs(); 8707 break; 8708 case Compiler_stubs: 8709 generate_compiler_stubs(); 8710 break; 8711 case Final_stubs: 8712 generate_final_stubs(); 8713 break; 8714 default: 8715 fatal("unexpected stubs kind: %d", kind); 8716 break; 8717 }; 8718 } 8719 }; // end class declaration 8720 8721 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8722 StubGenerator g(code, kind); 8723 } 8724 8725 8726 #if defined (LINUX) 8727 8728 // Define pointers to atomic stubs and initialize them to point to the 8729 // code in atomic_aarch64.S. 8730 8731 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8732 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8733 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8734 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8735 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8736 8737 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8738 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8739 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8740 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8741 DEFAULT_ATOMIC_OP(xchg, 4, ) 8742 DEFAULT_ATOMIC_OP(xchg, 8, ) 8743 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8744 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8745 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8746 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8747 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8748 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8749 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8750 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8751 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8752 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8753 8754 #undef DEFAULT_ATOMIC_OP 8755 8756 #endif // LINUX