1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/globalDefinitions.hpp" 57 #include "utilities/powerOfTwo.hpp" 58 #ifdef COMPILER2 59 #include "opto/runtime.hpp" 60 #endif 61 #if INCLUDE_ZGC 62 #include "gc/z/zThreadLocalData.hpp" 63 #endif 64 65 // Declaration and definition of StubGenerator (no .hpp file). 66 // For a more detailed description of the stub routine structure 67 // see the comment in stubRoutines.hpp 68 69 #undef __ 70 #define __ _masm-> 71 72 #ifdef PRODUCT 73 #define BLOCK_COMMENT(str) /* nothing */ 74 #else 75 #define BLOCK_COMMENT(str) __ block_comment(str) 76 #endif 77 78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 79 80 // Stub Code definitions 81 82 class StubGenerator: public StubCodeGenerator { 83 private: 84 85 #ifdef PRODUCT 86 #define inc_counter_np(counter) ((void)0) 87 #else 88 void inc_counter_np_(uint& counter) { 89 __ incrementw(ExternalAddress((address)&counter)); 90 } 91 #define inc_counter_np(counter) \ 92 BLOCK_COMMENT("inc_counter " #counter); \ 93 inc_counter_np_(counter); 94 #endif 95 96 // Call stubs are used to call Java from C 97 // 98 // Arguments: 99 // c_rarg0: call wrapper address address 100 // c_rarg1: result address 101 // c_rarg2: result type BasicType 102 // c_rarg3: method Method* 103 // c_rarg4: (interpreter) entry point address 104 // c_rarg5: parameters intptr_t* 105 // c_rarg6: parameter size (in words) int 106 // c_rarg7: thread Thread* 107 // 108 // There is no return from the stub itself as any Java result 109 // is written to result 110 // 111 // we save r30 (lr) as the return PC at the base of the frame and 112 // link r29 (fp) below it as the frame pointer installing sp (r31) 113 // into fp. 114 // 115 // we save r0-r7, which accounts for all the c arguments. 116 // 117 // TODO: strictly do we need to save them all? they are treated as 118 // volatile by C so could we omit saving the ones we are going to 119 // place in global registers (thread? method?) or those we only use 120 // during setup of the Java call? 121 // 122 // we don't need to save r8 which C uses as an indirect result location 123 // return register. 124 // 125 // we don't need to save r9-r15 which both C and Java treat as 126 // volatile 127 // 128 // we don't need to save r16-18 because Java does not use them 129 // 130 // we save r19-r28 which Java uses as scratch registers and C 131 // expects to be callee-save 132 // 133 // we save the bottom 64 bits of each value stored in v8-v15; it is 134 // the responsibility of the caller to preserve larger values. 135 // 136 // so the stub frame looks like this when we enter Java code 137 // 138 // [ return_from_Java ] <--- sp 139 // [ argument word n ] 140 // ... 141 // -29 [ argument word 1 ] 142 // -28 [ saved Floating-point Control Register ] 143 // -26 [ saved v15 ] <--- sp_after_call 144 // -25 [ saved v14 ] 145 // -24 [ saved v13 ] 146 // -23 [ saved v12 ] 147 // -22 [ saved v11 ] 148 // -21 [ saved v10 ] 149 // -20 [ saved v9 ] 150 // -19 [ saved v8 ] 151 // -18 [ saved r28 ] 152 // -17 [ saved r27 ] 153 // -16 [ saved r26 ] 154 // -15 [ saved r25 ] 155 // -14 [ saved r24 ] 156 // -13 [ saved r23 ] 157 // -12 [ saved r22 ] 158 // -11 [ saved r21 ] 159 // -10 [ saved r20 ] 160 // -9 [ saved r19 ] 161 // -8 [ call wrapper (r0) ] 162 // -7 [ result (r1) ] 163 // -6 [ result type (r2) ] 164 // -5 [ method (r3) ] 165 // -4 [ entry point (r4) ] 166 // -3 [ parameters (r5) ] 167 // -2 [ parameter size (r6) ] 168 // -1 [ thread (r7) ] 169 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 170 // 1 [ saved lr (r30) ] 171 172 // Call stub stack layout word offsets from fp 173 enum call_stub_layout { 174 sp_after_call_off = -28, 175 176 fpcr_off = sp_after_call_off, 177 d15_off = -26, 178 d13_off = -24, 179 d11_off = -22, 180 d9_off = -20, 181 182 r28_off = -18, 183 r26_off = -16, 184 r24_off = -14, 185 r22_off = -12, 186 r20_off = -10, 187 call_wrapper_off = -8, 188 result_off = -7, 189 result_type_off = -6, 190 method_off = -5, 191 entry_point_off = -4, 192 parameter_size_off = -2, 193 thread_off = -1, 194 fp_f = 0, 195 retaddr_off = 1, 196 }; 197 198 address generate_call_stub(address& return_address) { 199 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 200 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 201 "adjust this code"); 202 203 StubCodeMark mark(this, "StubRoutines", "call_stub"); 204 address start = __ pc(); 205 206 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 207 208 const Address fpcr_save (rfp, fpcr_off * wordSize); 209 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 210 const Address result (rfp, result_off * wordSize); 211 const Address result_type (rfp, result_type_off * wordSize); 212 const Address method (rfp, method_off * wordSize); 213 const Address entry_point (rfp, entry_point_off * wordSize); 214 const Address parameter_size(rfp, parameter_size_off * wordSize); 215 216 const Address thread (rfp, thread_off * wordSize); 217 218 const Address d15_save (rfp, d15_off * wordSize); 219 const Address d13_save (rfp, d13_off * wordSize); 220 const Address d11_save (rfp, d11_off * wordSize); 221 const Address d9_save (rfp, d9_off * wordSize); 222 223 const Address r28_save (rfp, r28_off * wordSize); 224 const Address r26_save (rfp, r26_off * wordSize); 225 const Address r24_save (rfp, r24_off * wordSize); 226 const Address r22_save (rfp, r22_off * wordSize); 227 const Address r20_save (rfp, r20_off * wordSize); 228 229 // stub code 230 231 address aarch64_entry = __ pc(); 232 233 // set up frame and move sp to end of save area 234 __ enter(); 235 __ sub(sp, rfp, -sp_after_call_off * wordSize); 236 237 // save register parameters and Java scratch/global registers 238 // n.b. we save thread even though it gets installed in 239 // rthread because we want to sanity check rthread later 240 __ str(c_rarg7, thread); 241 __ strw(c_rarg6, parameter_size); 242 __ stp(c_rarg4, c_rarg5, entry_point); 243 __ stp(c_rarg2, c_rarg3, result_type); 244 __ stp(c_rarg0, c_rarg1, call_wrapper); 245 246 __ stp(r20, r19, r20_save); 247 __ stp(r22, r21, r22_save); 248 __ stp(r24, r23, r24_save); 249 __ stp(r26, r25, r26_save); 250 __ stp(r28, r27, r28_save); 251 252 __ stpd(v9, v8, d9_save); 253 __ stpd(v11, v10, d11_save); 254 __ stpd(v13, v12, d13_save); 255 __ stpd(v15, v14, d15_save); 256 257 __ get_fpcr(rscratch1); 258 __ str(rscratch1, fpcr_save); 259 // Set FPCR to the state we need. We do want Round to Nearest. We 260 // don't want non-IEEE rounding modes or floating-point traps. 261 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 262 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 263 __ set_fpcr(rscratch1); 264 265 // install Java thread in global register now we have saved 266 // whatever value it held 267 __ mov(rthread, c_rarg7); 268 // And method 269 __ mov(rmethod, c_rarg3); 270 271 // set up the heapbase register 272 __ reinit_heapbase(); 273 274 #ifdef ASSERT 275 // make sure we have no pending exceptions 276 { 277 Label L; 278 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 279 __ cmp(rscratch1, (u1)NULL_WORD); 280 __ br(Assembler::EQ, L); 281 __ stop("StubRoutines::call_stub: entered with pending exception"); 282 __ BIND(L); 283 } 284 #endif 285 // pass parameters if any 286 __ mov(esp, sp); 287 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 288 __ andr(sp, rscratch1, -2 * wordSize); 289 290 BLOCK_COMMENT("pass parameters if any"); 291 Label parameters_done; 292 // parameter count is still in c_rarg6 293 // and parameter pointer identifying param 1 is in c_rarg5 294 __ cbzw(c_rarg6, parameters_done); 295 296 address loop = __ pc(); 297 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 298 __ subsw(c_rarg6, c_rarg6, 1); 299 __ push(rscratch1); 300 __ br(Assembler::GT, loop); 301 302 __ BIND(parameters_done); 303 304 // call Java entry -- passing methdoOop, and current sp 305 // rmethod: Method* 306 // r19_sender_sp: sender sp 307 BLOCK_COMMENT("call Java function"); 308 __ mov(r19_sender_sp, sp); 309 __ blr(c_rarg4); 310 311 // we do this here because the notify will already have been done 312 // if we get to the next instruction via an exception 313 // 314 // n.b. adding this instruction here affects the calculation of 315 // whether or not a routine returns to the call stub (used when 316 // doing stack walks) since the normal test is to check the return 317 // pc against the address saved below. so we may need to allow for 318 // this extra instruction in the check. 319 320 // save current address for use by exception handling code 321 322 return_address = __ pc(); 323 324 // store result depending on type (everything that is not 325 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 326 // n.b. this assumes Java returns an integral result in r0 327 // and a floating result in j_farg0 328 // All of j_rargN may be used to return inline type fields so be careful 329 // not to clobber those. 330 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 331 // assignment of Rresult below. 332 Register Rresult = r14, Rresult_type = r15; 333 __ ldr(Rresult, result); 334 Label is_long, is_float, is_double, check_prim, exit; 335 __ ldr(Rresult_type, result_type); 336 __ cmp(Rresult_type, (u1)T_OBJECT); 337 __ br(Assembler::EQ, check_prim); 338 __ cmp(Rresult_type, (u1)T_LONG); 339 __ br(Assembler::EQ, is_long); 340 __ cmp(Rresult_type, (u1)T_FLOAT); 341 __ br(Assembler::EQ, is_float); 342 __ cmp(Rresult_type, (u1)T_DOUBLE); 343 __ br(Assembler::EQ, is_double); 344 345 // handle T_INT case 346 __ strw(r0, Address(Rresult)); 347 348 __ BIND(exit); 349 350 // pop parameters 351 __ sub(esp, rfp, -sp_after_call_off * wordSize); 352 353 #ifdef ASSERT 354 // verify that threads correspond 355 { 356 Label L, S; 357 __ ldr(rscratch1, thread); 358 __ cmp(rthread, rscratch1); 359 __ br(Assembler::NE, S); 360 __ get_thread(rscratch1); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::EQ, L); 363 __ BIND(S); 364 __ stop("StubRoutines::call_stub: threads must correspond"); 365 __ BIND(L); 366 } 367 #endif 368 369 __ pop_cont_fastpath(rthread); 370 371 // restore callee-save registers 372 __ ldpd(v15, v14, d15_save); 373 __ ldpd(v13, v12, d13_save); 374 __ ldpd(v11, v10, d11_save); 375 __ ldpd(v9, v8, d9_save); 376 377 __ ldp(r28, r27, r28_save); 378 __ ldp(r26, r25, r26_save); 379 __ ldp(r24, r23, r24_save); 380 __ ldp(r22, r21, r22_save); 381 __ ldp(r20, r19, r20_save); 382 383 // restore fpcr 384 __ ldr(rscratch1, fpcr_save); 385 __ set_fpcr(rscratch1); 386 387 __ ldp(c_rarg0, c_rarg1, call_wrapper); 388 __ ldrw(c_rarg2, result_type); 389 __ ldr(c_rarg3, method); 390 __ ldp(c_rarg4, c_rarg5, entry_point); 391 __ ldp(c_rarg6, c_rarg7, parameter_size); 392 393 // leave frame and return to caller 394 __ leave(); 395 __ ret(lr); 396 397 // handle return types different from T_INT 398 __ BIND(check_prim); 399 if (InlineTypeReturnedAsFields) { 400 // Check for scalarized return value 401 __ tbz(r0, 0, is_long); 402 // Load pack handler address 403 __ andr(rscratch1, r0, -2); 404 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 405 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 406 __ blr(rscratch1); 407 __ b(exit); 408 } 409 410 __ BIND(is_long); 411 __ str(r0, Address(Rresult, 0)); 412 __ br(Assembler::AL, exit); 413 414 __ BIND(is_float); 415 __ strs(j_farg0, Address(Rresult, 0)); 416 __ br(Assembler::AL, exit); 417 418 __ BIND(is_double); 419 __ strd(j_farg0, Address(Rresult, 0)); 420 __ br(Assembler::AL, exit); 421 422 return start; 423 } 424 425 // Return point for a Java call if there's an exception thrown in 426 // Java code. The exception is caught and transformed into a 427 // pending exception stored in JavaThread that can be tested from 428 // within the VM. 429 // 430 // Note: Usually the parameters are removed by the callee. In case 431 // of an exception crossing an activation frame boundary, that is 432 // not the case if the callee is compiled code => need to setup the 433 // rsp. 434 // 435 // r0: exception oop 436 437 address generate_catch_exception() { 438 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 439 address start = __ pc(); 440 441 // same as in generate_call_stub(): 442 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 443 const Address thread (rfp, thread_off * wordSize); 444 445 #ifdef ASSERT 446 // verify that threads correspond 447 { 448 Label L, S; 449 __ ldr(rscratch1, thread); 450 __ cmp(rthread, rscratch1); 451 __ br(Assembler::NE, S); 452 __ get_thread(rscratch1); 453 __ cmp(rthread, rscratch1); 454 __ br(Assembler::EQ, L); 455 __ bind(S); 456 __ stop("StubRoutines::catch_exception: threads must correspond"); 457 __ bind(L); 458 } 459 #endif 460 461 // set pending exception 462 __ verify_oop(r0); 463 464 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 465 __ mov(rscratch1, (address)__FILE__); 466 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 467 __ movw(rscratch1, (int)__LINE__); 468 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 469 470 // complete return to VM 471 assert(StubRoutines::_call_stub_return_address != nullptr, 472 "_call_stub_return_address must have been generated before"); 473 __ b(StubRoutines::_call_stub_return_address); 474 475 return start; 476 } 477 478 // Continuation point for runtime calls returning with a pending 479 // exception. The pending exception check happened in the runtime 480 // or native call stub. The pending exception in Thread is 481 // converted into a Java-level exception. 482 // 483 // Contract with Java-level exception handlers: 484 // r0: exception 485 // r3: throwing pc 486 // 487 // NOTE: At entry of this stub, exception-pc must be in LR !! 488 489 // NOTE: this is always used as a jump target within generated code 490 // so it just needs to be generated code with no x86 prolog 491 492 address generate_forward_exception() { 493 StubCodeMark mark(this, "StubRoutines", "forward exception"); 494 address start = __ pc(); 495 496 // Upon entry, LR points to the return address returning into 497 // Java (interpreted or compiled) code; i.e., the return address 498 // becomes the throwing pc. 499 // 500 // Arguments pushed before the runtime call are still on the stack 501 // but the exception handler will reset the stack pointer -> 502 // ignore them. A potential result in registers can be ignored as 503 // well. 504 505 #ifdef ASSERT 506 // make sure this code is only executed if there is a pending exception 507 { 508 Label L; 509 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 510 __ cbnz(rscratch1, L); 511 __ stop("StubRoutines::forward exception: no pending exception (1)"); 512 __ bind(L); 513 } 514 #endif 515 516 // compute exception handler into r19 517 518 // call the VM to find the handler address associated with the 519 // caller address. pass thread in r0 and caller pc (ret address) 520 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 521 // the stack. 522 __ mov(c_rarg1, lr); 523 // lr will be trashed by the VM call so we move it to R19 524 // (callee-saved) because we also need to pass it to the handler 525 // returned by this call. 526 __ mov(r19, lr); 527 BLOCK_COMMENT("call exception_handler_for_return_address"); 528 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 529 SharedRuntime::exception_handler_for_return_address), 530 rthread, c_rarg1); 531 // Reinitialize the ptrue predicate register, in case the external runtime 532 // call clobbers ptrue reg, as we may return to SVE compiled code. 533 __ reinitialize_ptrue(); 534 535 // we should not really care that lr is no longer the callee 536 // address. we saved the value the handler needs in r19 so we can 537 // just copy it to r3. however, the C2 handler will push its own 538 // frame and then calls into the VM and the VM code asserts that 539 // the PC for the frame above the handler belongs to a compiled 540 // Java method. So, we restore lr here to satisfy that assert. 541 __ mov(lr, r19); 542 // setup r0 & r3 & clear pending exception 543 __ mov(r3, r19); 544 __ mov(r19, r0); 545 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 546 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 547 548 #ifdef ASSERT 549 // make sure exception is set 550 { 551 Label L; 552 __ cbnz(r0, L); 553 __ stop("StubRoutines::forward exception: no pending exception (2)"); 554 __ bind(L); 555 } 556 #endif 557 558 // continue at exception handler 559 // r0: exception 560 // r3: throwing pc 561 // r19: exception handler 562 __ verify_oop(r0); 563 __ br(r19); 564 565 return start; 566 } 567 568 // Non-destructive plausibility checks for oops 569 // 570 // Arguments: 571 // r0: oop to verify 572 // rscratch1: error message 573 // 574 // Stack after saving c_rarg3: 575 // [tos + 0]: saved c_rarg3 576 // [tos + 1]: saved c_rarg2 577 // [tos + 2]: saved lr 578 // [tos + 3]: saved rscratch2 579 // [tos + 4]: saved r0 580 // [tos + 5]: saved rscratch1 581 address generate_verify_oop() { 582 583 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 584 address start = __ pc(); 585 586 Label exit, error; 587 588 // save c_rarg2 and c_rarg3 589 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 590 591 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 592 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 593 __ ldr(c_rarg3, Address(c_rarg2)); 594 __ add(c_rarg3, c_rarg3, 1); 595 __ str(c_rarg3, Address(c_rarg2)); 596 597 // object is in r0 598 // make sure object is 'reasonable' 599 __ cbz(r0, exit); // if obj is null it is OK 600 601 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 602 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 603 604 // return if everything seems ok 605 __ bind(exit); 606 607 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 608 __ ret(lr); 609 610 // handle errors 611 __ bind(error); 612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 613 614 __ push(RegSet::range(r0, r29), sp); 615 // debug(char* msg, int64_t pc, int64_t regs[]) 616 __ mov(c_rarg0, rscratch1); // pass address of error message 617 __ mov(c_rarg1, lr); // pass return address 618 __ mov(c_rarg2, sp); // pass address of regs on stack 619 #ifndef PRODUCT 620 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 621 #endif 622 BLOCK_COMMENT("call MacroAssembler::debug"); 623 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 624 __ blr(rscratch1); 625 __ hlt(0); 626 627 return start; 628 } 629 630 // Generate indices for iota vector. 631 address generate_iota_indices(const char *stub_name) { 632 __ align(CodeEntryAlignment); 633 StubCodeMark mark(this, "StubRoutines", stub_name); 634 address start = __ pc(); 635 // B 636 __ emit_data64(0x0706050403020100, relocInfo::none); 637 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 638 // H 639 __ emit_data64(0x0003000200010000, relocInfo::none); 640 __ emit_data64(0x0007000600050004, relocInfo::none); 641 // S 642 __ emit_data64(0x0000000100000000, relocInfo::none); 643 __ emit_data64(0x0000000300000002, relocInfo::none); 644 // D 645 __ emit_data64(0x0000000000000000, relocInfo::none); 646 __ emit_data64(0x0000000000000001, relocInfo::none); 647 // S - FP 648 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 649 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 650 // D - FP 651 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 652 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 653 return start; 654 } 655 656 // The inner part of zero_words(). This is the bulk operation, 657 // zeroing words in blocks, possibly using DC ZVA to do it. The 658 // caller is responsible for zeroing the last few words. 659 // 660 // Inputs: 661 // r10: the HeapWord-aligned base address of an array to zero. 662 // r11: the count in HeapWords, r11 > 0. 663 // 664 // Returns r10 and r11, adjusted for the caller to clear. 665 // r10: the base address of the tail of words left to clear. 666 // r11: the number of words in the tail. 667 // r11 < MacroAssembler::zero_words_block_size. 668 669 address generate_zero_blocks() { 670 Label done; 671 Label base_aligned; 672 673 Register base = r10, cnt = r11; 674 675 __ align(CodeEntryAlignment); 676 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 677 address start = __ pc(); 678 679 if (UseBlockZeroing) { 680 int zva_length = VM_Version::zva_length(); 681 682 // Ensure ZVA length can be divided by 16. This is required by 683 // the subsequent operations. 684 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 685 686 __ tbz(base, 3, base_aligned); 687 __ str(zr, Address(__ post(base, 8))); 688 __ sub(cnt, cnt, 1); 689 __ bind(base_aligned); 690 691 // Ensure count >= zva_length * 2 so that it still deserves a zva after 692 // alignment. 693 Label small; 694 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 695 __ subs(rscratch1, cnt, low_limit >> 3); 696 __ br(Assembler::LT, small); 697 __ zero_dcache_blocks(base, cnt); 698 __ bind(small); 699 } 700 701 { 702 // Number of stp instructions we'll unroll 703 const int unroll = 704 MacroAssembler::zero_words_block_size / 2; 705 // Clear the remaining blocks. 706 Label loop; 707 __ subs(cnt, cnt, unroll * 2); 708 __ br(Assembler::LT, done); 709 __ bind(loop); 710 for (int i = 0; i < unroll; i++) 711 __ stp(zr, zr, __ post(base, 16)); 712 __ subs(cnt, cnt, unroll * 2); 713 __ br(Assembler::GE, loop); 714 __ bind(done); 715 __ add(cnt, cnt, unroll * 2); 716 } 717 718 __ ret(lr); 719 720 return start; 721 } 722 723 724 typedef enum { 725 copy_forwards = 1, 726 copy_backwards = -1 727 } copy_direction; 728 729 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 730 // for arraycopy stubs. 731 class ArrayCopyBarrierSetHelper : StackObj { 732 BarrierSetAssembler* _bs_asm; 733 MacroAssembler* _masm; 734 DecoratorSet _decorators; 735 BasicType _type; 736 Register _gct1; 737 Register _gct2; 738 Register _gct3; 739 FloatRegister _gcvt1; 740 FloatRegister _gcvt2; 741 FloatRegister _gcvt3; 742 743 public: 744 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 745 DecoratorSet decorators, 746 BasicType type, 747 Register gct1, 748 Register gct2, 749 Register gct3, 750 FloatRegister gcvt1, 751 FloatRegister gcvt2, 752 FloatRegister gcvt3) 753 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 754 _masm(masm), 755 _decorators(decorators), 756 _type(type), 757 _gct1(gct1), 758 _gct2(gct2), 759 _gct3(gct3), 760 _gcvt1(gcvt1), 761 _gcvt2(gcvt2), 762 _gcvt3(gcvt3) { 763 } 764 765 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 766 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 767 dst1, dst2, src, 768 _gct1, _gct2, _gcvt1); 769 } 770 771 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 772 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 773 dst, src1, src2, 774 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 775 } 776 777 void copy_load_at_16(Register dst1, Register dst2, Address src) { 778 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 779 dst1, dst2, src, 780 _gct1); 781 } 782 783 void copy_store_at_16(Address dst, Register src1, Register src2) { 784 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 785 dst, src1, src2, 786 _gct1, _gct2, _gct3); 787 } 788 789 void copy_load_at_8(Register dst, Address src) { 790 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 791 dst, noreg, src, 792 _gct1); 793 } 794 795 void copy_store_at_8(Address dst, Register src) { 796 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 797 dst, src, noreg, 798 _gct1, _gct2, _gct3); 799 } 800 }; 801 802 // Bulk copy of blocks of 8 words. 803 // 804 // count is a count of words. 805 // 806 // Precondition: count >= 8 807 // 808 // Postconditions: 809 // 810 // The least significant bit of count contains the remaining count 811 // of words to copy. The rest of count is trash. 812 // 813 // s and d are adjusted to point to the remaining words to copy 814 // 815 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 816 copy_direction direction) { 817 int unit = wordSize * direction; 818 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 819 820 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 821 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 822 const Register stride = r14; 823 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 824 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 825 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 826 827 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 828 assert_different_registers(s, d, count, rscratch1, rscratch2); 829 830 Label again, drain; 831 const char *stub_name; 832 if (direction == copy_forwards) 833 stub_name = "forward_copy_longs"; 834 else 835 stub_name = "backward_copy_longs"; 836 837 __ align(CodeEntryAlignment); 838 839 StubCodeMark mark(this, "StubRoutines", stub_name); 840 841 __ bind(start); 842 843 Label unaligned_copy_long; 844 if (AvoidUnalignedAccesses) { 845 __ tbnz(d, 3, unaligned_copy_long); 846 } 847 848 if (direction == copy_forwards) { 849 __ sub(s, s, bias); 850 __ sub(d, d, bias); 851 } 852 853 #ifdef ASSERT 854 // Make sure we are never given < 8 words 855 { 856 Label L; 857 __ cmp(count, (u1)8); 858 __ br(Assembler::GE, L); 859 __ stop("genrate_copy_longs called with < 8 words"); 860 __ bind(L); 861 } 862 #endif 863 864 // Fill 8 registers 865 if (UseSIMDForMemoryOps) { 866 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 867 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 868 } else { 869 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 870 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 871 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 872 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 873 } 874 875 __ subs(count, count, 16); 876 __ br(Assembler::LO, drain); 877 878 int prefetch = PrefetchCopyIntervalInBytes; 879 bool use_stride = false; 880 if (direction == copy_backwards) { 881 use_stride = prefetch > 256; 882 prefetch = -prefetch; 883 if (use_stride) __ mov(stride, prefetch); 884 } 885 886 __ bind(again); 887 888 if (PrefetchCopyIntervalInBytes > 0) 889 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 890 891 if (UseSIMDForMemoryOps) { 892 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 893 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 894 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 895 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 896 } else { 897 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 898 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 899 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 900 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 901 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 902 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 903 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 905 } 906 907 __ subs(count, count, 8); 908 __ br(Assembler::HS, again); 909 910 // Drain 911 __ bind(drain); 912 if (UseSIMDForMemoryOps) { 913 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 914 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 915 } else { 916 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 917 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 920 } 921 922 { 923 Label L1, L2; 924 __ tbz(count, exact_log2(4), L1); 925 if (UseSIMDForMemoryOps) { 926 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 927 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 928 } else { 929 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 930 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 931 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 932 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 933 } 934 __ bind(L1); 935 936 if (direction == copy_forwards) { 937 __ add(s, s, bias); 938 __ add(d, d, bias); 939 } 940 941 __ tbz(count, 1, L2); 942 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 943 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 944 __ bind(L2); 945 } 946 947 __ ret(lr); 948 949 if (AvoidUnalignedAccesses) { 950 Label drain, again; 951 // Register order for storing. Order is different for backward copy. 952 953 __ bind(unaligned_copy_long); 954 955 // source address is even aligned, target odd aligned 956 // 957 // when forward copying word pairs we read long pairs at offsets 958 // {0, 2, 4, 6} (in long words). when backwards copying we read 959 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 960 // address by -2 in the forwards case so we can compute the 961 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 962 // or -1. 963 // 964 // when forward copying we need to store 1 word, 3 pairs and 965 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 966 // zero offset We adjust the destination by -1 which means we 967 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 968 // 969 // When backwards copyng we need to store 1 word, 3 pairs and 970 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 971 // offsets {1, 3, 5, 7, 8} * unit. 972 973 if (direction == copy_forwards) { 974 __ sub(s, s, 16); 975 __ sub(d, d, 8); 976 } 977 978 // Fill 8 registers 979 // 980 // for forwards copy s was offset by -16 from the original input 981 // value of s so the register contents are at these offsets 982 // relative to the 64 bit block addressed by that original input 983 // and so on for each successive 64 byte block when s is updated 984 // 985 // t0 at offset 0, t1 at offset 8 986 // t2 at offset 16, t3 at offset 24 987 // t4 at offset 32, t5 at offset 40 988 // t6 at offset 48, t7 at offset 56 989 990 // for backwards copy s was not offset so the register contents 991 // are at these offsets into the preceding 64 byte block 992 // relative to that original input and so on for each successive 993 // preceding 64 byte block when s is updated. this explains the 994 // slightly counter-intuitive looking pattern of register usage 995 // in the stp instructions for backwards copy. 996 // 997 // t0 at offset -16, t1 at offset -8 998 // t2 at offset -32, t3 at offset -24 999 // t4 at offset -48, t5 at offset -40 1000 // t6 at offset -64, t7 at offset -56 1001 1002 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1003 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1004 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1005 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1006 1007 __ subs(count, count, 16); 1008 __ br(Assembler::LO, drain); 1009 1010 int prefetch = PrefetchCopyIntervalInBytes; 1011 bool use_stride = false; 1012 if (direction == copy_backwards) { 1013 use_stride = prefetch > 256; 1014 prefetch = -prefetch; 1015 if (use_stride) __ mov(stride, prefetch); 1016 } 1017 1018 __ bind(again); 1019 1020 if (PrefetchCopyIntervalInBytes > 0) 1021 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1022 1023 if (direction == copy_forwards) { 1024 // allowing for the offset of -8 the store instructions place 1025 // registers into the target 64 bit block at the following 1026 // offsets 1027 // 1028 // t0 at offset 0 1029 // t1 at offset 8, t2 at offset 16 1030 // t3 at offset 24, t4 at offset 32 1031 // t5 at offset 40, t6 at offset 48 1032 // t7 at offset 56 1033 1034 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1035 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1036 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1037 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1038 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1039 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1040 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1041 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1042 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1043 } else { 1044 // d was not offset when we started so the registers are 1045 // written into the 64 bit block preceding d with the following 1046 // offsets 1047 // 1048 // t1 at offset -8 1049 // t3 at offset -24, t0 at offset -16 1050 // t5 at offset -48, t2 at offset -32 1051 // t7 at offset -56, t4 at offset -48 1052 // t6 at offset -64 1053 // 1054 // note that this matches the offsets previously noted for the 1055 // loads 1056 1057 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1058 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1059 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1060 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1061 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1062 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1063 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1064 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1065 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1066 } 1067 1068 __ subs(count, count, 8); 1069 __ br(Assembler::HS, again); 1070 1071 // Drain 1072 // 1073 // this uses the same pattern of offsets and register arguments 1074 // as above 1075 __ bind(drain); 1076 if (direction == copy_forwards) { 1077 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1078 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1079 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1080 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1082 } else { 1083 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1084 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1085 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1086 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1087 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1088 } 1089 // now we need to copy any remaining part block which may 1090 // include a 4 word block subblock and/or a 2 word subblock. 1091 // bits 2 and 1 in the count are the tell-tale for whether we 1092 // have each such subblock 1093 { 1094 Label L1, L2; 1095 __ tbz(count, exact_log2(4), L1); 1096 // this is the same as above but copying only 4 longs hence 1097 // with only one intervening stp between the str instructions 1098 // but note that the offsets and registers still follow the 1099 // same pattern 1100 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1101 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1102 if (direction == copy_forwards) { 1103 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1104 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1105 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1106 } else { 1107 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1108 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1109 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1110 } 1111 __ bind(L1); 1112 1113 __ tbz(count, 1, L2); 1114 // this is the same as above but copying only 2 longs hence 1115 // there is no intervening stp between the str instructions 1116 // but note that the offset and register patterns are still 1117 // the same 1118 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1119 if (direction == copy_forwards) { 1120 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1121 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1122 } else { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1124 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1125 } 1126 __ bind(L2); 1127 1128 // for forwards copy we need to re-adjust the offsets we 1129 // applied so that s and d are follow the last words written 1130 1131 if (direction == copy_forwards) { 1132 __ add(s, s, 16); 1133 __ add(d, d, 8); 1134 } 1135 1136 } 1137 1138 __ ret(lr); 1139 } 1140 } 1141 1142 // Small copy: less than 16 bytes. 1143 // 1144 // NB: Ignores all of the bits of count which represent more than 15 1145 // bytes, so a caller doesn't have to mask them. 1146 1147 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1148 bool is_backwards = step < 0; 1149 size_t granularity = uabs(step); 1150 int direction = is_backwards ? -1 : 1; 1151 1152 Label Lword, Lint, Lshort, Lbyte; 1153 1154 assert(granularity 1155 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1156 1157 const Register t0 = r3; 1158 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1159 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1160 1161 // ??? I don't know if this bit-test-and-branch is the right thing 1162 // to do. It does a lot of jumping, resulting in several 1163 // mispredicted branches. It might make more sense to do this 1164 // with something like Duff's device with a single computed branch. 1165 1166 __ tbz(count, 3 - exact_log2(granularity), Lword); 1167 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1168 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1169 __ bind(Lword); 1170 1171 if (granularity <= sizeof (jint)) { 1172 __ tbz(count, 2 - exact_log2(granularity), Lint); 1173 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1174 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1175 __ bind(Lint); 1176 } 1177 1178 if (granularity <= sizeof (jshort)) { 1179 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1180 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1181 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1182 __ bind(Lshort); 1183 } 1184 1185 if (granularity <= sizeof (jbyte)) { 1186 __ tbz(count, 0, Lbyte); 1187 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1188 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1189 __ bind(Lbyte); 1190 } 1191 } 1192 1193 Label copy_f, copy_b; 1194 Label copy_obj_f, copy_obj_b; 1195 Label copy_obj_uninit_f, copy_obj_uninit_b; 1196 1197 // All-singing all-dancing memory copy. 1198 // 1199 // Copy count units of memory from s to d. The size of a unit is 1200 // step, which can be positive or negative depending on the direction 1201 // of copy. If is_aligned is false, we align the source address. 1202 // 1203 1204 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1205 Register s, Register d, Register count, int step) { 1206 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1207 bool is_backwards = step < 0; 1208 unsigned int granularity = uabs(step); 1209 const Register t0 = r3, t1 = r4; 1210 1211 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1212 // load all the data before writing anything 1213 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1214 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1215 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1216 const Register send = r17, dend = r16; 1217 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1218 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1219 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1220 1221 if (PrefetchCopyIntervalInBytes > 0) 1222 __ prfm(Address(s, 0), PLDL1KEEP); 1223 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1224 __ br(Assembler::HI, copy_big); 1225 1226 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1227 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1228 1229 __ cmp(count, u1(16/granularity)); 1230 __ br(Assembler::LS, copy16); 1231 1232 __ cmp(count, u1(64/granularity)); 1233 __ br(Assembler::HI, copy80); 1234 1235 __ cmp(count, u1(32/granularity)); 1236 __ br(Assembler::LS, copy32); 1237 1238 // 33..64 bytes 1239 if (UseSIMDForMemoryOps) { 1240 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1241 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1242 bs.copy_store_at_32(Address(d, 0), v0, v1); 1243 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1244 } else { 1245 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1246 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1247 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1248 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1249 1250 bs.copy_store_at_16(Address(d, 0), t0, t1); 1251 bs.copy_store_at_16(Address(d, 16), t2, t3); 1252 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1253 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1254 } 1255 __ b(finish); 1256 1257 // 17..32 bytes 1258 __ bind(copy32); 1259 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1260 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1261 1262 bs.copy_store_at_16(Address(d, 0), t0, t1); 1263 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1264 __ b(finish); 1265 1266 // 65..80/96 bytes 1267 // (96 bytes if SIMD because we do 32 byes per instruction) 1268 __ bind(copy80); 1269 if (UseSIMDForMemoryOps) { 1270 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1271 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1272 // Unaligned pointers can be an issue for copying. 1273 // The issue has more chances to happen when granularity of data is 1274 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1275 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1276 // The most performance drop has been seen for the range 65-80 bytes. 1277 // For such cases using the pair of ldp/stp instead of the third pair of 1278 // ldpq/stpq fixes the performance issue. 1279 if (granularity < sizeof (jint)) { 1280 Label copy96; 1281 __ cmp(count, u1(80/granularity)); 1282 __ br(Assembler::HI, copy96); 1283 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1284 1285 bs.copy_store_at_32(Address(d, 0), v0, v1); 1286 bs.copy_store_at_32(Address(d, 32), v2, v3); 1287 1288 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1289 __ b(finish); 1290 1291 __ bind(copy96); 1292 } 1293 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1294 1295 bs.copy_store_at_32(Address(d, 0), v0, v1); 1296 bs.copy_store_at_32(Address(d, 32), v2, v3); 1297 1298 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1299 } else { 1300 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1301 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1302 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1303 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1304 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1305 1306 bs.copy_store_at_16(Address(d, 0), t0, t1); 1307 bs.copy_store_at_16(Address(d, 16), t2, t3); 1308 bs.copy_store_at_16(Address(d, 32), t4, t5); 1309 bs.copy_store_at_16(Address(d, 48), t6, t7); 1310 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1311 } 1312 __ b(finish); 1313 1314 // 0..16 bytes 1315 __ bind(copy16); 1316 __ cmp(count, u1(8/granularity)); 1317 __ br(Assembler::LO, copy8); 1318 1319 // 8..16 bytes 1320 bs.copy_load_at_8(t0, Address(s, 0)); 1321 bs.copy_load_at_8(t1, Address(send, -8)); 1322 bs.copy_store_at_8(Address(d, 0), t0); 1323 bs.copy_store_at_8(Address(dend, -8), t1); 1324 __ b(finish); 1325 1326 if (granularity < 8) { 1327 // 4..7 bytes 1328 __ bind(copy8); 1329 __ tbz(count, 2 - exact_log2(granularity), copy4); 1330 __ ldrw(t0, Address(s, 0)); 1331 __ ldrw(t1, Address(send, -4)); 1332 __ strw(t0, Address(d, 0)); 1333 __ strw(t1, Address(dend, -4)); 1334 __ b(finish); 1335 if (granularity < 4) { 1336 // 0..3 bytes 1337 __ bind(copy4); 1338 __ cbz(count, finish); // get rid of 0 case 1339 if (granularity == 2) { 1340 __ ldrh(t0, Address(s, 0)); 1341 __ strh(t0, Address(d, 0)); 1342 } else { // granularity == 1 1343 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1344 // the first and last byte. 1345 // Handle the 3 byte case by loading and storing base + count/2 1346 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1347 // This does means in the 1 byte case we load/store the same 1348 // byte 3 times. 1349 __ lsr(count, count, 1); 1350 __ ldrb(t0, Address(s, 0)); 1351 __ ldrb(t1, Address(send, -1)); 1352 __ ldrb(t2, Address(s, count)); 1353 __ strb(t0, Address(d, 0)); 1354 __ strb(t1, Address(dend, -1)); 1355 __ strb(t2, Address(d, count)); 1356 } 1357 __ b(finish); 1358 } 1359 } 1360 1361 __ bind(copy_big); 1362 if (is_backwards) { 1363 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1364 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1365 } 1366 1367 // Now we've got the small case out of the way we can align the 1368 // source address on a 2-word boundary. 1369 1370 // Here we will materialize a count in r15, which is used by copy_memory_small 1371 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1372 // Up until here, we have used t9, which aliases r15, but from here on, that register 1373 // can not be used as a temp register, as it contains the count. 1374 1375 Label aligned; 1376 1377 if (is_aligned) { 1378 // We may have to adjust by 1 word to get s 2-word-aligned. 1379 __ tbz(s, exact_log2(wordSize), aligned); 1380 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1381 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1382 __ sub(count, count, wordSize/granularity); 1383 } else { 1384 if (is_backwards) { 1385 __ andr(r15, s, 2 * wordSize - 1); 1386 } else { 1387 __ neg(r15, s); 1388 __ andr(r15, r15, 2 * wordSize - 1); 1389 } 1390 // r15 is the byte adjustment needed to align s. 1391 __ cbz(r15, aligned); 1392 int shift = exact_log2(granularity); 1393 if (shift) __ lsr(r15, r15, shift); 1394 __ sub(count, count, r15); 1395 1396 #if 0 1397 // ?? This code is only correct for a disjoint copy. It may or 1398 // may not make sense to use it in that case. 1399 1400 // Copy the first pair; s and d may not be aligned. 1401 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1402 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1403 1404 // Align s and d, adjust count 1405 if (is_backwards) { 1406 __ sub(s, s, r15); 1407 __ sub(d, d, r15); 1408 } else { 1409 __ add(s, s, r15); 1410 __ add(d, d, r15); 1411 } 1412 #else 1413 copy_memory_small(decorators, type, s, d, r15, step); 1414 #endif 1415 } 1416 1417 __ bind(aligned); 1418 1419 // s is now 2-word-aligned. 1420 1421 // We have a count of units and some trailing bytes. Adjust the 1422 // count and do a bulk copy of words. 1423 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1424 if (direction == copy_forwards) { 1425 if (type != T_OBJECT) { 1426 __ bl(copy_f); 1427 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1428 __ bl(copy_obj_uninit_f); 1429 } else { 1430 __ bl(copy_obj_f); 1431 } 1432 } else { 1433 if (type != T_OBJECT) { 1434 __ bl(copy_b); 1435 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1436 __ bl(copy_obj_uninit_b); 1437 } else { 1438 __ bl(copy_obj_b); 1439 } 1440 } 1441 1442 // And the tail. 1443 copy_memory_small(decorators, type, s, d, count, step); 1444 1445 if (granularity >= 8) __ bind(copy8); 1446 if (granularity >= 4) __ bind(copy4); 1447 __ bind(finish); 1448 } 1449 1450 1451 void clobber_registers() { 1452 #ifdef ASSERT 1453 RegSet clobbered 1454 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1455 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1456 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1457 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1458 __ mov(*it, rscratch1); 1459 } 1460 #endif 1461 1462 } 1463 1464 // Scan over array at a for count oops, verifying each one. 1465 // Preserves a and count, clobbers rscratch1 and rscratch2. 1466 void verify_oop_array (int size, Register a, Register count, Register temp) { 1467 Label loop, end; 1468 __ mov(rscratch1, a); 1469 __ mov(rscratch2, zr); 1470 __ bind(loop); 1471 __ cmp(rscratch2, count); 1472 __ br(Assembler::HS, end); 1473 if (size == wordSize) { 1474 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1475 __ verify_oop(temp); 1476 } else { 1477 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1478 __ decode_heap_oop(temp); // calls verify_oop 1479 } 1480 __ add(rscratch2, rscratch2, 1); 1481 __ b(loop); 1482 __ bind(end); 1483 } 1484 1485 // Arguments: 1486 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1487 // ignored 1488 // is_oop - true => oop array, so generate store check code 1489 // name - stub name string 1490 // 1491 // Inputs: 1492 // c_rarg0 - source array address 1493 // c_rarg1 - destination array address 1494 // c_rarg2 - element count, treated as ssize_t, can be zero 1495 // 1496 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1497 // the hardware handle it. The two dwords within qwords that span 1498 // cache line boundaries will still be loaded and stored atomically. 1499 // 1500 // Side Effects: 1501 // disjoint_int_copy_entry is set to the no-overlap entry point 1502 // used by generate_conjoint_int_oop_copy(). 1503 // 1504 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1505 const char *name, bool dest_uninitialized = false) { 1506 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1507 RegSet saved_reg = RegSet::of(s, d, count); 1508 __ align(CodeEntryAlignment); 1509 StubCodeMark mark(this, "StubRoutines", name); 1510 address start = __ pc(); 1511 __ enter(); 1512 1513 if (entry != nullptr) { 1514 *entry = __ pc(); 1515 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1516 BLOCK_COMMENT("Entry:"); 1517 } 1518 1519 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1520 if (dest_uninitialized) { 1521 decorators |= IS_DEST_UNINITIALIZED; 1522 } 1523 if (aligned) { 1524 decorators |= ARRAYCOPY_ALIGNED; 1525 } 1526 1527 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1528 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1529 1530 if (is_oop) { 1531 // save regs before copy_memory 1532 __ push(RegSet::of(d, count), sp); 1533 } 1534 { 1535 // UnsafeMemoryAccess page error: continue after unsafe access 1536 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1537 UnsafeMemoryAccessMark umam(this, add_entry, true); 1538 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1539 } 1540 1541 if (is_oop) { 1542 __ pop(RegSet::of(d, count), sp); 1543 if (VerifyOops) 1544 verify_oop_array(size, d, count, r16); 1545 } 1546 1547 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1548 1549 __ leave(); 1550 __ mov(r0, zr); // return 0 1551 __ ret(lr); 1552 return start; 1553 } 1554 1555 // Arguments: 1556 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1557 // ignored 1558 // is_oop - true => oop array, so generate store check code 1559 // name - stub name string 1560 // 1561 // Inputs: 1562 // c_rarg0 - source array address 1563 // c_rarg1 - destination array address 1564 // c_rarg2 - element count, treated as ssize_t, can be zero 1565 // 1566 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1567 // the hardware handle it. The two dwords within qwords that span 1568 // cache line boundaries will still be loaded and stored atomically. 1569 // 1570 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1571 address *entry, const char *name, 1572 bool dest_uninitialized = false) { 1573 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1574 RegSet saved_regs = RegSet::of(s, d, count); 1575 StubCodeMark mark(this, "StubRoutines", name); 1576 address start = __ pc(); 1577 __ enter(); 1578 1579 if (entry != nullptr) { 1580 *entry = __ pc(); 1581 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1582 BLOCK_COMMENT("Entry:"); 1583 } 1584 1585 // use fwd copy when (d-s) above_equal (count*size) 1586 __ sub(rscratch1, d, s); 1587 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1588 __ br(Assembler::HS, nooverlap_target); 1589 1590 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1591 if (dest_uninitialized) { 1592 decorators |= IS_DEST_UNINITIALIZED; 1593 } 1594 if (aligned) { 1595 decorators |= ARRAYCOPY_ALIGNED; 1596 } 1597 1598 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1599 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1600 1601 if (is_oop) { 1602 // save regs before copy_memory 1603 __ push(RegSet::of(d, count), sp); 1604 } 1605 { 1606 // UnsafeMemoryAccess page error: continue after unsafe access 1607 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1608 UnsafeMemoryAccessMark umam(this, add_entry, true); 1609 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1610 } 1611 if (is_oop) { 1612 __ pop(RegSet::of(d, count), sp); 1613 if (VerifyOops) 1614 verify_oop_array(size, d, count, r16); 1615 } 1616 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1617 __ leave(); 1618 __ mov(r0, zr); // return 0 1619 __ ret(lr); 1620 return start; 1621 } 1622 1623 // Arguments: 1624 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1625 // ignored 1626 // name - stub name string 1627 // 1628 // Inputs: 1629 // c_rarg0 - source array address 1630 // c_rarg1 - destination array address 1631 // c_rarg2 - element count, treated as ssize_t, can be zero 1632 // 1633 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1634 // we let the hardware handle it. The one to eight bytes within words, 1635 // dwords or qwords that span cache line boundaries will still be loaded 1636 // and stored atomically. 1637 // 1638 // Side Effects: 1639 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1640 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1641 // we let the hardware handle it. The one to eight bytes within words, 1642 // dwords or qwords that span cache line boundaries will still be loaded 1643 // and stored atomically. 1644 // 1645 // Side Effects: 1646 // disjoint_byte_copy_entry is set to the no-overlap entry point 1647 // used by generate_conjoint_byte_copy(). 1648 // 1649 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1650 const bool not_oop = false; 1651 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1652 } 1653 1654 // Arguments: 1655 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1656 // ignored 1657 // name - stub name string 1658 // 1659 // Inputs: 1660 // c_rarg0 - source array address 1661 // c_rarg1 - destination array address 1662 // c_rarg2 - element count, treated as ssize_t, can be zero 1663 // 1664 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1665 // we let the hardware handle it. The one to eight bytes within words, 1666 // dwords or qwords that span cache line boundaries will still be loaded 1667 // and stored atomically. 1668 // 1669 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1670 address* entry, const char *name) { 1671 const bool not_oop = false; 1672 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1673 } 1674 1675 // Arguments: 1676 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1677 // ignored 1678 // name - stub name string 1679 // 1680 // Inputs: 1681 // c_rarg0 - source array address 1682 // c_rarg1 - destination array address 1683 // c_rarg2 - element count, treated as ssize_t, can be zero 1684 // 1685 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1686 // let the hardware handle it. The two or four words within dwords 1687 // or qwords that span cache line boundaries will still be loaded 1688 // and stored atomically. 1689 // 1690 // Side Effects: 1691 // disjoint_short_copy_entry is set to the no-overlap entry point 1692 // used by generate_conjoint_short_copy(). 1693 // 1694 address generate_disjoint_short_copy(bool aligned, 1695 address* entry, const char *name) { 1696 const bool not_oop = false; 1697 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1698 } 1699 1700 // Arguments: 1701 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1702 // ignored 1703 // name - stub name string 1704 // 1705 // Inputs: 1706 // c_rarg0 - source array address 1707 // c_rarg1 - destination array address 1708 // c_rarg2 - element count, treated as ssize_t, can be zero 1709 // 1710 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1711 // let the hardware handle it. The two or four words within dwords 1712 // or qwords that span cache line boundaries will still be loaded 1713 // and stored atomically. 1714 // 1715 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1716 address *entry, const char *name) { 1717 const bool not_oop = false; 1718 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1719 1720 } 1721 // Arguments: 1722 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1723 // ignored 1724 // name - stub name string 1725 // 1726 // Inputs: 1727 // c_rarg0 - source array address 1728 // c_rarg1 - destination array address 1729 // c_rarg2 - element count, treated as ssize_t, can be zero 1730 // 1731 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1732 // the hardware handle it. The two dwords within qwords that span 1733 // cache line boundaries will still be loaded and stored atomically. 1734 // 1735 // Side Effects: 1736 // disjoint_int_copy_entry is set to the no-overlap entry point 1737 // used by generate_conjoint_int_oop_copy(). 1738 // 1739 address generate_disjoint_int_copy(bool aligned, address *entry, 1740 const char *name, bool dest_uninitialized = false) { 1741 const bool not_oop = false; 1742 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1743 } 1744 1745 // Arguments: 1746 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1747 // ignored 1748 // name - stub name string 1749 // 1750 // Inputs: 1751 // c_rarg0 - source array address 1752 // c_rarg1 - destination array address 1753 // c_rarg2 - element count, treated as ssize_t, can be zero 1754 // 1755 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1756 // the hardware handle it. The two dwords within qwords that span 1757 // cache line boundaries will still be loaded and stored atomically. 1758 // 1759 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1760 address *entry, const char *name, 1761 bool dest_uninitialized = false) { 1762 const bool not_oop = false; 1763 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1764 } 1765 1766 1767 // Arguments: 1768 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1769 // ignored 1770 // name - stub name string 1771 // 1772 // Inputs: 1773 // c_rarg0 - source array address 1774 // c_rarg1 - destination array address 1775 // c_rarg2 - element count, treated as size_t, can be zero 1776 // 1777 // Side Effects: 1778 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1779 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1780 // 1781 address generate_disjoint_long_copy(bool aligned, address *entry, 1782 const char *name, bool dest_uninitialized = false) { 1783 const bool not_oop = false; 1784 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1785 } 1786 1787 // Arguments: 1788 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1789 // ignored 1790 // name - stub name string 1791 // 1792 // Inputs: 1793 // c_rarg0 - source array address 1794 // c_rarg1 - destination array address 1795 // c_rarg2 - element count, treated as size_t, can be zero 1796 // 1797 address generate_conjoint_long_copy(bool aligned, 1798 address nooverlap_target, address *entry, 1799 const char *name, bool dest_uninitialized = false) { 1800 const bool not_oop = false; 1801 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1802 } 1803 1804 // Arguments: 1805 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1806 // ignored 1807 // name - stub name string 1808 // 1809 // Inputs: 1810 // c_rarg0 - source array address 1811 // c_rarg1 - destination array address 1812 // c_rarg2 - element count, treated as size_t, can be zero 1813 // 1814 // Side Effects: 1815 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1816 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1817 // 1818 address generate_disjoint_oop_copy(bool aligned, address *entry, 1819 const char *name, bool dest_uninitialized) { 1820 const bool is_oop = true; 1821 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1822 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1823 } 1824 1825 // Arguments: 1826 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1827 // ignored 1828 // name - stub name string 1829 // 1830 // Inputs: 1831 // c_rarg0 - source array address 1832 // c_rarg1 - destination array address 1833 // c_rarg2 - element count, treated as size_t, can be zero 1834 // 1835 address generate_conjoint_oop_copy(bool aligned, 1836 address nooverlap_target, address *entry, 1837 const char *name, bool dest_uninitialized) { 1838 const bool is_oop = true; 1839 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1840 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1841 name, dest_uninitialized); 1842 } 1843 1844 1845 // Helper for generating a dynamic type check. 1846 // Smashes rscratch1, rscratch2. 1847 void generate_type_check(Register sub_klass, 1848 Register super_check_offset, 1849 Register super_klass, 1850 Label& L_success) { 1851 assert_different_registers(sub_klass, super_check_offset, super_klass); 1852 1853 BLOCK_COMMENT("type_check:"); 1854 1855 Label L_miss; 1856 1857 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1858 super_check_offset); 1859 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1860 1861 // Fall through on failure! 1862 __ BIND(L_miss); 1863 } 1864 1865 // 1866 // Generate checkcasting array copy stub 1867 // 1868 // Input: 1869 // c_rarg0 - source array address 1870 // c_rarg1 - destination array address 1871 // c_rarg2 - element count, treated as ssize_t, can be zero 1872 // c_rarg3 - size_t ckoff (super_check_offset) 1873 // c_rarg4 - oop ckval (super_klass) 1874 // 1875 // Output: 1876 // r0 == 0 - success 1877 // r0 == -1^K - failure, where K is partial transfer count 1878 // 1879 address generate_checkcast_copy(const char *name, address *entry, 1880 bool dest_uninitialized = false) { 1881 1882 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1883 1884 // Input registers (after setup_arg_regs) 1885 const Register from = c_rarg0; // source array address 1886 const Register to = c_rarg1; // destination array address 1887 const Register count = c_rarg2; // elementscount 1888 const Register ckoff = c_rarg3; // super_check_offset 1889 const Register ckval = c_rarg4; // super_klass 1890 1891 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1892 RegSet wb_post_saved_regs = RegSet::of(count); 1893 1894 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1895 const Register copied_oop = r22; // actual oop copied 1896 const Register count_save = r21; // orig elementscount 1897 const Register start_to = r20; // destination array start address 1898 const Register r19_klass = r19; // oop._klass 1899 1900 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1901 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1902 1903 //--------------------------------------------------------------- 1904 // Assembler stub will be used for this call to arraycopy 1905 // if the two arrays are subtypes of Object[] but the 1906 // destination array type is not equal to or a supertype 1907 // of the source type. Each element must be separately 1908 // checked. 1909 1910 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1911 copied_oop, r19_klass, count_save); 1912 1913 __ align(CodeEntryAlignment); 1914 StubCodeMark mark(this, "StubRoutines", name); 1915 address start = __ pc(); 1916 1917 __ enter(); // required for proper stackwalking of RuntimeStub frame 1918 1919 #ifdef ASSERT 1920 // caller guarantees that the arrays really are different 1921 // otherwise, we would have to make conjoint checks 1922 { Label L; 1923 __ b(L); // conjoint check not yet implemented 1924 __ stop("checkcast_copy within a single array"); 1925 __ bind(L); 1926 } 1927 #endif //ASSERT 1928 1929 // Caller of this entry point must set up the argument registers. 1930 if (entry != nullptr) { 1931 *entry = __ pc(); 1932 BLOCK_COMMENT("Entry:"); 1933 } 1934 1935 // Empty array: Nothing to do. 1936 __ cbz(count, L_done); 1937 __ push(RegSet::of(r19, r20, r21, r22), sp); 1938 1939 #ifdef ASSERT 1940 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1941 // The ckoff and ckval must be mutually consistent, 1942 // even though caller generates both. 1943 { Label L; 1944 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1945 __ ldrw(start_to, Address(ckval, sco_offset)); 1946 __ cmpw(ckoff, start_to); 1947 __ br(Assembler::EQ, L); 1948 __ stop("super_check_offset inconsistent"); 1949 __ bind(L); 1950 } 1951 #endif //ASSERT 1952 1953 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1954 bool is_oop = true; 1955 int element_size = UseCompressedOops ? 4 : 8; 1956 if (dest_uninitialized) { 1957 decorators |= IS_DEST_UNINITIALIZED; 1958 } 1959 1960 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1961 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1962 1963 // save the original count 1964 __ mov(count_save, count); 1965 1966 // Copy from low to high addresses 1967 __ mov(start_to, to); // Save destination array start address 1968 __ b(L_load_element); 1969 1970 // ======== begin loop ======== 1971 // (Loop is rotated; its entry is L_load_element.) 1972 // Loop control: 1973 // for (; count != 0; count--) { 1974 // copied_oop = load_heap_oop(from++); 1975 // ... generate_type_check ...; 1976 // store_heap_oop(to++, copied_oop); 1977 // } 1978 __ align(OptoLoopAlignment); 1979 1980 __ BIND(L_store_element); 1981 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1982 __ post(to, element_size), copied_oop, noreg, 1983 gct1, gct2, gct3); 1984 __ sub(count, count, 1); 1985 __ cbz(count, L_do_card_marks); 1986 1987 // ======== loop entry is here ======== 1988 __ BIND(L_load_element); 1989 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1990 copied_oop, noreg, __ post(from, element_size), 1991 gct1); 1992 __ cbz(copied_oop, L_store_element); 1993 1994 __ load_klass(r19_klass, copied_oop);// query the object klass 1995 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1996 // ======== end loop ======== 1997 1998 // It was a real error; we must depend on the caller to finish the job. 1999 // Register count = remaining oops, count_orig = total oops. 2000 // Emit GC store barriers for the oops we have copied and report 2001 // their number to the caller. 2002 2003 __ subs(count, count_save, count); // K = partially copied oop count 2004 __ eon(count, count, zr); // report (-1^K) to caller 2005 __ br(Assembler::EQ, L_done_pop); 2006 2007 __ BIND(L_do_card_marks); 2008 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2009 2010 __ bind(L_done_pop); 2011 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2012 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2013 2014 __ bind(L_done); 2015 __ mov(r0, count); 2016 __ leave(); 2017 __ ret(lr); 2018 2019 return start; 2020 } 2021 2022 // Perform range checks on the proposed arraycopy. 2023 // Kills temp, but nothing else. 2024 // Also, clean the sign bits of src_pos and dst_pos. 2025 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2026 Register src_pos, // source position (c_rarg1) 2027 Register dst, // destination array oo (c_rarg2) 2028 Register dst_pos, // destination position (c_rarg3) 2029 Register length, 2030 Register temp, 2031 Label& L_failed) { 2032 BLOCK_COMMENT("arraycopy_range_checks:"); 2033 2034 assert_different_registers(rscratch1, temp); 2035 2036 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2037 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2038 __ addw(temp, length, src_pos); 2039 __ cmpw(temp, rscratch1); 2040 __ br(Assembler::HI, L_failed); 2041 2042 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2043 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2044 __ addw(temp, length, dst_pos); 2045 __ cmpw(temp, rscratch1); 2046 __ br(Assembler::HI, L_failed); 2047 2048 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2049 __ movw(src_pos, src_pos); 2050 __ movw(dst_pos, dst_pos); 2051 2052 BLOCK_COMMENT("arraycopy_range_checks done"); 2053 } 2054 2055 // These stubs get called from some dumb test routine. 2056 // I'll write them properly when they're called from 2057 // something that's actually doing something. 2058 static void fake_arraycopy_stub(address src, address dst, int count) { 2059 assert(count == 0, "huh?"); 2060 } 2061 2062 2063 // 2064 // Generate 'unsafe' array copy stub 2065 // Though just as safe as the other stubs, it takes an unscaled 2066 // size_t argument instead of an element count. 2067 // 2068 // Input: 2069 // c_rarg0 - source array address 2070 // c_rarg1 - destination array address 2071 // c_rarg2 - byte count, treated as ssize_t, can be zero 2072 // 2073 // Examines the alignment of the operands and dispatches 2074 // to a long, int, short, or byte copy loop. 2075 // 2076 address generate_unsafe_copy(const char *name, 2077 address byte_copy_entry, 2078 address short_copy_entry, 2079 address int_copy_entry, 2080 address long_copy_entry) { 2081 Label L_long_aligned, L_int_aligned, L_short_aligned; 2082 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2083 2084 __ align(CodeEntryAlignment); 2085 StubCodeMark mark(this, "StubRoutines", name); 2086 address start = __ pc(); 2087 __ enter(); // required for proper stackwalking of RuntimeStub frame 2088 2089 // bump this on entry, not on exit: 2090 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2091 2092 __ orr(rscratch1, s, d); 2093 __ orr(rscratch1, rscratch1, count); 2094 2095 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2096 __ cbz(rscratch1, L_long_aligned); 2097 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2098 __ cbz(rscratch1, L_int_aligned); 2099 __ tbz(rscratch1, 0, L_short_aligned); 2100 __ b(RuntimeAddress(byte_copy_entry)); 2101 2102 __ BIND(L_short_aligned); 2103 __ lsr(count, count, LogBytesPerShort); // size => short_count 2104 __ b(RuntimeAddress(short_copy_entry)); 2105 __ BIND(L_int_aligned); 2106 __ lsr(count, count, LogBytesPerInt); // size => int_count 2107 __ b(RuntimeAddress(int_copy_entry)); 2108 __ BIND(L_long_aligned); 2109 __ lsr(count, count, LogBytesPerLong); // size => long_count 2110 __ b(RuntimeAddress(long_copy_entry)); 2111 2112 return start; 2113 } 2114 2115 // 2116 // Generate generic array copy stubs 2117 // 2118 // Input: 2119 // c_rarg0 - src oop 2120 // c_rarg1 - src_pos (32-bits) 2121 // c_rarg2 - dst oop 2122 // c_rarg3 - dst_pos (32-bits) 2123 // c_rarg4 - element count (32-bits) 2124 // 2125 // Output: 2126 // r0 == 0 - success 2127 // r0 == -1^K - failure, where K is partial transfer count 2128 // 2129 address generate_generic_copy(const char *name, 2130 address byte_copy_entry, address short_copy_entry, 2131 address int_copy_entry, address oop_copy_entry, 2132 address long_copy_entry, address checkcast_copy_entry) { 2133 2134 Label L_failed, L_objArray; 2135 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2136 2137 // Input registers 2138 const Register src = c_rarg0; // source array oop 2139 const Register src_pos = c_rarg1; // source position 2140 const Register dst = c_rarg2; // destination array oop 2141 const Register dst_pos = c_rarg3; // destination position 2142 const Register length = c_rarg4; 2143 2144 2145 // Registers used as temps 2146 const Register dst_klass = c_rarg5; 2147 2148 __ align(CodeEntryAlignment); 2149 2150 StubCodeMark mark(this, "StubRoutines", name); 2151 2152 address start = __ pc(); 2153 2154 __ enter(); // required for proper stackwalking of RuntimeStub frame 2155 2156 // bump this on entry, not on exit: 2157 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2158 2159 //----------------------------------------------------------------------- 2160 // Assembler stub will be used for this call to arraycopy 2161 // if the following conditions are met: 2162 // 2163 // (1) src and dst must not be null. 2164 // (2) src_pos must not be negative. 2165 // (3) dst_pos must not be negative. 2166 // (4) length must not be negative. 2167 // (5) src klass and dst klass should be the same and not null. 2168 // (6) src and dst should be arrays. 2169 // (7) src_pos + length must not exceed length of src. 2170 // (8) dst_pos + length must not exceed length of dst. 2171 // 2172 2173 // if (src == nullptr) return -1; 2174 __ cbz(src, L_failed); 2175 2176 // if (src_pos < 0) return -1; 2177 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2178 2179 // if (dst == nullptr) return -1; 2180 __ cbz(dst, L_failed); 2181 2182 // if (dst_pos < 0) return -1; 2183 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2184 2185 // registers used as temp 2186 const Register scratch_length = r16; // elements count to copy 2187 const Register scratch_src_klass = r17; // array klass 2188 const Register lh = r15; // layout helper 2189 2190 // if (length < 0) return -1; 2191 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2192 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2193 2194 __ load_klass(scratch_src_klass, src); 2195 #ifdef ASSERT 2196 // assert(src->klass() != nullptr); 2197 { 2198 BLOCK_COMMENT("assert klasses not null {"); 2199 Label L1, L2; 2200 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2201 __ bind(L1); 2202 __ stop("broken null klass"); 2203 __ bind(L2); 2204 __ load_klass(rscratch1, dst); 2205 __ cbz(rscratch1, L1); // this would be broken also 2206 BLOCK_COMMENT("} assert klasses not null done"); 2207 } 2208 #endif 2209 2210 // Load layout helper (32-bits) 2211 // 2212 // |array_tag| | header_size | element_type | |log2_element_size| 2213 // 32 30 24 16 8 2 0 2214 // 2215 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2216 // 2217 2218 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2219 2220 // Handle objArrays completely differently... 2221 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2222 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2223 __ movw(rscratch1, objArray_lh); 2224 __ eorw(rscratch2, lh, rscratch1); 2225 __ cbzw(rscratch2, L_objArray); 2226 2227 // if (src->klass() != dst->klass()) return -1; 2228 __ load_klass(rscratch2, dst); 2229 __ eor(rscratch2, rscratch2, scratch_src_klass); 2230 __ cbnz(rscratch2, L_failed); 2231 2232 // Check for flat inline type array -> return -1 2233 __ test_flat_array_oop(src, rscratch2, L_failed); 2234 2235 // Check for null-free (non-flat) inline type array -> handle as object array 2236 __ test_null_free_array_oop(src, rscratch2, L_objArray); 2237 2238 // if (!src->is_Array()) return -1; 2239 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2240 2241 // At this point, it is known to be a typeArray (array_tag 0x3). 2242 #ifdef ASSERT 2243 { 2244 BLOCK_COMMENT("assert primitive array {"); 2245 Label L; 2246 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2247 __ cmpw(lh, rscratch2); 2248 __ br(Assembler::GE, L); 2249 __ stop("must be a primitive array"); 2250 __ bind(L); 2251 BLOCK_COMMENT("} assert primitive array done"); 2252 } 2253 #endif 2254 2255 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2256 rscratch2, L_failed); 2257 2258 // TypeArrayKlass 2259 // 2260 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2261 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2262 // 2263 2264 const Register rscratch1_offset = rscratch1; // array offset 2265 const Register r15_elsize = lh; // element size 2266 2267 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2268 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2269 __ add(src, src, rscratch1_offset); // src array offset 2270 __ add(dst, dst, rscratch1_offset); // dst array offset 2271 BLOCK_COMMENT("choose copy loop based on element size"); 2272 2273 // next registers should be set before the jump to corresponding stub 2274 const Register from = c_rarg0; // source array address 2275 const Register to = c_rarg1; // destination array address 2276 const Register count = c_rarg2; // elements count 2277 2278 // 'from', 'to', 'count' registers should be set in such order 2279 // since they are the same as 'src', 'src_pos', 'dst'. 2280 2281 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2282 2283 // The possible values of elsize are 0-3, i.e. exact_log2(element 2284 // size in bytes). We do a simple bitwise binary search. 2285 __ BIND(L_copy_bytes); 2286 __ tbnz(r15_elsize, 1, L_copy_ints); 2287 __ tbnz(r15_elsize, 0, L_copy_shorts); 2288 __ lea(from, Address(src, src_pos));// src_addr 2289 __ lea(to, Address(dst, dst_pos));// dst_addr 2290 __ movw(count, scratch_length); // length 2291 __ b(RuntimeAddress(byte_copy_entry)); 2292 2293 __ BIND(L_copy_shorts); 2294 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2295 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2296 __ movw(count, scratch_length); // length 2297 __ b(RuntimeAddress(short_copy_entry)); 2298 2299 __ BIND(L_copy_ints); 2300 __ tbnz(r15_elsize, 0, L_copy_longs); 2301 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2302 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2303 __ movw(count, scratch_length); // length 2304 __ b(RuntimeAddress(int_copy_entry)); 2305 2306 __ BIND(L_copy_longs); 2307 #ifdef ASSERT 2308 { 2309 BLOCK_COMMENT("assert long copy {"); 2310 Label L; 2311 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2312 __ cmpw(r15_elsize, LogBytesPerLong); 2313 __ br(Assembler::EQ, L); 2314 __ stop("must be long copy, but elsize is wrong"); 2315 __ bind(L); 2316 BLOCK_COMMENT("} assert long copy done"); 2317 } 2318 #endif 2319 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2320 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2321 __ movw(count, scratch_length); // length 2322 __ b(RuntimeAddress(long_copy_entry)); 2323 2324 // ObjArrayKlass 2325 __ BIND(L_objArray); 2326 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2327 2328 Label L_plain_copy, L_checkcast_copy; 2329 // test array classes for subtyping 2330 __ load_klass(r15, dst); 2331 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2332 __ br(Assembler::NE, L_checkcast_copy); 2333 2334 // Identically typed arrays can be copied without element-wise checks. 2335 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2336 rscratch2, L_failed); 2337 2338 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2339 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2340 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2341 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2342 __ movw(count, scratch_length); // length 2343 __ BIND(L_plain_copy); 2344 __ b(RuntimeAddress(oop_copy_entry)); 2345 2346 __ BIND(L_checkcast_copy); 2347 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2348 { 2349 // Before looking at dst.length, make sure dst is also an objArray. 2350 __ ldrw(rscratch1, Address(r15, lh_offset)); 2351 __ movw(rscratch2, objArray_lh); 2352 __ eorw(rscratch1, rscratch1, rscratch2); 2353 __ cbnzw(rscratch1, L_failed); 2354 2355 // It is safe to examine both src.length and dst.length. 2356 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2357 r15, L_failed); 2358 2359 __ load_klass(dst_klass, dst); // reload 2360 2361 // Marshal the base address arguments now, freeing registers. 2362 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2363 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2364 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2365 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2366 __ movw(count, length); // length (reloaded) 2367 Register sco_temp = c_rarg3; // this register is free now 2368 assert_different_registers(from, to, count, sco_temp, 2369 dst_klass, scratch_src_klass); 2370 // assert_clean_int(count, sco_temp); 2371 2372 // Generate the type check. 2373 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2374 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2375 2376 // Smashes rscratch1, rscratch2 2377 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2378 2379 // Fetch destination element klass from the ObjArrayKlass header. 2380 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2381 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2382 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2383 2384 // the checkcast_copy loop needs two extra arguments: 2385 assert(c_rarg3 == sco_temp, "#3 already in place"); 2386 // Set up arguments for checkcast_copy_entry. 2387 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2388 __ b(RuntimeAddress(checkcast_copy_entry)); 2389 } 2390 2391 __ BIND(L_failed); 2392 __ mov(r0, -1); 2393 __ leave(); // required for proper stackwalking of RuntimeStub frame 2394 __ ret(lr); 2395 2396 return start; 2397 } 2398 2399 // 2400 // Generate stub for array fill. If "aligned" is true, the 2401 // "to" address is assumed to be heapword aligned. 2402 // 2403 // Arguments for generated stub: 2404 // to: c_rarg0 2405 // value: c_rarg1 2406 // count: c_rarg2 treated as signed 2407 // 2408 address generate_fill(BasicType t, bool aligned, const char *name) { 2409 __ align(CodeEntryAlignment); 2410 StubCodeMark mark(this, "StubRoutines", name); 2411 address start = __ pc(); 2412 2413 BLOCK_COMMENT("Entry:"); 2414 2415 const Register to = c_rarg0; // source array address 2416 const Register value = c_rarg1; // value 2417 const Register count = c_rarg2; // elements count 2418 2419 const Register bz_base = r10; // base for block_zero routine 2420 const Register cnt_words = r11; // temp register 2421 2422 __ enter(); 2423 2424 Label L_fill_elements, L_exit1; 2425 2426 int shift = -1; 2427 switch (t) { 2428 case T_BYTE: 2429 shift = 0; 2430 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2431 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2432 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2433 __ br(Assembler::LO, L_fill_elements); 2434 break; 2435 case T_SHORT: 2436 shift = 1; 2437 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2438 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2439 __ br(Assembler::LO, L_fill_elements); 2440 break; 2441 case T_INT: 2442 shift = 2; 2443 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2444 __ br(Assembler::LO, L_fill_elements); 2445 break; 2446 default: ShouldNotReachHere(); 2447 } 2448 2449 // Align source address at 8 bytes address boundary. 2450 Label L_skip_align1, L_skip_align2, L_skip_align4; 2451 if (!aligned) { 2452 switch (t) { 2453 case T_BYTE: 2454 // One byte misalignment happens only for byte arrays. 2455 __ tbz(to, 0, L_skip_align1); 2456 __ strb(value, Address(__ post(to, 1))); 2457 __ subw(count, count, 1); 2458 __ bind(L_skip_align1); 2459 // Fallthrough 2460 case T_SHORT: 2461 // Two bytes misalignment happens only for byte and short (char) arrays. 2462 __ tbz(to, 1, L_skip_align2); 2463 __ strh(value, Address(__ post(to, 2))); 2464 __ subw(count, count, 2 >> shift); 2465 __ bind(L_skip_align2); 2466 // Fallthrough 2467 case T_INT: 2468 // Align to 8 bytes, we know we are 4 byte aligned to start. 2469 __ tbz(to, 2, L_skip_align4); 2470 __ strw(value, Address(__ post(to, 4))); 2471 __ subw(count, count, 4 >> shift); 2472 __ bind(L_skip_align4); 2473 break; 2474 default: ShouldNotReachHere(); 2475 } 2476 } 2477 2478 // 2479 // Fill large chunks 2480 // 2481 __ lsrw(cnt_words, count, 3 - shift); // number of words 2482 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2483 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2484 if (UseBlockZeroing) { 2485 Label non_block_zeroing, rest; 2486 // If the fill value is zero we can use the fast zero_words(). 2487 __ cbnz(value, non_block_zeroing); 2488 __ mov(bz_base, to); 2489 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2490 address tpc = __ zero_words(bz_base, cnt_words); 2491 if (tpc == nullptr) { 2492 fatal("CodeCache is full at generate_fill"); 2493 } 2494 __ b(rest); 2495 __ bind(non_block_zeroing); 2496 __ fill_words(to, cnt_words, value); 2497 __ bind(rest); 2498 } else { 2499 __ fill_words(to, cnt_words, value); 2500 } 2501 2502 // Remaining count is less than 8 bytes. Fill it by a single store. 2503 // Note that the total length is no less than 8 bytes. 2504 if (t == T_BYTE || t == T_SHORT) { 2505 Label L_exit1; 2506 __ cbzw(count, L_exit1); 2507 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2508 __ str(value, Address(to, -8)); // overwrite some elements 2509 __ bind(L_exit1); 2510 __ leave(); 2511 __ ret(lr); 2512 } 2513 2514 // Handle copies less than 8 bytes. 2515 Label L_fill_2, L_fill_4, L_exit2; 2516 __ bind(L_fill_elements); 2517 switch (t) { 2518 case T_BYTE: 2519 __ tbz(count, 0, L_fill_2); 2520 __ strb(value, Address(__ post(to, 1))); 2521 __ bind(L_fill_2); 2522 __ tbz(count, 1, L_fill_4); 2523 __ strh(value, Address(__ post(to, 2))); 2524 __ bind(L_fill_4); 2525 __ tbz(count, 2, L_exit2); 2526 __ strw(value, Address(to)); 2527 break; 2528 case T_SHORT: 2529 __ tbz(count, 0, L_fill_4); 2530 __ strh(value, Address(__ post(to, 2))); 2531 __ bind(L_fill_4); 2532 __ tbz(count, 1, L_exit2); 2533 __ strw(value, Address(to)); 2534 break; 2535 case T_INT: 2536 __ cbzw(count, L_exit2); 2537 __ strw(value, Address(to)); 2538 break; 2539 default: ShouldNotReachHere(); 2540 } 2541 __ bind(L_exit2); 2542 __ leave(); 2543 __ ret(lr); 2544 return start; 2545 } 2546 2547 address generate_data_cache_writeback() { 2548 const Register line = c_rarg0; // address of line to write back 2549 2550 __ align(CodeEntryAlignment); 2551 2552 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2553 2554 address start = __ pc(); 2555 __ enter(); 2556 __ cache_wb(Address(line, 0)); 2557 __ leave(); 2558 __ ret(lr); 2559 2560 return start; 2561 } 2562 2563 address generate_data_cache_writeback_sync() { 2564 const Register is_pre = c_rarg0; // pre or post sync 2565 2566 __ align(CodeEntryAlignment); 2567 2568 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2569 2570 // pre wbsync is a no-op 2571 // post wbsync translates to an sfence 2572 2573 Label skip; 2574 address start = __ pc(); 2575 __ enter(); 2576 __ cbnz(is_pre, skip); 2577 __ cache_wbsync(false); 2578 __ bind(skip); 2579 __ leave(); 2580 __ ret(lr); 2581 2582 return start; 2583 } 2584 2585 void generate_arraycopy_stubs() { 2586 address entry; 2587 address entry_jbyte_arraycopy; 2588 address entry_jshort_arraycopy; 2589 address entry_jint_arraycopy; 2590 address entry_oop_arraycopy; 2591 address entry_jlong_arraycopy; 2592 address entry_checkcast_arraycopy; 2593 2594 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2595 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2596 2597 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2598 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2599 2600 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2601 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2602 2603 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2604 2605 //*** jbyte 2606 // Always need aligned and unaligned versions 2607 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2608 "jbyte_disjoint_arraycopy"); 2609 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2610 &entry_jbyte_arraycopy, 2611 "jbyte_arraycopy"); 2612 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2613 "arrayof_jbyte_disjoint_arraycopy"); 2614 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2615 "arrayof_jbyte_arraycopy"); 2616 2617 //*** jshort 2618 // Always need aligned and unaligned versions 2619 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2620 "jshort_disjoint_arraycopy"); 2621 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2622 &entry_jshort_arraycopy, 2623 "jshort_arraycopy"); 2624 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2625 "arrayof_jshort_disjoint_arraycopy"); 2626 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2627 "arrayof_jshort_arraycopy"); 2628 2629 //*** jint 2630 // Aligned versions 2631 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2632 "arrayof_jint_disjoint_arraycopy"); 2633 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2634 "arrayof_jint_arraycopy"); 2635 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2636 // entry_jint_arraycopy always points to the unaligned version 2637 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2638 "jint_disjoint_arraycopy"); 2639 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2640 &entry_jint_arraycopy, 2641 "jint_arraycopy"); 2642 2643 //*** jlong 2644 // It is always aligned 2645 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2646 "arrayof_jlong_disjoint_arraycopy"); 2647 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2648 "arrayof_jlong_arraycopy"); 2649 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2650 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2651 2652 //*** oops 2653 { 2654 // With compressed oops we need unaligned versions; notice that 2655 // we overwrite entry_oop_arraycopy. 2656 bool aligned = !UseCompressedOops; 2657 2658 StubRoutines::_arrayof_oop_disjoint_arraycopy 2659 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2660 /*dest_uninitialized*/false); 2661 StubRoutines::_arrayof_oop_arraycopy 2662 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2663 /*dest_uninitialized*/false); 2664 // Aligned versions without pre-barriers 2665 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2666 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2667 /*dest_uninitialized*/true); 2668 StubRoutines::_arrayof_oop_arraycopy_uninit 2669 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2670 /*dest_uninitialized*/true); 2671 } 2672 2673 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2674 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2675 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2676 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2677 2678 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2679 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2680 /*dest_uninitialized*/true); 2681 2682 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2683 entry_jbyte_arraycopy, 2684 entry_jshort_arraycopy, 2685 entry_jint_arraycopy, 2686 entry_jlong_arraycopy); 2687 2688 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2689 entry_jbyte_arraycopy, 2690 entry_jshort_arraycopy, 2691 entry_jint_arraycopy, 2692 entry_oop_arraycopy, 2693 entry_jlong_arraycopy, 2694 entry_checkcast_arraycopy); 2695 2696 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2697 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2698 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2699 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2700 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2701 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2702 } 2703 2704 void generate_math_stubs() { Unimplemented(); } 2705 2706 // Arguments: 2707 // 2708 // Inputs: 2709 // c_rarg0 - source byte array address 2710 // c_rarg1 - destination byte array address 2711 // c_rarg2 - K (key) in little endian int array 2712 // 2713 address generate_aescrypt_encryptBlock() { 2714 __ align(CodeEntryAlignment); 2715 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2716 2717 const Register from = c_rarg0; // source array address 2718 const Register to = c_rarg1; // destination array address 2719 const Register key = c_rarg2; // key array address 2720 const Register keylen = rscratch1; 2721 2722 address start = __ pc(); 2723 __ enter(); 2724 2725 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2726 2727 __ aesenc_loadkeys(key, keylen); 2728 __ aesecb_encrypt(from, to, keylen); 2729 2730 __ mov(r0, 0); 2731 2732 __ leave(); 2733 __ ret(lr); 2734 2735 return start; 2736 } 2737 2738 // Arguments: 2739 // 2740 // Inputs: 2741 // c_rarg0 - source byte array address 2742 // c_rarg1 - destination byte array address 2743 // c_rarg2 - K (key) in little endian int array 2744 // 2745 address generate_aescrypt_decryptBlock() { 2746 assert(UseAES, "need AES cryptographic extension support"); 2747 __ align(CodeEntryAlignment); 2748 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2749 Label L_doLast; 2750 2751 const Register from = c_rarg0; // source array address 2752 const Register to = c_rarg1; // destination array address 2753 const Register key = c_rarg2; // key array address 2754 const Register keylen = rscratch1; 2755 2756 address start = __ pc(); 2757 __ enter(); // required for proper stackwalking of RuntimeStub frame 2758 2759 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2760 2761 __ aesecb_decrypt(from, to, key, keylen); 2762 2763 __ mov(r0, 0); 2764 2765 __ leave(); 2766 __ ret(lr); 2767 2768 return start; 2769 } 2770 2771 // Arguments: 2772 // 2773 // Inputs: 2774 // c_rarg0 - source byte array address 2775 // c_rarg1 - destination byte array address 2776 // c_rarg2 - K (key) in little endian int array 2777 // c_rarg3 - r vector byte array address 2778 // c_rarg4 - input length 2779 // 2780 // Output: 2781 // x0 - input length 2782 // 2783 address generate_cipherBlockChaining_encryptAESCrypt() { 2784 assert(UseAES, "need AES cryptographic extension support"); 2785 __ align(CodeEntryAlignment); 2786 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2787 2788 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2789 2790 const Register from = c_rarg0; // source array address 2791 const Register to = c_rarg1; // destination array address 2792 const Register key = c_rarg2; // key array address 2793 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2794 // and left with the results of the last encryption block 2795 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2796 const Register keylen = rscratch1; 2797 2798 address start = __ pc(); 2799 2800 __ enter(); 2801 2802 __ movw(rscratch2, len_reg); 2803 2804 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2805 2806 __ ld1(v0, __ T16B, rvec); 2807 2808 __ cmpw(keylen, 52); 2809 __ br(Assembler::CC, L_loadkeys_44); 2810 __ br(Assembler::EQ, L_loadkeys_52); 2811 2812 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2813 __ rev32(v17, __ T16B, v17); 2814 __ rev32(v18, __ T16B, v18); 2815 __ BIND(L_loadkeys_52); 2816 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2817 __ rev32(v19, __ T16B, v19); 2818 __ rev32(v20, __ T16B, v20); 2819 __ BIND(L_loadkeys_44); 2820 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2821 __ rev32(v21, __ T16B, v21); 2822 __ rev32(v22, __ T16B, v22); 2823 __ rev32(v23, __ T16B, v23); 2824 __ rev32(v24, __ T16B, v24); 2825 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2826 __ rev32(v25, __ T16B, v25); 2827 __ rev32(v26, __ T16B, v26); 2828 __ rev32(v27, __ T16B, v27); 2829 __ rev32(v28, __ T16B, v28); 2830 __ ld1(v29, v30, v31, __ T16B, key); 2831 __ rev32(v29, __ T16B, v29); 2832 __ rev32(v30, __ T16B, v30); 2833 __ rev32(v31, __ T16B, v31); 2834 2835 __ BIND(L_aes_loop); 2836 __ ld1(v1, __ T16B, __ post(from, 16)); 2837 __ eor(v0, __ T16B, v0, v1); 2838 2839 __ br(Assembler::CC, L_rounds_44); 2840 __ br(Assembler::EQ, L_rounds_52); 2841 2842 __ aese(v0, v17); __ aesmc(v0, v0); 2843 __ aese(v0, v18); __ aesmc(v0, v0); 2844 __ BIND(L_rounds_52); 2845 __ aese(v0, v19); __ aesmc(v0, v0); 2846 __ aese(v0, v20); __ aesmc(v0, v0); 2847 __ BIND(L_rounds_44); 2848 __ aese(v0, v21); __ aesmc(v0, v0); 2849 __ aese(v0, v22); __ aesmc(v0, v0); 2850 __ aese(v0, v23); __ aesmc(v0, v0); 2851 __ aese(v0, v24); __ aesmc(v0, v0); 2852 __ aese(v0, v25); __ aesmc(v0, v0); 2853 __ aese(v0, v26); __ aesmc(v0, v0); 2854 __ aese(v0, v27); __ aesmc(v0, v0); 2855 __ aese(v0, v28); __ aesmc(v0, v0); 2856 __ aese(v0, v29); __ aesmc(v0, v0); 2857 __ aese(v0, v30); 2858 __ eor(v0, __ T16B, v0, v31); 2859 2860 __ st1(v0, __ T16B, __ post(to, 16)); 2861 2862 __ subw(len_reg, len_reg, 16); 2863 __ cbnzw(len_reg, L_aes_loop); 2864 2865 __ st1(v0, __ T16B, rvec); 2866 2867 __ mov(r0, rscratch2); 2868 2869 __ leave(); 2870 __ ret(lr); 2871 2872 return start; 2873 } 2874 2875 // Arguments: 2876 // 2877 // Inputs: 2878 // c_rarg0 - source byte array address 2879 // c_rarg1 - destination byte array address 2880 // c_rarg2 - K (key) in little endian int array 2881 // c_rarg3 - r vector byte array address 2882 // c_rarg4 - input length 2883 // 2884 // Output: 2885 // r0 - input length 2886 // 2887 address generate_cipherBlockChaining_decryptAESCrypt() { 2888 assert(UseAES, "need AES cryptographic extension support"); 2889 __ align(CodeEntryAlignment); 2890 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2891 2892 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2893 2894 const Register from = c_rarg0; // source array address 2895 const Register to = c_rarg1; // destination array address 2896 const Register key = c_rarg2; // key array address 2897 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2898 // and left with the results of the last encryption block 2899 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2900 const Register keylen = rscratch1; 2901 2902 address start = __ pc(); 2903 2904 __ enter(); 2905 2906 __ movw(rscratch2, len_reg); 2907 2908 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2909 2910 __ ld1(v2, __ T16B, rvec); 2911 2912 __ ld1(v31, __ T16B, __ post(key, 16)); 2913 __ rev32(v31, __ T16B, v31); 2914 2915 __ cmpw(keylen, 52); 2916 __ br(Assembler::CC, L_loadkeys_44); 2917 __ br(Assembler::EQ, L_loadkeys_52); 2918 2919 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2920 __ rev32(v17, __ T16B, v17); 2921 __ rev32(v18, __ T16B, v18); 2922 __ BIND(L_loadkeys_52); 2923 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2924 __ rev32(v19, __ T16B, v19); 2925 __ rev32(v20, __ T16B, v20); 2926 __ BIND(L_loadkeys_44); 2927 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2928 __ rev32(v21, __ T16B, v21); 2929 __ rev32(v22, __ T16B, v22); 2930 __ rev32(v23, __ T16B, v23); 2931 __ rev32(v24, __ T16B, v24); 2932 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2933 __ rev32(v25, __ T16B, v25); 2934 __ rev32(v26, __ T16B, v26); 2935 __ rev32(v27, __ T16B, v27); 2936 __ rev32(v28, __ T16B, v28); 2937 __ ld1(v29, v30, __ T16B, key); 2938 __ rev32(v29, __ T16B, v29); 2939 __ rev32(v30, __ T16B, v30); 2940 2941 __ BIND(L_aes_loop); 2942 __ ld1(v0, __ T16B, __ post(from, 16)); 2943 __ orr(v1, __ T16B, v0, v0); 2944 2945 __ br(Assembler::CC, L_rounds_44); 2946 __ br(Assembler::EQ, L_rounds_52); 2947 2948 __ aesd(v0, v17); __ aesimc(v0, v0); 2949 __ aesd(v0, v18); __ aesimc(v0, v0); 2950 __ BIND(L_rounds_52); 2951 __ aesd(v0, v19); __ aesimc(v0, v0); 2952 __ aesd(v0, v20); __ aesimc(v0, v0); 2953 __ BIND(L_rounds_44); 2954 __ aesd(v0, v21); __ aesimc(v0, v0); 2955 __ aesd(v0, v22); __ aesimc(v0, v0); 2956 __ aesd(v0, v23); __ aesimc(v0, v0); 2957 __ aesd(v0, v24); __ aesimc(v0, v0); 2958 __ aesd(v0, v25); __ aesimc(v0, v0); 2959 __ aesd(v0, v26); __ aesimc(v0, v0); 2960 __ aesd(v0, v27); __ aesimc(v0, v0); 2961 __ aesd(v0, v28); __ aesimc(v0, v0); 2962 __ aesd(v0, v29); __ aesimc(v0, v0); 2963 __ aesd(v0, v30); 2964 __ eor(v0, __ T16B, v0, v31); 2965 __ eor(v0, __ T16B, v0, v2); 2966 2967 __ st1(v0, __ T16B, __ post(to, 16)); 2968 __ orr(v2, __ T16B, v1, v1); 2969 2970 __ subw(len_reg, len_reg, 16); 2971 __ cbnzw(len_reg, L_aes_loop); 2972 2973 __ st1(v2, __ T16B, rvec); 2974 2975 __ mov(r0, rscratch2); 2976 2977 __ leave(); 2978 __ ret(lr); 2979 2980 return start; 2981 } 2982 2983 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2984 // Inputs: 128-bits. in is preserved. 2985 // The least-significant 64-bit word is in the upper dword of each vector. 2986 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2987 // Output: result 2988 void be_add_128_64(FloatRegister result, FloatRegister in, 2989 FloatRegister inc, FloatRegister tmp) { 2990 assert_different_registers(result, tmp, inc); 2991 2992 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2993 // input 2994 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2995 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 2996 // MSD == 0 (must be!) to LSD 2997 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 2998 } 2999 3000 // CTR AES crypt. 3001 // Arguments: 3002 // 3003 // Inputs: 3004 // c_rarg0 - source byte array address 3005 // c_rarg1 - destination byte array address 3006 // c_rarg2 - K (key) in little endian int array 3007 // c_rarg3 - counter vector byte array address 3008 // c_rarg4 - input length 3009 // c_rarg5 - saved encryptedCounter start 3010 // c_rarg6 - saved used length 3011 // 3012 // Output: 3013 // r0 - input length 3014 // 3015 address generate_counterMode_AESCrypt() { 3016 const Register in = c_rarg0; 3017 const Register out = c_rarg1; 3018 const Register key = c_rarg2; 3019 const Register counter = c_rarg3; 3020 const Register saved_len = c_rarg4, len = r10; 3021 const Register saved_encrypted_ctr = c_rarg5; 3022 const Register used_ptr = c_rarg6, used = r12; 3023 3024 const Register offset = r7; 3025 const Register keylen = r11; 3026 3027 const unsigned char block_size = 16; 3028 const int bulk_width = 4; 3029 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3030 // performance with larger data sizes, but it also means that the 3031 // fast path isn't used until you have at least 8 blocks, and up 3032 // to 127 bytes of data will be executed on the slow path. For 3033 // that reason, and also so as not to blow away too much icache, 4 3034 // blocks seems like a sensible compromise. 3035 3036 // Algorithm: 3037 // 3038 // if (len == 0) { 3039 // goto DONE; 3040 // } 3041 // int result = len; 3042 // do { 3043 // if (used >= blockSize) { 3044 // if (len >= bulk_width * blockSize) { 3045 // CTR_large_block(); 3046 // if (len == 0) 3047 // goto DONE; 3048 // } 3049 // for (;;) { 3050 // 16ByteVector v0 = counter; 3051 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3052 // used = 0; 3053 // if (len < blockSize) 3054 // break; /* goto NEXT */ 3055 // 16ByteVector v1 = load16Bytes(in, offset); 3056 // v1 = v1 ^ encryptedCounter; 3057 // store16Bytes(out, offset); 3058 // used = blockSize; 3059 // offset += blockSize; 3060 // len -= blockSize; 3061 // if (len == 0) 3062 // goto DONE; 3063 // } 3064 // } 3065 // NEXT: 3066 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3067 // len--; 3068 // } while (len != 0); 3069 // DONE: 3070 // return result; 3071 // 3072 // CTR_large_block() 3073 // Wide bulk encryption of whole blocks. 3074 3075 __ align(CodeEntryAlignment); 3076 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3077 const address start = __ pc(); 3078 __ enter(); 3079 3080 Label DONE, CTR_large_block, large_block_return; 3081 __ ldrw(used, Address(used_ptr)); 3082 __ cbzw(saved_len, DONE); 3083 3084 __ mov(len, saved_len); 3085 __ mov(offset, 0); 3086 3087 // Compute #rounds for AES based on the length of the key array 3088 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3089 3090 __ aesenc_loadkeys(key, keylen); 3091 3092 { 3093 Label L_CTR_loop, NEXT; 3094 3095 __ bind(L_CTR_loop); 3096 3097 __ cmp(used, block_size); 3098 __ br(__ LO, NEXT); 3099 3100 // Maybe we have a lot of data 3101 __ subsw(rscratch1, len, bulk_width * block_size); 3102 __ br(__ HS, CTR_large_block); 3103 __ BIND(large_block_return); 3104 __ cbzw(len, DONE); 3105 3106 // Setup the counter 3107 __ movi(v4, __ T4S, 0); 3108 __ movi(v5, __ T4S, 1); 3109 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3110 3111 // 128-bit big-endian increment 3112 __ ld1(v0, __ T16B, counter); 3113 __ rev64(v16, __ T16B, v0); 3114 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3115 __ rev64(v16, __ T16B, v16); 3116 __ st1(v16, __ T16B, counter); 3117 // Previous counter value is in v0 3118 // v4 contains { 0, 1 } 3119 3120 { 3121 // We have fewer than bulk_width blocks of data left. Encrypt 3122 // them one by one until there is less than a full block 3123 // remaining, being careful to save both the encrypted counter 3124 // and the counter. 3125 3126 Label inner_loop; 3127 __ bind(inner_loop); 3128 // Counter to encrypt is in v0 3129 __ aesecb_encrypt(noreg, noreg, keylen); 3130 __ st1(v0, __ T16B, saved_encrypted_ctr); 3131 3132 // Do we have a remaining full block? 3133 3134 __ mov(used, 0); 3135 __ cmp(len, block_size); 3136 __ br(__ LO, NEXT); 3137 3138 // Yes, we have a full block 3139 __ ldrq(v1, Address(in, offset)); 3140 __ eor(v1, __ T16B, v1, v0); 3141 __ strq(v1, Address(out, offset)); 3142 __ mov(used, block_size); 3143 __ add(offset, offset, block_size); 3144 3145 __ subw(len, len, block_size); 3146 __ cbzw(len, DONE); 3147 3148 // Increment the counter, store it back 3149 __ orr(v0, __ T16B, v16, v16); 3150 __ rev64(v16, __ T16B, v16); 3151 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3152 __ rev64(v16, __ T16B, v16); 3153 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3154 3155 __ b(inner_loop); 3156 } 3157 3158 __ BIND(NEXT); 3159 3160 // Encrypt a single byte, and loop. 3161 // We expect this to be a rare event. 3162 __ ldrb(rscratch1, Address(in, offset)); 3163 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3164 __ eor(rscratch1, rscratch1, rscratch2); 3165 __ strb(rscratch1, Address(out, offset)); 3166 __ add(offset, offset, 1); 3167 __ add(used, used, 1); 3168 __ subw(len, len,1); 3169 __ cbnzw(len, L_CTR_loop); 3170 } 3171 3172 __ bind(DONE); 3173 __ strw(used, Address(used_ptr)); 3174 __ mov(r0, saved_len); 3175 3176 __ leave(); // required for proper stackwalking of RuntimeStub frame 3177 __ ret(lr); 3178 3179 // Bulk encryption 3180 3181 __ BIND (CTR_large_block); 3182 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3183 3184 if (bulk_width == 8) { 3185 __ sub(sp, sp, 4 * 16); 3186 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3187 } 3188 __ sub(sp, sp, 4 * 16); 3189 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3190 RegSet saved_regs = (RegSet::of(in, out, offset) 3191 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3192 __ push(saved_regs, sp); 3193 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3194 __ add(in, in, offset); 3195 __ add(out, out, offset); 3196 3197 // Keys should already be loaded into the correct registers 3198 3199 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3200 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3201 3202 // AES/CTR loop 3203 { 3204 Label L_CTR_loop; 3205 __ BIND(L_CTR_loop); 3206 3207 // Setup the counters 3208 __ movi(v8, __ T4S, 0); 3209 __ movi(v9, __ T4S, 1); 3210 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3211 3212 for (int i = 0; i < bulk_width; i++) { 3213 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3214 __ rev64(v0_ofs, __ T16B, v16); 3215 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3216 } 3217 3218 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3219 3220 // Encrypt the counters 3221 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3222 3223 if (bulk_width == 8) { 3224 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3225 } 3226 3227 // XOR the encrypted counters with the inputs 3228 for (int i = 0; i < bulk_width; i++) { 3229 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3230 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3231 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3232 } 3233 3234 // Write the encrypted data 3235 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3236 if (bulk_width == 8) { 3237 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3238 } 3239 3240 __ subw(len, len, 16 * bulk_width); 3241 __ cbnzw(len, L_CTR_loop); 3242 } 3243 3244 // Save the counter back where it goes 3245 __ rev64(v16, __ T16B, v16); 3246 __ st1(v16, __ T16B, counter); 3247 3248 __ pop(saved_regs, sp); 3249 3250 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3251 if (bulk_width == 8) { 3252 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3253 } 3254 3255 __ andr(rscratch1, len, -16 * bulk_width); 3256 __ sub(len, len, rscratch1); 3257 __ add(offset, offset, rscratch1); 3258 __ mov(used, 16); 3259 __ strw(used, Address(used_ptr)); 3260 __ b(large_block_return); 3261 3262 return start; 3263 } 3264 3265 // Vector AES Galois Counter Mode implementation. Parameters: 3266 // 3267 // in = c_rarg0 3268 // len = c_rarg1 3269 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3270 // out = c_rarg3 3271 // key = c_rarg4 3272 // state = c_rarg5 - GHASH.state 3273 // subkeyHtbl = c_rarg6 - powers of H 3274 // counter = c_rarg7 - 16 bytes of CTR 3275 // return - number of processed bytes 3276 address generate_galoisCounterMode_AESCrypt() { 3277 address ghash_polynomial = __ pc(); 3278 __ emit_int64(0x87); // The low-order bits of the field 3279 // polynomial (i.e. p = z^7+z^2+z+1) 3280 // repeated in the low and high parts of a 3281 // 128-bit vector 3282 __ emit_int64(0x87); 3283 3284 __ align(CodeEntryAlignment); 3285 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3286 address start = __ pc(); 3287 __ enter(); 3288 3289 const Register in = c_rarg0; 3290 const Register len = c_rarg1; 3291 const Register ct = c_rarg2; 3292 const Register out = c_rarg3; 3293 // and updated with the incremented counter in the end 3294 3295 const Register key = c_rarg4; 3296 const Register state = c_rarg5; 3297 3298 const Register subkeyHtbl = c_rarg6; 3299 3300 const Register counter = c_rarg7; 3301 3302 const Register keylen = r10; 3303 // Save state before entering routine 3304 __ sub(sp, sp, 4 * 16); 3305 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3306 __ sub(sp, sp, 4 * 16); 3307 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3308 3309 // __ andr(len, len, -512); 3310 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3311 __ str(len, __ pre(sp, -2 * wordSize)); 3312 3313 Label DONE; 3314 __ cbz(len, DONE); 3315 3316 // Compute #rounds for AES based on the length of the key array 3317 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3318 3319 __ aesenc_loadkeys(key, keylen); 3320 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3321 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3322 3323 // AES/CTR loop 3324 { 3325 Label L_CTR_loop; 3326 __ BIND(L_CTR_loop); 3327 3328 // Setup the counters 3329 __ movi(v8, __ T4S, 0); 3330 __ movi(v9, __ T4S, 1); 3331 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3332 3333 assert(v0->encoding() < v8->encoding(), ""); 3334 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3335 FloatRegister f = as_FloatRegister(i); 3336 __ rev32(f, __ T16B, v16); 3337 __ addv(v16, __ T4S, v16, v8); 3338 } 3339 3340 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3341 3342 // Encrypt the counters 3343 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3344 3345 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3346 3347 // XOR the encrypted counters with the inputs 3348 for (int i = 0; i < 8; i++) { 3349 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3350 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3351 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3352 } 3353 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3354 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3355 3356 __ subw(len, len, 16 * 8); 3357 __ cbnzw(len, L_CTR_loop); 3358 } 3359 3360 __ rev32(v16, __ T16B, v16); 3361 __ st1(v16, __ T16B, counter); 3362 3363 __ ldr(len, Address(sp)); 3364 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3365 3366 // GHASH/CTR loop 3367 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3368 len, /*unrolls*/4); 3369 3370 #ifdef ASSERT 3371 { Label L; 3372 __ cmp(len, (unsigned char)0); 3373 __ br(Assembler::EQ, L); 3374 __ stop("stubGenerator: abort"); 3375 __ bind(L); 3376 } 3377 #endif 3378 3379 __ bind(DONE); 3380 // Return the number of bytes processed 3381 __ ldr(r0, __ post(sp, 2 * wordSize)); 3382 3383 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3384 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3385 3386 __ leave(); // required for proper stackwalking of RuntimeStub frame 3387 __ ret(lr); 3388 return start; 3389 } 3390 3391 class Cached64Bytes { 3392 private: 3393 MacroAssembler *_masm; 3394 Register _regs[8]; 3395 3396 public: 3397 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3398 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3399 auto it = rs.begin(); 3400 for (auto &r: _regs) { 3401 r = *it; 3402 ++it; 3403 } 3404 } 3405 3406 void gen_loads(Register base) { 3407 for (int i = 0; i < 8; i += 2) { 3408 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3409 } 3410 } 3411 3412 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3413 void extract_u32(Register dest, int i) { 3414 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3415 } 3416 }; 3417 3418 // Utility routines for md5. 3419 // Clobbers r10 and r11. 3420 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3421 int k, int s, int t) { 3422 Register rscratch3 = r10; 3423 Register rscratch4 = r11; 3424 3425 __ eorw(rscratch3, r3, r4); 3426 __ movw(rscratch2, t); 3427 __ andw(rscratch3, rscratch3, r2); 3428 __ addw(rscratch4, r1, rscratch2); 3429 reg_cache.extract_u32(rscratch1, k); 3430 __ eorw(rscratch3, rscratch3, r4); 3431 __ addw(rscratch4, rscratch4, rscratch1); 3432 __ addw(rscratch3, rscratch3, rscratch4); 3433 __ rorw(rscratch2, rscratch3, 32 - s); 3434 __ addw(r1, rscratch2, r2); 3435 } 3436 3437 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3438 int k, int s, int t) { 3439 Register rscratch3 = r10; 3440 Register rscratch4 = r11; 3441 3442 reg_cache.extract_u32(rscratch1, k); 3443 __ movw(rscratch2, t); 3444 __ addw(rscratch4, r1, rscratch2); 3445 __ addw(rscratch4, rscratch4, rscratch1); 3446 __ bicw(rscratch2, r3, r4); 3447 __ andw(rscratch3, r2, r4); 3448 __ addw(rscratch2, rscratch2, rscratch4); 3449 __ addw(rscratch2, rscratch2, rscratch3); 3450 __ rorw(rscratch2, rscratch2, 32 - s); 3451 __ addw(r1, rscratch2, r2); 3452 } 3453 3454 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3455 int k, int s, int t) { 3456 Register rscratch3 = r10; 3457 Register rscratch4 = r11; 3458 3459 __ eorw(rscratch3, r3, r4); 3460 __ movw(rscratch2, t); 3461 __ addw(rscratch4, r1, rscratch2); 3462 reg_cache.extract_u32(rscratch1, k); 3463 __ eorw(rscratch3, rscratch3, r2); 3464 __ addw(rscratch4, rscratch4, rscratch1); 3465 __ addw(rscratch3, rscratch3, rscratch4); 3466 __ rorw(rscratch2, rscratch3, 32 - s); 3467 __ addw(r1, rscratch2, r2); 3468 } 3469 3470 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3471 int k, int s, int t) { 3472 Register rscratch3 = r10; 3473 Register rscratch4 = r11; 3474 3475 __ movw(rscratch3, t); 3476 __ ornw(rscratch2, r2, r4); 3477 __ addw(rscratch4, r1, rscratch3); 3478 reg_cache.extract_u32(rscratch1, k); 3479 __ eorw(rscratch3, rscratch2, r3); 3480 __ addw(rscratch4, rscratch4, rscratch1); 3481 __ addw(rscratch3, rscratch3, rscratch4); 3482 __ rorw(rscratch2, rscratch3, 32 - s); 3483 __ addw(r1, rscratch2, r2); 3484 } 3485 3486 // Arguments: 3487 // 3488 // Inputs: 3489 // c_rarg0 - byte[] source+offset 3490 // c_rarg1 - int[] SHA.state 3491 // c_rarg2 - int offset 3492 // c_rarg3 - int limit 3493 // 3494 address generate_md5_implCompress(bool multi_block, const char *name) { 3495 __ align(CodeEntryAlignment); 3496 StubCodeMark mark(this, "StubRoutines", name); 3497 address start = __ pc(); 3498 3499 Register buf = c_rarg0; 3500 Register state = c_rarg1; 3501 Register ofs = c_rarg2; 3502 Register limit = c_rarg3; 3503 Register a = r4; 3504 Register b = r5; 3505 Register c = r6; 3506 Register d = r7; 3507 Register rscratch3 = r10; 3508 Register rscratch4 = r11; 3509 3510 Register state_regs[2] = { r12, r13 }; 3511 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3512 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3513 3514 __ push(saved_regs, sp); 3515 3516 __ ldp(state_regs[0], state_regs[1], Address(state)); 3517 __ ubfx(a, state_regs[0], 0, 32); 3518 __ ubfx(b, state_regs[0], 32, 32); 3519 __ ubfx(c, state_regs[1], 0, 32); 3520 __ ubfx(d, state_regs[1], 32, 32); 3521 3522 Label md5_loop; 3523 __ BIND(md5_loop); 3524 3525 reg_cache.gen_loads(buf); 3526 3527 // Round 1 3528 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3529 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3530 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3531 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3532 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3533 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3534 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3535 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3536 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3537 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3538 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3539 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3540 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3541 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3542 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3543 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3544 3545 // Round 2 3546 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3547 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3548 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3549 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3550 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3551 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3552 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3553 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3554 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3555 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3556 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3557 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3558 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3559 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3560 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3561 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3562 3563 // Round 3 3564 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3565 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3566 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3567 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3568 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3569 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3570 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3571 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3572 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3573 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3574 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3575 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3576 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3577 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3578 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3579 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3580 3581 // Round 4 3582 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3583 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3584 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3585 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3586 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3587 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3588 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3589 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3590 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3591 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3592 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3593 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3594 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3595 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3596 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3597 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3598 3599 __ addw(a, state_regs[0], a); 3600 __ ubfx(rscratch2, state_regs[0], 32, 32); 3601 __ addw(b, rscratch2, b); 3602 __ addw(c, state_regs[1], c); 3603 __ ubfx(rscratch4, state_regs[1], 32, 32); 3604 __ addw(d, rscratch4, d); 3605 3606 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3607 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3608 3609 if (multi_block) { 3610 __ add(buf, buf, 64); 3611 __ add(ofs, ofs, 64); 3612 __ cmp(ofs, limit); 3613 __ br(Assembler::LE, md5_loop); 3614 __ mov(c_rarg0, ofs); // return ofs 3615 } 3616 3617 // write hash values back in the correct order 3618 __ stp(state_regs[0], state_regs[1], Address(state)); 3619 3620 __ pop(saved_regs, sp); 3621 3622 __ ret(lr); 3623 3624 return start; 3625 } 3626 3627 // Arguments: 3628 // 3629 // Inputs: 3630 // c_rarg0 - byte[] source+offset 3631 // c_rarg1 - int[] SHA.state 3632 // c_rarg2 - int offset 3633 // c_rarg3 - int limit 3634 // 3635 address generate_sha1_implCompress(bool multi_block, const char *name) { 3636 __ align(CodeEntryAlignment); 3637 StubCodeMark mark(this, "StubRoutines", name); 3638 address start = __ pc(); 3639 3640 Register buf = c_rarg0; 3641 Register state = c_rarg1; 3642 Register ofs = c_rarg2; 3643 Register limit = c_rarg3; 3644 3645 Label keys; 3646 Label sha1_loop; 3647 3648 // load the keys into v0..v3 3649 __ adr(rscratch1, keys); 3650 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3651 // load 5 words state into v6, v7 3652 __ ldrq(v6, Address(state, 0)); 3653 __ ldrs(v7, Address(state, 16)); 3654 3655 3656 __ BIND(sha1_loop); 3657 // load 64 bytes of data into v16..v19 3658 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3659 __ rev32(v16, __ T16B, v16); 3660 __ rev32(v17, __ T16B, v17); 3661 __ rev32(v18, __ T16B, v18); 3662 __ rev32(v19, __ T16B, v19); 3663 3664 // do the sha1 3665 __ addv(v4, __ T4S, v16, v0); 3666 __ orr(v20, __ T16B, v6, v6); 3667 3668 FloatRegister d0 = v16; 3669 FloatRegister d1 = v17; 3670 FloatRegister d2 = v18; 3671 FloatRegister d3 = v19; 3672 3673 for (int round = 0; round < 20; round++) { 3674 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3675 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3676 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3677 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3678 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3679 3680 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3681 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3682 __ sha1h(tmp2, __ T4S, v20); 3683 if (round < 5) 3684 __ sha1c(v20, __ T4S, tmp3, tmp4); 3685 else if (round < 10 || round >= 15) 3686 __ sha1p(v20, __ T4S, tmp3, tmp4); 3687 else 3688 __ sha1m(v20, __ T4S, tmp3, tmp4); 3689 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3690 3691 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3692 } 3693 3694 __ addv(v7, __ T2S, v7, v21); 3695 __ addv(v6, __ T4S, v6, v20); 3696 3697 if (multi_block) { 3698 __ add(ofs, ofs, 64); 3699 __ cmp(ofs, limit); 3700 __ br(Assembler::LE, sha1_loop); 3701 __ mov(c_rarg0, ofs); // return ofs 3702 } 3703 3704 __ strq(v6, Address(state, 0)); 3705 __ strs(v7, Address(state, 16)); 3706 3707 __ ret(lr); 3708 3709 __ bind(keys); 3710 __ emit_int32(0x5a827999); 3711 __ emit_int32(0x6ed9eba1); 3712 __ emit_int32(0x8f1bbcdc); 3713 __ emit_int32(0xca62c1d6); 3714 3715 return start; 3716 } 3717 3718 3719 // Arguments: 3720 // 3721 // Inputs: 3722 // c_rarg0 - byte[] source+offset 3723 // c_rarg1 - int[] SHA.state 3724 // c_rarg2 - int offset 3725 // c_rarg3 - int limit 3726 // 3727 address generate_sha256_implCompress(bool multi_block, const char *name) { 3728 static const uint32_t round_consts[64] = { 3729 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3730 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3731 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3732 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3733 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3734 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3735 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3736 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3737 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3738 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3739 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3740 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3741 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3742 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3743 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3744 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3745 }; 3746 __ align(CodeEntryAlignment); 3747 StubCodeMark mark(this, "StubRoutines", name); 3748 address start = __ pc(); 3749 3750 Register buf = c_rarg0; 3751 Register state = c_rarg1; 3752 Register ofs = c_rarg2; 3753 Register limit = c_rarg3; 3754 3755 Label sha1_loop; 3756 3757 __ stpd(v8, v9, __ pre(sp, -32)); 3758 __ stpd(v10, v11, Address(sp, 16)); 3759 3760 // dga == v0 3761 // dgb == v1 3762 // dg0 == v2 3763 // dg1 == v3 3764 // dg2 == v4 3765 // t0 == v6 3766 // t1 == v7 3767 3768 // load 16 keys to v16..v31 3769 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3770 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3771 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3772 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3773 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3774 3775 // load 8 words (256 bits) state 3776 __ ldpq(v0, v1, state); 3777 3778 __ BIND(sha1_loop); 3779 // load 64 bytes of data into v8..v11 3780 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3781 __ rev32(v8, __ T16B, v8); 3782 __ rev32(v9, __ T16B, v9); 3783 __ rev32(v10, __ T16B, v10); 3784 __ rev32(v11, __ T16B, v11); 3785 3786 __ addv(v6, __ T4S, v8, v16); 3787 __ orr(v2, __ T16B, v0, v0); 3788 __ orr(v3, __ T16B, v1, v1); 3789 3790 FloatRegister d0 = v8; 3791 FloatRegister d1 = v9; 3792 FloatRegister d2 = v10; 3793 FloatRegister d3 = v11; 3794 3795 3796 for (int round = 0; round < 16; round++) { 3797 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3798 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3799 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3800 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3801 3802 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3803 __ orr(v4, __ T16B, v2, v2); 3804 if (round < 15) 3805 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3806 __ sha256h(v2, __ T4S, v3, tmp2); 3807 __ sha256h2(v3, __ T4S, v4, tmp2); 3808 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3809 3810 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3811 } 3812 3813 __ addv(v0, __ T4S, v0, v2); 3814 __ addv(v1, __ T4S, v1, v3); 3815 3816 if (multi_block) { 3817 __ add(ofs, ofs, 64); 3818 __ cmp(ofs, limit); 3819 __ br(Assembler::LE, sha1_loop); 3820 __ mov(c_rarg0, ofs); // return ofs 3821 } 3822 3823 __ ldpd(v10, v11, Address(sp, 16)); 3824 __ ldpd(v8, v9, __ post(sp, 32)); 3825 3826 __ stpq(v0, v1, state); 3827 3828 __ ret(lr); 3829 3830 return start; 3831 } 3832 3833 // Double rounds for sha512. 3834 void sha512_dround(int dr, 3835 FloatRegister vi0, FloatRegister vi1, 3836 FloatRegister vi2, FloatRegister vi3, 3837 FloatRegister vi4, FloatRegister vrc0, 3838 FloatRegister vrc1, FloatRegister vin0, 3839 FloatRegister vin1, FloatRegister vin2, 3840 FloatRegister vin3, FloatRegister vin4) { 3841 if (dr < 36) { 3842 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3843 } 3844 __ addv(v5, __ T2D, vrc0, vin0); 3845 __ ext(v6, __ T16B, vi2, vi3, 8); 3846 __ ext(v5, __ T16B, v5, v5, 8); 3847 __ ext(v7, __ T16B, vi1, vi2, 8); 3848 __ addv(vi3, __ T2D, vi3, v5); 3849 if (dr < 32) { 3850 __ ext(v5, __ T16B, vin3, vin4, 8); 3851 __ sha512su0(vin0, __ T2D, vin1); 3852 } 3853 __ sha512h(vi3, __ T2D, v6, v7); 3854 if (dr < 32) { 3855 __ sha512su1(vin0, __ T2D, vin2, v5); 3856 } 3857 __ addv(vi4, __ T2D, vi1, vi3); 3858 __ sha512h2(vi3, __ T2D, vi1, vi0); 3859 } 3860 3861 // Arguments: 3862 // 3863 // Inputs: 3864 // c_rarg0 - byte[] source+offset 3865 // c_rarg1 - int[] SHA.state 3866 // c_rarg2 - int offset 3867 // c_rarg3 - int limit 3868 // 3869 address generate_sha512_implCompress(bool multi_block, const char *name) { 3870 static const uint64_t round_consts[80] = { 3871 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3872 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3873 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3874 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3875 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3876 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3877 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3878 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3879 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3880 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3881 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3882 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3883 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3884 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3885 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3886 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3887 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3888 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3889 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3890 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3891 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3892 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3893 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3894 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3895 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3896 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3897 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3898 }; 3899 3900 __ align(CodeEntryAlignment); 3901 StubCodeMark mark(this, "StubRoutines", name); 3902 address start = __ pc(); 3903 3904 Register buf = c_rarg0; 3905 Register state = c_rarg1; 3906 Register ofs = c_rarg2; 3907 Register limit = c_rarg3; 3908 3909 __ stpd(v8, v9, __ pre(sp, -64)); 3910 __ stpd(v10, v11, Address(sp, 16)); 3911 __ stpd(v12, v13, Address(sp, 32)); 3912 __ stpd(v14, v15, Address(sp, 48)); 3913 3914 Label sha512_loop; 3915 3916 // load state 3917 __ ld1(v8, v9, v10, v11, __ T2D, state); 3918 3919 // load first 4 round constants 3920 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3921 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3922 3923 __ BIND(sha512_loop); 3924 // load 128B of data into v12..v19 3925 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3926 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3927 __ rev64(v12, __ T16B, v12); 3928 __ rev64(v13, __ T16B, v13); 3929 __ rev64(v14, __ T16B, v14); 3930 __ rev64(v15, __ T16B, v15); 3931 __ rev64(v16, __ T16B, v16); 3932 __ rev64(v17, __ T16B, v17); 3933 __ rev64(v18, __ T16B, v18); 3934 __ rev64(v19, __ T16B, v19); 3935 3936 __ mov(rscratch2, rscratch1); 3937 3938 __ mov(v0, __ T16B, v8); 3939 __ mov(v1, __ T16B, v9); 3940 __ mov(v2, __ T16B, v10); 3941 __ mov(v3, __ T16B, v11); 3942 3943 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3944 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3945 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3946 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3947 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3948 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3949 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3950 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3951 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3952 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3953 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3954 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3955 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3956 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3957 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3958 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3959 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3960 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3961 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3962 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3963 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3964 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3965 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3966 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3967 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3968 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3969 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3970 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3971 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3972 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3973 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3974 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3975 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3976 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3977 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3978 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3979 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3980 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3981 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3982 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3983 3984 __ addv(v8, __ T2D, v8, v0); 3985 __ addv(v9, __ T2D, v9, v1); 3986 __ addv(v10, __ T2D, v10, v2); 3987 __ addv(v11, __ T2D, v11, v3); 3988 3989 if (multi_block) { 3990 __ add(ofs, ofs, 128); 3991 __ cmp(ofs, limit); 3992 __ br(Assembler::LE, sha512_loop); 3993 __ mov(c_rarg0, ofs); // return ofs 3994 } 3995 3996 __ st1(v8, v9, v10, v11, __ T2D, state); 3997 3998 __ ldpd(v14, v15, Address(sp, 48)); 3999 __ ldpd(v12, v13, Address(sp, 32)); 4000 __ ldpd(v10, v11, Address(sp, 16)); 4001 __ ldpd(v8, v9, __ post(sp, 64)); 4002 4003 __ ret(lr); 4004 4005 return start; 4006 } 4007 4008 // Arguments: 4009 // 4010 // Inputs: 4011 // c_rarg0 - byte[] source+offset 4012 // c_rarg1 - byte[] SHA.state 4013 // c_rarg2 - int block_size 4014 // c_rarg3 - int offset 4015 // c_rarg4 - int limit 4016 // 4017 address generate_sha3_implCompress(bool multi_block, const char *name) { 4018 static const uint64_t round_consts[24] = { 4019 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4020 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4021 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4022 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4023 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4024 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4025 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4026 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4027 }; 4028 4029 __ align(CodeEntryAlignment); 4030 StubCodeMark mark(this, "StubRoutines", name); 4031 address start = __ pc(); 4032 4033 Register buf = c_rarg0; 4034 Register state = c_rarg1; 4035 Register block_size = c_rarg2; 4036 Register ofs = c_rarg3; 4037 Register limit = c_rarg4; 4038 4039 Label sha3_loop, rounds24_loop; 4040 Label sha3_512_or_sha3_384, shake128; 4041 4042 __ stpd(v8, v9, __ pre(sp, -64)); 4043 __ stpd(v10, v11, Address(sp, 16)); 4044 __ stpd(v12, v13, Address(sp, 32)); 4045 __ stpd(v14, v15, Address(sp, 48)); 4046 4047 // load state 4048 __ add(rscratch1, state, 32); 4049 __ ld1(v0, v1, v2, v3, __ T1D, state); 4050 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4051 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4052 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4053 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4054 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4055 __ ld1(v24, __ T1D, rscratch1); 4056 4057 __ BIND(sha3_loop); 4058 4059 // 24 keccak rounds 4060 __ movw(rscratch2, 24); 4061 4062 // load round_constants base 4063 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4064 4065 // load input 4066 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4067 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4068 __ eor(v0, __ T8B, v0, v25); 4069 __ eor(v1, __ T8B, v1, v26); 4070 __ eor(v2, __ T8B, v2, v27); 4071 __ eor(v3, __ T8B, v3, v28); 4072 __ eor(v4, __ T8B, v4, v29); 4073 __ eor(v5, __ T8B, v5, v30); 4074 __ eor(v6, __ T8B, v6, v31); 4075 4076 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4077 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4078 4079 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4080 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4081 __ eor(v7, __ T8B, v7, v25); 4082 __ eor(v8, __ T8B, v8, v26); 4083 __ eor(v9, __ T8B, v9, v27); 4084 __ eor(v10, __ T8B, v10, v28); 4085 __ eor(v11, __ T8B, v11, v29); 4086 __ eor(v12, __ T8B, v12, v30); 4087 __ eor(v13, __ T8B, v13, v31); 4088 4089 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4090 __ eor(v14, __ T8B, v14, v25); 4091 __ eor(v15, __ T8B, v15, v26); 4092 __ eor(v16, __ T8B, v16, v27); 4093 4094 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4095 __ andw(c_rarg5, block_size, 48); 4096 __ cbzw(c_rarg5, rounds24_loop); 4097 4098 __ tbnz(block_size, 5, shake128); 4099 // block_size == 144, bit5 == 0, SHA3-244 4100 __ ldrd(v28, __ post(buf, 8)); 4101 __ eor(v17, __ T8B, v17, v28); 4102 __ b(rounds24_loop); 4103 4104 __ BIND(shake128); 4105 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4106 __ eor(v17, __ T8B, v17, v28); 4107 __ eor(v18, __ T8B, v18, v29); 4108 __ eor(v19, __ T8B, v19, v30); 4109 __ eor(v20, __ T8B, v20, v31); 4110 __ b(rounds24_loop); // block_size == 168, SHAKE128 4111 4112 __ BIND(sha3_512_or_sha3_384); 4113 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4114 __ eor(v7, __ T8B, v7, v25); 4115 __ eor(v8, __ T8B, v8, v26); 4116 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4117 4118 // SHA3-384 4119 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4120 __ eor(v9, __ T8B, v9, v27); 4121 __ eor(v10, __ T8B, v10, v28); 4122 __ eor(v11, __ T8B, v11, v29); 4123 __ eor(v12, __ T8B, v12, v30); 4124 4125 __ BIND(rounds24_loop); 4126 __ subw(rscratch2, rscratch2, 1); 4127 4128 __ eor3(v29, __ T16B, v4, v9, v14); 4129 __ eor3(v26, __ T16B, v1, v6, v11); 4130 __ eor3(v28, __ T16B, v3, v8, v13); 4131 __ eor3(v25, __ T16B, v0, v5, v10); 4132 __ eor3(v27, __ T16B, v2, v7, v12); 4133 __ eor3(v29, __ T16B, v29, v19, v24); 4134 __ eor3(v26, __ T16B, v26, v16, v21); 4135 __ eor3(v28, __ T16B, v28, v18, v23); 4136 __ eor3(v25, __ T16B, v25, v15, v20); 4137 __ eor3(v27, __ T16B, v27, v17, v22); 4138 4139 __ rax1(v30, __ T2D, v29, v26); 4140 __ rax1(v26, __ T2D, v26, v28); 4141 __ rax1(v28, __ T2D, v28, v25); 4142 __ rax1(v25, __ T2D, v25, v27); 4143 __ rax1(v27, __ T2D, v27, v29); 4144 4145 __ eor(v0, __ T16B, v0, v30); 4146 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4147 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4148 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4149 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4150 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4151 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4152 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4153 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4154 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4155 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4156 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4157 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4158 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4159 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4160 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4161 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4162 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4163 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4164 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4165 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4166 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4167 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4168 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4169 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4170 4171 __ bcax(v20, __ T16B, v31, v22, v8); 4172 __ bcax(v21, __ T16B, v8, v23, v22); 4173 __ bcax(v22, __ T16B, v22, v24, v23); 4174 __ bcax(v23, __ T16B, v23, v31, v24); 4175 __ bcax(v24, __ T16B, v24, v8, v31); 4176 4177 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4178 4179 __ bcax(v17, __ T16B, v25, v19, v3); 4180 __ bcax(v18, __ T16B, v3, v15, v19); 4181 __ bcax(v19, __ T16B, v19, v16, v15); 4182 __ bcax(v15, __ T16B, v15, v25, v16); 4183 __ bcax(v16, __ T16B, v16, v3, v25); 4184 4185 __ bcax(v10, __ T16B, v29, v12, v26); 4186 __ bcax(v11, __ T16B, v26, v13, v12); 4187 __ bcax(v12, __ T16B, v12, v14, v13); 4188 __ bcax(v13, __ T16B, v13, v29, v14); 4189 __ bcax(v14, __ T16B, v14, v26, v29); 4190 4191 __ bcax(v7, __ T16B, v30, v9, v4); 4192 __ bcax(v8, __ T16B, v4, v5, v9); 4193 __ bcax(v9, __ T16B, v9, v6, v5); 4194 __ bcax(v5, __ T16B, v5, v30, v6); 4195 __ bcax(v6, __ T16B, v6, v4, v30); 4196 4197 __ bcax(v3, __ T16B, v27, v0, v28); 4198 __ bcax(v4, __ T16B, v28, v1, v0); 4199 __ bcax(v0, __ T16B, v0, v2, v1); 4200 __ bcax(v1, __ T16B, v1, v27, v2); 4201 __ bcax(v2, __ T16B, v2, v28, v27); 4202 4203 __ eor(v0, __ T16B, v0, v31); 4204 4205 __ cbnzw(rscratch2, rounds24_loop); 4206 4207 if (multi_block) { 4208 __ add(ofs, ofs, block_size); 4209 __ cmp(ofs, limit); 4210 __ br(Assembler::LE, sha3_loop); 4211 __ mov(c_rarg0, ofs); // return ofs 4212 } 4213 4214 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4215 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4216 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4217 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4218 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4219 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4220 __ st1(v24, __ T1D, state); 4221 4222 __ ldpd(v14, v15, Address(sp, 48)); 4223 __ ldpd(v12, v13, Address(sp, 32)); 4224 __ ldpd(v10, v11, Address(sp, 16)); 4225 __ ldpd(v8, v9, __ post(sp, 64)); 4226 4227 __ ret(lr); 4228 4229 return start; 4230 } 4231 4232 /** 4233 * Arguments: 4234 * 4235 * Inputs: 4236 * c_rarg0 - int crc 4237 * c_rarg1 - byte* buf 4238 * c_rarg2 - int length 4239 * 4240 * Output: 4241 * rax - int crc result 4242 */ 4243 address generate_updateBytesCRC32() { 4244 assert(UseCRC32Intrinsics, "what are we doing here?"); 4245 4246 __ align(CodeEntryAlignment); 4247 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4248 4249 address start = __ pc(); 4250 4251 const Register crc = c_rarg0; // crc 4252 const Register buf = c_rarg1; // source java byte array address 4253 const Register len = c_rarg2; // length 4254 const Register table0 = c_rarg3; // crc_table address 4255 const Register table1 = c_rarg4; 4256 const Register table2 = c_rarg5; 4257 const Register table3 = c_rarg6; 4258 const Register tmp3 = c_rarg7; 4259 4260 BLOCK_COMMENT("Entry:"); 4261 __ enter(); // required for proper stackwalking of RuntimeStub frame 4262 4263 __ kernel_crc32(crc, buf, len, 4264 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4265 4266 __ leave(); // required for proper stackwalking of RuntimeStub frame 4267 __ ret(lr); 4268 4269 return start; 4270 } 4271 4272 // ChaCha20 block function. This version parallelizes by loading 4273 // individual 32-bit state elements into vectors for four blocks 4274 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4275 // 4276 // state (int[16]) = c_rarg0 4277 // keystream (byte[1024]) = c_rarg1 4278 // return - number of bytes of keystream (always 256) 4279 address generate_chacha20Block_blockpar() { 4280 Label L_twoRounds, L_cc20_const; 4281 // The constant data is broken into two 128-bit segments to be loaded 4282 // onto FloatRegisters. The first 128 bits are a counter add overlay 4283 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4284 // The second 128-bits is a table constant used for 8-bit left rotations. 4285 __ BIND(L_cc20_const); 4286 __ emit_int64(0x0000000100000000UL); 4287 __ emit_int64(0x0000000300000002UL); 4288 __ emit_int64(0x0605040702010003UL); 4289 __ emit_int64(0x0E0D0C0F0A09080BUL); 4290 4291 __ align(CodeEntryAlignment); 4292 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4293 address start = __ pc(); 4294 __ enter(); 4295 4296 int i, j; 4297 const Register state = c_rarg0; 4298 const Register keystream = c_rarg1; 4299 const Register loopCtr = r10; 4300 const Register tmpAddr = r11; 4301 4302 const FloatRegister stateFirst = v0; 4303 const FloatRegister stateSecond = v1; 4304 const FloatRegister stateThird = v2; 4305 const FloatRegister stateFourth = v3; 4306 const FloatRegister origCtrState = v28; 4307 const FloatRegister scratch = v29; 4308 const FloatRegister lrot8Tbl = v30; 4309 4310 // Organize SIMD registers in an array that facilitates 4311 // putting repetitive opcodes into loop structures. It is 4312 // important that each grouping of 4 registers is monotonically 4313 // increasing to support the requirements of multi-register 4314 // instructions (e.g. ld4r, st4, etc.) 4315 const FloatRegister workSt[16] = { 4316 v4, v5, v6, v7, v16, v17, v18, v19, 4317 v20, v21, v22, v23, v24, v25, v26, v27 4318 }; 4319 4320 // Load from memory and interlace across 16 SIMD registers, 4321 // With each word from memory being broadcast to all lanes of 4322 // each successive SIMD register. 4323 // Addr(0) -> All lanes in workSt[i] 4324 // Addr(4) -> All lanes workSt[i + 1], etc. 4325 __ mov(tmpAddr, state); 4326 for (i = 0; i < 16; i += 4) { 4327 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4328 __ post(tmpAddr, 16)); 4329 } 4330 4331 // Pull in constant data. The first 16 bytes are the add overlay 4332 // which is applied to the vector holding the counter (state[12]). 4333 // The second 16 bytes is the index register for the 8-bit left 4334 // rotation tbl instruction. 4335 __ adr(tmpAddr, L_cc20_const); 4336 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4337 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4338 4339 // Set up the 10 iteration loop and perform all 8 quarter round ops 4340 __ mov(loopCtr, 10); 4341 __ BIND(L_twoRounds); 4342 4343 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4344 scratch, lrot8Tbl); 4345 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4346 scratch, lrot8Tbl); 4347 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4348 scratch, lrot8Tbl); 4349 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4350 scratch, lrot8Tbl); 4351 4352 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4353 scratch, lrot8Tbl); 4354 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4355 scratch, lrot8Tbl); 4356 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4357 scratch, lrot8Tbl); 4358 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4359 scratch, lrot8Tbl); 4360 4361 // Decrement and iterate 4362 __ sub(loopCtr, loopCtr, 1); 4363 __ cbnz(loopCtr, L_twoRounds); 4364 4365 __ mov(tmpAddr, state); 4366 4367 // Add the starting state back to the post-loop keystream 4368 // state. We read/interlace the state array from memory into 4369 // 4 registers similar to what we did in the beginning. Then 4370 // add the counter overlay onto workSt[12] at the end. 4371 for (i = 0; i < 16; i += 4) { 4372 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4373 __ post(tmpAddr, 16)); 4374 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4375 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4376 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4377 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4378 } 4379 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4380 4381 // Write to key stream, storing the same element out of workSt[0..15] 4382 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4383 // for the next element position. 4384 for (i = 0; i < 4; i++) { 4385 for (j = 0; j < 16; j += 4) { 4386 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4387 __ post(keystream, 16)); 4388 } 4389 } 4390 4391 __ mov(r0, 256); // Return length of output keystream 4392 __ leave(); 4393 __ ret(lr); 4394 4395 return start; 4396 } 4397 4398 /** 4399 * Arguments: 4400 * 4401 * Inputs: 4402 * c_rarg0 - int crc 4403 * c_rarg1 - byte* buf 4404 * c_rarg2 - int length 4405 * c_rarg3 - int* table 4406 * 4407 * Output: 4408 * r0 - int crc result 4409 */ 4410 address generate_updateBytesCRC32C() { 4411 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4412 4413 __ align(CodeEntryAlignment); 4414 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4415 4416 address start = __ pc(); 4417 4418 const Register crc = c_rarg0; // crc 4419 const Register buf = c_rarg1; // source java byte array address 4420 const Register len = c_rarg2; // length 4421 const Register table0 = c_rarg3; // crc_table address 4422 const Register table1 = c_rarg4; 4423 const Register table2 = c_rarg5; 4424 const Register table3 = c_rarg6; 4425 const Register tmp3 = c_rarg7; 4426 4427 BLOCK_COMMENT("Entry:"); 4428 __ enter(); // required for proper stackwalking of RuntimeStub frame 4429 4430 __ kernel_crc32c(crc, buf, len, 4431 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4432 4433 __ leave(); // required for proper stackwalking of RuntimeStub frame 4434 __ ret(lr); 4435 4436 return start; 4437 } 4438 4439 /*** 4440 * Arguments: 4441 * 4442 * Inputs: 4443 * c_rarg0 - int adler 4444 * c_rarg1 - byte* buff 4445 * c_rarg2 - int len 4446 * 4447 * Output: 4448 * c_rarg0 - int adler result 4449 */ 4450 address generate_updateBytesAdler32() { 4451 __ align(CodeEntryAlignment); 4452 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4453 address start = __ pc(); 4454 4455 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4456 4457 // Aliases 4458 Register adler = c_rarg0; 4459 Register s1 = c_rarg0; 4460 Register s2 = c_rarg3; 4461 Register buff = c_rarg1; 4462 Register len = c_rarg2; 4463 Register nmax = r4; 4464 Register base = r5; 4465 Register count = r6; 4466 Register temp0 = rscratch1; 4467 Register temp1 = rscratch2; 4468 FloatRegister vbytes = v0; 4469 FloatRegister vs1acc = v1; 4470 FloatRegister vs2acc = v2; 4471 FloatRegister vtable = v3; 4472 4473 // Max number of bytes we can process before having to take the mod 4474 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4475 uint64_t BASE = 0xfff1; 4476 uint64_t NMAX = 0x15B0; 4477 4478 __ mov(base, BASE); 4479 __ mov(nmax, NMAX); 4480 4481 // Load accumulation coefficients for the upper 16 bits 4482 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4483 __ ld1(vtable, __ T16B, Address(temp0)); 4484 4485 // s1 is initialized to the lower 16 bits of adler 4486 // s2 is initialized to the upper 16 bits of adler 4487 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4488 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4489 4490 // The pipelined loop needs at least 16 elements for 1 iteration 4491 // It does check this, but it is more effective to skip to the cleanup loop 4492 __ cmp(len, (u1)16); 4493 __ br(Assembler::HS, L_nmax); 4494 __ cbz(len, L_combine); 4495 4496 __ bind(L_simple_by1_loop); 4497 __ ldrb(temp0, Address(__ post(buff, 1))); 4498 __ add(s1, s1, temp0); 4499 __ add(s2, s2, s1); 4500 __ subs(len, len, 1); 4501 __ br(Assembler::HI, L_simple_by1_loop); 4502 4503 // s1 = s1 % BASE 4504 __ subs(temp0, s1, base); 4505 __ csel(s1, temp0, s1, Assembler::HS); 4506 4507 // s2 = s2 % BASE 4508 __ lsr(temp0, s2, 16); 4509 __ lsl(temp1, temp0, 4); 4510 __ sub(temp1, temp1, temp0); 4511 __ add(s2, temp1, s2, ext::uxth); 4512 4513 __ subs(temp0, s2, base); 4514 __ csel(s2, temp0, s2, Assembler::HS); 4515 4516 __ b(L_combine); 4517 4518 __ bind(L_nmax); 4519 __ subs(len, len, nmax); 4520 __ sub(count, nmax, 16); 4521 __ br(Assembler::LO, L_by16); 4522 4523 __ bind(L_nmax_loop); 4524 4525 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4526 vbytes, vs1acc, vs2acc, vtable); 4527 4528 __ subs(count, count, 16); 4529 __ br(Assembler::HS, L_nmax_loop); 4530 4531 // s1 = s1 % BASE 4532 __ lsr(temp0, s1, 16); 4533 __ lsl(temp1, temp0, 4); 4534 __ sub(temp1, temp1, temp0); 4535 __ add(temp1, temp1, s1, ext::uxth); 4536 4537 __ lsr(temp0, temp1, 16); 4538 __ lsl(s1, temp0, 4); 4539 __ sub(s1, s1, temp0); 4540 __ add(s1, s1, temp1, ext:: uxth); 4541 4542 __ subs(temp0, s1, base); 4543 __ csel(s1, temp0, s1, Assembler::HS); 4544 4545 // s2 = s2 % BASE 4546 __ lsr(temp0, s2, 16); 4547 __ lsl(temp1, temp0, 4); 4548 __ sub(temp1, temp1, temp0); 4549 __ add(temp1, temp1, s2, ext::uxth); 4550 4551 __ lsr(temp0, temp1, 16); 4552 __ lsl(s2, temp0, 4); 4553 __ sub(s2, s2, temp0); 4554 __ add(s2, s2, temp1, ext:: uxth); 4555 4556 __ subs(temp0, s2, base); 4557 __ csel(s2, temp0, s2, Assembler::HS); 4558 4559 __ subs(len, len, nmax); 4560 __ sub(count, nmax, 16); 4561 __ br(Assembler::HS, L_nmax_loop); 4562 4563 __ bind(L_by16); 4564 __ adds(len, len, count); 4565 __ br(Assembler::LO, L_by1); 4566 4567 __ bind(L_by16_loop); 4568 4569 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4570 vbytes, vs1acc, vs2acc, vtable); 4571 4572 __ subs(len, len, 16); 4573 __ br(Assembler::HS, L_by16_loop); 4574 4575 __ bind(L_by1); 4576 __ adds(len, len, 15); 4577 __ br(Assembler::LO, L_do_mod); 4578 4579 __ bind(L_by1_loop); 4580 __ ldrb(temp0, Address(__ post(buff, 1))); 4581 __ add(s1, temp0, s1); 4582 __ add(s2, s2, s1); 4583 __ subs(len, len, 1); 4584 __ br(Assembler::HS, L_by1_loop); 4585 4586 __ bind(L_do_mod); 4587 // s1 = s1 % BASE 4588 __ lsr(temp0, s1, 16); 4589 __ lsl(temp1, temp0, 4); 4590 __ sub(temp1, temp1, temp0); 4591 __ add(temp1, temp1, s1, ext::uxth); 4592 4593 __ lsr(temp0, temp1, 16); 4594 __ lsl(s1, temp0, 4); 4595 __ sub(s1, s1, temp0); 4596 __ add(s1, s1, temp1, ext:: uxth); 4597 4598 __ subs(temp0, s1, base); 4599 __ csel(s1, temp0, s1, Assembler::HS); 4600 4601 // s2 = s2 % BASE 4602 __ lsr(temp0, s2, 16); 4603 __ lsl(temp1, temp0, 4); 4604 __ sub(temp1, temp1, temp0); 4605 __ add(temp1, temp1, s2, ext::uxth); 4606 4607 __ lsr(temp0, temp1, 16); 4608 __ lsl(s2, temp0, 4); 4609 __ sub(s2, s2, temp0); 4610 __ add(s2, s2, temp1, ext:: uxth); 4611 4612 __ subs(temp0, s2, base); 4613 __ csel(s2, temp0, s2, Assembler::HS); 4614 4615 // Combine lower bits and higher bits 4616 __ bind(L_combine); 4617 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4618 4619 __ ret(lr); 4620 4621 return start; 4622 } 4623 4624 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4625 Register temp0, Register temp1, FloatRegister vbytes, 4626 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4627 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4628 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4629 // In non-vectorized code, we update s1 and s2 as: 4630 // s1 <- s1 + b1 4631 // s2 <- s2 + s1 4632 // s1 <- s1 + b2 4633 // s2 <- s2 + b1 4634 // ... 4635 // s1 <- s1 + b16 4636 // s2 <- s2 + s1 4637 // Putting above assignments together, we have: 4638 // s1_new = s1 + b1 + b2 + ... + b16 4639 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4640 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4641 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4642 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4643 4644 // s2 = s2 + s1 * 16 4645 __ add(s2, s2, s1, Assembler::LSL, 4); 4646 4647 // vs1acc = b1 + b2 + b3 + ... + b16 4648 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4649 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4650 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4651 __ uaddlv(vs1acc, __ T16B, vbytes); 4652 __ uaddlv(vs2acc, __ T8H, vs2acc); 4653 4654 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4655 __ fmovd(temp0, vs1acc); 4656 __ fmovd(temp1, vs2acc); 4657 __ add(s1, s1, temp0); 4658 __ add(s2, s2, temp1); 4659 } 4660 4661 /** 4662 * Arguments: 4663 * 4664 * Input: 4665 * c_rarg0 - x address 4666 * c_rarg1 - x length 4667 * c_rarg2 - y address 4668 * c_rarg3 - y length 4669 * c_rarg4 - z address 4670 */ 4671 address generate_multiplyToLen() { 4672 __ align(CodeEntryAlignment); 4673 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4674 4675 address start = __ pc(); 4676 const Register x = r0; 4677 const Register xlen = r1; 4678 const Register y = r2; 4679 const Register ylen = r3; 4680 const Register z = r4; 4681 4682 const Register tmp0 = r5; 4683 const Register tmp1 = r10; 4684 const Register tmp2 = r11; 4685 const Register tmp3 = r12; 4686 const Register tmp4 = r13; 4687 const Register tmp5 = r14; 4688 const Register tmp6 = r15; 4689 const Register tmp7 = r16; 4690 4691 BLOCK_COMMENT("Entry:"); 4692 __ enter(); // required for proper stackwalking of RuntimeStub frame 4693 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4694 __ leave(); // required for proper stackwalking of RuntimeStub frame 4695 __ ret(lr); 4696 4697 return start; 4698 } 4699 4700 address generate_squareToLen() { 4701 // squareToLen algorithm for sizes 1..127 described in java code works 4702 // faster than multiply_to_len on some CPUs and slower on others, but 4703 // multiply_to_len shows a bit better overall results 4704 __ align(CodeEntryAlignment); 4705 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4706 address start = __ pc(); 4707 4708 const Register x = r0; 4709 const Register xlen = r1; 4710 const Register z = r2; 4711 const Register y = r4; // == x 4712 const Register ylen = r5; // == xlen 4713 4714 const Register tmp0 = r3; 4715 const Register tmp1 = r10; 4716 const Register tmp2 = r11; 4717 const Register tmp3 = r12; 4718 const Register tmp4 = r13; 4719 const Register tmp5 = r14; 4720 const Register tmp6 = r15; 4721 const Register tmp7 = r16; 4722 4723 RegSet spilled_regs = RegSet::of(y, ylen); 4724 BLOCK_COMMENT("Entry:"); 4725 __ enter(); 4726 __ push(spilled_regs, sp); 4727 __ mov(y, x); 4728 __ mov(ylen, xlen); 4729 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4730 __ pop(spilled_regs, sp); 4731 __ leave(); 4732 __ ret(lr); 4733 return start; 4734 } 4735 4736 address generate_mulAdd() { 4737 __ align(CodeEntryAlignment); 4738 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4739 4740 address start = __ pc(); 4741 4742 const Register out = r0; 4743 const Register in = r1; 4744 const Register offset = r2; 4745 const Register len = r3; 4746 const Register k = r4; 4747 4748 BLOCK_COMMENT("Entry:"); 4749 __ enter(); 4750 __ mul_add(out, in, offset, len, k); 4751 __ leave(); 4752 __ ret(lr); 4753 4754 return start; 4755 } 4756 4757 // Arguments: 4758 // 4759 // Input: 4760 // c_rarg0 - newArr address 4761 // c_rarg1 - oldArr address 4762 // c_rarg2 - newIdx 4763 // c_rarg3 - shiftCount 4764 // c_rarg4 - numIter 4765 // 4766 address generate_bigIntegerRightShift() { 4767 __ align(CodeEntryAlignment); 4768 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4769 address start = __ pc(); 4770 4771 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4772 4773 Register newArr = c_rarg0; 4774 Register oldArr = c_rarg1; 4775 Register newIdx = c_rarg2; 4776 Register shiftCount = c_rarg3; 4777 Register numIter = c_rarg4; 4778 Register idx = numIter; 4779 4780 Register newArrCur = rscratch1; 4781 Register shiftRevCount = rscratch2; 4782 Register oldArrCur = r13; 4783 Register oldArrNext = r14; 4784 4785 FloatRegister oldElem0 = v0; 4786 FloatRegister oldElem1 = v1; 4787 FloatRegister newElem = v2; 4788 FloatRegister shiftVCount = v3; 4789 FloatRegister shiftVRevCount = v4; 4790 4791 __ cbz(idx, Exit); 4792 4793 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4794 4795 // left shift count 4796 __ movw(shiftRevCount, 32); 4797 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4798 4799 // numIter too small to allow a 4-words SIMD loop, rolling back 4800 __ cmp(numIter, (u1)4); 4801 __ br(Assembler::LT, ShiftThree); 4802 4803 __ dup(shiftVCount, __ T4S, shiftCount); 4804 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4805 __ negr(shiftVCount, __ T4S, shiftVCount); 4806 4807 __ BIND(ShiftSIMDLoop); 4808 4809 // Calculate the load addresses 4810 __ sub(idx, idx, 4); 4811 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4812 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4813 __ add(oldArrCur, oldArrNext, 4); 4814 4815 // Load 4 words and process 4816 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4817 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4818 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4819 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4820 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4821 __ st1(newElem, __ T4S, Address(newArrCur)); 4822 4823 __ cmp(idx, (u1)4); 4824 __ br(Assembler::LT, ShiftTwoLoop); 4825 __ b(ShiftSIMDLoop); 4826 4827 __ BIND(ShiftTwoLoop); 4828 __ cbz(idx, Exit); 4829 __ cmp(idx, (u1)1); 4830 __ br(Assembler::EQ, ShiftOne); 4831 4832 // Calculate the load addresses 4833 __ sub(idx, idx, 2); 4834 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4835 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4836 __ add(oldArrCur, oldArrNext, 4); 4837 4838 // Load 2 words and process 4839 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4840 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4841 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4842 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4843 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4844 __ st1(newElem, __ T2S, Address(newArrCur)); 4845 __ b(ShiftTwoLoop); 4846 4847 __ BIND(ShiftThree); 4848 __ tbz(idx, 1, ShiftOne); 4849 __ tbz(idx, 0, ShiftTwo); 4850 __ ldrw(r10, Address(oldArr, 12)); 4851 __ ldrw(r11, Address(oldArr, 8)); 4852 __ lsrvw(r10, r10, shiftCount); 4853 __ lslvw(r11, r11, shiftRevCount); 4854 __ orrw(r12, r10, r11); 4855 __ strw(r12, Address(newArr, 8)); 4856 4857 __ BIND(ShiftTwo); 4858 __ ldrw(r10, Address(oldArr, 8)); 4859 __ ldrw(r11, Address(oldArr, 4)); 4860 __ lsrvw(r10, r10, shiftCount); 4861 __ lslvw(r11, r11, shiftRevCount); 4862 __ orrw(r12, r10, r11); 4863 __ strw(r12, Address(newArr, 4)); 4864 4865 __ BIND(ShiftOne); 4866 __ ldrw(r10, Address(oldArr, 4)); 4867 __ ldrw(r11, Address(oldArr)); 4868 __ lsrvw(r10, r10, shiftCount); 4869 __ lslvw(r11, r11, shiftRevCount); 4870 __ orrw(r12, r10, r11); 4871 __ strw(r12, Address(newArr)); 4872 4873 __ BIND(Exit); 4874 __ ret(lr); 4875 4876 return start; 4877 } 4878 4879 // Arguments: 4880 // 4881 // Input: 4882 // c_rarg0 - newArr address 4883 // c_rarg1 - oldArr address 4884 // c_rarg2 - newIdx 4885 // c_rarg3 - shiftCount 4886 // c_rarg4 - numIter 4887 // 4888 address generate_bigIntegerLeftShift() { 4889 __ align(CodeEntryAlignment); 4890 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4891 address start = __ pc(); 4892 4893 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4894 4895 Register newArr = c_rarg0; 4896 Register oldArr = c_rarg1; 4897 Register newIdx = c_rarg2; 4898 Register shiftCount = c_rarg3; 4899 Register numIter = c_rarg4; 4900 4901 Register shiftRevCount = rscratch1; 4902 Register oldArrNext = rscratch2; 4903 4904 FloatRegister oldElem0 = v0; 4905 FloatRegister oldElem1 = v1; 4906 FloatRegister newElem = v2; 4907 FloatRegister shiftVCount = v3; 4908 FloatRegister shiftVRevCount = v4; 4909 4910 __ cbz(numIter, Exit); 4911 4912 __ add(oldArrNext, oldArr, 4); 4913 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4914 4915 // right shift count 4916 __ movw(shiftRevCount, 32); 4917 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4918 4919 // numIter too small to allow a 4-words SIMD loop, rolling back 4920 __ cmp(numIter, (u1)4); 4921 __ br(Assembler::LT, ShiftThree); 4922 4923 __ dup(shiftVCount, __ T4S, shiftCount); 4924 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4925 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4926 4927 __ BIND(ShiftSIMDLoop); 4928 4929 // load 4 words and process 4930 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4931 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4932 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4933 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4934 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4935 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4936 __ sub(numIter, numIter, 4); 4937 4938 __ cmp(numIter, (u1)4); 4939 __ br(Assembler::LT, ShiftTwoLoop); 4940 __ b(ShiftSIMDLoop); 4941 4942 __ BIND(ShiftTwoLoop); 4943 __ cbz(numIter, Exit); 4944 __ cmp(numIter, (u1)1); 4945 __ br(Assembler::EQ, ShiftOne); 4946 4947 // load 2 words and process 4948 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4949 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4950 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4951 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4952 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4953 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4954 __ sub(numIter, numIter, 2); 4955 __ b(ShiftTwoLoop); 4956 4957 __ BIND(ShiftThree); 4958 __ ldrw(r10, __ post(oldArr, 4)); 4959 __ ldrw(r11, __ post(oldArrNext, 4)); 4960 __ lslvw(r10, r10, shiftCount); 4961 __ lsrvw(r11, r11, shiftRevCount); 4962 __ orrw(r12, r10, r11); 4963 __ strw(r12, __ post(newArr, 4)); 4964 __ tbz(numIter, 1, Exit); 4965 __ tbz(numIter, 0, ShiftOne); 4966 4967 __ BIND(ShiftTwo); 4968 __ ldrw(r10, __ post(oldArr, 4)); 4969 __ ldrw(r11, __ post(oldArrNext, 4)); 4970 __ lslvw(r10, r10, shiftCount); 4971 __ lsrvw(r11, r11, shiftRevCount); 4972 __ orrw(r12, r10, r11); 4973 __ strw(r12, __ post(newArr, 4)); 4974 4975 __ BIND(ShiftOne); 4976 __ ldrw(r10, Address(oldArr)); 4977 __ ldrw(r11, Address(oldArrNext)); 4978 __ lslvw(r10, r10, shiftCount); 4979 __ lsrvw(r11, r11, shiftRevCount); 4980 __ orrw(r12, r10, r11); 4981 __ strw(r12, Address(newArr)); 4982 4983 __ BIND(Exit); 4984 __ ret(lr); 4985 4986 return start; 4987 } 4988 4989 address generate_count_positives(address &count_positives_long) { 4990 const u1 large_loop_size = 64; 4991 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4992 int dcache_line = VM_Version::dcache_line_size(); 4993 4994 Register ary1 = r1, len = r2, result = r0; 4995 4996 __ align(CodeEntryAlignment); 4997 4998 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4999 5000 address entry = __ pc(); 5001 5002 __ enter(); 5003 // precondition: a copy of len is already in result 5004 // __ mov(result, len); 5005 5006 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5007 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5008 5009 __ cmp(len, (u1)15); 5010 __ br(Assembler::GT, LEN_OVER_15); 5011 // The only case when execution falls into this code is when pointer is near 5012 // the end of memory page and we have to avoid reading next page 5013 __ add(ary1, ary1, len); 5014 __ subs(len, len, 8); 5015 __ br(Assembler::GT, LEN_OVER_8); 5016 __ ldr(rscratch2, Address(ary1, -8)); 5017 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5018 __ lsrv(rscratch2, rscratch2, rscratch1); 5019 __ tst(rscratch2, UPPER_BIT_MASK); 5020 __ csel(result, zr, result, Assembler::NE); 5021 __ leave(); 5022 __ ret(lr); 5023 __ bind(LEN_OVER_8); 5024 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5025 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5026 __ tst(rscratch2, UPPER_BIT_MASK); 5027 __ br(Assembler::NE, RET_NO_POP); 5028 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5029 __ lsrv(rscratch1, rscratch1, rscratch2); 5030 __ tst(rscratch1, UPPER_BIT_MASK); 5031 __ bind(RET_NO_POP); 5032 __ csel(result, zr, result, Assembler::NE); 5033 __ leave(); 5034 __ ret(lr); 5035 5036 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5037 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5038 5039 count_positives_long = __ pc(); // 2nd entry point 5040 5041 __ enter(); 5042 5043 __ bind(LEN_OVER_15); 5044 __ push(spilled_regs, sp); 5045 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5046 __ cbz(rscratch2, ALIGNED); 5047 __ ldp(tmp6, tmp1, Address(ary1)); 5048 __ mov(tmp5, 16); 5049 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5050 __ add(ary1, ary1, rscratch1); 5051 __ orr(tmp6, tmp6, tmp1); 5052 __ tst(tmp6, UPPER_BIT_MASK); 5053 __ br(Assembler::NE, RET_ADJUST); 5054 __ sub(len, len, rscratch1); 5055 5056 __ bind(ALIGNED); 5057 __ cmp(len, large_loop_size); 5058 __ br(Assembler::LT, CHECK_16); 5059 // Perform 16-byte load as early return in pre-loop to handle situation 5060 // when initially aligned large array has negative values at starting bytes, 5061 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5062 // slower. Cases with negative bytes further ahead won't be affected that 5063 // much. In fact, it'll be faster due to early loads, less instructions and 5064 // less branches in LARGE_LOOP. 5065 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5066 __ sub(len, len, 16); 5067 __ orr(tmp6, tmp6, tmp1); 5068 __ tst(tmp6, UPPER_BIT_MASK); 5069 __ br(Assembler::NE, RET_ADJUST_16); 5070 __ cmp(len, large_loop_size); 5071 __ br(Assembler::LT, CHECK_16); 5072 5073 if (SoftwarePrefetchHintDistance >= 0 5074 && SoftwarePrefetchHintDistance >= dcache_line) { 5075 // initial prefetch 5076 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5077 } 5078 __ bind(LARGE_LOOP); 5079 if (SoftwarePrefetchHintDistance >= 0) { 5080 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5081 } 5082 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5083 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5084 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5085 // instructions per cycle and have less branches, but this approach disables 5086 // early return, thus, all 64 bytes are loaded and checked every time. 5087 __ ldp(tmp2, tmp3, Address(ary1)); 5088 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5089 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5090 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5091 __ add(ary1, ary1, large_loop_size); 5092 __ sub(len, len, large_loop_size); 5093 __ orr(tmp2, tmp2, tmp3); 5094 __ orr(tmp4, tmp4, tmp5); 5095 __ orr(rscratch1, rscratch1, rscratch2); 5096 __ orr(tmp6, tmp6, tmp1); 5097 __ orr(tmp2, tmp2, tmp4); 5098 __ orr(rscratch1, rscratch1, tmp6); 5099 __ orr(tmp2, tmp2, rscratch1); 5100 __ tst(tmp2, UPPER_BIT_MASK); 5101 __ br(Assembler::NE, RET_ADJUST_LONG); 5102 __ cmp(len, large_loop_size); 5103 __ br(Assembler::GE, LARGE_LOOP); 5104 5105 __ bind(CHECK_16); // small 16-byte load pre-loop 5106 __ cmp(len, (u1)16); 5107 __ br(Assembler::LT, POST_LOOP16); 5108 5109 __ bind(LOOP16); // small 16-byte load loop 5110 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5111 __ sub(len, len, 16); 5112 __ orr(tmp2, tmp2, tmp3); 5113 __ tst(tmp2, UPPER_BIT_MASK); 5114 __ br(Assembler::NE, RET_ADJUST_16); 5115 __ cmp(len, (u1)16); 5116 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5117 5118 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5119 __ cmp(len, (u1)8); 5120 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5121 __ ldr(tmp3, Address(__ post(ary1, 8))); 5122 __ tst(tmp3, UPPER_BIT_MASK); 5123 __ br(Assembler::NE, RET_ADJUST); 5124 __ sub(len, len, 8); 5125 5126 __ bind(POST_LOOP16_LOAD_TAIL); 5127 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5128 __ ldr(tmp1, Address(ary1)); 5129 __ mov(tmp2, 64); 5130 __ sub(tmp4, tmp2, len, __ LSL, 3); 5131 __ lslv(tmp1, tmp1, tmp4); 5132 __ tst(tmp1, UPPER_BIT_MASK); 5133 __ br(Assembler::NE, RET_ADJUST); 5134 // Fallthrough 5135 5136 __ bind(RET_LEN); 5137 __ pop(spilled_regs, sp); 5138 __ leave(); 5139 __ ret(lr); 5140 5141 // difference result - len is the count of guaranteed to be 5142 // positive bytes 5143 5144 __ bind(RET_ADJUST_LONG); 5145 __ add(len, len, (u1)(large_loop_size - 16)); 5146 __ bind(RET_ADJUST_16); 5147 __ add(len, len, 16); 5148 __ bind(RET_ADJUST); 5149 __ pop(spilled_regs, sp); 5150 __ leave(); 5151 __ sub(result, result, len); 5152 __ ret(lr); 5153 5154 return entry; 5155 } 5156 5157 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5158 bool usePrefetch, Label &NOT_EQUAL) { 5159 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5160 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5161 tmp7 = r12, tmp8 = r13; 5162 Label LOOP; 5163 5164 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5165 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5166 __ bind(LOOP); 5167 if (usePrefetch) { 5168 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5169 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5170 } 5171 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5172 __ eor(tmp1, tmp1, tmp2); 5173 __ eor(tmp3, tmp3, tmp4); 5174 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5175 __ orr(tmp1, tmp1, tmp3); 5176 __ cbnz(tmp1, NOT_EQUAL); 5177 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5178 __ eor(tmp5, tmp5, tmp6); 5179 __ eor(tmp7, tmp7, tmp8); 5180 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5181 __ orr(tmp5, tmp5, tmp7); 5182 __ cbnz(tmp5, NOT_EQUAL); 5183 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5184 __ eor(tmp1, tmp1, tmp2); 5185 __ eor(tmp3, tmp3, tmp4); 5186 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5187 __ orr(tmp1, tmp1, tmp3); 5188 __ cbnz(tmp1, NOT_EQUAL); 5189 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5190 __ eor(tmp5, tmp5, tmp6); 5191 __ sub(cnt1, cnt1, 8 * wordSize); 5192 __ eor(tmp7, tmp7, tmp8); 5193 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5194 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5195 // cmp) because subs allows an unlimited range of immediate operand. 5196 __ subs(tmp6, cnt1, loopThreshold); 5197 __ orr(tmp5, tmp5, tmp7); 5198 __ cbnz(tmp5, NOT_EQUAL); 5199 __ br(__ GE, LOOP); 5200 // post-loop 5201 __ eor(tmp1, tmp1, tmp2); 5202 __ eor(tmp3, tmp3, tmp4); 5203 __ orr(tmp1, tmp1, tmp3); 5204 __ sub(cnt1, cnt1, 2 * wordSize); 5205 __ cbnz(tmp1, NOT_EQUAL); 5206 } 5207 5208 void generate_large_array_equals_loop_simd(int loopThreshold, 5209 bool usePrefetch, Label &NOT_EQUAL) { 5210 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5211 tmp2 = rscratch2; 5212 Label LOOP; 5213 5214 __ bind(LOOP); 5215 if (usePrefetch) { 5216 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5217 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5218 } 5219 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5220 __ sub(cnt1, cnt1, 8 * wordSize); 5221 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5222 __ subs(tmp1, cnt1, loopThreshold); 5223 __ eor(v0, __ T16B, v0, v4); 5224 __ eor(v1, __ T16B, v1, v5); 5225 __ eor(v2, __ T16B, v2, v6); 5226 __ eor(v3, __ T16B, v3, v7); 5227 __ orr(v0, __ T16B, v0, v1); 5228 __ orr(v1, __ T16B, v2, v3); 5229 __ orr(v0, __ T16B, v0, v1); 5230 __ umov(tmp1, v0, __ D, 0); 5231 __ umov(tmp2, v0, __ D, 1); 5232 __ orr(tmp1, tmp1, tmp2); 5233 __ cbnz(tmp1, NOT_EQUAL); 5234 __ br(__ GE, LOOP); 5235 } 5236 5237 // a1 = r1 - array1 address 5238 // a2 = r2 - array2 address 5239 // result = r0 - return value. Already contains "false" 5240 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5241 // r3-r5 are reserved temporary registers 5242 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5243 address generate_large_array_equals() { 5244 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5245 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5246 tmp7 = r12, tmp8 = r13; 5247 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5248 SMALL_LOOP, POST_LOOP; 5249 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5250 // calculate if at least 32 prefetched bytes are used 5251 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5252 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5253 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5254 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5255 tmp5, tmp6, tmp7, tmp8); 5256 5257 __ align(CodeEntryAlignment); 5258 5259 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5260 5261 address entry = __ pc(); 5262 __ enter(); 5263 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5264 // also advance pointers to use post-increment instead of pre-increment 5265 __ add(a1, a1, wordSize); 5266 __ add(a2, a2, wordSize); 5267 if (AvoidUnalignedAccesses) { 5268 // both implementations (SIMD/nonSIMD) are using relatively large load 5269 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5270 // on some CPUs in case of address is not at least 16-byte aligned. 5271 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5272 // load if needed at least for 1st address and make if 16-byte aligned. 5273 Label ALIGNED16; 5274 __ tbz(a1, 3, ALIGNED16); 5275 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5276 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5277 __ sub(cnt1, cnt1, wordSize); 5278 __ eor(tmp1, tmp1, tmp2); 5279 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5280 __ bind(ALIGNED16); 5281 } 5282 if (UseSIMDForArrayEquals) { 5283 if (SoftwarePrefetchHintDistance >= 0) { 5284 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5285 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5286 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5287 /* prfm = */ true, NOT_EQUAL); 5288 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5289 __ br(__ LT, TAIL); 5290 } 5291 __ bind(NO_PREFETCH_LARGE_LOOP); 5292 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5293 /* prfm = */ false, NOT_EQUAL); 5294 } else { 5295 __ push(spilled_regs, sp); 5296 if (SoftwarePrefetchHintDistance >= 0) { 5297 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5298 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5299 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5300 /* prfm = */ true, NOT_EQUAL); 5301 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5302 __ br(__ LT, TAIL); 5303 } 5304 __ bind(NO_PREFETCH_LARGE_LOOP); 5305 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5306 /* prfm = */ false, NOT_EQUAL); 5307 } 5308 __ bind(TAIL); 5309 __ cbz(cnt1, EQUAL); 5310 __ subs(cnt1, cnt1, wordSize); 5311 __ br(__ LE, POST_LOOP); 5312 __ bind(SMALL_LOOP); 5313 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5314 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5315 __ subs(cnt1, cnt1, wordSize); 5316 __ eor(tmp1, tmp1, tmp2); 5317 __ cbnz(tmp1, NOT_EQUAL); 5318 __ br(__ GT, SMALL_LOOP); 5319 __ bind(POST_LOOP); 5320 __ ldr(tmp1, Address(a1, cnt1)); 5321 __ ldr(tmp2, Address(a2, cnt1)); 5322 __ eor(tmp1, tmp1, tmp2); 5323 __ cbnz(tmp1, NOT_EQUAL); 5324 __ bind(EQUAL); 5325 __ mov(result, true); 5326 __ bind(NOT_EQUAL); 5327 if (!UseSIMDForArrayEquals) { 5328 __ pop(spilled_regs, sp); 5329 } 5330 __ bind(NOT_EQUAL_NO_POP); 5331 __ leave(); 5332 __ ret(lr); 5333 return entry; 5334 } 5335 5336 address generate_dsin_dcos(bool isCos) { 5337 __ align(CodeEntryAlignment); 5338 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5339 address start = __ pc(); 5340 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5341 (address)StubRoutines::aarch64::_two_over_pi, 5342 (address)StubRoutines::aarch64::_pio2, 5343 (address)StubRoutines::aarch64::_dsin_coef, 5344 (address)StubRoutines::aarch64::_dcos_coef); 5345 return start; 5346 } 5347 5348 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5349 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5350 Label &DIFF2) { 5351 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5352 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5353 5354 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5355 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5356 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5357 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5358 5359 __ fmovd(tmpL, vtmp3); 5360 __ eor(rscratch2, tmp3, tmpL); 5361 __ cbnz(rscratch2, DIFF2); 5362 5363 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5364 __ umov(tmpL, vtmp3, __ D, 1); 5365 __ eor(rscratch2, tmpU, tmpL); 5366 __ cbnz(rscratch2, DIFF1); 5367 5368 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5369 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5370 __ fmovd(tmpL, vtmp); 5371 __ eor(rscratch2, tmp3, tmpL); 5372 __ cbnz(rscratch2, DIFF2); 5373 5374 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5375 __ umov(tmpL, vtmp, __ D, 1); 5376 __ eor(rscratch2, tmpU, tmpL); 5377 __ cbnz(rscratch2, DIFF1); 5378 } 5379 5380 // r0 = result 5381 // r1 = str1 5382 // r2 = cnt1 5383 // r3 = str2 5384 // r4 = cnt2 5385 // r10 = tmp1 5386 // r11 = tmp2 5387 address generate_compare_long_string_different_encoding(bool isLU) { 5388 __ align(CodeEntryAlignment); 5389 StubCodeMark mark(this, "StubRoutines", isLU 5390 ? "compare_long_string_different_encoding LU" 5391 : "compare_long_string_different_encoding UL"); 5392 address entry = __ pc(); 5393 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5394 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5395 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5396 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5397 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5398 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5399 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5400 5401 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5402 5403 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5404 // cnt2 == amount of characters left to compare 5405 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5406 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5407 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5408 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5409 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5410 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5411 __ eor(rscratch2, tmp1, tmp2); 5412 __ mov(rscratch1, tmp2); 5413 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5414 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5415 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5416 __ push(spilled_regs, sp); 5417 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5418 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5419 5420 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5421 5422 if (SoftwarePrefetchHintDistance >= 0) { 5423 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5424 __ br(__ LT, NO_PREFETCH); 5425 __ bind(LARGE_LOOP_PREFETCH); 5426 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5427 __ mov(tmp4, 2); 5428 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5429 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5430 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5431 __ subs(tmp4, tmp4, 1); 5432 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5433 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5434 __ mov(tmp4, 2); 5435 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5436 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5437 __ subs(tmp4, tmp4, 1); 5438 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5439 __ sub(cnt2, cnt2, 64); 5440 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5441 __ br(__ GE, LARGE_LOOP_PREFETCH); 5442 } 5443 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5444 __ bind(NO_PREFETCH); 5445 __ subs(cnt2, cnt2, 16); 5446 __ br(__ LT, TAIL); 5447 __ align(OptoLoopAlignment); 5448 __ bind(SMALL_LOOP); // smaller loop 5449 __ subs(cnt2, cnt2, 16); 5450 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5451 __ br(__ GE, SMALL_LOOP); 5452 __ cmn(cnt2, (u1)16); 5453 __ br(__ EQ, LOAD_LAST); 5454 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5455 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5456 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5457 __ ldr(tmp3, Address(cnt1, -8)); 5458 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5459 __ b(LOAD_LAST); 5460 __ bind(DIFF2); 5461 __ mov(tmpU, tmp3); 5462 __ bind(DIFF1); 5463 __ pop(spilled_regs, sp); 5464 __ b(CALCULATE_DIFFERENCE); 5465 __ bind(LOAD_LAST); 5466 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5467 // No need to load it again 5468 __ mov(tmpU, tmp3); 5469 __ pop(spilled_regs, sp); 5470 5471 // tmp2 points to the address of the last 4 Latin1 characters right now 5472 __ ldrs(vtmp, Address(tmp2)); 5473 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5474 __ fmovd(tmpL, vtmp); 5475 5476 __ eor(rscratch2, tmpU, tmpL); 5477 __ cbz(rscratch2, DONE); 5478 5479 // Find the first different characters in the longwords and 5480 // compute their difference. 5481 __ bind(CALCULATE_DIFFERENCE); 5482 __ rev(rscratch2, rscratch2); 5483 __ clz(rscratch2, rscratch2); 5484 __ andr(rscratch2, rscratch2, -16); 5485 __ lsrv(tmp1, tmp1, rscratch2); 5486 __ uxthw(tmp1, tmp1); 5487 __ lsrv(rscratch1, rscratch1, rscratch2); 5488 __ uxthw(rscratch1, rscratch1); 5489 __ subw(result, tmp1, rscratch1); 5490 __ bind(DONE); 5491 __ ret(lr); 5492 return entry; 5493 } 5494 5495 // r0 = input (float16) 5496 // v0 = result (float) 5497 // v1 = temporary float register 5498 address generate_float16ToFloat() { 5499 __ align(CodeEntryAlignment); 5500 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 5501 address entry = __ pc(); 5502 BLOCK_COMMENT("Entry:"); 5503 __ flt16_to_flt(v0, r0, v1); 5504 __ ret(lr); 5505 return entry; 5506 } 5507 5508 // v0 = input (float) 5509 // r0 = result (float16) 5510 // v1 = temporary float register 5511 address generate_floatToFloat16() { 5512 __ align(CodeEntryAlignment); 5513 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 5514 address entry = __ pc(); 5515 BLOCK_COMMENT("Entry:"); 5516 __ flt_to_flt16(r0, v0, v1); 5517 __ ret(lr); 5518 return entry; 5519 } 5520 5521 address generate_method_entry_barrier() { 5522 __ align(CodeEntryAlignment); 5523 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5524 5525 Label deoptimize_label; 5526 5527 address start = __ pc(); 5528 5529 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5530 5531 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5532 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5533 // We can get here despite the nmethod being good, if we have not 5534 // yet applied our cross modification fence (or data fence). 5535 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5536 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5537 __ ldrw(rscratch2, rscratch2); 5538 __ strw(rscratch2, thread_epoch_addr); 5539 __ isb(); 5540 __ membar(__ LoadLoad); 5541 } 5542 5543 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5544 5545 __ enter(); 5546 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5547 5548 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5549 5550 __ push_call_clobbered_registers(); 5551 5552 __ mov(c_rarg0, rscratch2); 5553 __ call_VM_leaf 5554 (CAST_FROM_FN_PTR 5555 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5556 5557 __ reset_last_Java_frame(true); 5558 5559 __ mov(rscratch1, r0); 5560 5561 __ pop_call_clobbered_registers(); 5562 5563 __ cbnz(rscratch1, deoptimize_label); 5564 5565 __ leave(); 5566 __ ret(lr); 5567 5568 __ BIND(deoptimize_label); 5569 5570 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5571 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5572 5573 __ mov(sp, rscratch1); 5574 __ br(rscratch2); 5575 5576 return start; 5577 } 5578 5579 // r0 = result 5580 // r1 = str1 5581 // r2 = cnt1 5582 // r3 = str2 5583 // r4 = cnt2 5584 // r10 = tmp1 5585 // r11 = tmp2 5586 address generate_compare_long_string_same_encoding(bool isLL) { 5587 __ align(CodeEntryAlignment); 5588 StubCodeMark mark(this, "StubRoutines", isLL 5589 ? "compare_long_string_same_encoding LL" 5590 : "compare_long_string_same_encoding UU"); 5591 address entry = __ pc(); 5592 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5593 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5594 5595 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5596 5597 // exit from large loop when less than 64 bytes left to read or we're about 5598 // to prefetch memory behind array border 5599 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5600 5601 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5602 __ eor(rscratch2, tmp1, tmp2); 5603 __ cbnz(rscratch2, CAL_DIFFERENCE); 5604 5605 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5606 // update pointers, because of previous read 5607 __ add(str1, str1, wordSize); 5608 __ add(str2, str2, wordSize); 5609 if (SoftwarePrefetchHintDistance >= 0) { 5610 __ align(OptoLoopAlignment); 5611 __ bind(LARGE_LOOP_PREFETCH); 5612 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5613 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5614 5615 for (int i = 0; i < 4; i++) { 5616 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5617 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5618 __ cmp(tmp1, tmp2); 5619 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5620 __ br(Assembler::NE, DIFF); 5621 } 5622 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5623 __ add(str1, str1, 64); 5624 __ add(str2, str2, 64); 5625 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5626 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5627 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5628 } 5629 5630 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5631 __ br(Assembler::LE, LESS16); 5632 __ align(OptoLoopAlignment); 5633 __ bind(LOOP_COMPARE16); 5634 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5635 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5636 __ cmp(tmp1, tmp2); 5637 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5638 __ br(Assembler::NE, DIFF); 5639 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5640 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5641 __ br(Assembler::LT, LESS16); 5642 5643 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5644 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5645 __ cmp(tmp1, tmp2); 5646 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5647 __ br(Assembler::NE, DIFF); 5648 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5649 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5650 __ br(Assembler::GE, LOOP_COMPARE16); 5651 __ cbz(cnt2, LENGTH_DIFF); 5652 5653 __ bind(LESS16); 5654 // each 8 compare 5655 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5656 __ br(Assembler::LE, LESS8); 5657 __ ldr(tmp1, Address(__ post(str1, 8))); 5658 __ ldr(tmp2, Address(__ post(str2, 8))); 5659 __ eor(rscratch2, tmp1, tmp2); 5660 __ cbnz(rscratch2, CAL_DIFFERENCE); 5661 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5662 5663 __ bind(LESS8); // directly load last 8 bytes 5664 if (!isLL) { 5665 __ add(cnt2, cnt2, cnt2); 5666 } 5667 __ ldr(tmp1, Address(str1, cnt2)); 5668 __ ldr(tmp2, Address(str2, cnt2)); 5669 __ eor(rscratch2, tmp1, tmp2); 5670 __ cbz(rscratch2, LENGTH_DIFF); 5671 __ b(CAL_DIFFERENCE); 5672 5673 __ bind(DIFF); 5674 __ cmp(tmp1, tmp2); 5675 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5676 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5677 // reuse rscratch2 register for the result of eor instruction 5678 __ eor(rscratch2, tmp1, tmp2); 5679 5680 __ bind(CAL_DIFFERENCE); 5681 __ rev(rscratch2, rscratch2); 5682 __ clz(rscratch2, rscratch2); 5683 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5684 __ lsrv(tmp1, tmp1, rscratch2); 5685 __ lsrv(tmp2, tmp2, rscratch2); 5686 if (isLL) { 5687 __ uxtbw(tmp1, tmp1); 5688 __ uxtbw(tmp2, tmp2); 5689 } else { 5690 __ uxthw(tmp1, tmp1); 5691 __ uxthw(tmp2, tmp2); 5692 } 5693 __ subw(result, tmp1, tmp2); 5694 5695 __ bind(LENGTH_DIFF); 5696 __ ret(lr); 5697 return entry; 5698 } 5699 5700 enum string_compare_mode { 5701 LL, 5702 LU, 5703 UL, 5704 UU, 5705 }; 5706 5707 // The following registers are declared in aarch64.ad 5708 // r0 = result 5709 // r1 = str1 5710 // r2 = cnt1 5711 // r3 = str2 5712 // r4 = cnt2 5713 // r10 = tmp1 5714 // r11 = tmp2 5715 // z0 = ztmp1 5716 // z1 = ztmp2 5717 // p0 = pgtmp1 5718 // p1 = pgtmp2 5719 address generate_compare_long_string_sve(string_compare_mode mode) { 5720 __ align(CodeEntryAlignment); 5721 address entry = __ pc(); 5722 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5723 tmp1 = r10, tmp2 = r11; 5724 5725 Label LOOP, DONE, MISMATCH; 5726 Register vec_len = tmp1; 5727 Register idx = tmp2; 5728 // The minimum of the string lengths has been stored in cnt2. 5729 Register cnt = cnt2; 5730 FloatRegister ztmp1 = z0, ztmp2 = z1; 5731 PRegister pgtmp1 = p0, pgtmp2 = p1; 5732 5733 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5734 switch (mode) { \ 5735 case LL: \ 5736 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5737 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5738 break; \ 5739 case LU: \ 5740 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5741 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5742 break; \ 5743 case UL: \ 5744 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5745 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5746 break; \ 5747 case UU: \ 5748 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5749 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5750 break; \ 5751 default: \ 5752 ShouldNotReachHere(); \ 5753 } 5754 5755 const char* stubname; 5756 switch (mode) { 5757 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5758 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5759 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5760 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5761 default: ShouldNotReachHere(); 5762 } 5763 5764 StubCodeMark mark(this, "StubRoutines", stubname); 5765 5766 __ mov(idx, 0); 5767 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5768 5769 if (mode == LL) { 5770 __ sve_cntb(vec_len); 5771 } else { 5772 __ sve_cnth(vec_len); 5773 } 5774 5775 __ sub(rscratch1, cnt, vec_len); 5776 5777 __ bind(LOOP); 5778 5779 // main loop 5780 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5781 __ add(idx, idx, vec_len); 5782 // Compare strings. 5783 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5784 __ br(__ NE, MISMATCH); 5785 __ cmp(idx, rscratch1); 5786 __ br(__ LT, LOOP); 5787 5788 // post loop, last iteration 5789 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5790 5791 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5792 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5793 __ br(__ EQ, DONE); 5794 5795 __ bind(MISMATCH); 5796 5797 // Crop the vector to find its location. 5798 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5799 // Extract the first different characters of each string. 5800 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5801 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5802 5803 // Compute the difference of the first different characters. 5804 __ sub(result, rscratch1, rscratch2); 5805 5806 __ bind(DONE); 5807 __ ret(lr); 5808 #undef LOAD_PAIR 5809 return entry; 5810 } 5811 5812 void generate_compare_long_strings() { 5813 if (UseSVE == 0) { 5814 StubRoutines::aarch64::_compare_long_string_LL 5815 = generate_compare_long_string_same_encoding(true); 5816 StubRoutines::aarch64::_compare_long_string_UU 5817 = generate_compare_long_string_same_encoding(false); 5818 StubRoutines::aarch64::_compare_long_string_LU 5819 = generate_compare_long_string_different_encoding(true); 5820 StubRoutines::aarch64::_compare_long_string_UL 5821 = generate_compare_long_string_different_encoding(false); 5822 } else { 5823 StubRoutines::aarch64::_compare_long_string_LL 5824 = generate_compare_long_string_sve(LL); 5825 StubRoutines::aarch64::_compare_long_string_UU 5826 = generate_compare_long_string_sve(UU); 5827 StubRoutines::aarch64::_compare_long_string_LU 5828 = generate_compare_long_string_sve(LU); 5829 StubRoutines::aarch64::_compare_long_string_UL 5830 = generate_compare_long_string_sve(UL); 5831 } 5832 } 5833 5834 // R0 = result 5835 // R1 = str2 5836 // R2 = cnt1 5837 // R3 = str1 5838 // R4 = cnt2 5839 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5840 // 5841 // This generic linear code use few additional ideas, which makes it faster: 5842 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5843 // in order to skip initial loading(help in systems with 1 ld pipeline) 5844 // 2) we can use "fast" algorithm of finding single character to search for 5845 // first symbol with less branches(1 branch per each loaded register instead 5846 // of branch for each symbol), so, this is where constants like 5847 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5848 // 3) after loading and analyzing 1st register of source string, it can be 5849 // used to search for every 1st character entry, saving few loads in 5850 // comparison with "simplier-but-slower" implementation 5851 // 4) in order to avoid lots of push/pop operations, code below is heavily 5852 // re-using/re-initializing/compressing register values, which makes code 5853 // larger and a bit less readable, however, most of extra operations are 5854 // issued during loads or branches, so, penalty is minimal 5855 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5856 const char* stubName = str1_isL 5857 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5858 : "indexof_linear_uu"; 5859 __ align(CodeEntryAlignment); 5860 StubCodeMark mark(this, "StubRoutines", stubName); 5861 address entry = __ pc(); 5862 5863 int str1_chr_size = str1_isL ? 1 : 2; 5864 int str2_chr_size = str2_isL ? 1 : 2; 5865 int str1_chr_shift = str1_isL ? 0 : 1; 5866 int str2_chr_shift = str2_isL ? 0 : 1; 5867 bool isL = str1_isL && str2_isL; 5868 // parameters 5869 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5870 // temporary registers 5871 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5872 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5873 // redefinitions 5874 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5875 5876 __ push(spilled_regs, sp); 5877 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5878 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5879 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5880 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5881 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5882 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5883 // Read whole register from str1. It is safe, because length >=8 here 5884 __ ldr(ch1, Address(str1)); 5885 // Read whole register from str2. It is safe, because length >=8 here 5886 __ ldr(ch2, Address(str2)); 5887 __ sub(cnt2, cnt2, cnt1); 5888 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5889 if (str1_isL != str2_isL) { 5890 __ eor(v0, __ T16B, v0, v0); 5891 } 5892 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5893 __ mul(first, first, tmp1); 5894 // check if we have less than 1 register to check 5895 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5896 if (str1_isL != str2_isL) { 5897 __ fmovd(v1, ch1); 5898 } 5899 __ br(__ LE, L_SMALL); 5900 __ eor(ch2, first, ch2); 5901 if (str1_isL != str2_isL) { 5902 __ zip1(v1, __ T16B, v1, v0); 5903 } 5904 __ sub(tmp2, ch2, tmp1); 5905 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5906 __ bics(tmp2, tmp2, ch2); 5907 if (str1_isL != str2_isL) { 5908 __ fmovd(ch1, v1); 5909 } 5910 __ br(__ NE, L_HAS_ZERO); 5911 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5912 __ add(result, result, wordSize/str2_chr_size); 5913 __ add(str2, str2, wordSize); 5914 __ br(__ LT, L_POST_LOOP); 5915 __ BIND(L_LOOP); 5916 __ ldr(ch2, Address(str2)); 5917 __ eor(ch2, first, ch2); 5918 __ sub(tmp2, ch2, tmp1); 5919 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5920 __ bics(tmp2, tmp2, ch2); 5921 __ br(__ NE, L_HAS_ZERO); 5922 __ BIND(L_LOOP_PROCEED); 5923 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5924 __ add(str2, str2, wordSize); 5925 __ add(result, result, wordSize/str2_chr_size); 5926 __ br(__ GE, L_LOOP); 5927 __ BIND(L_POST_LOOP); 5928 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5929 __ br(__ LE, NOMATCH); 5930 __ ldr(ch2, Address(str2)); 5931 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5932 __ eor(ch2, first, ch2); 5933 __ sub(tmp2, ch2, tmp1); 5934 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5935 __ mov(tmp4, -1); // all bits set 5936 __ b(L_SMALL_PROCEED); 5937 __ align(OptoLoopAlignment); 5938 __ BIND(L_SMALL); 5939 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5940 __ eor(ch2, first, ch2); 5941 if (str1_isL != str2_isL) { 5942 __ zip1(v1, __ T16B, v1, v0); 5943 } 5944 __ sub(tmp2, ch2, tmp1); 5945 __ mov(tmp4, -1); // all bits set 5946 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5947 if (str1_isL != str2_isL) { 5948 __ fmovd(ch1, v1); // move converted 4 symbols 5949 } 5950 __ BIND(L_SMALL_PROCEED); 5951 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5952 __ bic(tmp2, tmp2, ch2); 5953 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5954 __ rbit(tmp2, tmp2); 5955 __ br(__ EQ, NOMATCH); 5956 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5957 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5958 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5959 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5960 if (str2_isL) { // LL 5961 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5962 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5963 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5964 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5965 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5966 } else { 5967 __ mov(ch2, 0xE); // all bits in byte set except last one 5968 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5969 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5970 __ lslv(tmp2, tmp2, tmp4); 5971 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5972 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5973 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5974 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5975 } 5976 __ cmp(ch1, ch2); 5977 __ mov(tmp4, wordSize/str2_chr_size); 5978 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5979 __ BIND(L_SMALL_CMP_LOOP); 5980 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5981 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5982 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5983 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5984 __ add(tmp4, tmp4, 1); 5985 __ cmp(tmp4, cnt1); 5986 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5987 __ cmp(first, ch2); 5988 __ br(__ EQ, L_SMALL_CMP_LOOP); 5989 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5990 __ cbz(tmp2, NOMATCH); // no more matches. exit 5991 __ clz(tmp4, tmp2); 5992 __ add(result, result, 1); // advance index 5993 __ add(str2, str2, str2_chr_size); // advance pointer 5994 __ b(L_SMALL_HAS_ZERO_LOOP); 5995 __ align(OptoLoopAlignment); 5996 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5997 __ cmp(first, ch2); 5998 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5999 __ b(DONE); 6000 __ align(OptoLoopAlignment); 6001 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 6002 if (str2_isL) { // LL 6003 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6004 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6005 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6006 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6007 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6008 } else { 6009 __ mov(ch2, 0xE); // all bits in byte set except last one 6010 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6011 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6012 __ lslv(tmp2, tmp2, tmp4); 6013 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6014 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6015 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6016 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6017 } 6018 __ cmp(ch1, ch2); 6019 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6020 __ b(DONE); 6021 __ align(OptoLoopAlignment); 6022 __ BIND(L_HAS_ZERO); 6023 __ rbit(tmp2, tmp2); 6024 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6025 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6026 // It's fine because both counters are 32bit and are not changed in this 6027 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6028 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6029 __ sub(result, result, 1); 6030 __ BIND(L_HAS_ZERO_LOOP); 6031 __ mov(cnt1, wordSize/str2_chr_size); 6032 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6033 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6034 if (str2_isL) { 6035 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6036 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6037 __ lslv(tmp2, tmp2, tmp4); 6038 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6039 __ add(tmp4, tmp4, 1); 6040 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6041 __ lsl(tmp2, tmp2, 1); 6042 __ mov(tmp4, wordSize/str2_chr_size); 6043 } else { 6044 __ mov(ch2, 0xE); 6045 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6046 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6047 __ lslv(tmp2, tmp2, tmp4); 6048 __ add(tmp4, tmp4, 1); 6049 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6050 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6051 __ lsl(tmp2, tmp2, 1); 6052 __ mov(tmp4, wordSize/str2_chr_size); 6053 __ sub(str2, str2, str2_chr_size); 6054 } 6055 __ cmp(ch1, ch2); 6056 __ mov(tmp4, wordSize/str2_chr_size); 6057 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6058 __ BIND(L_CMP_LOOP); 6059 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6060 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6061 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6062 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6063 __ add(tmp4, tmp4, 1); 6064 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6065 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6066 __ cmp(cnt1, ch2); 6067 __ br(__ EQ, L_CMP_LOOP); 6068 __ BIND(L_CMP_LOOP_NOMATCH); 6069 // here we're not matched 6070 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6071 __ clz(tmp4, tmp2); 6072 __ add(str2, str2, str2_chr_size); // advance pointer 6073 __ b(L_HAS_ZERO_LOOP); 6074 __ align(OptoLoopAlignment); 6075 __ BIND(L_CMP_LOOP_LAST_CMP); 6076 __ cmp(cnt1, ch2); 6077 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6078 __ b(DONE); 6079 __ align(OptoLoopAlignment); 6080 __ BIND(L_CMP_LOOP_LAST_CMP2); 6081 if (str2_isL) { 6082 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6083 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6084 __ lslv(tmp2, tmp2, tmp4); 6085 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6086 __ add(tmp4, tmp4, 1); 6087 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6088 __ lsl(tmp2, tmp2, 1); 6089 } else { 6090 __ mov(ch2, 0xE); 6091 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6092 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6093 __ lslv(tmp2, tmp2, tmp4); 6094 __ add(tmp4, tmp4, 1); 6095 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6096 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6097 __ lsl(tmp2, tmp2, 1); 6098 __ sub(str2, str2, str2_chr_size); 6099 } 6100 __ cmp(ch1, ch2); 6101 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6102 __ b(DONE); 6103 __ align(OptoLoopAlignment); 6104 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6105 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6106 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6107 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6108 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6109 // result by analyzed characters value, so, we can just reset lower bits 6110 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6111 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6112 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6113 // index of last analyzed substring inside current octet. So, str2 in at 6114 // respective start address. We need to advance it to next octet 6115 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6116 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6117 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6118 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6119 __ movw(cnt2, cnt2); 6120 __ b(L_LOOP_PROCEED); 6121 __ align(OptoLoopAlignment); 6122 __ BIND(NOMATCH); 6123 __ mov(result, -1); 6124 __ BIND(DONE); 6125 __ pop(spilled_regs, sp); 6126 __ ret(lr); 6127 return entry; 6128 } 6129 6130 void generate_string_indexof_stubs() { 6131 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6132 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6133 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6134 } 6135 6136 void inflate_and_store_2_fp_registers(bool generatePrfm, 6137 FloatRegister src1, FloatRegister src2) { 6138 Register dst = r1; 6139 __ zip1(v1, __ T16B, src1, v0); 6140 __ zip2(v2, __ T16B, src1, v0); 6141 if (generatePrfm) { 6142 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6143 } 6144 __ zip1(v3, __ T16B, src2, v0); 6145 __ zip2(v4, __ T16B, src2, v0); 6146 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6147 } 6148 6149 // R0 = src 6150 // R1 = dst 6151 // R2 = len 6152 // R3 = len >> 3 6153 // V0 = 0 6154 // v1 = loaded 8 bytes 6155 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6156 address generate_large_byte_array_inflate() { 6157 __ align(CodeEntryAlignment); 6158 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6159 address entry = __ pc(); 6160 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6161 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6162 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6163 6164 // do one more 8-byte read to have address 16-byte aligned in most cases 6165 // also use single store instruction 6166 __ ldrd(v2, __ post(src, 8)); 6167 __ sub(octetCounter, octetCounter, 2); 6168 __ zip1(v1, __ T16B, v1, v0); 6169 __ zip1(v2, __ T16B, v2, v0); 6170 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6171 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6172 __ subs(rscratch1, octetCounter, large_loop_threshold); 6173 __ br(__ LE, LOOP_START); 6174 __ b(LOOP_PRFM_START); 6175 __ bind(LOOP_PRFM); 6176 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6177 __ bind(LOOP_PRFM_START); 6178 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6179 __ sub(octetCounter, octetCounter, 8); 6180 __ subs(rscratch1, octetCounter, large_loop_threshold); 6181 inflate_and_store_2_fp_registers(true, v3, v4); 6182 inflate_and_store_2_fp_registers(true, v5, v6); 6183 __ br(__ GT, LOOP_PRFM); 6184 __ cmp(octetCounter, (u1)8); 6185 __ br(__ LT, DONE); 6186 __ bind(LOOP); 6187 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6188 __ bind(LOOP_START); 6189 __ sub(octetCounter, octetCounter, 8); 6190 __ cmp(octetCounter, (u1)8); 6191 inflate_and_store_2_fp_registers(false, v3, v4); 6192 inflate_and_store_2_fp_registers(false, v5, v6); 6193 __ br(__ GE, LOOP); 6194 __ bind(DONE); 6195 __ ret(lr); 6196 return entry; 6197 } 6198 6199 /** 6200 * Arguments: 6201 * 6202 * Input: 6203 * c_rarg0 - current state address 6204 * c_rarg1 - H key address 6205 * c_rarg2 - data address 6206 * c_rarg3 - number of blocks 6207 * 6208 * Output: 6209 * Updated state at c_rarg0 6210 */ 6211 address generate_ghash_processBlocks() { 6212 // Bafflingly, GCM uses little-endian for the byte order, but 6213 // big-endian for the bit order. For example, the polynomial 1 is 6214 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6215 // 6216 // So, we must either reverse the bytes in each word and do 6217 // everything big-endian or reverse the bits in each byte and do 6218 // it little-endian. On AArch64 it's more idiomatic to reverse 6219 // the bits in each byte (we have an instruction, RBIT, to do 6220 // that) and keep the data in little-endian bit order through the 6221 // calculation, bit-reversing the inputs and outputs. 6222 6223 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6224 __ align(wordSize * 2); 6225 address p = __ pc(); 6226 __ emit_int64(0x87); // The low-order bits of the field 6227 // polynomial (i.e. p = z^7+z^2+z+1) 6228 // repeated in the low and high parts of a 6229 // 128-bit vector 6230 __ emit_int64(0x87); 6231 6232 __ align(CodeEntryAlignment); 6233 address start = __ pc(); 6234 6235 Register state = c_rarg0; 6236 Register subkeyH = c_rarg1; 6237 Register data = c_rarg2; 6238 Register blocks = c_rarg3; 6239 6240 FloatRegister vzr = v30; 6241 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6242 6243 __ ldrq(v24, p); // The field polynomial 6244 6245 __ ldrq(v0, Address(state)); 6246 __ ldrq(v1, Address(subkeyH)); 6247 6248 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6249 __ rbit(v0, __ T16B, v0); 6250 __ rev64(v1, __ T16B, v1); 6251 __ rbit(v1, __ T16B, v1); 6252 6253 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6254 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6255 6256 { 6257 Label L_ghash_loop; 6258 __ bind(L_ghash_loop); 6259 6260 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6261 // reversing each byte 6262 __ rbit(v2, __ T16B, v2); 6263 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6264 6265 // Multiply state in v2 by subkey in v1 6266 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6267 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6268 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6269 // Reduce v7:v5 by the field polynomial 6270 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6271 6272 __ sub(blocks, blocks, 1); 6273 __ cbnz(blocks, L_ghash_loop); 6274 } 6275 6276 // The bit-reversed result is at this point in v0 6277 __ rev64(v0, __ T16B, v0); 6278 __ rbit(v0, __ T16B, v0); 6279 6280 __ st1(v0, __ T16B, state); 6281 __ ret(lr); 6282 6283 return start; 6284 } 6285 6286 address generate_ghash_processBlocks_wide() { 6287 address small = generate_ghash_processBlocks(); 6288 6289 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6290 __ align(wordSize * 2); 6291 address p = __ pc(); 6292 __ emit_int64(0x87); // The low-order bits of the field 6293 // polynomial (i.e. p = z^7+z^2+z+1) 6294 // repeated in the low and high parts of a 6295 // 128-bit vector 6296 __ emit_int64(0x87); 6297 6298 __ align(CodeEntryAlignment); 6299 address start = __ pc(); 6300 6301 Register state = c_rarg0; 6302 Register subkeyH = c_rarg1; 6303 Register data = c_rarg2; 6304 Register blocks = c_rarg3; 6305 6306 const int unroll = 4; 6307 6308 __ cmp(blocks, (unsigned char)(unroll * 2)); 6309 __ br(__ LT, small); 6310 6311 if (unroll > 1) { 6312 // Save state before entering routine 6313 __ sub(sp, sp, 4 * 16); 6314 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6315 __ sub(sp, sp, 4 * 16); 6316 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6317 } 6318 6319 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6320 6321 if (unroll > 1) { 6322 // And restore state 6323 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6324 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6325 } 6326 6327 __ cmp(blocks, (unsigned char)0); 6328 __ br(__ GT, small); 6329 6330 __ ret(lr); 6331 6332 return start; 6333 } 6334 6335 void generate_base64_encode_simdround(Register src, Register dst, 6336 FloatRegister codec, u8 size) { 6337 6338 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6339 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6340 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6341 6342 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6343 6344 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6345 6346 __ ushr(ind0, arrangement, in0, 2); 6347 6348 __ ushr(ind1, arrangement, in1, 2); 6349 __ shl(in0, arrangement, in0, 6); 6350 __ orr(ind1, arrangement, ind1, in0); 6351 __ ushr(ind1, arrangement, ind1, 2); 6352 6353 __ ushr(ind2, arrangement, in2, 4); 6354 __ shl(in1, arrangement, in1, 4); 6355 __ orr(ind2, arrangement, in1, ind2); 6356 __ ushr(ind2, arrangement, ind2, 2); 6357 6358 __ shl(ind3, arrangement, in2, 2); 6359 __ ushr(ind3, arrangement, ind3, 2); 6360 6361 __ tbl(out0, arrangement, codec, 4, ind0); 6362 __ tbl(out1, arrangement, codec, 4, ind1); 6363 __ tbl(out2, arrangement, codec, 4, ind2); 6364 __ tbl(out3, arrangement, codec, 4, ind3); 6365 6366 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6367 } 6368 6369 /** 6370 * Arguments: 6371 * 6372 * Input: 6373 * c_rarg0 - src_start 6374 * c_rarg1 - src_offset 6375 * c_rarg2 - src_length 6376 * c_rarg3 - dest_start 6377 * c_rarg4 - dest_offset 6378 * c_rarg5 - isURL 6379 * 6380 */ 6381 address generate_base64_encodeBlock() { 6382 6383 static const char toBase64[64] = { 6384 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6385 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6386 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6387 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6388 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6389 }; 6390 6391 static const char toBase64URL[64] = { 6392 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6393 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6394 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6395 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6396 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6397 }; 6398 6399 __ align(CodeEntryAlignment); 6400 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6401 address start = __ pc(); 6402 6403 Register src = c_rarg0; // source array 6404 Register soff = c_rarg1; // source start offset 6405 Register send = c_rarg2; // source end offset 6406 Register dst = c_rarg3; // dest array 6407 Register doff = c_rarg4; // position for writing to dest array 6408 Register isURL = c_rarg5; // Base64 or URL character set 6409 6410 // c_rarg6 and c_rarg7 are free to use as temps 6411 Register codec = c_rarg6; 6412 Register length = c_rarg7; 6413 6414 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6415 6416 __ add(src, src, soff); 6417 __ add(dst, dst, doff); 6418 __ sub(length, send, soff); 6419 6420 // load the codec base address 6421 __ lea(codec, ExternalAddress((address) toBase64)); 6422 __ cbz(isURL, ProcessData); 6423 __ lea(codec, ExternalAddress((address) toBase64URL)); 6424 6425 __ BIND(ProcessData); 6426 6427 // too short to formup a SIMD loop, roll back 6428 __ cmp(length, (u1)24); 6429 __ br(Assembler::LT, Process3B); 6430 6431 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6432 6433 __ BIND(Process48B); 6434 __ cmp(length, (u1)48); 6435 __ br(Assembler::LT, Process24B); 6436 generate_base64_encode_simdround(src, dst, v0, 16); 6437 __ sub(length, length, 48); 6438 __ b(Process48B); 6439 6440 __ BIND(Process24B); 6441 __ cmp(length, (u1)24); 6442 __ br(Assembler::LT, SIMDExit); 6443 generate_base64_encode_simdround(src, dst, v0, 8); 6444 __ sub(length, length, 24); 6445 6446 __ BIND(SIMDExit); 6447 __ cbz(length, Exit); 6448 6449 __ BIND(Process3B); 6450 // 3 src bytes, 24 bits 6451 __ ldrb(r10, __ post(src, 1)); 6452 __ ldrb(r11, __ post(src, 1)); 6453 __ ldrb(r12, __ post(src, 1)); 6454 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6455 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6456 // codec index 6457 __ ubfmw(r15, r12, 18, 23); 6458 __ ubfmw(r14, r12, 12, 17); 6459 __ ubfmw(r13, r12, 6, 11); 6460 __ andw(r12, r12, 63); 6461 // get the code based on the codec 6462 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6463 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6464 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6465 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6466 __ strb(r15, __ post(dst, 1)); 6467 __ strb(r14, __ post(dst, 1)); 6468 __ strb(r13, __ post(dst, 1)); 6469 __ strb(r12, __ post(dst, 1)); 6470 __ sub(length, length, 3); 6471 __ cbnz(length, Process3B); 6472 6473 __ BIND(Exit); 6474 __ ret(lr); 6475 6476 return start; 6477 } 6478 6479 void generate_base64_decode_simdround(Register src, Register dst, 6480 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6481 6482 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6483 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6484 6485 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6486 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6487 6488 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6489 6490 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6491 6492 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6493 6494 // we need unsigned saturating subtract, to make sure all input values 6495 // in range [0, 63] will have 0U value in the higher half lookup 6496 __ uqsubv(decH0, __ T16B, in0, v27); 6497 __ uqsubv(decH1, __ T16B, in1, v27); 6498 __ uqsubv(decH2, __ T16B, in2, v27); 6499 __ uqsubv(decH3, __ T16B, in3, v27); 6500 6501 // lower half lookup 6502 __ tbl(decL0, arrangement, codecL, 4, in0); 6503 __ tbl(decL1, arrangement, codecL, 4, in1); 6504 __ tbl(decL2, arrangement, codecL, 4, in2); 6505 __ tbl(decL3, arrangement, codecL, 4, in3); 6506 6507 // higher half lookup 6508 __ tbx(decH0, arrangement, codecH, 4, decH0); 6509 __ tbx(decH1, arrangement, codecH, 4, decH1); 6510 __ tbx(decH2, arrangement, codecH, 4, decH2); 6511 __ tbx(decH3, arrangement, codecH, 4, decH3); 6512 6513 // combine lower and higher 6514 __ orr(decL0, arrangement, decL0, decH0); 6515 __ orr(decL1, arrangement, decL1, decH1); 6516 __ orr(decL2, arrangement, decL2, decH2); 6517 __ orr(decL3, arrangement, decL3, decH3); 6518 6519 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6520 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6521 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6522 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6523 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6524 __ orr(in0, arrangement, decH0, decH1); 6525 __ orr(in1, arrangement, decH2, decH3); 6526 __ orr(in2, arrangement, in0, in1); 6527 __ umaxv(in3, arrangement, in2); 6528 __ umov(rscratch2, in3, __ B, 0); 6529 6530 // get the data to output 6531 __ shl(out0, arrangement, decL0, 2); 6532 __ ushr(out1, arrangement, decL1, 4); 6533 __ orr(out0, arrangement, out0, out1); 6534 __ shl(out1, arrangement, decL1, 4); 6535 __ ushr(out2, arrangement, decL2, 2); 6536 __ orr(out1, arrangement, out1, out2); 6537 __ shl(out2, arrangement, decL2, 6); 6538 __ orr(out2, arrangement, out2, decL3); 6539 6540 __ cbz(rscratch2, NoIllegalData); 6541 6542 // handle illegal input 6543 __ umov(r10, in2, __ D, 0); 6544 if (size == 16) { 6545 __ cbnz(r10, ErrorInLowerHalf); 6546 6547 // illegal input is in higher half, store the lower half now. 6548 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6549 6550 __ umov(r10, in2, __ D, 1); 6551 __ umov(r11, out0, __ D, 1); 6552 __ umov(r12, out1, __ D, 1); 6553 __ umov(r13, out2, __ D, 1); 6554 __ b(StoreLegalData); 6555 6556 __ BIND(ErrorInLowerHalf); 6557 } 6558 __ umov(r11, out0, __ D, 0); 6559 __ umov(r12, out1, __ D, 0); 6560 __ umov(r13, out2, __ D, 0); 6561 6562 __ BIND(StoreLegalData); 6563 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6564 __ strb(r11, __ post(dst, 1)); 6565 __ strb(r12, __ post(dst, 1)); 6566 __ strb(r13, __ post(dst, 1)); 6567 __ lsr(r10, r10, 8); 6568 __ lsr(r11, r11, 8); 6569 __ lsr(r12, r12, 8); 6570 __ lsr(r13, r13, 8); 6571 __ b(StoreLegalData); 6572 6573 __ BIND(NoIllegalData); 6574 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6575 } 6576 6577 6578 /** 6579 * Arguments: 6580 * 6581 * Input: 6582 * c_rarg0 - src_start 6583 * c_rarg1 - src_offset 6584 * c_rarg2 - src_length 6585 * c_rarg3 - dest_start 6586 * c_rarg4 - dest_offset 6587 * c_rarg5 - isURL 6588 * c_rarg6 - isMIME 6589 * 6590 */ 6591 address generate_base64_decodeBlock() { 6592 6593 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6594 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6595 // titled "Base64 decoding". 6596 6597 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6598 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6599 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6600 static const uint8_t fromBase64ForNoSIMD[256] = { 6601 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6602 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6603 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6604 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6605 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6606 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6607 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6608 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6609 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6610 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6611 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6612 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6613 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6614 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6616 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6617 }; 6618 6619 static const uint8_t fromBase64URLForNoSIMD[256] = { 6620 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6621 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6622 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6623 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6624 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6625 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6626 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6627 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6628 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6629 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6630 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6631 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6632 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6633 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6634 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6635 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6636 }; 6637 6638 // A legal value of base64 code is in range [0, 127]. We need two lookups 6639 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6640 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6641 // table vector lookup use tbx, out of range indices are unchanged in 6642 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6643 // The value of index 64 is set to 0, so that we know that we already get the 6644 // decoded data with the 1st lookup. 6645 static const uint8_t fromBase64ForSIMD[128] = { 6646 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6647 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6648 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6649 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6650 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6651 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6652 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6653 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6654 }; 6655 6656 static const uint8_t fromBase64URLForSIMD[128] = { 6657 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6658 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6659 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6660 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6661 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6662 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6663 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6664 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6665 }; 6666 6667 __ align(CodeEntryAlignment); 6668 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6669 address start = __ pc(); 6670 6671 Register src = c_rarg0; // source array 6672 Register soff = c_rarg1; // source start offset 6673 Register send = c_rarg2; // source end offset 6674 Register dst = c_rarg3; // dest array 6675 Register doff = c_rarg4; // position for writing to dest array 6676 Register isURL = c_rarg5; // Base64 or URL character set 6677 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6678 6679 Register length = send; // reuse send as length of source data to process 6680 6681 Register simd_codec = c_rarg6; 6682 Register nosimd_codec = c_rarg7; 6683 6684 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6685 6686 __ enter(); 6687 6688 __ add(src, src, soff); 6689 __ add(dst, dst, doff); 6690 6691 __ mov(doff, dst); 6692 6693 __ sub(length, send, soff); 6694 __ bfm(length, zr, 0, 1); 6695 6696 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6697 __ cbz(isURL, ProcessData); 6698 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6699 6700 __ BIND(ProcessData); 6701 __ mov(rscratch1, length); 6702 __ cmp(length, (u1)144); // 144 = 80 + 64 6703 __ br(Assembler::LT, Process4B); 6704 6705 // In the MIME case, the line length cannot be more than 76 6706 // bytes (see RFC 2045). This is too short a block for SIMD 6707 // to be worthwhile, so we use non-SIMD here. 6708 __ movw(rscratch1, 79); 6709 6710 __ BIND(Process4B); 6711 __ ldrw(r14, __ post(src, 4)); 6712 __ ubfxw(r10, r14, 0, 8); 6713 __ ubfxw(r11, r14, 8, 8); 6714 __ ubfxw(r12, r14, 16, 8); 6715 __ ubfxw(r13, r14, 24, 8); 6716 // get the de-code 6717 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6718 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6719 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6720 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6721 // error detection, 255u indicates an illegal input 6722 __ orrw(r14, r10, r11); 6723 __ orrw(r15, r12, r13); 6724 __ orrw(r14, r14, r15); 6725 __ tbnz(r14, 7, Exit); 6726 // recover the data 6727 __ lslw(r14, r10, 10); 6728 __ bfiw(r14, r11, 4, 6); 6729 __ bfmw(r14, r12, 2, 5); 6730 __ rev16w(r14, r14); 6731 __ bfiw(r13, r12, 6, 2); 6732 __ strh(r14, __ post(dst, 2)); 6733 __ strb(r13, __ post(dst, 1)); 6734 // non-simd loop 6735 __ subsw(rscratch1, rscratch1, 4); 6736 __ br(Assembler::GT, Process4B); 6737 6738 // if exiting from PreProcess80B, rscratch1 == -1; 6739 // otherwise, rscratch1 == 0. 6740 __ cbzw(rscratch1, Exit); 6741 __ sub(length, length, 80); 6742 6743 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6744 __ cbz(isURL, SIMDEnter); 6745 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6746 6747 __ BIND(SIMDEnter); 6748 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6749 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6750 __ mov(rscratch1, 63); 6751 __ dup(v27, __ T16B, rscratch1); 6752 6753 __ BIND(Process64B); 6754 __ cmp(length, (u1)64); 6755 __ br(Assembler::LT, Process32B); 6756 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6757 __ sub(length, length, 64); 6758 __ b(Process64B); 6759 6760 __ BIND(Process32B); 6761 __ cmp(length, (u1)32); 6762 __ br(Assembler::LT, SIMDExit); 6763 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6764 __ sub(length, length, 32); 6765 __ b(Process32B); 6766 6767 __ BIND(SIMDExit); 6768 __ cbz(length, Exit); 6769 __ movw(rscratch1, length); 6770 __ b(Process4B); 6771 6772 __ BIND(Exit); 6773 __ sub(c_rarg0, dst, doff); 6774 6775 __ leave(); 6776 __ ret(lr); 6777 6778 return start; 6779 } 6780 6781 // Support for spin waits. 6782 address generate_spin_wait() { 6783 __ align(CodeEntryAlignment); 6784 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6785 address start = __ pc(); 6786 6787 __ spin_wait(); 6788 __ ret(lr); 6789 6790 return start; 6791 } 6792 6793 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 6794 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 6795 6796 address start = __ pc(); 6797 const Register 6798 r_super_klass = r0, 6799 r_array_base = r1, 6800 r_array_length = r2, 6801 r_array_index = r3, 6802 r_sub_klass = r4, 6803 r_bitmap = rscratch2, 6804 result = r5; 6805 const FloatRegister 6806 vtemp = v0; 6807 6808 Label L_success; 6809 __ enter(); 6810 __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, 6811 r_array_base, r_array_length, r_array_index, 6812 vtemp, result, super_klass_index, 6813 /*stub_is_near*/true); 6814 __ leave(); 6815 __ ret(lr); 6816 6817 return start; 6818 } 6819 6820 // Slow path implementation for UseSecondarySupersTable. 6821 address generate_lookup_secondary_supers_table_slow_path_stub() { 6822 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 6823 6824 address start = __ pc(); 6825 const Register 6826 r_super_klass = r0, // argument 6827 r_array_base = r1, // argument 6828 temp1 = r2, // temp 6829 r_array_index = r3, // argument 6830 r_bitmap = rscratch2, // argument 6831 result = r5; // argument 6832 6833 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 6834 __ ret(lr); 6835 6836 return start; 6837 } 6838 6839 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6840 6841 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6842 // 6843 // If LSE is in use, generate LSE versions of all the stubs. The 6844 // non-LSE versions are in atomic_aarch64.S. 6845 6846 // class AtomicStubMark records the entry point of a stub and the 6847 // stub pointer which will point to it. The stub pointer is set to 6848 // the entry point when ~AtomicStubMark() is called, which must be 6849 // after ICache::invalidate_range. This ensures safe publication of 6850 // the generated code. 6851 class AtomicStubMark { 6852 address _entry_point; 6853 aarch64_atomic_stub_t *_stub; 6854 MacroAssembler *_masm; 6855 public: 6856 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6857 _masm = masm; 6858 __ align(32); 6859 _entry_point = __ pc(); 6860 _stub = stub; 6861 } 6862 ~AtomicStubMark() { 6863 *_stub = (aarch64_atomic_stub_t)_entry_point; 6864 } 6865 }; 6866 6867 // NB: For memory_order_conservative we need a trailing membar after 6868 // LSE atomic operations but not a leading membar. 6869 // 6870 // We don't need a leading membar because a clause in the Arm ARM 6871 // says: 6872 // 6873 // Barrier-ordered-before 6874 // 6875 // Barrier instructions order prior Memory effects before subsequent 6876 // Memory effects generated by the same Observer. A read or a write 6877 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6878 // Observer if and only if RW1 appears in program order before RW 2 6879 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6880 // instruction with both Acquire and Release semantics. 6881 // 6882 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6883 // and Release semantics, therefore we don't need a leading 6884 // barrier. However, there is no corresponding Barrier-ordered-after 6885 // relationship, therefore we need a trailing membar to prevent a 6886 // later store or load from being reordered with the store in an 6887 // atomic instruction. 6888 // 6889 // This was checked by using the herd7 consistency model simulator 6890 // (http://diy.inria.fr/) with this test case: 6891 // 6892 // AArch64 LseCas 6893 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6894 // P0 | P1; 6895 // LDR W4, [X2] | MOV W3, #0; 6896 // DMB LD | MOV W4, #1; 6897 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6898 // | DMB ISH; 6899 // | STR W4, [X2]; 6900 // exists 6901 // (0:X3=0 /\ 0:X4=1) 6902 // 6903 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6904 // with the store to x in P1. Without the DMB in P1 this may happen. 6905 // 6906 // At the time of writing we don't know of any AArch64 hardware that 6907 // reorders stores in this way, but the Reference Manual permits it. 6908 6909 void gen_cas_entry(Assembler::operand_size size, 6910 atomic_memory_order order) { 6911 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6912 exchange_val = c_rarg2; 6913 bool acquire, release; 6914 switch (order) { 6915 case memory_order_relaxed: 6916 acquire = false; 6917 release = false; 6918 break; 6919 case memory_order_release: 6920 acquire = false; 6921 release = true; 6922 break; 6923 default: 6924 acquire = true; 6925 release = true; 6926 break; 6927 } 6928 __ mov(prev, compare_val); 6929 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6930 if (order == memory_order_conservative) { 6931 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6932 } 6933 if (size == Assembler::xword) { 6934 __ mov(r0, prev); 6935 } else { 6936 __ movw(r0, prev); 6937 } 6938 __ ret(lr); 6939 } 6940 6941 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6942 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6943 // If not relaxed, then default to conservative. Relaxed is the only 6944 // case we use enough to be worth specializing. 6945 if (order == memory_order_relaxed) { 6946 __ ldadd(size, incr, prev, addr); 6947 } else { 6948 __ ldaddal(size, incr, prev, addr); 6949 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6950 } 6951 if (size == Assembler::xword) { 6952 __ mov(r0, prev); 6953 } else { 6954 __ movw(r0, prev); 6955 } 6956 __ ret(lr); 6957 } 6958 6959 void gen_swpal_entry(Assembler::operand_size size) { 6960 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6961 __ swpal(size, incr, prev, addr); 6962 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6963 if (size == Assembler::xword) { 6964 __ mov(r0, prev); 6965 } else { 6966 __ movw(r0, prev); 6967 } 6968 __ ret(lr); 6969 } 6970 6971 void generate_atomic_entry_points() { 6972 if (! UseLSE) { 6973 return; 6974 } 6975 6976 __ align(CodeEntryAlignment); 6977 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6978 address first_entry = __ pc(); 6979 6980 // ADD, memory_order_conservative 6981 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6982 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6983 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6984 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6985 6986 // ADD, memory_order_relaxed 6987 AtomicStubMark mark_fetch_add_4_relaxed 6988 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6989 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6990 AtomicStubMark mark_fetch_add_8_relaxed 6991 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6992 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6993 6994 // XCHG, memory_order_conservative 6995 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6996 gen_swpal_entry(Assembler::word); 6997 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6998 gen_swpal_entry(Assembler::xword); 6999 7000 // CAS, memory_order_conservative 7001 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 7002 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 7003 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 7004 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 7005 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 7006 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 7007 7008 // CAS, memory_order_relaxed 7009 AtomicStubMark mark_cmpxchg_1_relaxed 7010 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 7011 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 7012 AtomicStubMark mark_cmpxchg_4_relaxed 7013 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 7014 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 7015 AtomicStubMark mark_cmpxchg_8_relaxed 7016 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 7017 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 7018 7019 AtomicStubMark mark_cmpxchg_4_release 7020 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 7021 gen_cas_entry(MacroAssembler::word, memory_order_release); 7022 AtomicStubMark mark_cmpxchg_8_release 7023 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 7024 gen_cas_entry(MacroAssembler::xword, memory_order_release); 7025 7026 AtomicStubMark mark_cmpxchg_4_seq_cst 7027 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 7028 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 7029 AtomicStubMark mark_cmpxchg_8_seq_cst 7030 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 7031 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 7032 7033 ICache::invalidate_range(first_entry, __ pc() - first_entry); 7034 } 7035 #endif // LINUX 7036 7037 address generate_cont_thaw(Continuation::thaw_kind kind) { 7038 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 7039 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 7040 7041 address start = __ pc(); 7042 7043 if (return_barrier) { 7044 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7045 __ mov(sp, rscratch1); 7046 } 7047 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7048 7049 if (return_barrier) { 7050 // preserve possible return value from a method returning to the return barrier 7051 __ fmovd(rscratch1, v0); 7052 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7053 } 7054 7055 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7056 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7057 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7058 7059 if (return_barrier) { 7060 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7061 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7062 __ fmovd(v0, rscratch1); 7063 } 7064 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7065 7066 7067 Label thaw_success; 7068 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7069 __ cbnz(rscratch2, thaw_success); 7070 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 7071 __ br(rscratch1); 7072 __ bind(thaw_success); 7073 7074 // make room for the thawed frames 7075 __ sub(rscratch1, sp, rscratch2); 7076 __ andr(rscratch1, rscratch1, -16); // align 7077 __ mov(sp, rscratch1); 7078 7079 if (return_barrier) { 7080 // save original return value -- again 7081 __ fmovd(rscratch1, v0); 7082 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7083 } 7084 7085 // If we want, we can templatize thaw by kind, and have three different entries 7086 __ movw(c_rarg1, (uint32_t)kind); 7087 7088 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7089 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7090 7091 if (return_barrier) { 7092 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7093 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7094 __ fmovd(v0, rscratch1); 7095 } else { 7096 __ mov(r0, zr); // return 0 (success) from doYield 7097 } 7098 7099 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7100 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7101 __ mov(rfp, sp); 7102 7103 if (return_barrier_exception) { 7104 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7105 __ authenticate_return_address(c_rarg1); 7106 __ verify_oop(r0); 7107 // save return value containing the exception oop in callee-saved R19 7108 __ mov(r19, r0); 7109 7110 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7111 7112 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7113 // __ reinitialize_ptrue(); 7114 7115 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7116 7117 __ mov(r1, r0); // the exception handler 7118 __ mov(r0, r19); // restore return value containing the exception oop 7119 __ verify_oop(r0); 7120 7121 __ leave(); 7122 __ mov(r3, lr); 7123 __ br(r1); // the exception handler 7124 } else { 7125 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7126 __ leave(); 7127 __ ret(lr); 7128 } 7129 7130 return start; 7131 } 7132 7133 address generate_cont_thaw() { 7134 if (!Continuations::enabled()) return nullptr; 7135 7136 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7137 address start = __ pc(); 7138 generate_cont_thaw(Continuation::thaw_top); 7139 return start; 7140 } 7141 7142 address generate_cont_returnBarrier() { 7143 if (!Continuations::enabled()) return nullptr; 7144 7145 // TODO: will probably need multiple return barriers depending on return type 7146 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7147 address start = __ pc(); 7148 7149 generate_cont_thaw(Continuation::thaw_return_barrier); 7150 7151 return start; 7152 } 7153 7154 address generate_cont_returnBarrier_exception() { 7155 if (!Continuations::enabled()) return nullptr; 7156 7157 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7158 address start = __ pc(); 7159 7160 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7161 7162 return start; 7163 } 7164 7165 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7166 // are represented as long[5], with BITS_PER_LIMB = 26. 7167 // Pack five 26-bit limbs into three 64-bit registers. 7168 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7169 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7170 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7171 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7172 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7173 7174 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7175 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7176 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7177 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7178 7179 if (dest2->is_valid()) { 7180 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7181 } else { 7182 #ifdef ASSERT 7183 Label OK; 7184 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7185 __ br(__ EQ, OK); 7186 __ stop("high bits of Poly1305 integer should be zero"); 7187 __ should_not_reach_here(); 7188 __ bind(OK); 7189 #endif 7190 } 7191 } 7192 7193 // As above, but return only a 128-bit integer, packed into two 7194 // 64-bit registers. 7195 void pack_26(Register dest0, Register dest1, Register src) { 7196 pack_26(dest0, dest1, noreg, src); 7197 } 7198 7199 // Multiply and multiply-accumulate unsigned 64-bit registers. 7200 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7201 __ mul(prod_lo, n, m); 7202 __ umulh(prod_hi, n, m); 7203 } 7204 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7205 wide_mul(rscratch1, rscratch2, n, m); 7206 __ adds(sum_lo, sum_lo, rscratch1); 7207 __ adc(sum_hi, sum_hi, rscratch2); 7208 } 7209 7210 // Poly1305, RFC 7539 7211 7212 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7213 // description of the tricks used to simplify and accelerate this 7214 // computation. 7215 7216 address generate_poly1305_processBlocks() { 7217 __ align(CodeEntryAlignment); 7218 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7219 address start = __ pc(); 7220 Label here; 7221 __ enter(); 7222 RegSet callee_saved = RegSet::range(r19, r28); 7223 __ push(callee_saved, sp); 7224 7225 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7226 7227 // Arguments 7228 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7229 7230 // R_n is the 128-bit randomly-generated key, packed into two 7231 // registers. The caller passes this key to us as long[5], with 7232 // BITS_PER_LIMB = 26. 7233 const Register R_0 = *++regs, R_1 = *++regs; 7234 pack_26(R_0, R_1, r_start); 7235 7236 // RR_n is (R_n >> 2) * 5 7237 const Register RR_0 = *++regs, RR_1 = *++regs; 7238 __ lsr(RR_0, R_0, 2); 7239 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7240 __ lsr(RR_1, R_1, 2); 7241 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7242 7243 // U_n is the current checksum 7244 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7245 pack_26(U_0, U_1, U_2, acc_start); 7246 7247 static constexpr int BLOCK_LENGTH = 16; 7248 Label DONE, LOOP; 7249 7250 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7251 __ br(Assembler::LT, DONE); { 7252 __ bind(LOOP); 7253 7254 // S_n is to be the sum of U_n and the next block of data 7255 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7256 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7257 __ adds(S_0, U_0, S_0); 7258 __ adcs(S_1, U_1, S_1); 7259 __ adc(S_2, U_2, zr); 7260 __ add(S_2, S_2, 1); 7261 7262 const Register U_0HI = *++regs, U_1HI = *++regs; 7263 7264 // NB: this logic depends on some of the special properties of 7265 // Poly1305 keys. In particular, because we know that the top 7266 // four bits of R_0 and R_1 are zero, we can add together 7267 // partial products without any risk of needing to propagate a 7268 // carry out. 7269 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7270 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7271 __ andr(U_2, R_0, 3); 7272 __ mul(U_2, S_2, U_2); 7273 7274 // Recycle registers S_0, S_1, S_2 7275 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7276 7277 // Partial reduction mod 2**130 - 5 7278 __ adds(U_1, U_0HI, U_1); 7279 __ adc(U_2, U_1HI, U_2); 7280 // Sum now in U_2:U_1:U_0. 7281 // Dead: U_0HI, U_1HI. 7282 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7283 7284 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7285 7286 // First, U_2:U_1:U_0 += (U_2 >> 2) 7287 __ lsr(rscratch1, U_2, 2); 7288 __ andr(U_2, U_2, (u8)3); 7289 __ adds(U_0, U_0, rscratch1); 7290 __ adcs(U_1, U_1, zr); 7291 __ adc(U_2, U_2, zr); 7292 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7293 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7294 __ adcs(U_1, U_1, zr); 7295 __ adc(U_2, U_2, zr); 7296 7297 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7298 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7299 __ br(~ Assembler::LT, LOOP); 7300 } 7301 7302 // Further reduce modulo 2^130 - 5 7303 __ lsr(rscratch1, U_2, 2); 7304 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7305 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7306 __ adcs(U_1, U_1, zr); 7307 __ andr(U_2, U_2, (u1)3); 7308 __ adc(U_2, U_2, zr); 7309 7310 // Unpack the sum into five 26-bit limbs and write to memory. 7311 __ ubfiz(rscratch1, U_0, 0, 26); 7312 __ ubfx(rscratch2, U_0, 26, 26); 7313 __ stp(rscratch1, rscratch2, Address(acc_start)); 7314 __ ubfx(rscratch1, U_0, 52, 12); 7315 __ bfi(rscratch1, U_1, 12, 14); 7316 __ ubfx(rscratch2, U_1, 14, 26); 7317 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7318 __ ubfx(rscratch1, U_1, 40, 24); 7319 __ bfi(rscratch1, U_2, 24, 3); 7320 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7321 7322 __ bind(DONE); 7323 __ pop(callee_saved, sp); 7324 __ leave(); 7325 __ ret(lr); 7326 7327 return start; 7328 } 7329 7330 // exception handler for upcall stubs 7331 address generate_upcall_stub_exception_handler() { 7332 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7333 address start = __ pc(); 7334 7335 // Native caller has no idea how to handle exceptions, 7336 // so we just crash here. Up to callee to catch exceptions. 7337 __ verify_oop(r0); 7338 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7339 __ blr(rscratch1); 7340 __ should_not_reach_here(); 7341 7342 return start; 7343 } 7344 7345 // load Method* target of MethodHandle 7346 // j_rarg0 = jobject receiver 7347 // rmethod = result 7348 address generate_upcall_stub_load_target() { 7349 StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target"); 7350 address start = __ pc(); 7351 7352 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 7353 // Load target method from receiver 7354 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 7355 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 7356 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 7357 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 7358 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 7359 noreg, noreg); 7360 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 7361 7362 __ ret(lr); 7363 7364 return start; 7365 } 7366 7367 #undef __ 7368 #define __ masm-> 7369 7370 class MontgomeryMultiplyGenerator : public MacroAssembler { 7371 7372 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7373 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7374 7375 RegSet _toSave; 7376 bool _squaring; 7377 7378 public: 7379 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7380 : MacroAssembler(as->code()), _squaring(squaring) { 7381 7382 // Register allocation 7383 7384 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7385 Pa_base = *regs; // Argument registers 7386 if (squaring) 7387 Pb_base = Pa_base; 7388 else 7389 Pb_base = *++regs; 7390 Pn_base = *++regs; 7391 Rlen= *++regs; 7392 inv = *++regs; 7393 Pm_base = *++regs; 7394 7395 // Working registers: 7396 Ra = *++regs; // The current digit of a, b, n, and m. 7397 Rb = *++regs; 7398 Rm = *++regs; 7399 Rn = *++regs; 7400 7401 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7402 Pb = *++regs; 7403 Pm = *++regs; 7404 Pn = *++regs; 7405 7406 t0 = *++regs; // Three registers which form a 7407 t1 = *++regs; // triple-precision accumuator. 7408 t2 = *++regs; 7409 7410 Ri = *++regs; // Inner and outer loop indexes. 7411 Rj = *++regs; 7412 7413 Rhi_ab = *++regs; // Product registers: low and high parts 7414 Rlo_ab = *++regs; // of a*b and m*n. 7415 Rhi_mn = *++regs; 7416 Rlo_mn = *++regs; 7417 7418 // r19 and up are callee-saved. 7419 _toSave = RegSet::range(r19, *regs) + Pm_base; 7420 } 7421 7422 private: 7423 void save_regs() { 7424 push(_toSave, sp); 7425 } 7426 7427 void restore_regs() { 7428 pop(_toSave, sp); 7429 } 7430 7431 template <typename T> 7432 void unroll_2(Register count, T block) { 7433 Label loop, end, odd; 7434 tbnz(count, 0, odd); 7435 cbz(count, end); 7436 align(16); 7437 bind(loop); 7438 (this->*block)(); 7439 bind(odd); 7440 (this->*block)(); 7441 subs(count, count, 2); 7442 br(Assembler::GT, loop); 7443 bind(end); 7444 } 7445 7446 template <typename T> 7447 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7448 Label loop, end, odd; 7449 tbnz(count, 0, odd); 7450 cbz(count, end); 7451 align(16); 7452 bind(loop); 7453 (this->*block)(d, s, tmp); 7454 bind(odd); 7455 (this->*block)(d, s, tmp); 7456 subs(count, count, 2); 7457 br(Assembler::GT, loop); 7458 bind(end); 7459 } 7460 7461 void pre1(RegisterOrConstant i) { 7462 block_comment("pre1"); 7463 // Pa = Pa_base; 7464 // Pb = Pb_base + i; 7465 // Pm = Pm_base; 7466 // Pn = Pn_base + i; 7467 // Ra = *Pa; 7468 // Rb = *Pb; 7469 // Rm = *Pm; 7470 // Rn = *Pn; 7471 ldr(Ra, Address(Pa_base)); 7472 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7473 ldr(Rm, Address(Pm_base)); 7474 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7475 lea(Pa, Address(Pa_base)); 7476 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7477 lea(Pm, Address(Pm_base)); 7478 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7479 7480 // Zero the m*n result. 7481 mov(Rhi_mn, zr); 7482 mov(Rlo_mn, zr); 7483 } 7484 7485 // The core multiply-accumulate step of a Montgomery 7486 // multiplication. The idea is to schedule operations as a 7487 // pipeline so that instructions with long latencies (loads and 7488 // multiplies) have time to complete before their results are 7489 // used. This most benefits in-order implementations of the 7490 // architecture but out-of-order ones also benefit. 7491 void step() { 7492 block_comment("step"); 7493 // MACC(Ra, Rb, t0, t1, t2); 7494 // Ra = *++Pa; 7495 // Rb = *--Pb; 7496 umulh(Rhi_ab, Ra, Rb); 7497 mul(Rlo_ab, Ra, Rb); 7498 ldr(Ra, pre(Pa, wordSize)); 7499 ldr(Rb, pre(Pb, -wordSize)); 7500 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7501 // previous iteration. 7502 // MACC(Rm, Rn, t0, t1, t2); 7503 // Rm = *++Pm; 7504 // Rn = *--Pn; 7505 umulh(Rhi_mn, Rm, Rn); 7506 mul(Rlo_mn, Rm, Rn); 7507 ldr(Rm, pre(Pm, wordSize)); 7508 ldr(Rn, pre(Pn, -wordSize)); 7509 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7510 } 7511 7512 void post1() { 7513 block_comment("post1"); 7514 7515 // MACC(Ra, Rb, t0, t1, t2); 7516 // Ra = *++Pa; 7517 // Rb = *--Pb; 7518 umulh(Rhi_ab, Ra, Rb); 7519 mul(Rlo_ab, Ra, Rb); 7520 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7521 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7522 7523 // *Pm = Rm = t0 * inv; 7524 mul(Rm, t0, inv); 7525 str(Rm, Address(Pm)); 7526 7527 // MACC(Rm, Rn, t0, t1, t2); 7528 // t0 = t1; t1 = t2; t2 = 0; 7529 umulh(Rhi_mn, Rm, Rn); 7530 7531 #ifndef PRODUCT 7532 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7533 { 7534 mul(Rlo_mn, Rm, Rn); 7535 add(Rlo_mn, t0, Rlo_mn); 7536 Label ok; 7537 cbz(Rlo_mn, ok); { 7538 stop("broken Montgomery multiply"); 7539 } bind(ok); 7540 } 7541 #endif 7542 // We have very carefully set things up so that 7543 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7544 // the lower half of Rm * Rn because we know the result already: 7545 // it must be -t0. t0 + (-t0) must generate a carry iff 7546 // t0 != 0. So, rather than do a mul and an adds we just set 7547 // the carry flag iff t0 is nonzero. 7548 // 7549 // mul(Rlo_mn, Rm, Rn); 7550 // adds(zr, t0, Rlo_mn); 7551 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7552 adcs(t0, t1, Rhi_mn); 7553 adc(t1, t2, zr); 7554 mov(t2, zr); 7555 } 7556 7557 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7558 block_comment("pre2"); 7559 // Pa = Pa_base + i-len; 7560 // Pb = Pb_base + len; 7561 // Pm = Pm_base + i-len; 7562 // Pn = Pn_base + len; 7563 7564 if (i.is_register()) { 7565 sub(Rj, i.as_register(), len); 7566 } else { 7567 mov(Rj, i.as_constant()); 7568 sub(Rj, Rj, len); 7569 } 7570 // Rj == i-len 7571 7572 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7573 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7574 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7575 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7576 7577 // Ra = *++Pa; 7578 // Rb = *--Pb; 7579 // Rm = *++Pm; 7580 // Rn = *--Pn; 7581 ldr(Ra, pre(Pa, wordSize)); 7582 ldr(Rb, pre(Pb, -wordSize)); 7583 ldr(Rm, pre(Pm, wordSize)); 7584 ldr(Rn, pre(Pn, -wordSize)); 7585 7586 mov(Rhi_mn, zr); 7587 mov(Rlo_mn, zr); 7588 } 7589 7590 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7591 block_comment("post2"); 7592 if (i.is_constant()) { 7593 mov(Rj, i.as_constant()-len.as_constant()); 7594 } else { 7595 sub(Rj, i.as_register(), len); 7596 } 7597 7598 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7599 7600 // As soon as we know the least significant digit of our result, 7601 // store it. 7602 // Pm_base[i-len] = t0; 7603 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7604 7605 // t0 = t1; t1 = t2; t2 = 0; 7606 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7607 adc(t1, t2, zr); 7608 mov(t2, zr); 7609 } 7610 7611 // A carry in t0 after Montgomery multiplication means that we 7612 // should subtract multiples of n from our result in m. We'll 7613 // keep doing that until there is no carry. 7614 void normalize(RegisterOrConstant len) { 7615 block_comment("normalize"); 7616 // while (t0) 7617 // t0 = sub(Pm_base, Pn_base, t0, len); 7618 Label loop, post, again; 7619 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7620 cbz(t0, post); { 7621 bind(again); { 7622 mov(i, zr); 7623 mov(cnt, len); 7624 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7625 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7626 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7627 align(16); 7628 bind(loop); { 7629 sbcs(Rm, Rm, Rn); 7630 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7631 add(i, i, 1); 7632 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7633 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7634 sub(cnt, cnt, 1); 7635 } cbnz(cnt, loop); 7636 sbc(t0, t0, zr); 7637 } cbnz(t0, again); 7638 } bind(post); 7639 } 7640 7641 // Move memory at s to d, reversing words. 7642 // Increments d to end of copied memory 7643 // Destroys tmp1, tmp2 7644 // Preserves len 7645 // Leaves s pointing to the address which was in d at start 7646 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7647 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7648 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7649 7650 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7651 mov(tmp1, len); 7652 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7653 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7654 } 7655 // where 7656 void reverse1(Register d, Register s, Register tmp) { 7657 ldr(tmp, pre(s, -wordSize)); 7658 ror(tmp, tmp, 32); 7659 str(tmp, post(d, wordSize)); 7660 } 7661 7662 void step_squaring() { 7663 // An extra ACC 7664 step(); 7665 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7666 } 7667 7668 void last_squaring(RegisterOrConstant i) { 7669 Label dont; 7670 // if ((i & 1) == 0) { 7671 tbnz(i.as_register(), 0, dont); { 7672 // MACC(Ra, Rb, t0, t1, t2); 7673 // Ra = *++Pa; 7674 // Rb = *--Pb; 7675 umulh(Rhi_ab, Ra, Rb); 7676 mul(Rlo_ab, Ra, Rb); 7677 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7678 } bind(dont); 7679 } 7680 7681 void extra_step_squaring() { 7682 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7683 7684 // MACC(Rm, Rn, t0, t1, t2); 7685 // Rm = *++Pm; 7686 // Rn = *--Pn; 7687 umulh(Rhi_mn, Rm, Rn); 7688 mul(Rlo_mn, Rm, Rn); 7689 ldr(Rm, pre(Pm, wordSize)); 7690 ldr(Rn, pre(Pn, -wordSize)); 7691 } 7692 7693 void post1_squaring() { 7694 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7695 7696 // *Pm = Rm = t0 * inv; 7697 mul(Rm, t0, inv); 7698 str(Rm, Address(Pm)); 7699 7700 // MACC(Rm, Rn, t0, t1, t2); 7701 // t0 = t1; t1 = t2; t2 = 0; 7702 umulh(Rhi_mn, Rm, Rn); 7703 7704 #ifndef PRODUCT 7705 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7706 { 7707 mul(Rlo_mn, Rm, Rn); 7708 add(Rlo_mn, t0, Rlo_mn); 7709 Label ok; 7710 cbz(Rlo_mn, ok); { 7711 stop("broken Montgomery multiply"); 7712 } bind(ok); 7713 } 7714 #endif 7715 // We have very carefully set things up so that 7716 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7717 // the lower half of Rm * Rn because we know the result already: 7718 // it must be -t0. t0 + (-t0) must generate a carry iff 7719 // t0 != 0. So, rather than do a mul and an adds we just set 7720 // the carry flag iff t0 is nonzero. 7721 // 7722 // mul(Rlo_mn, Rm, Rn); 7723 // adds(zr, t0, Rlo_mn); 7724 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7725 adcs(t0, t1, Rhi_mn); 7726 adc(t1, t2, zr); 7727 mov(t2, zr); 7728 } 7729 7730 void acc(Register Rhi, Register Rlo, 7731 Register t0, Register t1, Register t2) { 7732 adds(t0, t0, Rlo); 7733 adcs(t1, t1, Rhi); 7734 adc(t2, t2, zr); 7735 } 7736 7737 public: 7738 /** 7739 * Fast Montgomery multiplication. The derivation of the 7740 * algorithm is in A Cryptographic Library for the Motorola 7741 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7742 * 7743 * Arguments: 7744 * 7745 * Inputs for multiplication: 7746 * c_rarg0 - int array elements a 7747 * c_rarg1 - int array elements b 7748 * c_rarg2 - int array elements n (the modulus) 7749 * c_rarg3 - int length 7750 * c_rarg4 - int inv 7751 * c_rarg5 - int array elements m (the result) 7752 * 7753 * Inputs for squaring: 7754 * c_rarg0 - int array elements a 7755 * c_rarg1 - int array elements n (the modulus) 7756 * c_rarg2 - int length 7757 * c_rarg3 - int inv 7758 * c_rarg4 - int array elements m (the result) 7759 * 7760 */ 7761 address generate_multiply() { 7762 Label argh, nothing; 7763 bind(argh); 7764 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7765 7766 align(CodeEntryAlignment); 7767 address entry = pc(); 7768 7769 cbzw(Rlen, nothing); 7770 7771 enter(); 7772 7773 // Make room. 7774 cmpw(Rlen, 512); 7775 br(Assembler::HI, argh); 7776 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7777 andr(sp, Ra, -2 * wordSize); 7778 7779 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7780 7781 { 7782 // Copy input args, reversing as we go. We use Ra as a 7783 // temporary variable. 7784 reverse(Ra, Pa_base, Rlen, t0, t1); 7785 if (!_squaring) 7786 reverse(Ra, Pb_base, Rlen, t0, t1); 7787 reverse(Ra, Pn_base, Rlen, t0, t1); 7788 } 7789 7790 // Push all call-saved registers and also Pm_base which we'll need 7791 // at the end. 7792 save_regs(); 7793 7794 #ifndef PRODUCT 7795 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7796 { 7797 ldr(Rn, Address(Pn_base, 0)); 7798 mul(Rlo_mn, Rn, inv); 7799 subs(zr, Rlo_mn, -1); 7800 Label ok; 7801 br(EQ, ok); { 7802 stop("broken inverse in Montgomery multiply"); 7803 } bind(ok); 7804 } 7805 #endif 7806 7807 mov(Pm_base, Ra); 7808 7809 mov(t0, zr); 7810 mov(t1, zr); 7811 mov(t2, zr); 7812 7813 block_comment("for (int i = 0; i < len; i++) {"); 7814 mov(Ri, zr); { 7815 Label loop, end; 7816 cmpw(Ri, Rlen); 7817 br(Assembler::GE, end); 7818 7819 bind(loop); 7820 pre1(Ri); 7821 7822 block_comment(" for (j = i; j; j--) {"); { 7823 movw(Rj, Ri); 7824 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7825 } block_comment(" } // j"); 7826 7827 post1(); 7828 addw(Ri, Ri, 1); 7829 cmpw(Ri, Rlen); 7830 br(Assembler::LT, loop); 7831 bind(end); 7832 block_comment("} // i"); 7833 } 7834 7835 block_comment("for (int i = len; i < 2*len; i++) {"); 7836 mov(Ri, Rlen); { 7837 Label loop, end; 7838 cmpw(Ri, Rlen, Assembler::LSL, 1); 7839 br(Assembler::GE, end); 7840 7841 bind(loop); 7842 pre2(Ri, Rlen); 7843 7844 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7845 lslw(Rj, Rlen, 1); 7846 subw(Rj, Rj, Ri); 7847 subw(Rj, Rj, 1); 7848 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7849 } block_comment(" } // j"); 7850 7851 post2(Ri, Rlen); 7852 addw(Ri, Ri, 1); 7853 cmpw(Ri, Rlen, Assembler::LSL, 1); 7854 br(Assembler::LT, loop); 7855 bind(end); 7856 } 7857 block_comment("} // i"); 7858 7859 normalize(Rlen); 7860 7861 mov(Ra, Pm_base); // Save Pm_base in Ra 7862 restore_regs(); // Restore caller's Pm_base 7863 7864 // Copy our result into caller's Pm_base 7865 reverse(Pm_base, Ra, Rlen, t0, t1); 7866 7867 leave(); 7868 bind(nothing); 7869 ret(lr); 7870 7871 return entry; 7872 } 7873 // In C, approximately: 7874 7875 // void 7876 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7877 // julong Pn_base[], julong Pm_base[], 7878 // julong inv, int len) { 7879 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7880 // julong *Pa, *Pb, *Pn, *Pm; 7881 // julong Ra, Rb, Rn, Rm; 7882 7883 // int i; 7884 7885 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7886 7887 // for (i = 0; i < len; i++) { 7888 // int j; 7889 7890 // Pa = Pa_base; 7891 // Pb = Pb_base + i; 7892 // Pm = Pm_base; 7893 // Pn = Pn_base + i; 7894 7895 // Ra = *Pa; 7896 // Rb = *Pb; 7897 // Rm = *Pm; 7898 // Rn = *Pn; 7899 7900 // int iters = i; 7901 // for (j = 0; iters--; j++) { 7902 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7903 // MACC(Ra, Rb, t0, t1, t2); 7904 // Ra = *++Pa; 7905 // Rb = *--Pb; 7906 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7907 // MACC(Rm, Rn, t0, t1, t2); 7908 // Rm = *++Pm; 7909 // Rn = *--Pn; 7910 // } 7911 7912 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7913 // MACC(Ra, Rb, t0, t1, t2); 7914 // *Pm = Rm = t0 * inv; 7915 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7916 // MACC(Rm, Rn, t0, t1, t2); 7917 7918 // assert(t0 == 0, "broken Montgomery multiply"); 7919 7920 // t0 = t1; t1 = t2; t2 = 0; 7921 // } 7922 7923 // for (i = len; i < 2*len; i++) { 7924 // int j; 7925 7926 // Pa = Pa_base + i-len; 7927 // Pb = Pb_base + len; 7928 // Pm = Pm_base + i-len; 7929 // Pn = Pn_base + len; 7930 7931 // Ra = *++Pa; 7932 // Rb = *--Pb; 7933 // Rm = *++Pm; 7934 // Rn = *--Pn; 7935 7936 // int iters = len*2-i-1; 7937 // for (j = i-len+1; iters--; j++) { 7938 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7939 // MACC(Ra, Rb, t0, t1, t2); 7940 // Ra = *++Pa; 7941 // Rb = *--Pb; 7942 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7943 // MACC(Rm, Rn, t0, t1, t2); 7944 // Rm = *++Pm; 7945 // Rn = *--Pn; 7946 // } 7947 7948 // Pm_base[i-len] = t0; 7949 // t0 = t1; t1 = t2; t2 = 0; 7950 // } 7951 7952 // while (t0) 7953 // t0 = sub(Pm_base, Pn_base, t0, len); 7954 // } 7955 7956 /** 7957 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7958 * multiplies than Montgomery multiplication so it should be up to 7959 * 25% faster. However, its loop control is more complex and it 7960 * may actually run slower on some machines. 7961 * 7962 * Arguments: 7963 * 7964 * Inputs: 7965 * c_rarg0 - int array elements a 7966 * c_rarg1 - int array elements n (the modulus) 7967 * c_rarg2 - int length 7968 * c_rarg3 - int inv 7969 * c_rarg4 - int array elements m (the result) 7970 * 7971 */ 7972 address generate_square() { 7973 Label argh; 7974 bind(argh); 7975 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7976 7977 align(CodeEntryAlignment); 7978 address entry = pc(); 7979 7980 enter(); 7981 7982 // Make room. 7983 cmpw(Rlen, 512); 7984 br(Assembler::HI, argh); 7985 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7986 andr(sp, Ra, -2 * wordSize); 7987 7988 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7989 7990 { 7991 // Copy input args, reversing as we go. We use Ra as a 7992 // temporary variable. 7993 reverse(Ra, Pa_base, Rlen, t0, t1); 7994 reverse(Ra, Pn_base, Rlen, t0, t1); 7995 } 7996 7997 // Push all call-saved registers and also Pm_base which we'll need 7998 // at the end. 7999 save_regs(); 8000 8001 mov(Pm_base, Ra); 8002 8003 mov(t0, zr); 8004 mov(t1, zr); 8005 mov(t2, zr); 8006 8007 block_comment("for (int i = 0; i < len; i++) {"); 8008 mov(Ri, zr); { 8009 Label loop, end; 8010 bind(loop); 8011 cmp(Ri, Rlen); 8012 br(Assembler::GE, end); 8013 8014 pre1(Ri); 8015 8016 block_comment("for (j = (i+1)/2; j; j--) {"); { 8017 add(Rj, Ri, 1); 8018 lsr(Rj, Rj, 1); 8019 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8020 } block_comment(" } // j"); 8021 8022 last_squaring(Ri); 8023 8024 block_comment(" for (j = i/2; j; j--) {"); { 8025 lsr(Rj, Ri, 1); 8026 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8027 } block_comment(" } // j"); 8028 8029 post1_squaring(); 8030 add(Ri, Ri, 1); 8031 cmp(Ri, Rlen); 8032 br(Assembler::LT, loop); 8033 8034 bind(end); 8035 block_comment("} // i"); 8036 } 8037 8038 block_comment("for (int i = len; i < 2*len; i++) {"); 8039 mov(Ri, Rlen); { 8040 Label loop, end; 8041 bind(loop); 8042 cmp(Ri, Rlen, Assembler::LSL, 1); 8043 br(Assembler::GE, end); 8044 8045 pre2(Ri, Rlen); 8046 8047 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8048 lsl(Rj, Rlen, 1); 8049 sub(Rj, Rj, Ri); 8050 sub(Rj, Rj, 1); 8051 lsr(Rj, Rj, 1); 8052 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8053 } block_comment(" } // j"); 8054 8055 last_squaring(Ri); 8056 8057 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8058 lsl(Rj, Rlen, 1); 8059 sub(Rj, Rj, Ri); 8060 lsr(Rj, Rj, 1); 8061 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8062 } block_comment(" } // j"); 8063 8064 post2(Ri, Rlen); 8065 add(Ri, Ri, 1); 8066 cmp(Ri, Rlen, Assembler::LSL, 1); 8067 8068 br(Assembler::LT, loop); 8069 bind(end); 8070 block_comment("} // i"); 8071 } 8072 8073 normalize(Rlen); 8074 8075 mov(Ra, Pm_base); // Save Pm_base in Ra 8076 restore_regs(); // Restore caller's Pm_base 8077 8078 // Copy our result into caller's Pm_base 8079 reverse(Pm_base, Ra, Rlen, t0, t1); 8080 8081 leave(); 8082 ret(lr); 8083 8084 return entry; 8085 } 8086 // In C, approximately: 8087 8088 // void 8089 // montgomery_square(julong Pa_base[], julong Pn_base[], 8090 // julong Pm_base[], julong inv, int len) { 8091 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8092 // julong *Pa, *Pb, *Pn, *Pm; 8093 // julong Ra, Rb, Rn, Rm; 8094 8095 // int i; 8096 8097 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8098 8099 // for (i = 0; i < len; i++) { 8100 // int j; 8101 8102 // Pa = Pa_base; 8103 // Pb = Pa_base + i; 8104 // Pm = Pm_base; 8105 // Pn = Pn_base + i; 8106 8107 // Ra = *Pa; 8108 // Rb = *Pb; 8109 // Rm = *Pm; 8110 // Rn = *Pn; 8111 8112 // int iters = (i+1)/2; 8113 // for (j = 0; iters--; j++) { 8114 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8115 // MACC2(Ra, Rb, t0, t1, t2); 8116 // Ra = *++Pa; 8117 // Rb = *--Pb; 8118 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8119 // MACC(Rm, Rn, t0, t1, t2); 8120 // Rm = *++Pm; 8121 // Rn = *--Pn; 8122 // } 8123 // if ((i & 1) == 0) { 8124 // assert(Ra == Pa_base[j], "must be"); 8125 // MACC(Ra, Ra, t0, t1, t2); 8126 // } 8127 // iters = i/2; 8128 // assert(iters == i-j, "must be"); 8129 // for (; iters--; j++) { 8130 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8131 // MACC(Rm, Rn, t0, t1, t2); 8132 // Rm = *++Pm; 8133 // Rn = *--Pn; 8134 // } 8135 8136 // *Pm = Rm = t0 * inv; 8137 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8138 // MACC(Rm, Rn, t0, t1, t2); 8139 8140 // assert(t0 == 0, "broken Montgomery multiply"); 8141 8142 // t0 = t1; t1 = t2; t2 = 0; 8143 // } 8144 8145 // for (i = len; i < 2*len; i++) { 8146 // int start = i-len+1; 8147 // int end = start + (len - start)/2; 8148 // int j; 8149 8150 // Pa = Pa_base + i-len; 8151 // Pb = Pa_base + len; 8152 // Pm = Pm_base + i-len; 8153 // Pn = Pn_base + len; 8154 8155 // Ra = *++Pa; 8156 // Rb = *--Pb; 8157 // Rm = *++Pm; 8158 // Rn = *--Pn; 8159 8160 // int iters = (2*len-i-1)/2; 8161 // assert(iters == end-start, "must be"); 8162 // for (j = start; iters--; j++) { 8163 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8164 // MACC2(Ra, Rb, t0, t1, t2); 8165 // Ra = *++Pa; 8166 // Rb = *--Pb; 8167 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8168 // MACC(Rm, Rn, t0, t1, t2); 8169 // Rm = *++Pm; 8170 // Rn = *--Pn; 8171 // } 8172 // if ((i & 1) == 0) { 8173 // assert(Ra == Pa_base[j], "must be"); 8174 // MACC(Ra, Ra, t0, t1, t2); 8175 // } 8176 // iters = (2*len-i)/2; 8177 // assert(iters == len-j, "must be"); 8178 // for (; iters--; j++) { 8179 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8180 // MACC(Rm, Rn, t0, t1, t2); 8181 // Rm = *++Pm; 8182 // Rn = *--Pn; 8183 // } 8184 // Pm_base[i-len] = t0; 8185 // t0 = t1; t1 = t2; t2 = 0; 8186 // } 8187 8188 // while (t0) 8189 // t0 = sub(Pm_base, Pn_base, t0, len); 8190 // } 8191 }; 8192 8193 8194 // Call here from the interpreter or compiled code to either load 8195 // multiple returned values from the inline type instance being 8196 // returned to registers or to store returned values to a newly 8197 // allocated inline type instance. 8198 address generate_return_value_stub(address destination, const char* name, bool has_res) { 8199 // We need to save all registers the calling convention may use so 8200 // the runtime calls read or update those registers. This needs to 8201 // be in sync with SharedRuntime::java_return_convention(). 8202 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 8203 enum layout { 8204 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 8205 j_rarg6_off, j_rarg6_2, 8206 j_rarg5_off, j_rarg5_2, 8207 j_rarg4_off, j_rarg4_2, 8208 j_rarg3_off, j_rarg3_2, 8209 j_rarg2_off, j_rarg2_2, 8210 j_rarg1_off, j_rarg1_2, 8211 j_rarg0_off, j_rarg0_2, 8212 8213 j_farg7_off, j_farg7_2, 8214 j_farg6_off, j_farg6_2, 8215 j_farg5_off, j_farg5_2, 8216 j_farg4_off, j_farg4_2, 8217 j_farg3_off, j_farg3_2, 8218 j_farg2_off, j_farg2_2, 8219 j_farg1_off, j_farg1_2, 8220 j_farg0_off, j_farg0_2, 8221 8222 rfp_off, rfp_off2, 8223 return_off, return_off2, 8224 8225 framesize // inclusive of return address 8226 }; 8227 8228 CodeBuffer code(name, 512, 64); 8229 MacroAssembler* masm = new MacroAssembler(&code); 8230 8231 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 8232 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 8233 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 8234 int frame_size_in_words = frame_size_in_bytes / wordSize; 8235 8236 OopMapSet* oop_maps = new OopMapSet(); 8237 OopMap* map = new OopMap(frame_size_in_slots, 0); 8238 8239 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 8240 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 8241 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 8242 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 8243 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 8244 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 8245 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 8246 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 8247 8248 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 8249 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 8250 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 8251 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 8252 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 8253 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 8254 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 8255 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 8256 8257 address start = __ pc(); 8258 8259 __ enter(); // Save FP and LR before call 8260 8261 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 8262 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 8263 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 8264 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 8265 8266 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 8267 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 8268 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 8269 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 8270 8271 int frame_complete = __ offset(); 8272 8273 // Set up last_Java_sp and last_Java_fp 8274 address the_pc = __ pc(); 8275 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 8276 8277 // Call runtime 8278 __ mov(c_rarg1, r0); 8279 __ mov(c_rarg0, rthread); 8280 8281 __ mov(rscratch1, destination); 8282 __ blr(rscratch1); 8283 8284 oop_maps->add_gc_map(the_pc - start, map); 8285 8286 __ reset_last_Java_frame(false); 8287 8288 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 8289 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 8290 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 8291 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 8292 8293 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 8294 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 8295 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 8296 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 8297 8298 __ leave(); 8299 8300 // check for pending exceptions 8301 Label pending; 8302 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 8303 __ cbnz(rscratch1, pending); 8304 8305 if (has_res) { 8306 __ get_vm_result(r0, rthread); 8307 } 8308 8309 __ ret(lr); 8310 8311 __ bind(pending); 8312 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 8313 8314 // ------------- 8315 // make sure all code is generated 8316 masm->flush(); 8317 8318 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 8319 return stub->entry_point(); 8320 } 8321 8322 // Initialization 8323 void generate_initial_stubs() { 8324 // Generate initial stubs and initializes the entry points 8325 8326 // entry points that exist in all platforms Note: This is code 8327 // that could be shared among different platforms - however the 8328 // benefit seems to be smaller than the disadvantage of having a 8329 // much more complicated generator structure. See also comment in 8330 // stubRoutines.hpp. 8331 8332 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8333 8334 StubRoutines::_call_stub_entry = 8335 generate_call_stub(StubRoutines::_call_stub_return_address); 8336 8337 // is referenced by megamorphic call 8338 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8339 8340 // Initialize table for copy memory (arraycopy) check. 8341 if (UnsafeMemoryAccess::_table == nullptr) { 8342 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 8343 } 8344 8345 if (UseCRC32Intrinsics) { 8346 // set table address before stub generation which use it 8347 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8348 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8349 } 8350 8351 if (UseCRC32CIntrinsics) { 8352 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8353 } 8354 8355 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8356 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8357 } 8358 8359 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8360 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8361 } 8362 8363 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8364 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8365 StubRoutines::_hf2f = generate_float16ToFloat(); 8366 StubRoutines::_f2hf = generate_floatToFloat16(); 8367 } 8368 8369 if (InlineTypeReturnedAsFields) { 8370 StubRoutines::_load_inline_type_fields_in_regs = 8371 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 8372 StubRoutines::_store_inline_type_fields_to_buf = 8373 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 8374 } 8375 8376 } 8377 8378 void generate_continuation_stubs() { 8379 // Continuation stubs: 8380 StubRoutines::_cont_thaw = generate_cont_thaw(); 8381 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8382 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8383 } 8384 8385 void generate_final_stubs() { 8386 // support for verify_oop (must happen after universe_init) 8387 if (VerifyOops) { 8388 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8389 } 8390 8391 // arraycopy stubs used by compilers 8392 generate_arraycopy_stubs(); 8393 8394 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8395 if (bs_nm != nullptr) { 8396 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8397 } 8398 8399 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8400 8401 if (UsePoly1305Intrinsics) { 8402 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8403 } 8404 8405 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8406 8407 generate_atomic_entry_points(); 8408 8409 #endif // LINUX 8410 8411 #ifdef COMPILER2 8412 if (UseSecondarySupersTable) { 8413 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 8414 if (! InlineSecondarySupersTest) { 8415 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8416 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 8417 = generate_lookup_secondary_supers_table_stub(slot); 8418 } 8419 } 8420 } 8421 #endif 8422 8423 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8424 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 8425 8426 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8427 } 8428 8429 void generate_compiler_stubs() { 8430 #if COMPILER2_OR_JVMCI 8431 8432 if (UseSVE == 0) { 8433 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8434 } 8435 8436 // array equals stub for large arrays. 8437 if (!UseSimpleArrayEquals) { 8438 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8439 } 8440 8441 // byte_array_inflate stub for large arrays. 8442 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8443 8444 // countPositives stub for large arrays. 8445 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8446 8447 generate_compare_long_strings(); 8448 8449 generate_string_indexof_stubs(); 8450 8451 #ifdef COMPILER2 8452 if (UseMultiplyToLenIntrinsic) { 8453 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8454 } 8455 8456 if (UseSquareToLenIntrinsic) { 8457 StubRoutines::_squareToLen = generate_squareToLen(); 8458 } 8459 8460 if (UseMulAddIntrinsic) { 8461 StubRoutines::_mulAdd = generate_mulAdd(); 8462 } 8463 8464 if (UseSIMDForBigIntegerShiftIntrinsics) { 8465 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8466 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8467 } 8468 8469 if (UseMontgomeryMultiplyIntrinsic) { 8470 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8471 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8472 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8473 } 8474 8475 if (UseMontgomerySquareIntrinsic) { 8476 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8477 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8478 // We use generate_multiply() rather than generate_square() 8479 // because it's faster for the sizes of modulus we care about. 8480 StubRoutines::_montgomerySquare = g.generate_multiply(); 8481 } 8482 #endif // COMPILER2 8483 8484 if (UseChaCha20Intrinsics) { 8485 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8486 } 8487 8488 if (UseBASE64Intrinsics) { 8489 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8490 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8491 } 8492 8493 // data cache line writeback 8494 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8495 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8496 8497 if (UseAESIntrinsics) { 8498 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8499 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8500 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8501 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8502 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8503 } 8504 if (UseGHASHIntrinsics) { 8505 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8506 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8507 } 8508 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8509 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8510 } 8511 8512 if (UseMD5Intrinsics) { 8513 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8514 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8515 } 8516 if (UseSHA1Intrinsics) { 8517 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8518 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8519 } 8520 if (UseSHA256Intrinsics) { 8521 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8522 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8523 } 8524 if (UseSHA512Intrinsics) { 8525 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8526 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8527 } 8528 if (UseSHA3Intrinsics) { 8529 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8530 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8531 } 8532 8533 // generate Adler32 intrinsics code 8534 if (UseAdler32Intrinsics) { 8535 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8536 } 8537 #endif // COMPILER2_OR_JVMCI 8538 } 8539 8540 public: 8541 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8542 switch(kind) { 8543 case Initial_stubs: 8544 generate_initial_stubs(); 8545 break; 8546 case Continuation_stubs: 8547 generate_continuation_stubs(); 8548 break; 8549 case Compiler_stubs: 8550 generate_compiler_stubs(); 8551 break; 8552 case Final_stubs: 8553 generate_final_stubs(); 8554 break; 8555 default: 8556 fatal("unexpected stubs kind: %d", kind); 8557 break; 8558 }; 8559 } 8560 }; // end class declaration 8561 8562 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8563 StubGenerator g(code, kind); 8564 } 8565 8566 8567 #if defined (LINUX) 8568 8569 // Define pointers to atomic stubs and initialize them to point to the 8570 // code in atomic_aarch64.S. 8571 8572 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8573 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8574 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8575 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8576 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8577 8578 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8579 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8580 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8581 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8582 DEFAULT_ATOMIC_OP(xchg, 4, ) 8583 DEFAULT_ATOMIC_OP(xchg, 8, ) 8584 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8585 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8586 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8587 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8588 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8589 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8590 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8591 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8592 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8593 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8594 8595 #undef DEFAULT_ATOMIC_OP 8596 8597 #endif // LINUX