1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 // All of j_rargN may be used to return inline type fields so be careful 332 // not to clobber those. 333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 334 // assignment of Rresult below. 335 Register Rresult = r14, Rresult_type = r15; 336 __ ldr(Rresult, result); 337 Label is_long, is_float, is_double, check_prim, exit; 338 __ ldr(Rresult_type, result_type); 339 __ cmp(Rresult_type, (u1)T_OBJECT); 340 __ br(Assembler::EQ, check_prim); 341 __ cmp(Rresult_type, (u1)T_LONG); 342 __ br(Assembler::EQ, is_long); 343 __ cmp(Rresult_type, (u1)T_FLOAT); 344 __ br(Assembler::EQ, is_float); 345 __ cmp(Rresult_type, (u1)T_DOUBLE); 346 __ br(Assembler::EQ, is_double); 347 348 // handle T_INT case 349 __ strw(r0, Address(Rresult)); 350 351 __ BIND(exit); 352 353 // pop parameters 354 __ sub(esp, rfp, -sp_after_call_off * wordSize); 355 356 #ifdef ASSERT 357 // verify that threads correspond 358 { 359 Label L, S; 360 __ ldr(rscratch1, thread); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::NE, S); 363 __ get_thread(rscratch1); 364 __ cmp(rthread, rscratch1); 365 __ br(Assembler::EQ, L); 366 __ BIND(S); 367 __ stop("StubRoutines::call_stub: threads must correspond"); 368 __ BIND(L); 369 } 370 #endif 371 372 __ pop_cont_fastpath(rthread); 373 374 // restore callee-save registers 375 __ ldpd(v15, v14, d15_save); 376 __ ldpd(v13, v12, d13_save); 377 __ ldpd(v11, v10, d11_save); 378 __ ldpd(v9, v8, d9_save); 379 380 __ ldp(r28, r27, r28_save); 381 __ ldp(r26, r25, r26_save); 382 __ ldp(r24, r23, r24_save); 383 __ ldp(r22, r21, r22_save); 384 __ ldp(r20, r19, r20_save); 385 386 // restore fpcr 387 __ ldr(rscratch1, fpcr_save); 388 __ set_fpcr(rscratch1); 389 390 __ ldp(c_rarg0, c_rarg1, call_wrapper); 391 __ ldrw(c_rarg2, result_type); 392 __ ldr(c_rarg3, method); 393 __ ldp(c_rarg4, c_rarg5, entry_point); 394 __ ldp(c_rarg6, c_rarg7, parameter_size); 395 396 // leave frame and return to caller 397 __ leave(); 398 __ ret(lr); 399 400 // handle return types different from T_INT 401 __ BIND(check_prim); 402 if (InlineTypeReturnedAsFields) { 403 // Check for scalarized return value 404 __ tbz(r0, 0, is_long); 405 // Load pack handler address 406 __ andr(rscratch1, r0, -2); 407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 409 __ blr(rscratch1); 410 __ b(exit); 411 } 412 413 __ BIND(is_long); 414 __ str(r0, Address(Rresult, 0)); 415 __ br(Assembler::AL, exit); 416 417 __ BIND(is_float); 418 __ strs(j_farg0, Address(Rresult, 0)); 419 __ br(Assembler::AL, exit); 420 421 __ BIND(is_double); 422 __ strd(j_farg0, Address(Rresult, 0)); 423 __ br(Assembler::AL, exit); 424 425 return start; 426 } 427 428 // Return point for a Java call if there's an exception thrown in 429 // Java code. The exception is caught and transformed into a 430 // pending exception stored in JavaThread that can be tested from 431 // within the VM. 432 // 433 // Note: Usually the parameters are removed by the callee. In case 434 // of an exception crossing an activation frame boundary, that is 435 // not the case if the callee is compiled code => need to setup the 436 // rsp. 437 // 438 // r0: exception oop 439 440 address generate_catch_exception() { 441 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 442 StubCodeMark mark(this, stub_id); 443 address start = __ pc(); 444 445 // same as in generate_call_stub(): 446 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 447 const Address thread (rfp, thread_off * wordSize); 448 449 #ifdef ASSERT 450 // verify that threads correspond 451 { 452 Label L, S; 453 __ ldr(rscratch1, thread); 454 __ cmp(rthread, rscratch1); 455 __ br(Assembler::NE, S); 456 __ get_thread(rscratch1); 457 __ cmp(rthread, rscratch1); 458 __ br(Assembler::EQ, L); 459 __ bind(S); 460 __ stop("StubRoutines::catch_exception: threads must correspond"); 461 __ bind(L); 462 } 463 #endif 464 465 // set pending exception 466 __ verify_oop(r0); 467 468 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 469 __ mov(rscratch1, (address)__FILE__); 470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 471 __ movw(rscratch1, (int)__LINE__); 472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 473 474 // complete return to VM 475 assert(StubRoutines::_call_stub_return_address != nullptr, 476 "_call_stub_return_address must have been generated before"); 477 __ b(StubRoutines::_call_stub_return_address); 478 479 return start; 480 } 481 482 // Continuation point for runtime calls returning with a pending 483 // exception. The pending exception check happened in the runtime 484 // or native call stub. The pending exception in Thread is 485 // converted into a Java-level exception. 486 // 487 // Contract with Java-level exception handlers: 488 // r0: exception 489 // r3: throwing pc 490 // 491 // NOTE: At entry of this stub, exception-pc must be in LR !! 492 493 // NOTE: this is always used as a jump target within generated code 494 // so it just needs to be generated code with no x86 prolog 495 496 address generate_forward_exception() { 497 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 498 StubCodeMark mark(this, stub_id); 499 address start = __ pc(); 500 501 // Upon entry, LR points to the return address returning into 502 // Java (interpreted or compiled) code; i.e., the return address 503 // becomes the throwing pc. 504 // 505 // Arguments pushed before the runtime call are still on the stack 506 // but the exception handler will reset the stack pointer -> 507 // ignore them. A potential result in registers can be ignored as 508 // well. 509 510 #ifdef ASSERT 511 // make sure this code is only executed if there is a pending exception 512 { 513 Label L; 514 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 515 __ cbnz(rscratch1, L); 516 __ stop("StubRoutines::forward exception: no pending exception (1)"); 517 __ bind(L); 518 } 519 #endif 520 521 // compute exception handler into r19 522 523 // call the VM to find the handler address associated with the 524 // caller address. pass thread in r0 and caller pc (ret address) 525 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 526 // the stack. 527 __ mov(c_rarg1, lr); 528 // lr will be trashed by the VM call so we move it to R19 529 // (callee-saved) because we also need to pass it to the handler 530 // returned by this call. 531 __ mov(r19, lr); 532 BLOCK_COMMENT("call exception_handler_for_return_address"); 533 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 534 SharedRuntime::exception_handler_for_return_address), 535 rthread, c_rarg1); 536 // Reinitialize the ptrue predicate register, in case the external runtime 537 // call clobbers ptrue reg, as we may return to SVE compiled code. 538 __ reinitialize_ptrue(); 539 540 // we should not really care that lr is no longer the callee 541 // address. we saved the value the handler needs in r19 so we can 542 // just copy it to r3. however, the C2 handler will push its own 543 // frame and then calls into the VM and the VM code asserts that 544 // the PC for the frame above the handler belongs to a compiled 545 // Java method. So, we restore lr here to satisfy that assert. 546 __ mov(lr, r19); 547 // setup r0 & r3 & clear pending exception 548 __ mov(r3, r19); 549 __ mov(r19, r0); 550 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 551 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 552 553 #ifdef ASSERT 554 // make sure exception is set 555 { 556 Label L; 557 __ cbnz(r0, L); 558 __ stop("StubRoutines::forward exception: no pending exception (2)"); 559 __ bind(L); 560 } 561 #endif 562 563 // continue at exception handler 564 // r0: exception 565 // r3: throwing pc 566 // r19: exception handler 567 __ verify_oop(r0); 568 __ br(r19); 569 570 return start; 571 } 572 573 // Non-destructive plausibility checks for oops 574 // 575 // Arguments: 576 // r0: oop to verify 577 // rscratch1: error message 578 // 579 // Stack after saving c_rarg3: 580 // [tos + 0]: saved c_rarg3 581 // [tos + 1]: saved c_rarg2 582 // [tos + 2]: saved lr 583 // [tos + 3]: saved rscratch2 584 // [tos + 4]: saved r0 585 // [tos + 5]: saved rscratch1 586 address generate_verify_oop() { 587 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 588 StubCodeMark mark(this, stub_id); 589 address start = __ pc(); 590 591 Label exit, error; 592 593 // save c_rarg2 and c_rarg3 594 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 595 596 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 597 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 598 __ ldr(c_rarg3, Address(c_rarg2)); 599 __ add(c_rarg3, c_rarg3, 1); 600 __ str(c_rarg3, Address(c_rarg2)); 601 602 // object is in r0 603 // make sure object is 'reasonable' 604 __ cbz(r0, exit); // if obj is null it is OK 605 606 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 607 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 608 609 // return if everything seems ok 610 __ bind(exit); 611 612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 613 __ ret(lr); 614 615 // handle errors 616 __ bind(error); 617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 618 619 __ push(RegSet::range(r0, r29), sp); 620 // debug(char* msg, int64_t pc, int64_t regs[]) 621 __ mov(c_rarg0, rscratch1); // pass address of error message 622 __ mov(c_rarg1, lr); // pass return address 623 __ mov(c_rarg2, sp); // pass address of regs on stack 624 #ifndef PRODUCT 625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 626 #endif 627 BLOCK_COMMENT("call MacroAssembler::debug"); 628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 629 __ blr(rscratch1); 630 __ hlt(0); 631 632 return start; 633 } 634 635 // Generate indices for iota vector. 636 address generate_iota_indices(StubGenStubId stub_id) { 637 __ align(CodeEntryAlignment); 638 StubCodeMark mark(this, stub_id); 639 address start = __ pc(); 640 // B 641 __ emit_data64(0x0706050403020100, relocInfo::none); 642 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 643 // H 644 __ emit_data64(0x0003000200010000, relocInfo::none); 645 __ emit_data64(0x0007000600050004, relocInfo::none); 646 // S 647 __ emit_data64(0x0000000100000000, relocInfo::none); 648 __ emit_data64(0x0000000300000002, relocInfo::none); 649 // D 650 __ emit_data64(0x0000000000000000, relocInfo::none); 651 __ emit_data64(0x0000000000000001, relocInfo::none); 652 // S - FP 653 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 654 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 655 // D - FP 656 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 657 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 658 return start; 659 } 660 661 // The inner part of zero_words(). This is the bulk operation, 662 // zeroing words in blocks, possibly using DC ZVA to do it. The 663 // caller is responsible for zeroing the last few words. 664 // 665 // Inputs: 666 // r10: the HeapWord-aligned base address of an array to zero. 667 // r11: the count in HeapWords, r11 > 0. 668 // 669 // Returns r10 and r11, adjusted for the caller to clear. 670 // r10: the base address of the tail of words left to clear. 671 // r11: the number of words in the tail. 672 // r11 < MacroAssembler::zero_words_block_size. 673 674 address generate_zero_blocks() { 675 Label done; 676 Label base_aligned; 677 678 Register base = r10, cnt = r11; 679 680 __ align(CodeEntryAlignment); 681 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 682 StubCodeMark mark(this, stub_id); 683 address start = __ pc(); 684 685 if (UseBlockZeroing) { 686 int zva_length = VM_Version::zva_length(); 687 688 // Ensure ZVA length can be divided by 16. This is required by 689 // the subsequent operations. 690 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 691 692 __ tbz(base, 3, base_aligned); 693 __ str(zr, Address(__ post(base, 8))); 694 __ sub(cnt, cnt, 1); 695 __ bind(base_aligned); 696 697 // Ensure count >= zva_length * 2 so that it still deserves a zva after 698 // alignment. 699 Label small; 700 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 701 __ subs(rscratch1, cnt, low_limit >> 3); 702 __ br(Assembler::LT, small); 703 __ zero_dcache_blocks(base, cnt); 704 __ bind(small); 705 } 706 707 { 708 // Number of stp instructions we'll unroll 709 const int unroll = 710 MacroAssembler::zero_words_block_size / 2; 711 // Clear the remaining blocks. 712 Label loop; 713 __ subs(cnt, cnt, unroll * 2); 714 __ br(Assembler::LT, done); 715 __ bind(loop); 716 for (int i = 0; i < unroll; i++) 717 __ stp(zr, zr, __ post(base, 16)); 718 __ subs(cnt, cnt, unroll * 2); 719 __ br(Assembler::GE, loop); 720 __ bind(done); 721 __ add(cnt, cnt, unroll * 2); 722 } 723 724 __ ret(lr); 725 726 return start; 727 } 728 729 730 typedef enum { 731 copy_forwards = 1, 732 copy_backwards = -1 733 } copy_direction; 734 735 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 736 // for arraycopy stubs. 737 class ArrayCopyBarrierSetHelper : StackObj { 738 BarrierSetAssembler* _bs_asm; 739 MacroAssembler* _masm; 740 DecoratorSet _decorators; 741 BasicType _type; 742 Register _gct1; 743 Register _gct2; 744 Register _gct3; 745 FloatRegister _gcvt1; 746 FloatRegister _gcvt2; 747 FloatRegister _gcvt3; 748 749 public: 750 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 751 DecoratorSet decorators, 752 BasicType type, 753 Register gct1, 754 Register gct2, 755 Register gct3, 756 FloatRegister gcvt1, 757 FloatRegister gcvt2, 758 FloatRegister gcvt3) 759 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 760 _masm(masm), 761 _decorators(decorators), 762 _type(type), 763 _gct1(gct1), 764 _gct2(gct2), 765 _gct3(gct3), 766 _gcvt1(gcvt1), 767 _gcvt2(gcvt2), 768 _gcvt3(gcvt3) { 769 } 770 771 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 772 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 773 dst1, dst2, src, 774 _gct1, _gct2, _gcvt1); 775 } 776 777 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 778 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 779 dst, src1, src2, 780 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 781 } 782 783 void copy_load_at_16(Register dst1, Register dst2, Address src) { 784 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 785 dst1, dst2, src, 786 _gct1); 787 } 788 789 void copy_store_at_16(Address dst, Register src1, Register src2) { 790 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 791 dst, src1, src2, 792 _gct1, _gct2, _gct3); 793 } 794 795 void copy_load_at_8(Register dst, Address src) { 796 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 797 dst, noreg, src, 798 _gct1); 799 } 800 801 void copy_store_at_8(Address dst, Register src) { 802 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 803 dst, src, noreg, 804 _gct1, _gct2, _gct3); 805 } 806 }; 807 808 // Bulk copy of blocks of 8 words. 809 // 810 // count is a count of words. 811 // 812 // Precondition: count >= 8 813 // 814 // Postconditions: 815 // 816 // The least significant bit of count contains the remaining count 817 // of words to copy. The rest of count is trash. 818 // 819 // s and d are adjusted to point to the remaining words to copy 820 // 821 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 822 BasicType type; 823 copy_direction direction; 824 825 switch (stub_id) { 826 case copy_byte_f_id: 827 direction = copy_forwards; 828 type = T_BYTE; 829 break; 830 case copy_byte_b_id: 831 direction = copy_backwards; 832 type = T_BYTE; 833 break; 834 case copy_oop_f_id: 835 direction = copy_forwards; 836 type = T_OBJECT; 837 break; 838 case copy_oop_b_id: 839 direction = copy_backwards; 840 type = T_OBJECT; 841 break; 842 case copy_oop_uninit_f_id: 843 direction = copy_forwards; 844 type = T_OBJECT; 845 break; 846 case copy_oop_uninit_b_id: 847 direction = copy_backwards; 848 type = T_OBJECT; 849 break; 850 default: 851 ShouldNotReachHere(); 852 } 853 854 int unit = wordSize * direction; 855 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 856 857 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 858 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 859 const Register stride = r14; 860 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 861 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 862 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 863 864 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 865 assert_different_registers(s, d, count, rscratch1, rscratch2); 866 867 Label again, drain; 868 869 __ align(CodeEntryAlignment); 870 871 StubCodeMark mark(this, stub_id); 872 873 __ bind(start); 874 875 Label unaligned_copy_long; 876 if (AvoidUnalignedAccesses) { 877 __ tbnz(d, 3, unaligned_copy_long); 878 } 879 880 if (direction == copy_forwards) { 881 __ sub(s, s, bias); 882 __ sub(d, d, bias); 883 } 884 885 #ifdef ASSERT 886 // Make sure we are never given < 8 words 887 { 888 Label L; 889 __ cmp(count, (u1)8); 890 __ br(Assembler::GE, L); 891 __ stop("genrate_copy_longs called with < 8 words"); 892 __ bind(L); 893 } 894 #endif 895 896 // Fill 8 registers 897 if (UseSIMDForMemoryOps) { 898 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 899 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 900 } else { 901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 902 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 903 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 905 } 906 907 __ subs(count, count, 16); 908 __ br(Assembler::LO, drain); 909 910 int prefetch = PrefetchCopyIntervalInBytes; 911 bool use_stride = false; 912 if (direction == copy_backwards) { 913 use_stride = prefetch > 256; 914 prefetch = -prefetch; 915 if (use_stride) __ mov(stride, prefetch); 916 } 917 918 __ bind(again); 919 920 if (PrefetchCopyIntervalInBytes > 0) 921 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 922 923 if (UseSIMDForMemoryOps) { 924 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 925 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 926 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 927 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 928 } else { 929 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 930 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 931 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 932 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 933 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 934 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 937 } 938 939 __ subs(count, count, 8); 940 __ br(Assembler::HS, again); 941 942 // Drain 943 __ bind(drain); 944 if (UseSIMDForMemoryOps) { 945 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 946 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 947 } else { 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 950 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 951 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 952 } 953 954 { 955 Label L1, L2; 956 __ tbz(count, exact_log2(4), L1); 957 if (UseSIMDForMemoryOps) { 958 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 959 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 960 } else { 961 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 962 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 963 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 964 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 965 } 966 __ bind(L1); 967 968 if (direction == copy_forwards) { 969 __ add(s, s, bias); 970 __ add(d, d, bias); 971 } 972 973 __ tbz(count, 1, L2); 974 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 975 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 976 __ bind(L2); 977 } 978 979 __ ret(lr); 980 981 if (AvoidUnalignedAccesses) { 982 Label drain, again; 983 // Register order for storing. Order is different for backward copy. 984 985 __ bind(unaligned_copy_long); 986 987 // source address is even aligned, target odd aligned 988 // 989 // when forward copying word pairs we read long pairs at offsets 990 // {0, 2, 4, 6} (in long words). when backwards copying we read 991 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 992 // address by -2 in the forwards case so we can compute the 993 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 994 // or -1. 995 // 996 // when forward copying we need to store 1 word, 3 pairs and 997 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 998 // zero offset We adjust the destination by -1 which means we 999 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 1000 // 1001 // When backwards copyng we need to store 1 word, 3 pairs and 1002 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 1003 // offsets {1, 3, 5, 7, 8} * unit. 1004 1005 if (direction == copy_forwards) { 1006 __ sub(s, s, 16); 1007 __ sub(d, d, 8); 1008 } 1009 1010 // Fill 8 registers 1011 // 1012 // for forwards copy s was offset by -16 from the original input 1013 // value of s so the register contents are at these offsets 1014 // relative to the 64 bit block addressed by that original input 1015 // and so on for each successive 64 byte block when s is updated 1016 // 1017 // t0 at offset 0, t1 at offset 8 1018 // t2 at offset 16, t3 at offset 24 1019 // t4 at offset 32, t5 at offset 40 1020 // t6 at offset 48, t7 at offset 56 1021 1022 // for backwards copy s was not offset so the register contents 1023 // are at these offsets into the preceding 64 byte block 1024 // relative to that original input and so on for each successive 1025 // preceding 64 byte block when s is updated. this explains the 1026 // slightly counter-intuitive looking pattern of register usage 1027 // in the stp instructions for backwards copy. 1028 // 1029 // t0 at offset -16, t1 at offset -8 1030 // t2 at offset -32, t3 at offset -24 1031 // t4 at offset -48, t5 at offset -40 1032 // t6 at offset -64, t7 at offset -56 1033 1034 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1035 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1036 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1037 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1038 1039 __ subs(count, count, 16); 1040 __ br(Assembler::LO, drain); 1041 1042 int prefetch = PrefetchCopyIntervalInBytes; 1043 bool use_stride = false; 1044 if (direction == copy_backwards) { 1045 use_stride = prefetch > 256; 1046 prefetch = -prefetch; 1047 if (use_stride) __ mov(stride, prefetch); 1048 } 1049 1050 __ bind(again); 1051 1052 if (PrefetchCopyIntervalInBytes > 0) 1053 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1054 1055 if (direction == copy_forwards) { 1056 // allowing for the offset of -8 the store instructions place 1057 // registers into the target 64 bit block at the following 1058 // offsets 1059 // 1060 // t0 at offset 0 1061 // t1 at offset 8, t2 at offset 16 1062 // t3 at offset 24, t4 at offset 32 1063 // t5 at offset 40, t6 at offset 48 1064 // t7 at offset 56 1065 1066 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1067 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1068 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1069 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1070 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1071 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1072 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1073 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1074 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1075 } else { 1076 // d was not offset when we started so the registers are 1077 // written into the 64 bit block preceding d with the following 1078 // offsets 1079 // 1080 // t1 at offset -8 1081 // t3 at offset -24, t0 at offset -16 1082 // t5 at offset -48, t2 at offset -32 1083 // t7 at offset -56, t4 at offset -48 1084 // t6 at offset -64 1085 // 1086 // note that this matches the offsets previously noted for the 1087 // loads 1088 1089 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1090 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1091 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1092 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1093 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1094 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1095 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1096 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1097 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1098 } 1099 1100 __ subs(count, count, 8); 1101 __ br(Assembler::HS, again); 1102 1103 // Drain 1104 // 1105 // this uses the same pattern of offsets and register arguments 1106 // as above 1107 __ bind(drain); 1108 if (direction == copy_forwards) { 1109 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1110 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1111 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1112 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1113 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1114 } else { 1115 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1116 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1117 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1118 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1119 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1120 } 1121 // now we need to copy any remaining part block which may 1122 // include a 4 word block subblock and/or a 2 word subblock. 1123 // bits 2 and 1 in the count are the tell-tale for whether we 1124 // have each such subblock 1125 { 1126 Label L1, L2; 1127 __ tbz(count, exact_log2(4), L1); 1128 // this is the same as above but copying only 4 longs hence 1129 // with only one intervening stp between the str instructions 1130 // but note that the offsets and registers still follow the 1131 // same pattern 1132 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1133 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1134 if (direction == copy_forwards) { 1135 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1136 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1137 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1141 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1142 } 1143 __ bind(L1); 1144 1145 __ tbz(count, 1, L2); 1146 // this is the same as above but copying only 2 longs hence 1147 // there is no intervening stp between the str instructions 1148 // but note that the offset and register patterns are still 1149 // the same 1150 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1151 if (direction == copy_forwards) { 1152 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1153 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1154 } else { 1155 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1156 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1157 } 1158 __ bind(L2); 1159 1160 // for forwards copy we need to re-adjust the offsets we 1161 // applied so that s and d are follow the last words written 1162 1163 if (direction == copy_forwards) { 1164 __ add(s, s, 16); 1165 __ add(d, d, 8); 1166 } 1167 1168 } 1169 1170 __ ret(lr); 1171 } 1172 } 1173 1174 // Small copy: less than 16 bytes. 1175 // 1176 // NB: Ignores all of the bits of count which represent more than 15 1177 // bytes, so a caller doesn't have to mask them. 1178 1179 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1180 bool is_backwards = step < 0; 1181 size_t granularity = g_uabs(step); 1182 int direction = is_backwards ? -1 : 1; 1183 1184 Label Lword, Lint, Lshort, Lbyte; 1185 1186 assert(granularity 1187 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1188 1189 const Register t0 = r3; 1190 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1191 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1192 1193 // ??? I don't know if this bit-test-and-branch is the right thing 1194 // to do. It does a lot of jumping, resulting in several 1195 // mispredicted branches. It might make more sense to do this 1196 // with something like Duff's device with a single computed branch. 1197 1198 __ tbz(count, 3 - exact_log2(granularity), Lword); 1199 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1200 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1201 __ bind(Lword); 1202 1203 if (granularity <= sizeof (jint)) { 1204 __ tbz(count, 2 - exact_log2(granularity), Lint); 1205 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1206 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1207 __ bind(Lint); 1208 } 1209 1210 if (granularity <= sizeof (jshort)) { 1211 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1212 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1213 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1214 __ bind(Lshort); 1215 } 1216 1217 if (granularity <= sizeof (jbyte)) { 1218 __ tbz(count, 0, Lbyte); 1219 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1220 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1221 __ bind(Lbyte); 1222 } 1223 } 1224 1225 Label copy_f, copy_b; 1226 Label copy_obj_f, copy_obj_b; 1227 Label copy_obj_uninit_f, copy_obj_uninit_b; 1228 1229 // All-singing all-dancing memory copy. 1230 // 1231 // Copy count units of memory from s to d. The size of a unit is 1232 // step, which can be positive or negative depending on the direction 1233 // of copy. If is_aligned is false, we align the source address. 1234 // 1235 1236 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1237 Register s, Register d, Register count, int step) { 1238 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1239 bool is_backwards = step < 0; 1240 unsigned int granularity = g_uabs(step); 1241 const Register t0 = r3, t1 = r4; 1242 1243 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1244 // load all the data before writing anything 1245 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1246 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1247 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1248 const Register send = r17, dend = r16; 1249 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1250 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1251 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1252 1253 if (PrefetchCopyIntervalInBytes > 0) 1254 __ prfm(Address(s, 0), PLDL1KEEP); 1255 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1256 __ br(Assembler::HI, copy_big); 1257 1258 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1259 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1260 1261 __ cmp(count, u1(16/granularity)); 1262 __ br(Assembler::LS, copy16); 1263 1264 __ cmp(count, u1(64/granularity)); 1265 __ br(Assembler::HI, copy80); 1266 1267 __ cmp(count, u1(32/granularity)); 1268 __ br(Assembler::LS, copy32); 1269 1270 // 33..64 bytes 1271 if (UseSIMDForMemoryOps) { 1272 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1273 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1274 bs.copy_store_at_32(Address(d, 0), v0, v1); 1275 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1276 } else { 1277 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1278 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1279 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1280 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1281 1282 bs.copy_store_at_16(Address(d, 0), t0, t1); 1283 bs.copy_store_at_16(Address(d, 16), t2, t3); 1284 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1285 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1286 } 1287 __ b(finish); 1288 1289 // 17..32 bytes 1290 __ bind(copy32); 1291 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1292 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1293 1294 bs.copy_store_at_16(Address(d, 0), t0, t1); 1295 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1296 __ b(finish); 1297 1298 // 65..80/96 bytes 1299 // (96 bytes if SIMD because we do 32 byes per instruction) 1300 __ bind(copy80); 1301 if (UseSIMDForMemoryOps) { 1302 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1303 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1304 // Unaligned pointers can be an issue for copying. 1305 // The issue has more chances to happen when granularity of data is 1306 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1307 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1308 // The most performance drop has been seen for the range 65-80 bytes. 1309 // For such cases using the pair of ldp/stp instead of the third pair of 1310 // ldpq/stpq fixes the performance issue. 1311 if (granularity < sizeof (jint)) { 1312 Label copy96; 1313 __ cmp(count, u1(80/granularity)); 1314 __ br(Assembler::HI, copy96); 1315 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1316 1317 bs.copy_store_at_32(Address(d, 0), v0, v1); 1318 bs.copy_store_at_32(Address(d, 32), v2, v3); 1319 1320 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1321 __ b(finish); 1322 1323 __ bind(copy96); 1324 } 1325 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1326 1327 bs.copy_store_at_32(Address(d, 0), v0, v1); 1328 bs.copy_store_at_32(Address(d, 32), v2, v3); 1329 1330 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1331 } else { 1332 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1333 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1334 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1335 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1336 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1337 1338 bs.copy_store_at_16(Address(d, 0), t0, t1); 1339 bs.copy_store_at_16(Address(d, 16), t2, t3); 1340 bs.copy_store_at_16(Address(d, 32), t4, t5); 1341 bs.copy_store_at_16(Address(d, 48), t6, t7); 1342 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1343 } 1344 __ b(finish); 1345 1346 // 0..16 bytes 1347 __ bind(copy16); 1348 __ cmp(count, u1(8/granularity)); 1349 __ br(Assembler::LO, copy8); 1350 1351 // 8..16 bytes 1352 bs.copy_load_at_8(t0, Address(s, 0)); 1353 bs.copy_load_at_8(t1, Address(send, -8)); 1354 bs.copy_store_at_8(Address(d, 0), t0); 1355 bs.copy_store_at_8(Address(dend, -8), t1); 1356 __ b(finish); 1357 1358 if (granularity < 8) { 1359 // 4..7 bytes 1360 __ bind(copy8); 1361 __ tbz(count, 2 - exact_log2(granularity), copy4); 1362 __ ldrw(t0, Address(s, 0)); 1363 __ ldrw(t1, Address(send, -4)); 1364 __ strw(t0, Address(d, 0)); 1365 __ strw(t1, Address(dend, -4)); 1366 __ b(finish); 1367 if (granularity < 4) { 1368 // 0..3 bytes 1369 __ bind(copy4); 1370 __ cbz(count, finish); // get rid of 0 case 1371 if (granularity == 2) { 1372 __ ldrh(t0, Address(s, 0)); 1373 __ strh(t0, Address(d, 0)); 1374 } else { // granularity == 1 1375 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1376 // the first and last byte. 1377 // Handle the 3 byte case by loading and storing base + count/2 1378 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1379 // This does means in the 1 byte case we load/store the same 1380 // byte 3 times. 1381 __ lsr(count, count, 1); 1382 __ ldrb(t0, Address(s, 0)); 1383 __ ldrb(t1, Address(send, -1)); 1384 __ ldrb(t2, Address(s, count)); 1385 __ strb(t0, Address(d, 0)); 1386 __ strb(t1, Address(dend, -1)); 1387 __ strb(t2, Address(d, count)); 1388 } 1389 __ b(finish); 1390 } 1391 } 1392 1393 __ bind(copy_big); 1394 if (is_backwards) { 1395 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1396 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1397 } 1398 1399 // Now we've got the small case out of the way we can align the 1400 // source address on a 2-word boundary. 1401 1402 // Here we will materialize a count in r15, which is used by copy_memory_small 1403 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1404 // Up until here, we have used t9, which aliases r15, but from here on, that register 1405 // can not be used as a temp register, as it contains the count. 1406 1407 Label aligned; 1408 1409 if (is_aligned) { 1410 // We may have to adjust by 1 word to get s 2-word-aligned. 1411 __ tbz(s, exact_log2(wordSize), aligned); 1412 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1413 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1414 __ sub(count, count, wordSize/granularity); 1415 } else { 1416 if (is_backwards) { 1417 __ andr(r15, s, 2 * wordSize - 1); 1418 } else { 1419 __ neg(r15, s); 1420 __ andr(r15, r15, 2 * wordSize - 1); 1421 } 1422 // r15 is the byte adjustment needed to align s. 1423 __ cbz(r15, aligned); 1424 int shift = exact_log2(granularity); 1425 if (shift > 0) { 1426 __ lsr(r15, r15, shift); 1427 } 1428 __ sub(count, count, r15); 1429 1430 #if 0 1431 // ?? This code is only correct for a disjoint copy. It may or 1432 // may not make sense to use it in that case. 1433 1434 // Copy the first pair; s and d may not be aligned. 1435 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1436 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1437 1438 // Align s and d, adjust count 1439 if (is_backwards) { 1440 __ sub(s, s, r15); 1441 __ sub(d, d, r15); 1442 } else { 1443 __ add(s, s, r15); 1444 __ add(d, d, r15); 1445 } 1446 #else 1447 copy_memory_small(decorators, type, s, d, r15, step); 1448 #endif 1449 } 1450 1451 __ bind(aligned); 1452 1453 // s is now 2-word-aligned. 1454 1455 // We have a count of units and some trailing bytes. Adjust the 1456 // count and do a bulk copy of words. If the shift is zero 1457 // perform a move instead to benefit from zero latency moves. 1458 int shift = exact_log2(wordSize/granularity); 1459 if (shift > 0) { 1460 __ lsr(r15, count, shift); 1461 } else { 1462 __ mov(r15, count); 1463 } 1464 if (direction == copy_forwards) { 1465 if (type != T_OBJECT) { 1466 __ bl(copy_f); 1467 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1468 __ bl(copy_obj_uninit_f); 1469 } else { 1470 __ bl(copy_obj_f); 1471 } 1472 } else { 1473 if (type != T_OBJECT) { 1474 __ bl(copy_b); 1475 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1476 __ bl(copy_obj_uninit_b); 1477 } else { 1478 __ bl(copy_obj_b); 1479 } 1480 } 1481 1482 // And the tail. 1483 copy_memory_small(decorators, type, s, d, count, step); 1484 1485 if (granularity >= 8) __ bind(copy8); 1486 if (granularity >= 4) __ bind(copy4); 1487 __ bind(finish); 1488 } 1489 1490 1491 void clobber_registers() { 1492 #ifdef ASSERT 1493 RegSet clobbered 1494 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1495 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1496 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1497 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1498 __ mov(*it, rscratch1); 1499 } 1500 #endif 1501 1502 } 1503 1504 // Scan over array at a for count oops, verifying each one. 1505 // Preserves a and count, clobbers rscratch1 and rscratch2. 1506 void verify_oop_array (int size, Register a, Register count, Register temp) { 1507 Label loop, end; 1508 __ mov(rscratch1, a); 1509 __ mov(rscratch2, zr); 1510 __ bind(loop); 1511 __ cmp(rscratch2, count); 1512 __ br(Assembler::HS, end); 1513 if (size == wordSize) { 1514 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1515 __ verify_oop(temp); 1516 } else { 1517 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1518 __ decode_heap_oop(temp); // calls verify_oop 1519 } 1520 __ add(rscratch2, rscratch2, 1); 1521 __ b(loop); 1522 __ bind(end); 1523 } 1524 1525 // Arguments: 1526 // stub_id - is used to name the stub and identify all details of 1527 // how to perform the copy. 1528 // 1529 // entry - is assigned to the stub's post push entry point unless 1530 // it is null 1531 // 1532 // Inputs: 1533 // c_rarg0 - source array address 1534 // c_rarg1 - destination array address 1535 // c_rarg2 - element count, treated as ssize_t, can be zero 1536 // 1537 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1538 // the hardware handle it. The two dwords within qwords that span 1539 // cache line boundaries will still be loaded and stored atomically. 1540 // 1541 // Side Effects: entry is set to the (post push) entry point so it 1542 // can be used by the corresponding conjoint copy 1543 // method 1544 // 1545 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1546 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1547 RegSet saved_reg = RegSet::of(s, d, count); 1548 int size; 1549 bool aligned; 1550 bool is_oop; 1551 bool dest_uninitialized; 1552 switch (stub_id) { 1553 case jbyte_disjoint_arraycopy_id: 1554 size = sizeof(jbyte); 1555 aligned = false; 1556 is_oop = false; 1557 dest_uninitialized = false; 1558 break; 1559 case arrayof_jbyte_disjoint_arraycopy_id: 1560 size = sizeof(jbyte); 1561 aligned = true; 1562 is_oop = false; 1563 dest_uninitialized = false; 1564 break; 1565 case jshort_disjoint_arraycopy_id: 1566 size = sizeof(jshort); 1567 aligned = false; 1568 is_oop = false; 1569 dest_uninitialized = false; 1570 break; 1571 case arrayof_jshort_disjoint_arraycopy_id: 1572 size = sizeof(jshort); 1573 aligned = true; 1574 is_oop = false; 1575 dest_uninitialized = false; 1576 break; 1577 case jint_disjoint_arraycopy_id: 1578 size = sizeof(jint); 1579 aligned = false; 1580 is_oop = false; 1581 dest_uninitialized = false; 1582 break; 1583 case arrayof_jint_disjoint_arraycopy_id: 1584 size = sizeof(jint); 1585 aligned = true; 1586 is_oop = false; 1587 dest_uninitialized = false; 1588 break; 1589 case jlong_disjoint_arraycopy_id: 1590 // since this is always aligned we can (should!) use the same 1591 // stub as for case arrayof_jlong_disjoint_arraycopy 1592 ShouldNotReachHere(); 1593 break; 1594 case arrayof_jlong_disjoint_arraycopy_id: 1595 size = sizeof(jlong); 1596 aligned = true; 1597 is_oop = false; 1598 dest_uninitialized = false; 1599 break; 1600 case oop_disjoint_arraycopy_id: 1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1602 aligned = !UseCompressedOops; 1603 is_oop = true; 1604 dest_uninitialized = false; 1605 break; 1606 case arrayof_oop_disjoint_arraycopy_id: 1607 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1608 aligned = !UseCompressedOops; 1609 is_oop = true; 1610 dest_uninitialized = false; 1611 break; 1612 case oop_disjoint_arraycopy_uninit_id: 1613 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1614 aligned = !UseCompressedOops; 1615 is_oop = true; 1616 dest_uninitialized = true; 1617 break; 1618 case arrayof_oop_disjoint_arraycopy_uninit_id: 1619 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1620 aligned = !UseCompressedOops; 1621 is_oop = true; 1622 dest_uninitialized = true; 1623 break; 1624 default: 1625 ShouldNotReachHere(); 1626 break; 1627 } 1628 1629 __ align(CodeEntryAlignment); 1630 StubCodeMark mark(this, stub_id); 1631 address start = __ pc(); 1632 __ enter(); 1633 1634 if (entry != nullptr) { 1635 *entry = __ pc(); 1636 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1637 BLOCK_COMMENT("Entry:"); 1638 } 1639 1640 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1641 if (dest_uninitialized) { 1642 decorators |= IS_DEST_UNINITIALIZED; 1643 } 1644 if (aligned) { 1645 decorators |= ARRAYCOPY_ALIGNED; 1646 } 1647 1648 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1649 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1650 1651 if (is_oop) { 1652 // save regs before copy_memory 1653 __ push(RegSet::of(d, count), sp); 1654 } 1655 { 1656 // UnsafeMemoryAccess page error: continue after unsafe access 1657 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1658 UnsafeMemoryAccessMark umam(this, add_entry, true); 1659 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1660 } 1661 1662 if (is_oop) { 1663 __ pop(RegSet::of(d, count), sp); 1664 if (VerifyOops) 1665 verify_oop_array(size, d, count, r16); 1666 } 1667 1668 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1669 1670 __ leave(); 1671 __ mov(r0, zr); // return 0 1672 __ ret(lr); 1673 return start; 1674 } 1675 1676 // Arguments: 1677 // stub_id - is used to name the stub and identify all details of 1678 // how to perform the copy. 1679 // 1680 // nooverlap_target - identifes the (post push) entry for the 1681 // corresponding disjoint copy routine which can be 1682 // jumped to if the ranges do not actually overlap 1683 // 1684 // entry - is assigned to the stub's post push entry point unless 1685 // it is null 1686 // 1687 // 1688 // Inputs: 1689 // c_rarg0 - source array address 1690 // c_rarg1 - destination array address 1691 // c_rarg2 - element count, treated as ssize_t, can be zero 1692 // 1693 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1694 // the hardware handle it. The two dwords within qwords that span 1695 // cache line boundaries will still be loaded and stored atomically. 1696 // 1697 // Side Effects: 1698 // entry is set to the no-overlap entry point so it can be used by 1699 // some other conjoint copy method 1700 // 1701 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1702 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1703 RegSet saved_regs = RegSet::of(s, d, count); 1704 int size; 1705 bool aligned; 1706 bool is_oop; 1707 bool dest_uninitialized; 1708 switch (stub_id) { 1709 case jbyte_arraycopy_id: 1710 size = sizeof(jbyte); 1711 aligned = false; 1712 is_oop = false; 1713 dest_uninitialized = false; 1714 break; 1715 case arrayof_jbyte_arraycopy_id: 1716 size = sizeof(jbyte); 1717 aligned = true; 1718 is_oop = false; 1719 dest_uninitialized = false; 1720 break; 1721 case jshort_arraycopy_id: 1722 size = sizeof(jshort); 1723 aligned = false; 1724 is_oop = false; 1725 dest_uninitialized = false; 1726 break; 1727 case arrayof_jshort_arraycopy_id: 1728 size = sizeof(jshort); 1729 aligned = true; 1730 is_oop = false; 1731 dest_uninitialized = false; 1732 break; 1733 case jint_arraycopy_id: 1734 size = sizeof(jint); 1735 aligned = false; 1736 is_oop = false; 1737 dest_uninitialized = false; 1738 break; 1739 case arrayof_jint_arraycopy_id: 1740 size = sizeof(jint); 1741 aligned = true; 1742 is_oop = false; 1743 dest_uninitialized = false; 1744 break; 1745 case jlong_arraycopy_id: 1746 // since this is always aligned we can (should!) use the same 1747 // stub as for case arrayof_jlong_disjoint_arraycopy 1748 ShouldNotReachHere(); 1749 break; 1750 case arrayof_jlong_arraycopy_id: 1751 size = sizeof(jlong); 1752 aligned = true; 1753 is_oop = false; 1754 dest_uninitialized = false; 1755 break; 1756 case oop_arraycopy_id: 1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1758 aligned = !UseCompressedOops; 1759 is_oop = true; 1760 dest_uninitialized = false; 1761 break; 1762 case arrayof_oop_arraycopy_id: 1763 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1764 aligned = !UseCompressedOops; 1765 is_oop = true; 1766 dest_uninitialized = false; 1767 break; 1768 case oop_arraycopy_uninit_id: 1769 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1770 aligned = !UseCompressedOops; 1771 is_oop = true; 1772 dest_uninitialized = true; 1773 break; 1774 case arrayof_oop_arraycopy_uninit_id: 1775 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1776 aligned = !UseCompressedOops; 1777 is_oop = true; 1778 dest_uninitialized = true; 1779 break; 1780 default: 1781 ShouldNotReachHere(); 1782 } 1783 1784 StubCodeMark mark(this, stub_id); 1785 address start = __ pc(); 1786 __ enter(); 1787 1788 if (entry != nullptr) { 1789 *entry = __ pc(); 1790 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1791 BLOCK_COMMENT("Entry:"); 1792 } 1793 1794 // use fwd copy when (d-s) above_equal (count*size) 1795 __ sub(rscratch1, d, s); 1796 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1797 __ br(Assembler::HS, nooverlap_target); 1798 1799 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1800 if (dest_uninitialized) { 1801 decorators |= IS_DEST_UNINITIALIZED; 1802 } 1803 if (aligned) { 1804 decorators |= ARRAYCOPY_ALIGNED; 1805 } 1806 1807 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1808 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1809 1810 if (is_oop) { 1811 // save regs before copy_memory 1812 __ push(RegSet::of(d, count), sp); 1813 } 1814 { 1815 // UnsafeMemoryAccess page error: continue after unsafe access 1816 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1817 UnsafeMemoryAccessMark umam(this, add_entry, true); 1818 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1819 } 1820 if (is_oop) { 1821 __ pop(RegSet::of(d, count), sp); 1822 if (VerifyOops) 1823 verify_oop_array(size, d, count, r16); 1824 } 1825 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1826 __ leave(); 1827 __ mov(r0, zr); // return 0 1828 __ ret(lr); 1829 return start; 1830 } 1831 1832 // Helper for generating a dynamic type check. 1833 // Smashes rscratch1, rscratch2. 1834 void generate_type_check(Register sub_klass, 1835 Register super_check_offset, 1836 Register super_klass, 1837 Register temp1, 1838 Register temp2, 1839 Register result, 1840 Label& L_success) { 1841 assert_different_registers(sub_klass, super_check_offset, super_klass); 1842 1843 BLOCK_COMMENT("type_check:"); 1844 1845 Label L_miss; 1846 1847 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1848 super_check_offset); 1849 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1850 1851 // Fall through on failure! 1852 __ BIND(L_miss); 1853 } 1854 1855 // 1856 // Generate checkcasting array copy stub 1857 // 1858 // Input: 1859 // c_rarg0 - source array address 1860 // c_rarg1 - destination array address 1861 // c_rarg2 - element count, treated as ssize_t, can be zero 1862 // c_rarg3 - size_t ckoff (super_check_offset) 1863 // c_rarg4 - oop ckval (super_klass) 1864 // 1865 // Output: 1866 // r0 == 0 - success 1867 // r0 == -1^K - failure, where K is partial transfer count 1868 // 1869 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1870 bool dest_uninitialized; 1871 switch (stub_id) { 1872 case checkcast_arraycopy_id: 1873 dest_uninitialized = false; 1874 break; 1875 case checkcast_arraycopy_uninit_id: 1876 dest_uninitialized = true; 1877 break; 1878 default: 1879 ShouldNotReachHere(); 1880 } 1881 1882 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1883 1884 // Input registers (after setup_arg_regs) 1885 const Register from = c_rarg0; // source array address 1886 const Register to = c_rarg1; // destination array address 1887 const Register count = c_rarg2; // elementscount 1888 const Register ckoff = c_rarg3; // super_check_offset 1889 const Register ckval = c_rarg4; // super_klass 1890 1891 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1892 RegSet wb_post_saved_regs = RegSet::of(count); 1893 1894 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1895 const Register copied_oop = r22; // actual oop copied 1896 const Register count_save = r21; // orig elementscount 1897 const Register start_to = r20; // destination array start address 1898 const Register r19_klass = r19; // oop._klass 1899 1900 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1901 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1902 1903 //--------------------------------------------------------------- 1904 // Assembler stub will be used for this call to arraycopy 1905 // if the two arrays are subtypes of Object[] but the 1906 // destination array type is not equal to or a supertype 1907 // of the source type. Each element must be separately 1908 // checked. 1909 1910 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1911 copied_oop, r19_klass, count_save); 1912 1913 __ align(CodeEntryAlignment); 1914 StubCodeMark mark(this, stub_id); 1915 address start = __ pc(); 1916 1917 __ enter(); // required for proper stackwalking of RuntimeStub frame 1918 1919 #ifdef ASSERT 1920 // caller guarantees that the arrays really are different 1921 // otherwise, we would have to make conjoint checks 1922 { Label L; 1923 __ b(L); // conjoint check not yet implemented 1924 __ stop("checkcast_copy within a single array"); 1925 __ bind(L); 1926 } 1927 #endif //ASSERT 1928 1929 // Caller of this entry point must set up the argument registers. 1930 if (entry != nullptr) { 1931 *entry = __ pc(); 1932 BLOCK_COMMENT("Entry:"); 1933 } 1934 1935 // Empty array: Nothing to do. 1936 __ cbz(count, L_done); 1937 __ push(RegSet::of(r19, r20, r21, r22), sp); 1938 1939 #ifdef ASSERT 1940 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1941 // The ckoff and ckval must be mutually consistent, 1942 // even though caller generates both. 1943 { Label L; 1944 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1945 __ ldrw(start_to, Address(ckval, sco_offset)); 1946 __ cmpw(ckoff, start_to); 1947 __ br(Assembler::EQ, L); 1948 __ stop("super_check_offset inconsistent"); 1949 __ bind(L); 1950 } 1951 #endif //ASSERT 1952 1953 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1954 bool is_oop = true; 1955 int element_size = UseCompressedOops ? 4 : 8; 1956 if (dest_uninitialized) { 1957 decorators |= IS_DEST_UNINITIALIZED; 1958 } 1959 1960 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1961 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1962 1963 // save the original count 1964 __ mov(count_save, count); 1965 1966 // Copy from low to high addresses 1967 __ mov(start_to, to); // Save destination array start address 1968 __ b(L_load_element); 1969 1970 // ======== begin loop ======== 1971 // (Loop is rotated; its entry is L_load_element.) 1972 // Loop control: 1973 // for (; count != 0; count--) { 1974 // copied_oop = load_heap_oop(from++); 1975 // ... generate_type_check ...; 1976 // store_heap_oop(to++, copied_oop); 1977 // } 1978 __ align(OptoLoopAlignment); 1979 1980 __ BIND(L_store_element); 1981 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1982 __ post(to, element_size), copied_oop, noreg, 1983 gct1, gct2, gct3); 1984 __ sub(count, count, 1); 1985 __ cbz(count, L_do_card_marks); 1986 1987 // ======== loop entry is here ======== 1988 __ BIND(L_load_element); 1989 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1990 copied_oop, noreg, __ post(from, element_size), 1991 gct1); 1992 __ cbz(copied_oop, L_store_element); 1993 1994 __ load_klass(r19_klass, copied_oop);// query the object klass 1995 1996 BLOCK_COMMENT("type_check:"); 1997 generate_type_check(/*sub_klass*/r19_klass, 1998 /*super_check_offset*/ckoff, 1999 /*super_klass*/ckval, 2000 /*r_array_base*/gct1, 2001 /*temp2*/gct2, 2002 /*result*/r10, L_store_element); 2003 2004 // Fall through on failure! 2005 2006 // ======== end loop ======== 2007 2008 // It was a real error; we must depend on the caller to finish the job. 2009 // Register count = remaining oops, count_orig = total oops. 2010 // Emit GC store barriers for the oops we have copied and report 2011 // their number to the caller. 2012 2013 __ subs(count, count_save, count); // K = partially copied oop count 2014 __ eon(count, count, zr); // report (-1^K) to caller 2015 __ br(Assembler::EQ, L_done_pop); 2016 2017 __ BIND(L_do_card_marks); 2018 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2019 2020 __ bind(L_done_pop); 2021 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2022 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2023 2024 __ bind(L_done); 2025 __ mov(r0, count); 2026 __ leave(); 2027 __ ret(lr); 2028 2029 return start; 2030 } 2031 2032 // Perform range checks on the proposed arraycopy. 2033 // Kills temp, but nothing else. 2034 // Also, clean the sign bits of src_pos and dst_pos. 2035 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2036 Register src_pos, // source position (c_rarg1) 2037 Register dst, // destination array oo (c_rarg2) 2038 Register dst_pos, // destination position (c_rarg3) 2039 Register length, 2040 Register temp, 2041 Label& L_failed) { 2042 BLOCK_COMMENT("arraycopy_range_checks:"); 2043 2044 assert_different_registers(rscratch1, temp); 2045 2046 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2047 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2048 __ addw(temp, length, src_pos); 2049 __ cmpw(temp, rscratch1); 2050 __ br(Assembler::HI, L_failed); 2051 2052 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2053 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2054 __ addw(temp, length, dst_pos); 2055 __ cmpw(temp, rscratch1); 2056 __ br(Assembler::HI, L_failed); 2057 2058 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2059 __ movw(src_pos, src_pos); 2060 __ movw(dst_pos, dst_pos); 2061 2062 BLOCK_COMMENT("arraycopy_range_checks done"); 2063 } 2064 2065 // These stubs get called from some dumb test routine. 2066 // I'll write them properly when they're called from 2067 // something that's actually doing something. 2068 static void fake_arraycopy_stub(address src, address dst, int count) { 2069 assert(count == 0, "huh?"); 2070 } 2071 2072 2073 // 2074 // Generate 'unsafe' array copy stub 2075 // Though just as safe as the other stubs, it takes an unscaled 2076 // size_t argument instead of an element count. 2077 // 2078 // Input: 2079 // c_rarg0 - source array address 2080 // c_rarg1 - destination array address 2081 // c_rarg2 - byte count, treated as ssize_t, can be zero 2082 // 2083 // Examines the alignment of the operands and dispatches 2084 // to a long, int, short, or byte copy loop. 2085 // 2086 address generate_unsafe_copy(address byte_copy_entry, 2087 address short_copy_entry, 2088 address int_copy_entry, 2089 address long_copy_entry) { 2090 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2091 2092 Label L_long_aligned, L_int_aligned, L_short_aligned; 2093 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2094 2095 __ align(CodeEntryAlignment); 2096 StubCodeMark mark(this, stub_id); 2097 address start = __ pc(); 2098 __ enter(); // required for proper stackwalking of RuntimeStub frame 2099 2100 // bump this on entry, not on exit: 2101 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2102 2103 __ orr(rscratch1, s, d); 2104 __ orr(rscratch1, rscratch1, count); 2105 2106 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2107 __ cbz(rscratch1, L_long_aligned); 2108 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2109 __ cbz(rscratch1, L_int_aligned); 2110 __ tbz(rscratch1, 0, L_short_aligned); 2111 __ b(RuntimeAddress(byte_copy_entry)); 2112 2113 __ BIND(L_short_aligned); 2114 __ lsr(count, count, LogBytesPerShort); // size => short_count 2115 __ b(RuntimeAddress(short_copy_entry)); 2116 __ BIND(L_int_aligned); 2117 __ lsr(count, count, LogBytesPerInt); // size => int_count 2118 __ b(RuntimeAddress(int_copy_entry)); 2119 __ BIND(L_long_aligned); 2120 __ lsr(count, count, LogBytesPerLong); // size => long_count 2121 __ b(RuntimeAddress(long_copy_entry)); 2122 2123 return start; 2124 } 2125 2126 // 2127 // Generate generic array copy stubs 2128 // 2129 // Input: 2130 // c_rarg0 - src oop 2131 // c_rarg1 - src_pos (32-bits) 2132 // c_rarg2 - dst oop 2133 // c_rarg3 - dst_pos (32-bits) 2134 // c_rarg4 - element count (32-bits) 2135 // 2136 // Output: 2137 // r0 == 0 - success 2138 // r0 == -1^K - failure, where K is partial transfer count 2139 // 2140 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2141 address int_copy_entry, address oop_copy_entry, 2142 address long_copy_entry, address checkcast_copy_entry) { 2143 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2144 2145 Label L_failed, L_objArray; 2146 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2147 2148 // Input registers 2149 const Register src = c_rarg0; // source array oop 2150 const Register src_pos = c_rarg1; // source position 2151 const Register dst = c_rarg2; // destination array oop 2152 const Register dst_pos = c_rarg3; // destination position 2153 const Register length = c_rarg4; 2154 2155 2156 // Registers used as temps 2157 const Register dst_klass = c_rarg5; 2158 2159 __ align(CodeEntryAlignment); 2160 2161 StubCodeMark mark(this, stub_id); 2162 2163 address start = __ pc(); 2164 2165 __ enter(); // required for proper stackwalking of RuntimeStub frame 2166 2167 // bump this on entry, not on exit: 2168 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2169 2170 //----------------------------------------------------------------------- 2171 // Assembler stub will be used for this call to arraycopy 2172 // if the following conditions are met: 2173 // 2174 // (1) src and dst must not be null. 2175 // (2) src_pos must not be negative. 2176 // (3) dst_pos must not be negative. 2177 // (4) length must not be negative. 2178 // (5) src klass and dst klass should be the same and not null. 2179 // (6) src and dst should be arrays. 2180 // (7) src_pos + length must not exceed length of src. 2181 // (8) dst_pos + length must not exceed length of dst. 2182 // 2183 2184 // if (src == nullptr) return -1; 2185 __ cbz(src, L_failed); 2186 2187 // if (src_pos < 0) return -1; 2188 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2189 2190 // if (dst == nullptr) return -1; 2191 __ cbz(dst, L_failed); 2192 2193 // if (dst_pos < 0) return -1; 2194 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2195 2196 // registers used as temp 2197 const Register scratch_length = r16; // elements count to copy 2198 const Register scratch_src_klass = r17; // array klass 2199 const Register lh = r15; // layout helper 2200 2201 // if (length < 0) return -1; 2202 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2203 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2204 2205 __ load_klass(scratch_src_klass, src); 2206 #ifdef ASSERT 2207 // assert(src->klass() != nullptr); 2208 { 2209 BLOCK_COMMENT("assert klasses not null {"); 2210 Label L1, L2; 2211 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2212 __ bind(L1); 2213 __ stop("broken null klass"); 2214 __ bind(L2); 2215 __ load_klass(rscratch1, dst); 2216 __ cbz(rscratch1, L1); // this would be broken also 2217 BLOCK_COMMENT("} assert klasses not null done"); 2218 } 2219 #endif 2220 2221 // Load layout helper (32-bits) 2222 // 2223 // |array_tag| | header_size | element_type | |log2_element_size| 2224 // 32 30 24 16 8 2 0 2225 // 2226 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2227 // 2228 2229 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2230 2231 // Handle objArrays completely differently... 2232 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2233 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2234 __ movw(rscratch1, objArray_lh); 2235 __ eorw(rscratch2, lh, rscratch1); 2236 __ cbzw(rscratch2, L_objArray); 2237 2238 // if (src->klass() != dst->klass()) return -1; 2239 __ load_klass(rscratch2, dst); 2240 __ eor(rscratch2, rscratch2, scratch_src_klass); 2241 __ cbnz(rscratch2, L_failed); 2242 2243 // Check for flat inline type array -> return -1 2244 __ test_flat_array_oop(src, rscratch2, L_failed); 2245 2246 // Check for null-free (non-flat) inline type array -> handle as object array 2247 __ test_null_free_array_oop(src, rscratch2, L_objArray); 2248 2249 // if (!src->is_Array()) return -1; 2250 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2251 2252 // At this point, it is known to be a typeArray (array_tag 0x3). 2253 #ifdef ASSERT 2254 { 2255 BLOCK_COMMENT("assert primitive array {"); 2256 Label L; 2257 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2258 __ cmpw(lh, rscratch2); 2259 __ br(Assembler::GE, L); 2260 __ stop("must be a primitive array"); 2261 __ bind(L); 2262 BLOCK_COMMENT("} assert primitive array done"); 2263 } 2264 #endif 2265 2266 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2267 rscratch2, L_failed); 2268 2269 // TypeArrayKlass 2270 // 2271 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2272 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2273 // 2274 2275 const Register rscratch1_offset = rscratch1; // array offset 2276 const Register r15_elsize = lh; // element size 2277 2278 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2279 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2280 __ add(src, src, rscratch1_offset); // src array offset 2281 __ add(dst, dst, rscratch1_offset); // dst array offset 2282 BLOCK_COMMENT("choose copy loop based on element size"); 2283 2284 // next registers should be set before the jump to corresponding stub 2285 const Register from = c_rarg0; // source array address 2286 const Register to = c_rarg1; // destination array address 2287 const Register count = c_rarg2; // elements count 2288 2289 // 'from', 'to', 'count' registers should be set in such order 2290 // since they are the same as 'src', 'src_pos', 'dst'. 2291 2292 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2293 2294 // The possible values of elsize are 0-3, i.e. exact_log2(element 2295 // size in bytes). We do a simple bitwise binary search. 2296 __ BIND(L_copy_bytes); 2297 __ tbnz(r15_elsize, 1, L_copy_ints); 2298 __ tbnz(r15_elsize, 0, L_copy_shorts); 2299 __ lea(from, Address(src, src_pos));// src_addr 2300 __ lea(to, Address(dst, dst_pos));// dst_addr 2301 __ movw(count, scratch_length); // length 2302 __ b(RuntimeAddress(byte_copy_entry)); 2303 2304 __ BIND(L_copy_shorts); 2305 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2306 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2307 __ movw(count, scratch_length); // length 2308 __ b(RuntimeAddress(short_copy_entry)); 2309 2310 __ BIND(L_copy_ints); 2311 __ tbnz(r15_elsize, 0, L_copy_longs); 2312 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2313 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2314 __ movw(count, scratch_length); // length 2315 __ b(RuntimeAddress(int_copy_entry)); 2316 2317 __ BIND(L_copy_longs); 2318 #ifdef ASSERT 2319 { 2320 BLOCK_COMMENT("assert long copy {"); 2321 Label L; 2322 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2323 __ cmpw(r15_elsize, LogBytesPerLong); 2324 __ br(Assembler::EQ, L); 2325 __ stop("must be long copy, but elsize is wrong"); 2326 __ bind(L); 2327 BLOCK_COMMENT("} assert long copy done"); 2328 } 2329 #endif 2330 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2331 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2332 __ movw(count, scratch_length); // length 2333 __ b(RuntimeAddress(long_copy_entry)); 2334 2335 // ObjArrayKlass 2336 __ BIND(L_objArray); 2337 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2338 2339 Label L_plain_copy, L_checkcast_copy; 2340 // test array classes for subtyping 2341 __ load_klass(r15, dst); 2342 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2343 __ br(Assembler::NE, L_checkcast_copy); 2344 2345 // Identically typed arrays can be copied without element-wise checks. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 rscratch2, L_failed); 2348 2349 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2350 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2351 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ movw(count, scratch_length); // length 2354 __ BIND(L_plain_copy); 2355 __ b(RuntimeAddress(oop_copy_entry)); 2356 2357 __ BIND(L_checkcast_copy); 2358 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2359 { 2360 // Before looking at dst.length, make sure dst is also an objArray. 2361 __ ldrw(rscratch1, Address(r15, lh_offset)); 2362 __ movw(rscratch2, objArray_lh); 2363 __ eorw(rscratch1, rscratch1, rscratch2); 2364 __ cbnzw(rscratch1, L_failed); 2365 2366 // It is safe to examine both src.length and dst.length. 2367 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2368 r15, L_failed); 2369 2370 __ load_klass(dst_klass, dst); // reload 2371 2372 // Marshal the base address arguments now, freeing registers. 2373 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2374 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2375 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2376 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2377 __ movw(count, length); // length (reloaded) 2378 Register sco_temp = c_rarg3; // this register is free now 2379 assert_different_registers(from, to, count, sco_temp, 2380 dst_klass, scratch_src_klass); 2381 // assert_clean_int(count, sco_temp); 2382 2383 // Generate the type check. 2384 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2386 2387 // Smashes rscratch1, rscratch2 2388 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2389 L_plain_copy); 2390 2391 // Fetch destination element klass from the ObjArrayKlass header. 2392 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2393 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2394 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2395 2396 // the checkcast_copy loop needs two extra arguments: 2397 assert(c_rarg3 == sco_temp, "#3 already in place"); 2398 // Set up arguments for checkcast_copy_entry. 2399 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2400 __ b(RuntimeAddress(checkcast_copy_entry)); 2401 } 2402 2403 __ BIND(L_failed); 2404 __ mov(r0, -1); 2405 __ leave(); // required for proper stackwalking of RuntimeStub frame 2406 __ ret(lr); 2407 2408 return start; 2409 } 2410 2411 // 2412 // Generate stub for array fill. If "aligned" is true, the 2413 // "to" address is assumed to be heapword aligned. 2414 // 2415 // Arguments for generated stub: 2416 // to: c_rarg0 2417 // value: c_rarg1 2418 // count: c_rarg2 treated as signed 2419 // 2420 address generate_fill(StubGenStubId stub_id) { 2421 BasicType t; 2422 bool aligned; 2423 2424 switch (stub_id) { 2425 case jbyte_fill_id: 2426 t = T_BYTE; 2427 aligned = false; 2428 break; 2429 case jshort_fill_id: 2430 t = T_SHORT; 2431 aligned = false; 2432 break; 2433 case jint_fill_id: 2434 t = T_INT; 2435 aligned = false; 2436 break; 2437 case arrayof_jbyte_fill_id: 2438 t = T_BYTE; 2439 aligned = true; 2440 break; 2441 case arrayof_jshort_fill_id: 2442 t = T_SHORT; 2443 aligned = true; 2444 break; 2445 case arrayof_jint_fill_id: 2446 t = T_INT; 2447 aligned = true; 2448 break; 2449 default: 2450 ShouldNotReachHere(); 2451 }; 2452 2453 __ align(CodeEntryAlignment); 2454 StubCodeMark mark(this, stub_id); 2455 address start = __ pc(); 2456 2457 BLOCK_COMMENT("Entry:"); 2458 2459 const Register to = c_rarg0; // source array address 2460 const Register value = c_rarg1; // value 2461 const Register count = c_rarg2; // elements count 2462 2463 const Register bz_base = r10; // base for block_zero routine 2464 const Register cnt_words = r11; // temp register 2465 2466 __ enter(); 2467 2468 Label L_fill_elements, L_exit1; 2469 2470 int shift = -1; 2471 switch (t) { 2472 case T_BYTE: 2473 shift = 0; 2474 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2475 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2476 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2477 __ br(Assembler::LO, L_fill_elements); 2478 break; 2479 case T_SHORT: 2480 shift = 1; 2481 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2482 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2483 __ br(Assembler::LO, L_fill_elements); 2484 break; 2485 case T_INT: 2486 shift = 2; 2487 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2488 __ br(Assembler::LO, L_fill_elements); 2489 break; 2490 default: ShouldNotReachHere(); 2491 } 2492 2493 // Align source address at 8 bytes address boundary. 2494 Label L_skip_align1, L_skip_align2, L_skip_align4; 2495 if (!aligned) { 2496 switch (t) { 2497 case T_BYTE: 2498 // One byte misalignment happens only for byte arrays. 2499 __ tbz(to, 0, L_skip_align1); 2500 __ strb(value, Address(__ post(to, 1))); 2501 __ subw(count, count, 1); 2502 __ bind(L_skip_align1); 2503 // Fallthrough 2504 case T_SHORT: 2505 // Two bytes misalignment happens only for byte and short (char) arrays. 2506 __ tbz(to, 1, L_skip_align2); 2507 __ strh(value, Address(__ post(to, 2))); 2508 __ subw(count, count, 2 >> shift); 2509 __ bind(L_skip_align2); 2510 // Fallthrough 2511 case T_INT: 2512 // Align to 8 bytes, we know we are 4 byte aligned to start. 2513 __ tbz(to, 2, L_skip_align4); 2514 __ strw(value, Address(__ post(to, 4))); 2515 __ subw(count, count, 4 >> shift); 2516 __ bind(L_skip_align4); 2517 break; 2518 default: ShouldNotReachHere(); 2519 } 2520 } 2521 2522 // 2523 // Fill large chunks 2524 // 2525 __ lsrw(cnt_words, count, 3 - shift); // number of words 2526 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2527 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2528 if (UseBlockZeroing) { 2529 Label non_block_zeroing, rest; 2530 // If the fill value is zero we can use the fast zero_words(). 2531 __ cbnz(value, non_block_zeroing); 2532 __ mov(bz_base, to); 2533 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2534 address tpc = __ zero_words(bz_base, cnt_words); 2535 if (tpc == nullptr) { 2536 fatal("CodeCache is full at generate_fill"); 2537 } 2538 __ b(rest); 2539 __ bind(non_block_zeroing); 2540 __ fill_words(to, cnt_words, value); 2541 __ bind(rest); 2542 } else { 2543 __ fill_words(to, cnt_words, value); 2544 } 2545 2546 // Remaining count is less than 8 bytes. Fill it by a single store. 2547 // Note that the total length is no less than 8 bytes. 2548 if (t == T_BYTE || t == T_SHORT) { 2549 Label L_exit1; 2550 __ cbzw(count, L_exit1); 2551 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2552 __ str(value, Address(to, -8)); // overwrite some elements 2553 __ bind(L_exit1); 2554 __ leave(); 2555 __ ret(lr); 2556 } 2557 2558 // Handle copies less than 8 bytes. 2559 Label L_fill_2, L_fill_4, L_exit2; 2560 __ bind(L_fill_elements); 2561 switch (t) { 2562 case T_BYTE: 2563 __ tbz(count, 0, L_fill_2); 2564 __ strb(value, Address(__ post(to, 1))); 2565 __ bind(L_fill_2); 2566 __ tbz(count, 1, L_fill_4); 2567 __ strh(value, Address(__ post(to, 2))); 2568 __ bind(L_fill_4); 2569 __ tbz(count, 2, L_exit2); 2570 __ strw(value, Address(to)); 2571 break; 2572 case T_SHORT: 2573 __ tbz(count, 0, L_fill_4); 2574 __ strh(value, Address(__ post(to, 2))); 2575 __ bind(L_fill_4); 2576 __ tbz(count, 1, L_exit2); 2577 __ strw(value, Address(to)); 2578 break; 2579 case T_INT: 2580 __ cbzw(count, L_exit2); 2581 __ strw(value, Address(to)); 2582 break; 2583 default: ShouldNotReachHere(); 2584 } 2585 __ bind(L_exit2); 2586 __ leave(); 2587 __ ret(lr); 2588 return start; 2589 } 2590 2591 address generate_unsafecopy_common_error_exit() { 2592 address start_pc = __ pc(); 2593 __ leave(); 2594 __ mov(r0, 0); 2595 __ ret(lr); 2596 return start_pc; 2597 } 2598 2599 // 2600 // Generate 'unsafe' set memory stub 2601 // Though just as safe as the other stubs, it takes an unscaled 2602 // size_t (# bytes) argument instead of an element count. 2603 // 2604 // This fill operation is atomicity preserving: as long as the 2605 // address supplied is sufficiently aligned, all writes of up to 64 2606 // bits in size are single-copy atomic. 2607 // 2608 // Input: 2609 // c_rarg0 - destination array address 2610 // c_rarg1 - byte count (size_t) 2611 // c_rarg2 - byte value 2612 // 2613 address generate_unsafe_setmemory() { 2614 __ align(CodeEntryAlignment); 2615 StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id); 2616 address start = __ pc(); 2617 2618 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2619 Label tail; 2620 2621 UnsafeMemoryAccessMark umam(this, true, false); 2622 2623 __ enter(); // required for proper stackwalking of RuntimeStub frame 2624 2625 __ dup(v0, __ T16B, value); 2626 2627 if (AvoidUnalignedAccesses) { 2628 __ cmp(count, (u1)16); 2629 __ br(__ LO, tail); 2630 2631 __ mov(rscratch1, 16); 2632 __ andr(rscratch2, dest, 15); 2633 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2634 __ strq(v0, Address(dest)); 2635 __ sub(count, count, rscratch1); 2636 __ add(dest, dest, rscratch1); 2637 } 2638 2639 __ subs(count, count, (u1)64); 2640 __ br(__ LO, tail); 2641 { 2642 Label again; 2643 __ bind(again); 2644 __ stpq(v0, v0, Address(dest)); 2645 __ stpq(v0, v0, Address(dest, 32)); 2646 2647 __ subs(count, count, 64); 2648 __ add(dest, dest, 64); 2649 __ br(__ HS, again); 2650 } 2651 2652 __ bind(tail); 2653 // The count of bytes is off by 64, but we don't need to correct 2654 // it because we're only going to use the least-significant few 2655 // count bits from here on. 2656 // __ add(count, count, 64); 2657 2658 { 2659 Label dont; 2660 __ tbz(count, exact_log2(32), dont); 2661 __ stpq(v0, v0, __ post(dest, 32)); 2662 __ bind(dont); 2663 } 2664 { 2665 Label dont; 2666 __ tbz(count, exact_log2(16), dont); 2667 __ strq(v0, __ post(dest, 16)); 2668 __ bind(dont); 2669 } 2670 { 2671 Label dont; 2672 __ tbz(count, exact_log2(8), dont); 2673 __ strd(v0, __ post(dest, 8)); 2674 __ bind(dont); 2675 } 2676 2677 Label finished; 2678 __ tst(count, 7); 2679 __ br(__ EQ, finished); 2680 2681 { 2682 Label dont; 2683 __ tbz(count, exact_log2(4), dont); 2684 __ strs(v0, __ post(dest, 4)); 2685 __ bind(dont); 2686 } 2687 { 2688 Label dont; 2689 __ tbz(count, exact_log2(2), dont); 2690 __ bfi(value, value, 8, 8); 2691 __ strh(value, __ post(dest, 2)); 2692 __ bind(dont); 2693 } 2694 { 2695 Label dont; 2696 __ tbz(count, exact_log2(1), dont); 2697 __ strb(value, Address(dest)); 2698 __ bind(dont); 2699 } 2700 2701 __ bind(finished); 2702 __ leave(); 2703 __ ret(lr); 2704 2705 return start; 2706 } 2707 2708 address generate_data_cache_writeback() { 2709 const Register line = c_rarg0; // address of line to write back 2710 2711 __ align(CodeEntryAlignment); 2712 2713 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2714 StubCodeMark mark(this, stub_id); 2715 2716 address start = __ pc(); 2717 __ enter(); 2718 __ cache_wb(Address(line, 0)); 2719 __ leave(); 2720 __ ret(lr); 2721 2722 return start; 2723 } 2724 2725 address generate_data_cache_writeback_sync() { 2726 const Register is_pre = c_rarg0; // pre or post sync 2727 2728 __ align(CodeEntryAlignment); 2729 2730 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2731 StubCodeMark mark(this, stub_id); 2732 2733 // pre wbsync is a no-op 2734 // post wbsync translates to an sfence 2735 2736 Label skip; 2737 address start = __ pc(); 2738 __ enter(); 2739 __ cbnz(is_pre, skip); 2740 __ cache_wbsync(false); 2741 __ bind(skip); 2742 __ leave(); 2743 __ ret(lr); 2744 2745 return start; 2746 } 2747 2748 void generate_arraycopy_stubs() { 2749 address entry; 2750 address entry_jbyte_arraycopy; 2751 address entry_jshort_arraycopy; 2752 address entry_jint_arraycopy; 2753 address entry_oop_arraycopy; 2754 address entry_jlong_arraycopy; 2755 address entry_checkcast_arraycopy; 2756 2757 address ucm_common_error_exit = generate_unsafecopy_common_error_exit(); 2758 UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit); 2759 2760 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2761 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2762 2763 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2764 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2765 2766 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2767 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2768 2769 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2770 2771 //*** jbyte 2772 // Always need aligned and unaligned versions 2773 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2774 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2775 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2776 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2777 2778 //*** jshort 2779 // Always need aligned and unaligned versions 2780 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2781 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2782 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2783 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2784 2785 //*** jint 2786 // Aligned versions 2787 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2788 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2789 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2790 // entry_jint_arraycopy always points to the unaligned version 2791 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2792 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2793 2794 //*** jlong 2795 // It is always aligned 2796 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2797 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2798 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2799 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2800 2801 //*** oops 2802 { 2803 // With compressed oops we need unaligned versions; notice that 2804 // we overwrite entry_oop_arraycopy. 2805 bool aligned = !UseCompressedOops; 2806 2807 StubRoutines::_arrayof_oop_disjoint_arraycopy 2808 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2809 StubRoutines::_arrayof_oop_arraycopy 2810 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2811 // Aligned versions without pre-barriers 2812 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2813 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2814 StubRoutines::_arrayof_oop_arraycopy_uninit 2815 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2816 } 2817 2818 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2819 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2820 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2821 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2822 2823 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2824 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2825 2826 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2827 entry_jshort_arraycopy, 2828 entry_jint_arraycopy, 2829 entry_jlong_arraycopy); 2830 2831 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2832 entry_jshort_arraycopy, 2833 entry_jint_arraycopy, 2834 entry_oop_arraycopy, 2835 entry_jlong_arraycopy, 2836 entry_checkcast_arraycopy); 2837 2838 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2839 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2840 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2841 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2842 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2843 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2844 } 2845 2846 void generate_math_stubs() { Unimplemented(); } 2847 2848 // Arguments: 2849 // 2850 // Inputs: 2851 // c_rarg0 - source byte array address 2852 // c_rarg1 - destination byte array address 2853 // c_rarg2 - K (key) in little endian int array 2854 // 2855 address generate_aescrypt_encryptBlock() { 2856 __ align(CodeEntryAlignment); 2857 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2858 StubCodeMark mark(this, stub_id); 2859 2860 const Register from = c_rarg0; // source array address 2861 const Register to = c_rarg1; // destination array address 2862 const Register key = c_rarg2; // key array address 2863 const Register keylen = rscratch1; 2864 2865 address start = __ pc(); 2866 __ enter(); 2867 2868 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2869 2870 __ aesenc_loadkeys(key, keylen); 2871 __ aesecb_encrypt(from, to, keylen); 2872 2873 __ mov(r0, 0); 2874 2875 __ leave(); 2876 __ ret(lr); 2877 2878 return start; 2879 } 2880 2881 // Arguments: 2882 // 2883 // Inputs: 2884 // c_rarg0 - source byte array address 2885 // c_rarg1 - destination byte array address 2886 // c_rarg2 - K (key) in little endian int array 2887 // 2888 address generate_aescrypt_decryptBlock() { 2889 assert(UseAES, "need AES cryptographic extension support"); 2890 __ align(CodeEntryAlignment); 2891 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2892 StubCodeMark mark(this, stub_id); 2893 Label L_doLast; 2894 2895 const Register from = c_rarg0; // source array address 2896 const Register to = c_rarg1; // destination array address 2897 const Register key = c_rarg2; // key array address 2898 const Register keylen = rscratch1; 2899 2900 address start = __ pc(); 2901 __ enter(); // required for proper stackwalking of RuntimeStub frame 2902 2903 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2904 2905 __ aesecb_decrypt(from, to, key, keylen); 2906 2907 __ mov(r0, 0); 2908 2909 __ leave(); 2910 __ ret(lr); 2911 2912 return start; 2913 } 2914 2915 // Arguments: 2916 // 2917 // Inputs: 2918 // c_rarg0 - source byte array address 2919 // c_rarg1 - destination byte array address 2920 // c_rarg2 - K (key) in little endian int array 2921 // c_rarg3 - r vector byte array address 2922 // c_rarg4 - input length 2923 // 2924 // Output: 2925 // x0 - input length 2926 // 2927 address generate_cipherBlockChaining_encryptAESCrypt() { 2928 assert(UseAES, "need AES cryptographic extension support"); 2929 __ align(CodeEntryAlignment); 2930 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2931 StubCodeMark mark(this, stub_id); 2932 2933 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2934 2935 const Register from = c_rarg0; // source array address 2936 const Register to = c_rarg1; // destination array address 2937 const Register key = c_rarg2; // key array address 2938 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2939 // and left with the results of the last encryption block 2940 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2941 const Register keylen = rscratch1; 2942 2943 address start = __ pc(); 2944 2945 __ enter(); 2946 2947 __ movw(rscratch2, len_reg); 2948 2949 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2950 2951 __ ld1(v0, __ T16B, rvec); 2952 2953 __ cmpw(keylen, 52); 2954 __ br(Assembler::CC, L_loadkeys_44); 2955 __ br(Assembler::EQ, L_loadkeys_52); 2956 2957 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2958 __ rev32(v17, __ T16B, v17); 2959 __ rev32(v18, __ T16B, v18); 2960 __ BIND(L_loadkeys_52); 2961 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2962 __ rev32(v19, __ T16B, v19); 2963 __ rev32(v20, __ T16B, v20); 2964 __ BIND(L_loadkeys_44); 2965 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2966 __ rev32(v21, __ T16B, v21); 2967 __ rev32(v22, __ T16B, v22); 2968 __ rev32(v23, __ T16B, v23); 2969 __ rev32(v24, __ T16B, v24); 2970 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2971 __ rev32(v25, __ T16B, v25); 2972 __ rev32(v26, __ T16B, v26); 2973 __ rev32(v27, __ T16B, v27); 2974 __ rev32(v28, __ T16B, v28); 2975 __ ld1(v29, v30, v31, __ T16B, key); 2976 __ rev32(v29, __ T16B, v29); 2977 __ rev32(v30, __ T16B, v30); 2978 __ rev32(v31, __ T16B, v31); 2979 2980 __ BIND(L_aes_loop); 2981 __ ld1(v1, __ T16B, __ post(from, 16)); 2982 __ eor(v0, __ T16B, v0, v1); 2983 2984 __ br(Assembler::CC, L_rounds_44); 2985 __ br(Assembler::EQ, L_rounds_52); 2986 2987 __ aese(v0, v17); __ aesmc(v0, v0); 2988 __ aese(v0, v18); __ aesmc(v0, v0); 2989 __ BIND(L_rounds_52); 2990 __ aese(v0, v19); __ aesmc(v0, v0); 2991 __ aese(v0, v20); __ aesmc(v0, v0); 2992 __ BIND(L_rounds_44); 2993 __ aese(v0, v21); __ aesmc(v0, v0); 2994 __ aese(v0, v22); __ aesmc(v0, v0); 2995 __ aese(v0, v23); __ aesmc(v0, v0); 2996 __ aese(v0, v24); __ aesmc(v0, v0); 2997 __ aese(v0, v25); __ aesmc(v0, v0); 2998 __ aese(v0, v26); __ aesmc(v0, v0); 2999 __ aese(v0, v27); __ aesmc(v0, v0); 3000 __ aese(v0, v28); __ aesmc(v0, v0); 3001 __ aese(v0, v29); __ aesmc(v0, v0); 3002 __ aese(v0, v30); 3003 __ eor(v0, __ T16B, v0, v31); 3004 3005 __ st1(v0, __ T16B, __ post(to, 16)); 3006 3007 __ subw(len_reg, len_reg, 16); 3008 __ cbnzw(len_reg, L_aes_loop); 3009 3010 __ st1(v0, __ T16B, rvec); 3011 3012 __ mov(r0, rscratch2); 3013 3014 __ leave(); 3015 __ ret(lr); 3016 3017 return start; 3018 } 3019 3020 // Arguments: 3021 // 3022 // Inputs: 3023 // c_rarg0 - source byte array address 3024 // c_rarg1 - destination byte array address 3025 // c_rarg2 - K (key) in little endian int array 3026 // c_rarg3 - r vector byte array address 3027 // c_rarg4 - input length 3028 // 3029 // Output: 3030 // r0 - input length 3031 // 3032 address generate_cipherBlockChaining_decryptAESCrypt() { 3033 assert(UseAES, "need AES cryptographic extension support"); 3034 __ align(CodeEntryAlignment); 3035 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 3036 StubCodeMark mark(this, stub_id); 3037 3038 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3039 3040 const Register from = c_rarg0; // source array address 3041 const Register to = c_rarg1; // destination array address 3042 const Register key = c_rarg2; // key array address 3043 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3044 // and left with the results of the last encryption block 3045 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3046 const Register keylen = rscratch1; 3047 3048 address start = __ pc(); 3049 3050 __ enter(); 3051 3052 __ movw(rscratch2, len_reg); 3053 3054 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3055 3056 __ ld1(v2, __ T16B, rvec); 3057 3058 __ ld1(v31, __ T16B, __ post(key, 16)); 3059 __ rev32(v31, __ T16B, v31); 3060 3061 __ cmpw(keylen, 52); 3062 __ br(Assembler::CC, L_loadkeys_44); 3063 __ br(Assembler::EQ, L_loadkeys_52); 3064 3065 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3066 __ rev32(v17, __ T16B, v17); 3067 __ rev32(v18, __ T16B, v18); 3068 __ BIND(L_loadkeys_52); 3069 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3070 __ rev32(v19, __ T16B, v19); 3071 __ rev32(v20, __ T16B, v20); 3072 __ BIND(L_loadkeys_44); 3073 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3074 __ rev32(v21, __ T16B, v21); 3075 __ rev32(v22, __ T16B, v22); 3076 __ rev32(v23, __ T16B, v23); 3077 __ rev32(v24, __ T16B, v24); 3078 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3079 __ rev32(v25, __ T16B, v25); 3080 __ rev32(v26, __ T16B, v26); 3081 __ rev32(v27, __ T16B, v27); 3082 __ rev32(v28, __ T16B, v28); 3083 __ ld1(v29, v30, __ T16B, key); 3084 __ rev32(v29, __ T16B, v29); 3085 __ rev32(v30, __ T16B, v30); 3086 3087 __ BIND(L_aes_loop); 3088 __ ld1(v0, __ T16B, __ post(from, 16)); 3089 __ orr(v1, __ T16B, v0, v0); 3090 3091 __ br(Assembler::CC, L_rounds_44); 3092 __ br(Assembler::EQ, L_rounds_52); 3093 3094 __ aesd(v0, v17); __ aesimc(v0, v0); 3095 __ aesd(v0, v18); __ aesimc(v0, v0); 3096 __ BIND(L_rounds_52); 3097 __ aesd(v0, v19); __ aesimc(v0, v0); 3098 __ aesd(v0, v20); __ aesimc(v0, v0); 3099 __ BIND(L_rounds_44); 3100 __ aesd(v0, v21); __ aesimc(v0, v0); 3101 __ aesd(v0, v22); __ aesimc(v0, v0); 3102 __ aesd(v0, v23); __ aesimc(v0, v0); 3103 __ aesd(v0, v24); __ aesimc(v0, v0); 3104 __ aesd(v0, v25); __ aesimc(v0, v0); 3105 __ aesd(v0, v26); __ aesimc(v0, v0); 3106 __ aesd(v0, v27); __ aesimc(v0, v0); 3107 __ aesd(v0, v28); __ aesimc(v0, v0); 3108 __ aesd(v0, v29); __ aesimc(v0, v0); 3109 __ aesd(v0, v30); 3110 __ eor(v0, __ T16B, v0, v31); 3111 __ eor(v0, __ T16B, v0, v2); 3112 3113 __ st1(v0, __ T16B, __ post(to, 16)); 3114 __ orr(v2, __ T16B, v1, v1); 3115 3116 __ subw(len_reg, len_reg, 16); 3117 __ cbnzw(len_reg, L_aes_loop); 3118 3119 __ st1(v2, __ T16B, rvec); 3120 3121 __ mov(r0, rscratch2); 3122 3123 __ leave(); 3124 __ ret(lr); 3125 3126 return start; 3127 } 3128 3129 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3130 // Inputs: 128-bits. in is preserved. 3131 // The least-significant 64-bit word is in the upper dword of each vector. 3132 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3133 // Output: result 3134 void be_add_128_64(FloatRegister result, FloatRegister in, 3135 FloatRegister inc, FloatRegister tmp) { 3136 assert_different_registers(result, tmp, inc); 3137 3138 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3139 // input 3140 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3141 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3142 // MSD == 0 (must be!) to LSD 3143 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3144 } 3145 3146 // CTR AES crypt. 3147 // Arguments: 3148 // 3149 // Inputs: 3150 // c_rarg0 - source byte array address 3151 // c_rarg1 - destination byte array address 3152 // c_rarg2 - K (key) in little endian int array 3153 // c_rarg3 - counter vector byte array address 3154 // c_rarg4 - input length 3155 // c_rarg5 - saved encryptedCounter start 3156 // c_rarg6 - saved used length 3157 // 3158 // Output: 3159 // r0 - input length 3160 // 3161 address generate_counterMode_AESCrypt() { 3162 const Register in = c_rarg0; 3163 const Register out = c_rarg1; 3164 const Register key = c_rarg2; 3165 const Register counter = c_rarg3; 3166 const Register saved_len = c_rarg4, len = r10; 3167 const Register saved_encrypted_ctr = c_rarg5; 3168 const Register used_ptr = c_rarg6, used = r12; 3169 3170 const Register offset = r7; 3171 const Register keylen = r11; 3172 3173 const unsigned char block_size = 16; 3174 const int bulk_width = 4; 3175 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3176 // performance with larger data sizes, but it also means that the 3177 // fast path isn't used until you have at least 8 blocks, and up 3178 // to 127 bytes of data will be executed on the slow path. For 3179 // that reason, and also so as not to blow away too much icache, 4 3180 // blocks seems like a sensible compromise. 3181 3182 // Algorithm: 3183 // 3184 // if (len == 0) { 3185 // goto DONE; 3186 // } 3187 // int result = len; 3188 // do { 3189 // if (used >= blockSize) { 3190 // if (len >= bulk_width * blockSize) { 3191 // CTR_large_block(); 3192 // if (len == 0) 3193 // goto DONE; 3194 // } 3195 // for (;;) { 3196 // 16ByteVector v0 = counter; 3197 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3198 // used = 0; 3199 // if (len < blockSize) 3200 // break; /* goto NEXT */ 3201 // 16ByteVector v1 = load16Bytes(in, offset); 3202 // v1 = v1 ^ encryptedCounter; 3203 // store16Bytes(out, offset); 3204 // used = blockSize; 3205 // offset += blockSize; 3206 // len -= blockSize; 3207 // if (len == 0) 3208 // goto DONE; 3209 // } 3210 // } 3211 // NEXT: 3212 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3213 // len--; 3214 // } while (len != 0); 3215 // DONE: 3216 // return result; 3217 // 3218 // CTR_large_block() 3219 // Wide bulk encryption of whole blocks. 3220 3221 __ align(CodeEntryAlignment); 3222 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3223 StubCodeMark mark(this, stub_id); 3224 const address start = __ pc(); 3225 __ enter(); 3226 3227 Label DONE, CTR_large_block, large_block_return; 3228 __ ldrw(used, Address(used_ptr)); 3229 __ cbzw(saved_len, DONE); 3230 3231 __ mov(len, saved_len); 3232 __ mov(offset, 0); 3233 3234 // Compute #rounds for AES based on the length of the key array 3235 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3236 3237 __ aesenc_loadkeys(key, keylen); 3238 3239 { 3240 Label L_CTR_loop, NEXT; 3241 3242 __ bind(L_CTR_loop); 3243 3244 __ cmp(used, block_size); 3245 __ br(__ LO, NEXT); 3246 3247 // Maybe we have a lot of data 3248 __ subsw(rscratch1, len, bulk_width * block_size); 3249 __ br(__ HS, CTR_large_block); 3250 __ BIND(large_block_return); 3251 __ cbzw(len, DONE); 3252 3253 // Setup the counter 3254 __ movi(v4, __ T4S, 0); 3255 __ movi(v5, __ T4S, 1); 3256 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3257 3258 // 128-bit big-endian increment 3259 __ ld1(v0, __ T16B, counter); 3260 __ rev64(v16, __ T16B, v0); 3261 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3262 __ rev64(v16, __ T16B, v16); 3263 __ st1(v16, __ T16B, counter); 3264 // Previous counter value is in v0 3265 // v4 contains { 0, 1 } 3266 3267 { 3268 // We have fewer than bulk_width blocks of data left. Encrypt 3269 // them one by one until there is less than a full block 3270 // remaining, being careful to save both the encrypted counter 3271 // and the counter. 3272 3273 Label inner_loop; 3274 __ bind(inner_loop); 3275 // Counter to encrypt is in v0 3276 __ aesecb_encrypt(noreg, noreg, keylen); 3277 __ st1(v0, __ T16B, saved_encrypted_ctr); 3278 3279 // Do we have a remaining full block? 3280 3281 __ mov(used, 0); 3282 __ cmp(len, block_size); 3283 __ br(__ LO, NEXT); 3284 3285 // Yes, we have a full block 3286 __ ldrq(v1, Address(in, offset)); 3287 __ eor(v1, __ T16B, v1, v0); 3288 __ strq(v1, Address(out, offset)); 3289 __ mov(used, block_size); 3290 __ add(offset, offset, block_size); 3291 3292 __ subw(len, len, block_size); 3293 __ cbzw(len, DONE); 3294 3295 // Increment the counter, store it back 3296 __ orr(v0, __ T16B, v16, v16); 3297 __ rev64(v16, __ T16B, v16); 3298 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3299 __ rev64(v16, __ T16B, v16); 3300 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3301 3302 __ b(inner_loop); 3303 } 3304 3305 __ BIND(NEXT); 3306 3307 // Encrypt a single byte, and loop. 3308 // We expect this to be a rare event. 3309 __ ldrb(rscratch1, Address(in, offset)); 3310 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3311 __ eor(rscratch1, rscratch1, rscratch2); 3312 __ strb(rscratch1, Address(out, offset)); 3313 __ add(offset, offset, 1); 3314 __ add(used, used, 1); 3315 __ subw(len, len,1); 3316 __ cbnzw(len, L_CTR_loop); 3317 } 3318 3319 __ bind(DONE); 3320 __ strw(used, Address(used_ptr)); 3321 __ mov(r0, saved_len); 3322 3323 __ leave(); // required for proper stackwalking of RuntimeStub frame 3324 __ ret(lr); 3325 3326 // Bulk encryption 3327 3328 __ BIND (CTR_large_block); 3329 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3330 3331 if (bulk_width == 8) { 3332 __ sub(sp, sp, 4 * 16); 3333 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3334 } 3335 __ sub(sp, sp, 4 * 16); 3336 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3337 RegSet saved_regs = (RegSet::of(in, out, offset) 3338 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3339 __ push(saved_regs, sp); 3340 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3341 __ add(in, in, offset); 3342 __ add(out, out, offset); 3343 3344 // Keys should already be loaded into the correct registers 3345 3346 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3347 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3348 3349 // AES/CTR loop 3350 { 3351 Label L_CTR_loop; 3352 __ BIND(L_CTR_loop); 3353 3354 // Setup the counters 3355 __ movi(v8, __ T4S, 0); 3356 __ movi(v9, __ T4S, 1); 3357 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3358 3359 for (int i = 0; i < bulk_width; i++) { 3360 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3361 __ rev64(v0_ofs, __ T16B, v16); 3362 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3363 } 3364 3365 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3366 3367 // Encrypt the counters 3368 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3369 3370 if (bulk_width == 8) { 3371 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3372 } 3373 3374 // XOR the encrypted counters with the inputs 3375 for (int i = 0; i < bulk_width; i++) { 3376 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3377 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3378 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3379 } 3380 3381 // Write the encrypted data 3382 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3383 if (bulk_width == 8) { 3384 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3385 } 3386 3387 __ subw(len, len, 16 * bulk_width); 3388 __ cbnzw(len, L_CTR_loop); 3389 } 3390 3391 // Save the counter back where it goes 3392 __ rev64(v16, __ T16B, v16); 3393 __ st1(v16, __ T16B, counter); 3394 3395 __ pop(saved_regs, sp); 3396 3397 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3398 if (bulk_width == 8) { 3399 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3400 } 3401 3402 __ andr(rscratch1, len, -16 * bulk_width); 3403 __ sub(len, len, rscratch1); 3404 __ add(offset, offset, rscratch1); 3405 __ mov(used, 16); 3406 __ strw(used, Address(used_ptr)); 3407 __ b(large_block_return); 3408 3409 return start; 3410 } 3411 3412 // Vector AES Galois Counter Mode implementation. Parameters: 3413 // 3414 // in = c_rarg0 3415 // len = c_rarg1 3416 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3417 // out = c_rarg3 3418 // key = c_rarg4 3419 // state = c_rarg5 - GHASH.state 3420 // subkeyHtbl = c_rarg6 - powers of H 3421 // counter = c_rarg7 - 16 bytes of CTR 3422 // return - number of processed bytes 3423 address generate_galoisCounterMode_AESCrypt() { 3424 address ghash_polynomial = __ pc(); 3425 __ emit_int64(0x87); // The low-order bits of the field 3426 // polynomial (i.e. p = z^7+z^2+z+1) 3427 // repeated in the low and high parts of a 3428 // 128-bit vector 3429 __ emit_int64(0x87); 3430 3431 __ align(CodeEntryAlignment); 3432 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3433 StubCodeMark mark(this, stub_id); 3434 address start = __ pc(); 3435 __ enter(); 3436 3437 const Register in = c_rarg0; 3438 const Register len = c_rarg1; 3439 const Register ct = c_rarg2; 3440 const Register out = c_rarg3; 3441 // and updated with the incremented counter in the end 3442 3443 const Register key = c_rarg4; 3444 const Register state = c_rarg5; 3445 3446 const Register subkeyHtbl = c_rarg6; 3447 3448 const Register counter = c_rarg7; 3449 3450 const Register keylen = r10; 3451 // Save state before entering routine 3452 __ sub(sp, sp, 4 * 16); 3453 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3454 __ sub(sp, sp, 4 * 16); 3455 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3456 3457 // __ andr(len, len, -512); 3458 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3459 __ str(len, __ pre(sp, -2 * wordSize)); 3460 3461 Label DONE; 3462 __ cbz(len, DONE); 3463 3464 // Compute #rounds for AES based on the length of the key array 3465 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3466 3467 __ aesenc_loadkeys(key, keylen); 3468 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3469 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3470 3471 // AES/CTR loop 3472 { 3473 Label L_CTR_loop; 3474 __ BIND(L_CTR_loop); 3475 3476 // Setup the counters 3477 __ movi(v8, __ T4S, 0); 3478 __ movi(v9, __ T4S, 1); 3479 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3480 3481 assert(v0->encoding() < v8->encoding(), ""); 3482 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3483 FloatRegister f = as_FloatRegister(i); 3484 __ rev32(f, __ T16B, v16); 3485 __ addv(v16, __ T4S, v16, v8); 3486 } 3487 3488 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3489 3490 // Encrypt the counters 3491 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3492 3493 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3494 3495 // XOR the encrypted counters with the inputs 3496 for (int i = 0; i < 8; i++) { 3497 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3498 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3499 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3500 } 3501 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3502 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3503 3504 __ subw(len, len, 16 * 8); 3505 __ cbnzw(len, L_CTR_loop); 3506 } 3507 3508 __ rev32(v16, __ T16B, v16); 3509 __ st1(v16, __ T16B, counter); 3510 3511 __ ldr(len, Address(sp)); 3512 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3513 3514 // GHASH/CTR loop 3515 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3516 len, /*unrolls*/4); 3517 3518 #ifdef ASSERT 3519 { Label L; 3520 __ cmp(len, (unsigned char)0); 3521 __ br(Assembler::EQ, L); 3522 __ stop("stubGenerator: abort"); 3523 __ bind(L); 3524 } 3525 #endif 3526 3527 __ bind(DONE); 3528 // Return the number of bytes processed 3529 __ ldr(r0, __ post(sp, 2 * wordSize)); 3530 3531 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3532 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3533 3534 __ leave(); // required for proper stackwalking of RuntimeStub frame 3535 __ ret(lr); 3536 return start; 3537 } 3538 3539 class Cached64Bytes { 3540 private: 3541 MacroAssembler *_masm; 3542 Register _regs[8]; 3543 3544 public: 3545 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3546 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3547 auto it = rs.begin(); 3548 for (auto &r: _regs) { 3549 r = *it; 3550 ++it; 3551 } 3552 } 3553 3554 void gen_loads(Register base) { 3555 for (int i = 0; i < 8; i += 2) { 3556 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3557 } 3558 } 3559 3560 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3561 void extract_u32(Register dest, int i) { 3562 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3563 } 3564 }; 3565 3566 // Utility routines for md5. 3567 // Clobbers r10 and r11. 3568 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3569 int k, int s, int t) { 3570 Register rscratch3 = r10; 3571 Register rscratch4 = r11; 3572 3573 __ eorw(rscratch3, r3, r4); 3574 __ movw(rscratch2, t); 3575 __ andw(rscratch3, rscratch3, r2); 3576 __ addw(rscratch4, r1, rscratch2); 3577 reg_cache.extract_u32(rscratch1, k); 3578 __ eorw(rscratch3, rscratch3, r4); 3579 __ addw(rscratch4, rscratch4, rscratch1); 3580 __ addw(rscratch3, rscratch3, rscratch4); 3581 __ rorw(rscratch2, rscratch3, 32 - s); 3582 __ addw(r1, rscratch2, r2); 3583 } 3584 3585 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3586 int k, int s, int t) { 3587 Register rscratch3 = r10; 3588 Register rscratch4 = r11; 3589 3590 reg_cache.extract_u32(rscratch1, k); 3591 __ movw(rscratch2, t); 3592 __ addw(rscratch4, r1, rscratch2); 3593 __ addw(rscratch4, rscratch4, rscratch1); 3594 __ bicw(rscratch2, r3, r4); 3595 __ andw(rscratch3, r2, r4); 3596 __ addw(rscratch2, rscratch2, rscratch4); 3597 __ addw(rscratch2, rscratch2, rscratch3); 3598 __ rorw(rscratch2, rscratch2, 32 - s); 3599 __ addw(r1, rscratch2, r2); 3600 } 3601 3602 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3603 int k, int s, int t) { 3604 Register rscratch3 = r10; 3605 Register rscratch4 = r11; 3606 3607 __ eorw(rscratch3, r3, r4); 3608 __ movw(rscratch2, t); 3609 __ addw(rscratch4, r1, rscratch2); 3610 reg_cache.extract_u32(rscratch1, k); 3611 __ eorw(rscratch3, rscratch3, r2); 3612 __ addw(rscratch4, rscratch4, rscratch1); 3613 __ addw(rscratch3, rscratch3, rscratch4); 3614 __ rorw(rscratch2, rscratch3, 32 - s); 3615 __ addw(r1, rscratch2, r2); 3616 } 3617 3618 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3619 int k, int s, int t) { 3620 Register rscratch3 = r10; 3621 Register rscratch4 = r11; 3622 3623 __ movw(rscratch3, t); 3624 __ ornw(rscratch2, r2, r4); 3625 __ addw(rscratch4, r1, rscratch3); 3626 reg_cache.extract_u32(rscratch1, k); 3627 __ eorw(rscratch3, rscratch2, r3); 3628 __ addw(rscratch4, rscratch4, rscratch1); 3629 __ addw(rscratch3, rscratch3, rscratch4); 3630 __ rorw(rscratch2, rscratch3, 32 - s); 3631 __ addw(r1, rscratch2, r2); 3632 } 3633 3634 // Arguments: 3635 // 3636 // Inputs: 3637 // c_rarg0 - byte[] source+offset 3638 // c_rarg1 - int[] SHA.state 3639 // c_rarg2 - int offset 3640 // c_rarg3 - int limit 3641 // 3642 address generate_md5_implCompress(StubGenStubId stub_id) { 3643 bool multi_block; 3644 switch (stub_id) { 3645 case md5_implCompress_id: 3646 multi_block = false; 3647 break; 3648 case md5_implCompressMB_id: 3649 multi_block = true; 3650 break; 3651 default: 3652 ShouldNotReachHere(); 3653 } 3654 __ align(CodeEntryAlignment); 3655 3656 StubCodeMark mark(this, stub_id); 3657 address start = __ pc(); 3658 3659 Register buf = c_rarg0; 3660 Register state = c_rarg1; 3661 Register ofs = c_rarg2; 3662 Register limit = c_rarg3; 3663 Register a = r4; 3664 Register b = r5; 3665 Register c = r6; 3666 Register d = r7; 3667 Register rscratch3 = r10; 3668 Register rscratch4 = r11; 3669 3670 Register state_regs[2] = { r12, r13 }; 3671 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3672 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3673 3674 __ push(saved_regs, sp); 3675 3676 __ ldp(state_regs[0], state_regs[1], Address(state)); 3677 __ ubfx(a, state_regs[0], 0, 32); 3678 __ ubfx(b, state_regs[0], 32, 32); 3679 __ ubfx(c, state_regs[1], 0, 32); 3680 __ ubfx(d, state_regs[1], 32, 32); 3681 3682 Label md5_loop; 3683 __ BIND(md5_loop); 3684 3685 reg_cache.gen_loads(buf); 3686 3687 // Round 1 3688 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3689 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3690 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3691 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3692 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3693 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3694 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3695 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3696 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3697 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3698 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3699 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3700 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3701 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3702 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3703 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3704 3705 // Round 2 3706 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3707 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3708 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3709 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3710 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3711 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3712 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3713 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3714 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3715 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3716 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3717 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3718 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3719 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3720 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3721 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3722 3723 // Round 3 3724 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3725 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3726 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3727 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3728 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3729 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3730 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3731 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3732 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3733 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3734 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3735 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3736 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3737 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3738 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3739 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3740 3741 // Round 4 3742 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3743 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3744 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3745 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3746 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3747 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3748 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3749 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3750 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3751 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3752 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3753 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3754 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3755 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3756 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3757 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3758 3759 __ addw(a, state_regs[0], a); 3760 __ ubfx(rscratch2, state_regs[0], 32, 32); 3761 __ addw(b, rscratch2, b); 3762 __ addw(c, state_regs[1], c); 3763 __ ubfx(rscratch4, state_regs[1], 32, 32); 3764 __ addw(d, rscratch4, d); 3765 3766 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3767 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3768 3769 if (multi_block) { 3770 __ add(buf, buf, 64); 3771 __ add(ofs, ofs, 64); 3772 __ cmp(ofs, limit); 3773 __ br(Assembler::LE, md5_loop); 3774 __ mov(c_rarg0, ofs); // return ofs 3775 } 3776 3777 // write hash values back in the correct order 3778 __ stp(state_regs[0], state_regs[1], Address(state)); 3779 3780 __ pop(saved_regs, sp); 3781 3782 __ ret(lr); 3783 3784 return start; 3785 } 3786 3787 // Arguments: 3788 // 3789 // Inputs: 3790 // c_rarg0 - byte[] source+offset 3791 // c_rarg1 - int[] SHA.state 3792 // c_rarg2 - int offset 3793 // c_rarg3 - int limit 3794 // 3795 address generate_sha1_implCompress(StubGenStubId stub_id) { 3796 bool multi_block; 3797 switch (stub_id) { 3798 case sha1_implCompress_id: 3799 multi_block = false; 3800 break; 3801 case sha1_implCompressMB_id: 3802 multi_block = true; 3803 break; 3804 default: 3805 ShouldNotReachHere(); 3806 } 3807 3808 __ align(CodeEntryAlignment); 3809 3810 StubCodeMark mark(this, stub_id); 3811 address start = __ pc(); 3812 3813 Register buf = c_rarg0; 3814 Register state = c_rarg1; 3815 Register ofs = c_rarg2; 3816 Register limit = c_rarg3; 3817 3818 Label keys; 3819 Label sha1_loop; 3820 3821 // load the keys into v0..v3 3822 __ adr(rscratch1, keys); 3823 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3824 // load 5 words state into v6, v7 3825 __ ldrq(v6, Address(state, 0)); 3826 __ ldrs(v7, Address(state, 16)); 3827 3828 3829 __ BIND(sha1_loop); 3830 // load 64 bytes of data into v16..v19 3831 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3832 __ rev32(v16, __ T16B, v16); 3833 __ rev32(v17, __ T16B, v17); 3834 __ rev32(v18, __ T16B, v18); 3835 __ rev32(v19, __ T16B, v19); 3836 3837 // do the sha1 3838 __ addv(v4, __ T4S, v16, v0); 3839 __ orr(v20, __ T16B, v6, v6); 3840 3841 FloatRegister d0 = v16; 3842 FloatRegister d1 = v17; 3843 FloatRegister d2 = v18; 3844 FloatRegister d3 = v19; 3845 3846 for (int round = 0; round < 20; round++) { 3847 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3848 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3849 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3850 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3851 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3852 3853 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3854 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3855 __ sha1h(tmp2, __ T4S, v20); 3856 if (round < 5) 3857 __ sha1c(v20, __ T4S, tmp3, tmp4); 3858 else if (round < 10 || round >= 15) 3859 __ sha1p(v20, __ T4S, tmp3, tmp4); 3860 else 3861 __ sha1m(v20, __ T4S, tmp3, tmp4); 3862 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3863 3864 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3865 } 3866 3867 __ addv(v7, __ T2S, v7, v21); 3868 __ addv(v6, __ T4S, v6, v20); 3869 3870 if (multi_block) { 3871 __ add(ofs, ofs, 64); 3872 __ cmp(ofs, limit); 3873 __ br(Assembler::LE, sha1_loop); 3874 __ mov(c_rarg0, ofs); // return ofs 3875 } 3876 3877 __ strq(v6, Address(state, 0)); 3878 __ strs(v7, Address(state, 16)); 3879 3880 __ ret(lr); 3881 3882 __ bind(keys); 3883 __ emit_int32(0x5a827999); 3884 __ emit_int32(0x6ed9eba1); 3885 __ emit_int32(0x8f1bbcdc); 3886 __ emit_int32(0xca62c1d6); 3887 3888 return start; 3889 } 3890 3891 3892 // Arguments: 3893 // 3894 // Inputs: 3895 // c_rarg0 - byte[] source+offset 3896 // c_rarg1 - int[] SHA.state 3897 // c_rarg2 - int offset 3898 // c_rarg3 - int limit 3899 // 3900 address generate_sha256_implCompress(StubGenStubId stub_id) { 3901 bool multi_block; 3902 switch (stub_id) { 3903 case sha256_implCompress_id: 3904 multi_block = false; 3905 break; 3906 case sha256_implCompressMB_id: 3907 multi_block = true; 3908 break; 3909 default: 3910 ShouldNotReachHere(); 3911 } 3912 3913 static const uint32_t round_consts[64] = { 3914 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3915 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3916 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3917 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3918 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3919 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3920 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3921 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3922 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3923 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3924 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3925 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3926 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3927 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3928 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3929 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3930 }; 3931 3932 __ align(CodeEntryAlignment); 3933 3934 StubCodeMark mark(this, stub_id); 3935 address start = __ pc(); 3936 3937 Register buf = c_rarg0; 3938 Register state = c_rarg1; 3939 Register ofs = c_rarg2; 3940 Register limit = c_rarg3; 3941 3942 Label sha1_loop; 3943 3944 __ stpd(v8, v9, __ pre(sp, -32)); 3945 __ stpd(v10, v11, Address(sp, 16)); 3946 3947 // dga == v0 3948 // dgb == v1 3949 // dg0 == v2 3950 // dg1 == v3 3951 // dg2 == v4 3952 // t0 == v6 3953 // t1 == v7 3954 3955 // load 16 keys to v16..v31 3956 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3957 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3958 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3959 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3960 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3961 3962 // load 8 words (256 bits) state 3963 __ ldpq(v0, v1, state); 3964 3965 __ BIND(sha1_loop); 3966 // load 64 bytes of data into v8..v11 3967 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3968 __ rev32(v8, __ T16B, v8); 3969 __ rev32(v9, __ T16B, v9); 3970 __ rev32(v10, __ T16B, v10); 3971 __ rev32(v11, __ T16B, v11); 3972 3973 __ addv(v6, __ T4S, v8, v16); 3974 __ orr(v2, __ T16B, v0, v0); 3975 __ orr(v3, __ T16B, v1, v1); 3976 3977 FloatRegister d0 = v8; 3978 FloatRegister d1 = v9; 3979 FloatRegister d2 = v10; 3980 FloatRegister d3 = v11; 3981 3982 3983 for (int round = 0; round < 16; round++) { 3984 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3985 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3986 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3987 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3988 3989 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3990 __ orr(v4, __ T16B, v2, v2); 3991 if (round < 15) 3992 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3993 __ sha256h(v2, __ T4S, v3, tmp2); 3994 __ sha256h2(v3, __ T4S, v4, tmp2); 3995 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3996 3997 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3998 } 3999 4000 __ addv(v0, __ T4S, v0, v2); 4001 __ addv(v1, __ T4S, v1, v3); 4002 4003 if (multi_block) { 4004 __ add(ofs, ofs, 64); 4005 __ cmp(ofs, limit); 4006 __ br(Assembler::LE, sha1_loop); 4007 __ mov(c_rarg0, ofs); // return ofs 4008 } 4009 4010 __ ldpd(v10, v11, Address(sp, 16)); 4011 __ ldpd(v8, v9, __ post(sp, 32)); 4012 4013 __ stpq(v0, v1, state); 4014 4015 __ ret(lr); 4016 4017 return start; 4018 } 4019 4020 // Double rounds for sha512. 4021 void sha512_dround(int dr, 4022 FloatRegister vi0, FloatRegister vi1, 4023 FloatRegister vi2, FloatRegister vi3, 4024 FloatRegister vi4, FloatRegister vrc0, 4025 FloatRegister vrc1, FloatRegister vin0, 4026 FloatRegister vin1, FloatRegister vin2, 4027 FloatRegister vin3, FloatRegister vin4) { 4028 if (dr < 36) { 4029 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4030 } 4031 __ addv(v5, __ T2D, vrc0, vin0); 4032 __ ext(v6, __ T16B, vi2, vi3, 8); 4033 __ ext(v5, __ T16B, v5, v5, 8); 4034 __ ext(v7, __ T16B, vi1, vi2, 8); 4035 __ addv(vi3, __ T2D, vi3, v5); 4036 if (dr < 32) { 4037 __ ext(v5, __ T16B, vin3, vin4, 8); 4038 __ sha512su0(vin0, __ T2D, vin1); 4039 } 4040 __ sha512h(vi3, __ T2D, v6, v7); 4041 if (dr < 32) { 4042 __ sha512su1(vin0, __ T2D, vin2, v5); 4043 } 4044 __ addv(vi4, __ T2D, vi1, vi3); 4045 __ sha512h2(vi3, __ T2D, vi1, vi0); 4046 } 4047 4048 // Arguments: 4049 // 4050 // Inputs: 4051 // c_rarg0 - byte[] source+offset 4052 // c_rarg1 - int[] SHA.state 4053 // c_rarg2 - int offset 4054 // c_rarg3 - int limit 4055 // 4056 address generate_sha512_implCompress(StubGenStubId stub_id) { 4057 bool multi_block; 4058 switch (stub_id) { 4059 case sha512_implCompress_id: 4060 multi_block = false; 4061 break; 4062 case sha512_implCompressMB_id: 4063 multi_block = true; 4064 break; 4065 default: 4066 ShouldNotReachHere(); 4067 } 4068 4069 static const uint64_t round_consts[80] = { 4070 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4071 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4072 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4073 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4074 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4075 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4076 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4077 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4078 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4079 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4080 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4081 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4082 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4083 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4084 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4085 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4086 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4087 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4088 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4089 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4090 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4091 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4092 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4093 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4094 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4095 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4096 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4097 }; 4098 4099 __ align(CodeEntryAlignment); 4100 4101 StubCodeMark mark(this, stub_id); 4102 address start = __ pc(); 4103 4104 Register buf = c_rarg0; 4105 Register state = c_rarg1; 4106 Register ofs = c_rarg2; 4107 Register limit = c_rarg3; 4108 4109 __ stpd(v8, v9, __ pre(sp, -64)); 4110 __ stpd(v10, v11, Address(sp, 16)); 4111 __ stpd(v12, v13, Address(sp, 32)); 4112 __ stpd(v14, v15, Address(sp, 48)); 4113 4114 Label sha512_loop; 4115 4116 // load state 4117 __ ld1(v8, v9, v10, v11, __ T2D, state); 4118 4119 // load first 4 round constants 4120 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4121 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4122 4123 __ BIND(sha512_loop); 4124 // load 128B of data into v12..v19 4125 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4126 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4127 __ rev64(v12, __ T16B, v12); 4128 __ rev64(v13, __ T16B, v13); 4129 __ rev64(v14, __ T16B, v14); 4130 __ rev64(v15, __ T16B, v15); 4131 __ rev64(v16, __ T16B, v16); 4132 __ rev64(v17, __ T16B, v17); 4133 __ rev64(v18, __ T16B, v18); 4134 __ rev64(v19, __ T16B, v19); 4135 4136 __ mov(rscratch2, rscratch1); 4137 4138 __ mov(v0, __ T16B, v8); 4139 __ mov(v1, __ T16B, v9); 4140 __ mov(v2, __ T16B, v10); 4141 __ mov(v3, __ T16B, v11); 4142 4143 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4144 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4145 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4146 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4147 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4148 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4149 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4150 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4151 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4152 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4153 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4154 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4155 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4156 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4157 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4158 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4159 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4160 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4161 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4162 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4163 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4164 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4165 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4166 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4167 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4168 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4169 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4170 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4171 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4172 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4173 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4174 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4175 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4176 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4177 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4178 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4179 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4180 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4181 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4182 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4183 4184 __ addv(v8, __ T2D, v8, v0); 4185 __ addv(v9, __ T2D, v9, v1); 4186 __ addv(v10, __ T2D, v10, v2); 4187 __ addv(v11, __ T2D, v11, v3); 4188 4189 if (multi_block) { 4190 __ add(ofs, ofs, 128); 4191 __ cmp(ofs, limit); 4192 __ br(Assembler::LE, sha512_loop); 4193 __ mov(c_rarg0, ofs); // return ofs 4194 } 4195 4196 __ st1(v8, v9, v10, v11, __ T2D, state); 4197 4198 __ ldpd(v14, v15, Address(sp, 48)); 4199 __ ldpd(v12, v13, Address(sp, 32)); 4200 __ ldpd(v10, v11, Address(sp, 16)); 4201 __ ldpd(v8, v9, __ post(sp, 64)); 4202 4203 __ ret(lr); 4204 4205 return start; 4206 } 4207 4208 // Execute one round of keccak of two computations in parallel. 4209 // One of the states should be loaded into the lower halves of 4210 // the vector registers v0-v24, the other should be loaded into 4211 // the upper halves of those registers. The ld1r instruction loads 4212 // the round constant into both halves of register v31. 4213 // Intermediate results c0...c5 and d0...d5 are computed 4214 // in registers v25...v30. 4215 // All vector instructions that are used operate on both register 4216 // halves in parallel. 4217 // If only a single computation is needed, one can only load the lower halves. 4218 void keccak_round(Register rscratch1) { 4219 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4220 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4221 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4222 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4223 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4224 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4225 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4226 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4227 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4228 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4229 4230 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4231 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4232 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4233 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4234 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4235 4236 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4237 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4238 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4239 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4240 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4241 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4242 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4243 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4244 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4245 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4246 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4247 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4248 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4249 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4250 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4251 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4252 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4253 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4254 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4255 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4256 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4257 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4258 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4259 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4260 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4261 4262 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4263 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4264 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4265 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4266 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4267 4268 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4269 4270 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4271 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4272 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4273 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4274 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4275 4276 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4277 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4278 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4279 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4280 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4281 4282 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4283 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4284 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4285 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4286 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4287 4288 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4289 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4290 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4291 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4292 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4293 4294 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4295 } 4296 4297 // Arguments: 4298 // 4299 // Inputs: 4300 // c_rarg0 - byte[] source+offset 4301 // c_rarg1 - byte[] SHA.state 4302 // c_rarg2 - int block_size 4303 // c_rarg3 - int offset 4304 // c_rarg4 - int limit 4305 // 4306 address generate_sha3_implCompress(StubGenStubId stub_id) { 4307 bool multi_block; 4308 switch (stub_id) { 4309 case sha3_implCompress_id: 4310 multi_block = false; 4311 break; 4312 case sha3_implCompressMB_id: 4313 multi_block = true; 4314 break; 4315 default: 4316 ShouldNotReachHere(); 4317 } 4318 4319 static const uint64_t round_consts[24] = { 4320 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4321 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4322 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4323 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4324 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4325 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4326 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4327 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4328 }; 4329 4330 __ align(CodeEntryAlignment); 4331 4332 StubCodeMark mark(this, stub_id); 4333 address start = __ pc(); 4334 4335 Register buf = c_rarg0; 4336 Register state = c_rarg1; 4337 Register block_size = c_rarg2; 4338 Register ofs = c_rarg3; 4339 Register limit = c_rarg4; 4340 4341 Label sha3_loop, rounds24_loop; 4342 Label sha3_512_or_sha3_384, shake128; 4343 4344 __ stpd(v8, v9, __ pre(sp, -64)); 4345 __ stpd(v10, v11, Address(sp, 16)); 4346 __ stpd(v12, v13, Address(sp, 32)); 4347 __ stpd(v14, v15, Address(sp, 48)); 4348 4349 // load state 4350 __ add(rscratch1, state, 32); 4351 __ ld1(v0, v1, v2, v3, __ T1D, state); 4352 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4353 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4354 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4355 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4356 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4357 __ ld1(v24, __ T1D, rscratch1); 4358 4359 __ BIND(sha3_loop); 4360 4361 // 24 keccak rounds 4362 __ movw(rscratch2, 24); 4363 4364 // load round_constants base 4365 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4366 4367 // load input 4368 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4369 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4370 __ eor(v0, __ T8B, v0, v25); 4371 __ eor(v1, __ T8B, v1, v26); 4372 __ eor(v2, __ T8B, v2, v27); 4373 __ eor(v3, __ T8B, v3, v28); 4374 __ eor(v4, __ T8B, v4, v29); 4375 __ eor(v5, __ T8B, v5, v30); 4376 __ eor(v6, __ T8B, v6, v31); 4377 4378 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4379 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4380 4381 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4382 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4383 __ eor(v7, __ T8B, v7, v25); 4384 __ eor(v8, __ T8B, v8, v26); 4385 __ eor(v9, __ T8B, v9, v27); 4386 __ eor(v10, __ T8B, v10, v28); 4387 __ eor(v11, __ T8B, v11, v29); 4388 __ eor(v12, __ T8B, v12, v30); 4389 __ eor(v13, __ T8B, v13, v31); 4390 4391 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4392 __ eor(v14, __ T8B, v14, v25); 4393 __ eor(v15, __ T8B, v15, v26); 4394 __ eor(v16, __ T8B, v16, v27); 4395 4396 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4397 __ andw(c_rarg5, block_size, 48); 4398 __ cbzw(c_rarg5, rounds24_loop); 4399 4400 __ tbnz(block_size, 5, shake128); 4401 // block_size == 144, bit5 == 0, SHA3-224 4402 __ ldrd(v28, __ post(buf, 8)); 4403 __ eor(v17, __ T8B, v17, v28); 4404 __ b(rounds24_loop); 4405 4406 __ BIND(shake128); 4407 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4408 __ eor(v17, __ T8B, v17, v28); 4409 __ eor(v18, __ T8B, v18, v29); 4410 __ eor(v19, __ T8B, v19, v30); 4411 __ eor(v20, __ T8B, v20, v31); 4412 __ b(rounds24_loop); // block_size == 168, SHAKE128 4413 4414 __ BIND(sha3_512_or_sha3_384); 4415 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4416 __ eor(v7, __ T8B, v7, v25); 4417 __ eor(v8, __ T8B, v8, v26); 4418 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4419 4420 // SHA3-384 4421 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4422 __ eor(v9, __ T8B, v9, v27); 4423 __ eor(v10, __ T8B, v10, v28); 4424 __ eor(v11, __ T8B, v11, v29); 4425 __ eor(v12, __ T8B, v12, v30); 4426 4427 __ BIND(rounds24_loop); 4428 __ subw(rscratch2, rscratch2, 1); 4429 4430 keccak_round(rscratch1); 4431 4432 __ cbnzw(rscratch2, rounds24_loop); 4433 4434 if (multi_block) { 4435 __ add(ofs, ofs, block_size); 4436 __ cmp(ofs, limit); 4437 __ br(Assembler::LE, sha3_loop); 4438 __ mov(c_rarg0, ofs); // return ofs 4439 } 4440 4441 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4442 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4443 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4444 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4445 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4446 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4447 __ st1(v24, __ T1D, state); 4448 4449 // restore callee-saved registers 4450 __ ldpd(v14, v15, Address(sp, 48)); 4451 __ ldpd(v12, v13, Address(sp, 32)); 4452 __ ldpd(v10, v11, Address(sp, 16)); 4453 __ ldpd(v8, v9, __ post(sp, 64)); 4454 4455 __ ret(lr); 4456 4457 return start; 4458 } 4459 4460 // Inputs: 4461 // c_rarg0 - long[] state0 4462 // c_rarg1 - long[] state1 4463 address generate_double_keccak() { 4464 static const uint64_t round_consts[24] = { 4465 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4466 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4467 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4468 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4469 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4470 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4471 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4472 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4473 }; 4474 4475 // Implements the double_keccak() method of the 4476 // sun.secyrity.provider.SHA3Parallel class 4477 __ align(CodeEntryAlignment); 4478 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4479 address start = __ pc(); 4480 __ enter(); 4481 4482 Register state0 = c_rarg0; 4483 Register state1 = c_rarg1; 4484 4485 Label rounds24_loop; 4486 4487 // save callee-saved registers 4488 __ stpd(v8, v9, __ pre(sp, -64)); 4489 __ stpd(v10, v11, Address(sp, 16)); 4490 __ stpd(v12, v13, Address(sp, 32)); 4491 __ stpd(v14, v15, Address(sp, 48)); 4492 4493 // load states 4494 __ add(rscratch1, state0, 32); 4495 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4496 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4497 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4498 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4499 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4500 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4501 __ ld1(v24, __ D, 0, rscratch1); 4502 __ add(rscratch1, state1, 32); 4503 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4504 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4505 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4506 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4507 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4508 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4509 __ ld1(v24, __ D, 1, rscratch1); 4510 4511 // 24 keccak rounds 4512 __ movw(rscratch2, 24); 4513 4514 // load round_constants base 4515 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4516 4517 __ BIND(rounds24_loop); 4518 __ subw(rscratch2, rscratch2, 1); 4519 keccak_round(rscratch1); 4520 __ cbnzw(rscratch2, rounds24_loop); 4521 4522 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4523 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4524 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4525 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4526 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4527 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4528 __ st1(v24, __ D, 0, state0); 4529 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4530 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4531 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4532 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4533 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4534 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4535 __ st1(v24, __ D, 1, state1); 4536 4537 // restore callee-saved vector registers 4538 __ ldpd(v14, v15, Address(sp, 48)); 4539 __ ldpd(v12, v13, Address(sp, 32)); 4540 __ ldpd(v10, v11, Address(sp, 16)); 4541 __ ldpd(v8, v9, __ post(sp, 64)); 4542 4543 __ leave(); // required for proper stackwalking of RuntimeStub frame 4544 __ mov(r0, zr); // return 0 4545 __ ret(lr); 4546 4547 return start; 4548 } 4549 4550 // ChaCha20 block function. This version parallelizes the 32-bit 4551 // state elements on each of 16 vectors, producing 4 blocks of 4552 // keystream at a time. 4553 // 4554 // state (int[16]) = c_rarg0 4555 // keystream (byte[256]) = c_rarg1 4556 // return - number of bytes of produced keystream (always 256) 4557 // 4558 // This implementation takes each 32-bit integer from the state 4559 // array and broadcasts it across all 4 32-bit lanes of a vector register 4560 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4561 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4562 // the quarter round schedule is implemented as outlined in RFC 7539 section 4563 // 2.3. However, instead of sequentially processing the 3 quarter round 4564 // operations represented by one QUARTERROUND function, we instead stack all 4565 // the adds, xors and left-rotations from the first 4 quarter rounds together 4566 // and then do the same for the second set of 4 quarter rounds. This removes 4567 // some latency that would otherwise be incurred by waiting for an add to 4568 // complete before performing an xor (which depends on the result of the 4569 // add), etc. An adjustment happens between the first and second groups of 4 4570 // quarter rounds, but this is done only in the inputs to the macro functions 4571 // that generate the assembly instructions - these adjustments themselves are 4572 // not part of the resulting assembly. 4573 // The 4 registers v0-v3 are used during the quarter round operations as 4574 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4575 // registers become the vectors involved in adding the start state back onto 4576 // the post-QR working state. After the adds are complete, each of the 16 4577 // vectors write their first lane back to the keystream buffer, followed 4578 // by the second lane from all vectors and so on. 4579 address generate_chacha20Block_blockpar() { 4580 Label L_twoRounds, L_cc20_const; 4581 // The constant data is broken into two 128-bit segments to be loaded 4582 // onto FloatRegisters. The first 128 bits are a counter add overlay 4583 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4584 // The second 128-bits is a table constant used for 8-bit left rotations. 4585 __ BIND(L_cc20_const); 4586 __ emit_int64(0x0000000100000000UL); 4587 __ emit_int64(0x0000000300000002UL); 4588 __ emit_int64(0x0605040702010003UL); 4589 __ emit_int64(0x0E0D0C0F0A09080BUL); 4590 4591 __ align(CodeEntryAlignment); 4592 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4593 StubCodeMark mark(this, stub_id); 4594 address start = __ pc(); 4595 __ enter(); 4596 4597 int i, j; 4598 const Register state = c_rarg0; 4599 const Register keystream = c_rarg1; 4600 const Register loopCtr = r10; 4601 const Register tmpAddr = r11; 4602 const FloatRegister ctrAddOverlay = v28; 4603 const FloatRegister lrot8Tbl = v29; 4604 4605 // Organize SIMD registers in an array that facilitates 4606 // putting repetitive opcodes into loop structures. It is 4607 // important that each grouping of 4 registers is monotonically 4608 // increasing to support the requirements of multi-register 4609 // instructions (e.g. ld4r, st4, etc.) 4610 const FloatRegister workSt[16] = { 4611 v4, v5, v6, v7, v16, v17, v18, v19, 4612 v20, v21, v22, v23, v24, v25, v26, v27 4613 }; 4614 4615 // Pull in constant data. The first 16 bytes are the add overlay 4616 // which is applied to the vector holding the counter (state[12]). 4617 // The second 16 bytes is the index register for the 8-bit left 4618 // rotation tbl instruction. 4619 __ adr(tmpAddr, L_cc20_const); 4620 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4621 4622 // Load from memory and interlace across 16 SIMD registers, 4623 // With each word from memory being broadcast to all lanes of 4624 // each successive SIMD register. 4625 // Addr(0) -> All lanes in workSt[i] 4626 // Addr(4) -> All lanes workSt[i + 1], etc. 4627 __ mov(tmpAddr, state); 4628 for (i = 0; i < 16; i += 4) { 4629 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4630 __ post(tmpAddr, 16)); 4631 } 4632 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4633 4634 // Before entering the loop, create 5 4-register arrays. These 4635 // will hold the 4 registers that represent the a/b/c/d fields 4636 // in the quarter round operation. For instance the "b" field 4637 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4638 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4639 // since it is part of a diagonal organization. The aSet and scratch 4640 // register sets are defined at declaration time because they do not change 4641 // organization at any point during the 20-round processing. 4642 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4643 FloatRegister bSet[4]; 4644 FloatRegister cSet[4]; 4645 FloatRegister dSet[4]; 4646 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4647 4648 // Set up the 10 iteration loop and perform all 8 quarter round ops 4649 __ mov(loopCtr, 10); 4650 __ BIND(L_twoRounds); 4651 4652 // Set to columnar organization and do the following 4 quarter-rounds: 4653 // QUARTERROUND(0, 4, 8, 12) 4654 // QUARTERROUND(1, 5, 9, 13) 4655 // QUARTERROUND(2, 6, 10, 14) 4656 // QUARTERROUND(3, 7, 11, 15) 4657 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4658 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4659 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4660 4661 __ cc20_qr_add4(aSet, bSet); // a += b 4662 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4663 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4664 4665 __ cc20_qr_add4(cSet, dSet); // c += d 4666 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4667 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4668 4669 __ cc20_qr_add4(aSet, bSet); // a += b 4670 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4671 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4672 4673 __ cc20_qr_add4(cSet, dSet); // c += d 4674 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4675 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4676 4677 // Set to diagonal organization and do the next 4 quarter-rounds: 4678 // QUARTERROUND(0, 5, 10, 15) 4679 // QUARTERROUND(1, 6, 11, 12) 4680 // QUARTERROUND(2, 7, 8, 13) 4681 // QUARTERROUND(3, 4, 9, 14) 4682 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4683 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4684 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4685 4686 __ cc20_qr_add4(aSet, bSet); // a += b 4687 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4688 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4689 4690 __ cc20_qr_add4(cSet, dSet); // c += d 4691 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4692 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4693 4694 __ cc20_qr_add4(aSet, bSet); // a += b 4695 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4696 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4697 4698 __ cc20_qr_add4(cSet, dSet); // c += d 4699 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4700 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4701 4702 // Decrement and iterate 4703 __ sub(loopCtr, loopCtr, 1); 4704 __ cbnz(loopCtr, L_twoRounds); 4705 4706 __ mov(tmpAddr, state); 4707 4708 // Add the starting state back to the post-loop keystream 4709 // state. We read/interlace the state array from memory into 4710 // 4 registers similar to what we did in the beginning. Then 4711 // add the counter overlay onto workSt[12] at the end. 4712 for (i = 0; i < 16; i += 4) { 4713 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4714 __ addv(workSt[i], __ T4S, workSt[i], v0); 4715 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4716 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4717 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4718 } 4719 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4720 4721 // Write working state into the keystream buffer. This is accomplished 4722 // by taking the lane "i" from each of the four vectors and writing 4723 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4724 // repeating with the next 4 vectors until all 16 vectors have been used. 4725 // Then move to the next lane and repeat the process until all lanes have 4726 // been written. 4727 for (i = 0; i < 4; i++) { 4728 for (j = 0; j < 16; j += 4) { 4729 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4730 __ post(keystream, 16)); 4731 } 4732 } 4733 4734 __ mov(r0, 256); // Return length of output keystream 4735 __ leave(); 4736 __ ret(lr); 4737 4738 return start; 4739 } 4740 4741 // Helpers to schedule parallel operation bundles across vector 4742 // register sequences of size 2, 4 or 8. 4743 4744 // Implement various primitive computations across vector sequences 4745 4746 template<int N> 4747 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4748 const VSeq<N>& v1, const VSeq<N>& v2) { 4749 // output must not be constant 4750 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4751 // output cannot overwrite pending inputs 4752 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4753 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4754 for (int i = 0; i < N; i++) { 4755 __ addv(v[i], T, v1[i], v2[i]); 4756 } 4757 } 4758 4759 template<int N> 4760 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4761 const VSeq<N>& v1, const VSeq<N>& v2) { 4762 // output must not be constant 4763 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4764 // output cannot overwrite pending inputs 4765 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4766 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4767 for (int i = 0; i < N; i++) { 4768 __ subv(v[i], T, v1[i], v2[i]); 4769 } 4770 } 4771 4772 template<int N> 4773 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4774 const VSeq<N>& v1, const VSeq<N>& v2) { 4775 // output must not be constant 4776 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4777 // output cannot overwrite pending inputs 4778 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4779 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4780 for (int i = 0; i < N; i++) { 4781 __ mulv(v[i], T, v1[i], v2[i]); 4782 } 4783 } 4784 4785 template<int N> 4786 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4787 // output must not be constant 4788 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4789 // output cannot overwrite pending inputs 4790 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4791 for (int i = 0; i < N; i++) { 4792 __ negr(v[i], T, v1[i]); 4793 } 4794 } 4795 4796 template<int N> 4797 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4798 const VSeq<N>& v1, int shift) { 4799 // output must not be constant 4800 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4801 // output cannot overwrite pending inputs 4802 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4803 for (int i = 0; i < N; i++) { 4804 __ sshr(v[i], T, v1[i], shift); 4805 } 4806 } 4807 4808 template<int N> 4809 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4810 // output must not be constant 4811 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4812 // output cannot overwrite pending inputs 4813 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4814 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4815 for (int i = 0; i < N; i++) { 4816 __ andr(v[i], __ T16B, v1[i], v2[i]); 4817 } 4818 } 4819 4820 template<int N> 4821 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4822 // output must not be constant 4823 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4824 // output cannot overwrite pending inputs 4825 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4826 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4827 for (int i = 0; i < N; i++) { 4828 __ orr(v[i], __ T16B, v1[i], v2[i]); 4829 } 4830 } 4831 4832 template<int N> 4833 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4834 // output must not be constant 4835 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4836 // output cannot overwrite pending inputs 4837 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4838 for (int i = 0; i < N; i++) { 4839 __ notr(v[i], __ T16B, v1[i]); 4840 } 4841 } 4842 4843 template<int N> 4844 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4845 // output must not be constant 4846 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4847 // output cannot overwrite pending inputs 4848 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4849 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4850 for (int i = 0; i < N; i++) { 4851 __ sqdmulh(v[i], T, v1[i], v2[i]); 4852 } 4853 } 4854 4855 template<int N> 4856 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4857 // output must not be constant 4858 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4859 // output cannot overwrite pending inputs 4860 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4861 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4862 for (int i = 0; i < N; i++) { 4863 __ mlsv(v[i], T, v1[i], v2[i]); 4864 } 4865 } 4866 4867 // load N/2 successive pairs of quadword values from memory in order 4868 // into N successive vector registers of the sequence via the 4869 // address supplied in base. 4870 template<int N> 4871 void vs_ldpq(const VSeq<N>& v, Register base) { 4872 for (int i = 0; i < N; i += 2) { 4873 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4874 } 4875 } 4876 4877 // load N/2 successive pairs of quadword values from memory in order 4878 // into N vector registers of the sequence via the address supplied 4879 // in base using post-increment addressing 4880 template<int N> 4881 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4882 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4883 for (int i = 0; i < N; i += 2) { 4884 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4885 } 4886 } 4887 4888 // store N successive vector registers of the sequence into N/2 4889 // successive pairs of quadword memory locations via the address 4890 // supplied in base using post-increment addressing 4891 template<int N> 4892 void vs_stpq_post(const VSeq<N>& v, Register base) { 4893 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4894 for (int i = 0; i < N; i += 2) { 4895 __ stpq(v[i], v[i+1], __ post(base, 32)); 4896 } 4897 } 4898 4899 // load N/2 pairs of quadword values from memory de-interleaved into 4900 // N vector registers 2 at a time via the address supplied in base 4901 // using post-increment addressing. 4902 template<int N> 4903 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4904 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4905 for (int i = 0; i < N; i += 2) { 4906 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4907 } 4908 } 4909 4910 // store N vector registers interleaved into N/2 pairs of quadword 4911 // memory locations via the address supplied in base using 4912 // post-increment addressing. 4913 template<int N> 4914 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4915 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4916 for (int i = 0; i < N; i += 2) { 4917 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4918 } 4919 } 4920 4921 // load N quadword values from memory de-interleaved into N vector 4922 // registers 3 elements at a time via the address supplied in base. 4923 template<int N> 4924 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4925 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4926 for (int i = 0; i < N; i += 3) { 4927 __ ld3(v[i], v[i+1], v[i+2], T, base); 4928 } 4929 } 4930 4931 // load N quadword values from memory de-interleaved into N vector 4932 // registers 3 elements at a time via the address supplied in base 4933 // using post-increment addressing. 4934 template<int N> 4935 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4936 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4937 for (int i = 0; i < N; i += 3) { 4938 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4939 } 4940 } 4941 4942 // load N/2 pairs of quadword values from memory into N vector 4943 // registers via the address supplied in base with each pair indexed 4944 // using the the start offset plus the corresponding entry in the 4945 // offsets array 4946 template<int N> 4947 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4948 for (int i = 0; i < N/2; i++) { 4949 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4950 } 4951 } 4952 4953 // store N vector registers into N/2 pairs of quadword memory 4954 // locations via the address supplied in base with each pair indexed 4955 // using the the start offset plus the corresponding entry in the 4956 // offsets array 4957 template<int N> 4958 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4959 for (int i = 0; i < N/2; i++) { 4960 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4961 } 4962 } 4963 4964 // load N single quadword values from memory into N vector registers 4965 // via the address supplied in base with each value indexed using 4966 // the the start offset plus the corresponding entry in the offsets 4967 // array 4968 template<int N> 4969 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4970 int start, int (&offsets)[N]) { 4971 for (int i = 0; i < N; i++) { 4972 __ ldr(v[i], T, Address(base, start + offsets[i])); 4973 } 4974 } 4975 4976 // store N vector registers into N single quadword memory locations 4977 // via the address supplied in base with each value indexed using 4978 // the the start offset plus the corresponding entry in the offsets 4979 // array 4980 template<int N> 4981 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4982 int start, int (&offsets)[N]) { 4983 for (int i = 0; i < N; i++) { 4984 __ str(v[i], T, Address(base, start + offsets[i])); 4985 } 4986 } 4987 4988 // load N/2 pairs of quadword values from memory de-interleaved into 4989 // N vector registers 2 at a time via the address supplied in base 4990 // with each pair indexed using the the start offset plus the 4991 // corresponding entry in the offsets array 4992 template<int N> 4993 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4994 Register tmp, int start, int (&offsets)[N/2]) { 4995 for (int i = 0; i < N/2; i++) { 4996 __ add(tmp, base, start + offsets[i]); 4997 __ ld2(v[2*i], v[2*i+1], T, tmp); 4998 } 4999 } 5000 5001 // store N vector registers 2 at a time interleaved into N/2 pairs 5002 // of quadword memory locations via the address supplied in base 5003 // with each pair indexed using the the start offset plus the 5004 // corresponding entry in the offsets array 5005 template<int N> 5006 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 5007 Register tmp, int start, int (&offsets)[N/2]) { 5008 for (int i = 0; i < N/2; i++) { 5009 __ add(tmp, base, start + offsets[i]); 5010 __ st2(v[2*i], v[2*i+1], T, tmp); 5011 } 5012 } 5013 5014 // Helper routines for various flavours of Montgomery multiply 5015 5016 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 5017 // multiplications in parallel 5018 // 5019 5020 // See the montMul() method of the sun.security.provider.ML_DSA 5021 // class. 5022 // 5023 // Computes 4x4S results or 8x8H results 5024 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5025 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5026 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5027 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5028 // Outputs: va - 4x4S or 4x8H vector register sequences 5029 // vb, vc, vtmp and vq must all be disjoint 5030 // va must be disjoint from all other inputs/temps or must equal vc 5031 // va must have a non-zero delta i.e. it must not be a constant vseq. 5032 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5033 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5034 Assembler::SIMD_Arrangement T, 5035 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5036 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5037 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5038 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5039 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5040 5041 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5042 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5043 5044 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5045 5046 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5047 assert(vs_disjoint(va, vb), "va and vb overlap"); 5048 assert(vs_disjoint(va, vq), "va and vq overlap"); 5049 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5050 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5051 5052 // schedule 4 streams of instructions across the vector sequences 5053 for (int i = 0; i < 4; i++) { 5054 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5055 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5056 } 5057 5058 for (int i = 0; i < 4; i++) { 5059 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5060 } 5061 5062 for (int i = 0; i < 4; i++) { 5063 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5064 } 5065 5066 for (int i = 0; i < 4; i++) { 5067 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5068 } 5069 } 5070 5071 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5072 // multiplications in parallel 5073 // 5074 5075 // See the montMul() method of the sun.security.provider.ML_DSA 5076 // class. 5077 // 5078 // Computes 4x4S results or 8x8H results 5079 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5080 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5081 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5082 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5083 // Outputs: va - 4x4S or 4x8H vector register sequences 5084 // vb, vc, vtmp and vq must all be disjoint 5085 // va must be disjoint from all other inputs/temps or must equal vc 5086 // va must have a non-zero delta i.e. it must not be a constant vseq. 5087 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5088 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5089 Assembler::SIMD_Arrangement T, 5090 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5091 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5092 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5093 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5094 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5095 5096 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5097 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5098 5099 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5100 5101 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5102 assert(vs_disjoint(va, vb), "va and vb overlap"); 5103 assert(vs_disjoint(va, vq), "va and vq overlap"); 5104 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5105 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5106 5107 // schedule 2 streams of instructions across the vector sequences 5108 for (int i = 0; i < 2; i++) { 5109 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5110 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5111 } 5112 5113 for (int i = 0; i < 2; i++) { 5114 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5115 } 5116 5117 for (int i = 0; i < 2; i++) { 5118 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5119 } 5120 5121 for (int i = 0; i < 2; i++) { 5122 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5123 } 5124 } 5125 5126 // Perform 16 16-bit Montgomery multiplications in parallel. 5127 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5128 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5129 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5130 // It will assert that the register use is valid 5131 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5132 } 5133 5134 // Perform 32 16-bit Montgomery multiplications in parallel. 5135 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5136 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5137 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5138 // It will assert that the register use is valid 5139 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5140 } 5141 5142 // Perform 64 16-bit Montgomery multiplications in parallel. 5143 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5144 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5145 // Schedule two successive 4x8H multiplies via the montmul helper 5146 // on the front and back halves of va, vb and vc. The helper will 5147 // assert that the register use has no overlap conflicts on each 5148 // individual call but we also need to ensure that the necessary 5149 // disjoint/equality constraints are met across both calls. 5150 5151 // vb, vc, vtmp and vq must be disjoint. va must either be 5152 // disjoint from all other registers or equal vc 5153 5154 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5155 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5156 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5157 5158 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5159 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5160 5161 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5162 5163 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5164 assert(vs_disjoint(va, vb), "va and vb overlap"); 5165 assert(vs_disjoint(va, vq), "va and vq overlap"); 5166 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5167 5168 // we multiply the front and back halves of each sequence 4 at a 5169 // time because 5170 // 5171 // 1) we are currently only able to get 4-way instruction 5172 // parallelism at best 5173 // 5174 // 2) we need registers for the constants in vq and temporary 5175 // scratch registers to hold intermediate results so vtmp can only 5176 // be a VSeq<4> which means we only have 4 scratch slots 5177 5178 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5179 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5180 } 5181 5182 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5183 const VSeq<4>& vc, 5184 const VSeq<4>& vtmp, 5185 const VSeq<2>& vq) { 5186 // compute a = montmul(a1, c) 5187 kyber_montmul32(vc, va1, vc, vtmp, vq); 5188 // ouptut a1 = a0 - a 5189 vs_subv(va1, __ T8H, va0, vc); 5190 // and a0 = a0 + a 5191 vs_addv(va0, __ T8H, va0, vc); 5192 } 5193 5194 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5195 const VSeq<4>& vb, 5196 const VSeq<4>& vtmp1, 5197 const VSeq<4>& vtmp2, 5198 const VSeq<2>& vq) { 5199 // compute c = a0 - a1 5200 vs_subv(vtmp1, __ T8H, va0, va1); 5201 // output a0 = a0 + a1 5202 vs_addv(va0, __ T8H, va0, va1); 5203 // output a1 = b montmul c 5204 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5205 } 5206 5207 void load64shorts(const VSeq<8>& v, Register shorts) { 5208 vs_ldpq_post(v, shorts); 5209 } 5210 5211 void load32shorts(const VSeq<4>& v, Register shorts) { 5212 vs_ldpq_post(v, shorts); 5213 } 5214 5215 void store64shorts(VSeq<8> v, Register tmpAddr) { 5216 vs_stpq_post(v, tmpAddr); 5217 } 5218 5219 // Kyber NTT function. 5220 // Implements 5221 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5222 // 5223 // coeffs (short[256]) = c_rarg0 5224 // ntt_zetas (short[256]) = c_rarg1 5225 address generate_kyberNtt() { 5226 5227 __ align(CodeEntryAlignment); 5228 StubGenStubId stub_id = StubGenStubId::kyberNtt_id; 5229 StubCodeMark mark(this, stub_id); 5230 address start = __ pc(); 5231 __ enter(); 5232 5233 const Register coeffs = c_rarg0; 5234 const Register zetas = c_rarg1; 5235 5236 const Register kyberConsts = r10; 5237 const Register tmpAddr = r11; 5238 5239 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5240 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5241 VSeq<2> vq(30); // n.b. constants overlap vs3 5242 5243 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5244 // load the montmul constants 5245 vs_ldpq(vq, kyberConsts); 5246 5247 // Each level corresponds to an iteration of the outermost loop of the 5248 // Java method seilerNTT(int[] coeffs). There are some differences 5249 // from what is done in the seilerNTT() method, though: 5250 // 1. The computation is using 16-bit signed values, we do not convert them 5251 // to ints here. 5252 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5253 // this array for each level, it is easier that way to fill up the vector 5254 // registers. 5255 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5256 // multiplications (this is because that way there should not be any 5257 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5258 // that we can use the 16-bit arithmetic in the vector unit. 5259 // 5260 // On each level, we fill up the vector registers in such a way that the 5261 // array elements that need to be multiplied by the zetas go into one 5262 // set of vector registers while the corresponding ones that don't need to 5263 // be multiplied, go into another set. 5264 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5265 // registers interleaving the steps of 4 identical computations, 5266 // each done on 8 16-bit values per register. 5267 5268 // At levels 0-3 the coefficients multiplied by or added/subtracted 5269 // to the zetas occur in discrete blocks whose size is some multiple 5270 // of 32. 5271 5272 // level 0 5273 __ add(tmpAddr, coeffs, 256); 5274 load64shorts(vs1, tmpAddr); 5275 load64shorts(vs2, zetas); 5276 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5277 __ add(tmpAddr, coeffs, 0); 5278 load64shorts(vs1, tmpAddr); 5279 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5280 vs_addv(vs1, __ T8H, vs1, vs2); 5281 __ add(tmpAddr, coeffs, 0); 5282 vs_stpq_post(vs1, tmpAddr); 5283 __ add(tmpAddr, coeffs, 256); 5284 vs_stpq_post(vs3, tmpAddr); 5285 // restore montmul constants 5286 vs_ldpq(vq, kyberConsts); 5287 load64shorts(vs1, tmpAddr); 5288 load64shorts(vs2, zetas); 5289 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5290 __ add(tmpAddr, coeffs, 128); 5291 load64shorts(vs1, tmpAddr); 5292 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5293 vs_addv(vs1, __ T8H, vs1, vs2); 5294 __ add(tmpAddr, coeffs, 128); 5295 store64shorts(vs1, tmpAddr); 5296 __ add(tmpAddr, coeffs, 384); 5297 store64shorts(vs3, tmpAddr); 5298 5299 // level 1 5300 // restore montmul constants 5301 vs_ldpq(vq, kyberConsts); 5302 __ add(tmpAddr, coeffs, 128); 5303 load64shorts(vs1, tmpAddr); 5304 load64shorts(vs2, zetas); 5305 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5306 __ add(tmpAddr, coeffs, 0); 5307 load64shorts(vs1, tmpAddr); 5308 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5309 vs_addv(vs1, __ T8H, vs1, vs2); 5310 __ add(tmpAddr, coeffs, 0); 5311 store64shorts(vs1, tmpAddr); 5312 store64shorts(vs3, tmpAddr); 5313 vs_ldpq(vq, kyberConsts); 5314 __ add(tmpAddr, coeffs, 384); 5315 load64shorts(vs1, tmpAddr); 5316 load64shorts(vs2, zetas); 5317 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5318 __ add(tmpAddr, coeffs, 256); 5319 load64shorts(vs1, tmpAddr); 5320 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5321 vs_addv(vs1, __ T8H, vs1, vs2); 5322 __ add(tmpAddr, coeffs, 256); 5323 store64shorts(vs1, tmpAddr); 5324 store64shorts(vs3, tmpAddr); 5325 5326 // level 2 5327 vs_ldpq(vq, kyberConsts); 5328 int offsets1[4] = { 0, 32, 128, 160 }; 5329 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5330 load64shorts(vs2, zetas); 5331 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5332 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5333 // kyber_subv_addv64(); 5334 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5335 vs_addv(vs1, __ T8H, vs1, vs2); 5336 __ add(tmpAddr, coeffs, 0); 5337 vs_stpq_post(vs_front(vs1), tmpAddr); 5338 vs_stpq_post(vs_front(vs3), tmpAddr); 5339 vs_stpq_post(vs_back(vs1), tmpAddr); 5340 vs_stpq_post(vs_back(vs3), tmpAddr); 5341 vs_ldpq(vq, kyberConsts); 5342 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5343 load64shorts(vs2, zetas); 5344 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5345 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5346 // kyber_subv_addv64(); 5347 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5348 vs_addv(vs1, __ T8H, vs1, vs2); 5349 __ add(tmpAddr, coeffs, 256); 5350 vs_stpq_post(vs_front(vs1), tmpAddr); 5351 vs_stpq_post(vs_front(vs3), tmpAddr); 5352 vs_stpq_post(vs_back(vs1), tmpAddr); 5353 vs_stpq_post(vs_back(vs3), tmpAddr); 5354 5355 // level 3 5356 vs_ldpq(vq, kyberConsts); 5357 int offsets2[4] = { 0, 64, 128, 192 }; 5358 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5359 load64shorts(vs2, zetas); 5360 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5361 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5362 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5363 vs_addv(vs1, __ T8H, vs1, vs2); 5364 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5365 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5366 5367 vs_ldpq(vq, kyberConsts); 5368 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5369 load64shorts(vs2, zetas); 5370 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5371 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5372 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5373 vs_addv(vs1, __ T8H, vs1, vs2); 5374 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5375 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5376 5377 // level 4 5378 // At level 4 coefficients occur in 8 discrete blocks of size 16 5379 // so they are loaded using employing an ldr at 8 distinct offsets. 5380 5381 vs_ldpq(vq, kyberConsts); 5382 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5383 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5384 load64shorts(vs2, zetas); 5385 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5386 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5387 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5388 vs_addv(vs1, __ T8H, vs1, vs2); 5389 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5390 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5391 5392 vs_ldpq(vq, kyberConsts); 5393 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5394 load64shorts(vs2, zetas); 5395 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5396 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5397 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5398 vs_addv(vs1, __ T8H, vs1, vs2); 5399 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5400 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5401 5402 // level 5 5403 // At level 5 related coefficients occur in discrete blocks of size 8 so 5404 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5405 5406 vs_ldpq(vq, kyberConsts); 5407 int offsets4[4] = { 0, 32, 64, 96 }; 5408 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5409 load32shorts(vs_front(vs2), zetas); 5410 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5411 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5412 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5413 load32shorts(vs_front(vs2), zetas); 5414 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5415 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5416 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5417 load32shorts(vs_front(vs2), zetas); 5418 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5419 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5420 5421 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5422 load32shorts(vs_front(vs2), zetas); 5423 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5424 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5425 5426 // level 6 5427 // At level 6 related coefficients occur in discrete blocks of size 4 so 5428 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5429 5430 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5431 load32shorts(vs_front(vs2), zetas); 5432 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5433 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5434 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5435 // __ ldpq(v18, v19, __ post(zetas, 32)); 5436 load32shorts(vs_front(vs2), zetas); 5437 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5438 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5439 5440 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5441 load32shorts(vs_front(vs2), zetas); 5442 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5443 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5444 5445 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5446 load32shorts(vs_front(vs2), zetas); 5447 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5448 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5449 5450 __ leave(); // required for proper stackwalking of RuntimeStub frame 5451 __ mov(r0, zr); // return 0 5452 __ ret(lr); 5453 5454 return start; 5455 } 5456 5457 // Kyber Inverse NTT function 5458 // Implements 5459 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5460 // 5461 // coeffs (short[256]) = c_rarg0 5462 // ntt_zetas (short[256]) = c_rarg1 5463 address generate_kyberInverseNtt() { 5464 5465 __ align(CodeEntryAlignment); 5466 StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id; 5467 StubCodeMark mark(this, stub_id); 5468 address start = __ pc(); 5469 __ enter(); 5470 5471 const Register coeffs = c_rarg0; 5472 const Register zetas = c_rarg1; 5473 5474 const Register kyberConsts = r10; 5475 const Register tmpAddr = r11; 5476 const Register tmpAddr2 = c_rarg2; 5477 5478 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5479 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5480 VSeq<2> vq(30); // n.b. constants overlap vs3 5481 5482 __ lea(kyberConsts, 5483 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5484 5485 // level 0 5486 // At level 0 related coefficients occur in discrete blocks of size 4 so 5487 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5488 5489 vs_ldpq(vq, kyberConsts); 5490 int offsets4[4] = { 0, 32, 64, 96 }; 5491 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5492 load32shorts(vs_front(vs2), zetas); 5493 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5494 vs_front(vs2), vs_back(vs2), vtmp, vq); 5495 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5496 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5497 load32shorts(vs_front(vs2), zetas); 5498 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5499 vs_front(vs2), vs_back(vs2), vtmp, vq); 5500 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5501 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5502 load32shorts(vs_front(vs2), zetas); 5503 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5504 vs_front(vs2), vs_back(vs2), vtmp, vq); 5505 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5506 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5507 load32shorts(vs_front(vs2), zetas); 5508 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5509 vs_front(vs2), vs_back(vs2), vtmp, vq); 5510 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5511 5512 // level 1 5513 // At level 1 related coefficients occur in discrete blocks of size 8 so 5514 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5515 5516 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5517 load32shorts(vs_front(vs2), zetas); 5518 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5519 vs_front(vs2), vs_back(vs2), vtmp, vq); 5520 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5521 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5522 load32shorts(vs_front(vs2), zetas); 5523 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5524 vs_front(vs2), vs_back(vs2), vtmp, vq); 5525 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5526 5527 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5528 load32shorts(vs_front(vs2), zetas); 5529 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5530 vs_front(vs2), vs_back(vs2), vtmp, vq); 5531 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5532 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5533 load32shorts(vs_front(vs2), zetas); 5534 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5535 vs_front(vs2), vs_back(vs2), vtmp, vq); 5536 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5537 5538 // level 2 5539 // At level 2 coefficients occur in 8 discrete blocks of size 16 5540 // so they are loaded using employing an ldr at 8 distinct offsets. 5541 5542 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5543 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5544 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5545 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5546 vs_subv(vs1, __ T8H, vs1, vs2); 5547 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5548 load64shorts(vs2, zetas); 5549 vs_ldpq(vq, kyberConsts); 5550 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5551 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5552 5553 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5554 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5555 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5556 vs_subv(vs1, __ T8H, vs1, vs2); 5557 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5558 load64shorts(vs2, zetas); 5559 vs_ldpq(vq, kyberConsts); 5560 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5561 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5562 5563 // Barrett reduction at indexes where overflow may happen 5564 5565 // load q and the multiplier for the Barrett reduction 5566 __ add(tmpAddr, kyberConsts, 16); 5567 vs_ldpq(vq, tmpAddr); 5568 5569 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5570 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5571 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5572 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5573 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5574 vs_sshr(vs2, __ T8H, vs2, 11); 5575 vs_mlsv(vs1, __ T8H, vs2, vq1); 5576 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5577 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5578 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5579 vs_sshr(vs2, __ T8H, vs2, 11); 5580 vs_mlsv(vs1, __ T8H, vs2, vq1); 5581 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5582 5583 // level 3 5584 // From level 3 upwards coefficients occur in discrete blocks whose size is 5585 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5586 5587 int offsets2[4] = { 0, 64, 128, 192 }; 5588 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5589 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5590 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5591 vs_subv(vs1, __ T8H, vs1, vs2); 5592 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5593 load64shorts(vs2, zetas); 5594 vs_ldpq(vq, kyberConsts); 5595 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5596 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5597 5598 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5599 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5600 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5601 vs_subv(vs1, __ T8H, vs1, vs2); 5602 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5603 load64shorts(vs2, zetas); 5604 vs_ldpq(vq, kyberConsts); 5605 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5606 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5607 5608 // level 4 5609 5610 int offsets1[4] = { 0, 32, 128, 160 }; 5611 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5612 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5613 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5614 vs_subv(vs1, __ T8H, vs1, vs2); 5615 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5616 load64shorts(vs2, zetas); 5617 vs_ldpq(vq, kyberConsts); 5618 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5619 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5620 5621 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5622 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5623 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5624 vs_subv(vs1, __ T8H, vs1, vs2); 5625 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5626 load64shorts(vs2, zetas); 5627 vs_ldpq(vq, kyberConsts); 5628 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5629 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5630 5631 // level 5 5632 5633 __ add(tmpAddr, coeffs, 0); 5634 load64shorts(vs1, tmpAddr); 5635 __ add(tmpAddr, coeffs, 128); 5636 load64shorts(vs2, tmpAddr); 5637 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5638 vs_subv(vs1, __ T8H, vs1, vs2); 5639 __ add(tmpAddr, coeffs, 0); 5640 store64shorts(vs3, tmpAddr); 5641 load64shorts(vs2, zetas); 5642 vs_ldpq(vq, kyberConsts); 5643 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5644 __ add(tmpAddr, coeffs, 128); 5645 store64shorts(vs2, tmpAddr); 5646 5647 load64shorts(vs1, tmpAddr); 5648 __ add(tmpAddr, coeffs, 384); 5649 load64shorts(vs2, tmpAddr); 5650 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5651 vs_subv(vs1, __ T8H, vs1, vs2); 5652 __ add(tmpAddr, coeffs, 256); 5653 store64shorts(vs3, tmpAddr); 5654 load64shorts(vs2, zetas); 5655 vs_ldpq(vq, kyberConsts); 5656 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5657 __ add(tmpAddr, coeffs, 384); 5658 store64shorts(vs2, tmpAddr); 5659 5660 // Barrett reduction at indexes where overflow may happen 5661 5662 // load q and the multiplier for the Barrett reduction 5663 __ add(tmpAddr, kyberConsts, 16); 5664 vs_ldpq(vq, tmpAddr); 5665 5666 int offsets0[2] = { 0, 256 }; 5667 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5668 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5669 vs_sshr(vs2, __ T8H, vs2, 11); 5670 vs_mlsv(vs1, __ T8H, vs2, vq1); 5671 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5672 5673 // level 6 5674 5675 __ add(tmpAddr, coeffs, 0); 5676 load64shorts(vs1, tmpAddr); 5677 __ add(tmpAddr, coeffs, 256); 5678 load64shorts(vs2, tmpAddr); 5679 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5680 vs_subv(vs1, __ T8H, vs1, vs2); 5681 __ add(tmpAddr, coeffs, 0); 5682 store64shorts(vs3, tmpAddr); 5683 load64shorts(vs2, zetas); 5684 vs_ldpq(vq, kyberConsts); 5685 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5686 __ add(tmpAddr, coeffs, 256); 5687 store64shorts(vs2, tmpAddr); 5688 5689 __ add(tmpAddr, coeffs, 128); 5690 load64shorts(vs1, tmpAddr); 5691 __ add(tmpAddr, coeffs, 384); 5692 load64shorts(vs2, tmpAddr); 5693 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5694 vs_subv(vs1, __ T8H, vs1, vs2); 5695 __ add(tmpAddr, coeffs, 128); 5696 store64shorts(vs3, tmpAddr); 5697 load64shorts(vs2, zetas); 5698 vs_ldpq(vq, kyberConsts); 5699 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5700 __ add(tmpAddr, coeffs, 384); 5701 store64shorts(vs2, tmpAddr); 5702 5703 // multiply by 2^-n 5704 5705 // load toMont(2^-n mod q) 5706 __ add(tmpAddr, kyberConsts, 48); 5707 __ ldr(v29, __ Q, tmpAddr); 5708 5709 vs_ldpq(vq, kyberConsts); 5710 __ add(tmpAddr, coeffs, 0); 5711 load64shorts(vs1, tmpAddr); 5712 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5713 __ add(tmpAddr, coeffs, 0); 5714 store64shorts(vs2, tmpAddr); 5715 5716 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5717 load64shorts(vs1, tmpAddr); 5718 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5719 __ add(tmpAddr, coeffs, 128); 5720 store64shorts(vs2, tmpAddr); 5721 5722 // now tmpAddr contains coeffs + 256 5723 load64shorts(vs1, tmpAddr); 5724 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5725 __ add(tmpAddr, coeffs, 256); 5726 store64shorts(vs2, tmpAddr); 5727 5728 // now tmpAddr contains coeffs + 384 5729 load64shorts(vs1, tmpAddr); 5730 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5731 __ add(tmpAddr, coeffs, 384); 5732 store64shorts(vs2, tmpAddr); 5733 5734 __ leave(); // required for proper stackwalking of RuntimeStub frame 5735 __ mov(r0, zr); // return 0 5736 __ ret(lr); 5737 5738 return start; 5739 } 5740 5741 // Kyber multiply polynomials in the NTT domain. 5742 // Implements 5743 // static int implKyberNttMult( 5744 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5745 // 5746 // result (short[256]) = c_rarg0 5747 // ntta (short[256]) = c_rarg1 5748 // nttb (short[256]) = c_rarg2 5749 // zetas (short[128]) = c_rarg3 5750 address generate_kyberNttMult() { 5751 5752 __ align(CodeEntryAlignment); 5753 StubGenStubId stub_id = StubGenStubId::kyberNttMult_id; 5754 StubCodeMark mark(this, stub_id); 5755 address start = __ pc(); 5756 __ enter(); 5757 5758 const Register result = c_rarg0; 5759 const Register ntta = c_rarg1; 5760 const Register nttb = c_rarg2; 5761 const Register zetas = c_rarg3; 5762 5763 const Register kyberConsts = r10; 5764 const Register limit = r11; 5765 5766 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5767 VSeq<4> vs3(16), vs4(20); 5768 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5769 VSeq<2> vz(28); // pair of zetas 5770 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5771 5772 __ lea(kyberConsts, 5773 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5774 5775 Label kyberNttMult_loop; 5776 5777 __ add(limit, result, 512); 5778 5779 // load q and qinv 5780 vs_ldpq(vq, kyberConsts); 5781 5782 // load R^2 mod q (to convert back from Montgomery representation) 5783 __ add(kyberConsts, kyberConsts, 64); 5784 __ ldr(v27, __ Q, kyberConsts); 5785 5786 __ BIND(kyberNttMult_loop); 5787 5788 // load 16 zetas 5789 vs_ldpq_post(vz, zetas); 5790 5791 // load 2 sets of 32 coefficients from the two input arrays 5792 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5793 // are striped across pairs of vector registers 5794 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5795 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5796 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5797 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5798 5799 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5800 // i.e. montmul the first and second halves of vs1 in order and 5801 // then with one sequence reversed storing the two results in vs3 5802 // 5803 // vs3[0] <- montmul(a0, b0) 5804 // vs3[1] <- montmul(a1, b1) 5805 // vs3[2] <- montmul(a0, b1) 5806 // vs3[3] <- montmul(a1, b0) 5807 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5808 kyber_montmul16(vs_back(vs3), 5809 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5810 5811 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5812 // i.e. montmul the first and second halves of vs4 in order and 5813 // then with one sequence reversed storing the two results in vs1 5814 // 5815 // vs1[0] <- montmul(a2, b2) 5816 // vs1[1] <- montmul(a3, b3) 5817 // vs1[2] <- montmul(a2, b3) 5818 // vs1[3] <- montmul(a3, b2) 5819 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5820 kyber_montmul16(vs_back(vs1), 5821 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5822 5823 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5824 // We can schedule two montmuls at a time if we use a suitable vector 5825 // sequence <vs3[1], vs1[1]>. 5826 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5827 VSeq<2> vs5(vs3[1], delta); 5828 5829 // vs3[1] <- montmul(montmul(a1, b1), z0) 5830 // vs1[1] <- montmul(montmul(a3, b3), z1) 5831 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5832 5833 // add results in pairs storing in vs3 5834 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5835 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5836 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5837 5838 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5839 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5840 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5841 5842 // vs1 <- montmul(vs3, montRSquareModQ) 5843 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5844 5845 // store back the two pairs of result vectors de-interleaved as 8H elements 5846 // i.e. storing each pairs of shorts striped across a register pair adjacent 5847 // in memory 5848 vs_st2_post(vs1, __ T8H, result); 5849 5850 __ cmp(result, limit); 5851 __ br(Assembler::NE, kyberNttMult_loop); 5852 5853 __ leave(); // required for proper stackwalking of RuntimeStub frame 5854 __ mov(r0, zr); // return 0 5855 __ ret(lr); 5856 5857 return start; 5858 } 5859 5860 // Kyber add 2 polynomials. 5861 // Implements 5862 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5863 // 5864 // result (short[256]) = c_rarg0 5865 // a (short[256]) = c_rarg1 5866 // b (short[256]) = c_rarg2 5867 address generate_kyberAddPoly_2() { 5868 5869 __ align(CodeEntryAlignment); 5870 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id; 5871 StubCodeMark mark(this, stub_id); 5872 address start = __ pc(); 5873 __ enter(); 5874 5875 const Register result = c_rarg0; 5876 const Register a = c_rarg1; 5877 const Register b = c_rarg2; 5878 5879 const Register kyberConsts = r11; 5880 5881 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5882 // So, we can load, add and store the data in 3 groups of 11, 5883 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5884 // registers. A further constraint is that the mapping needs 5885 // to skip callee saves. So, we allocate the register 5886 // sequences using two 8 sequences, two 2 sequences and two 5887 // single registers. 5888 VSeq<8> vs1_1(0); 5889 VSeq<2> vs1_2(16); 5890 FloatRegister vs1_3 = v28; 5891 VSeq<8> vs2_1(18); 5892 VSeq<2> vs2_2(26); 5893 FloatRegister vs2_3 = v29; 5894 5895 // two constant vector sequences 5896 VSeq<8> vc_1(31, 0); 5897 VSeq<2> vc_2(31, 0); 5898 5899 FloatRegister vc_3 = v31; 5900 __ lea(kyberConsts, 5901 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5902 5903 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5904 for (int i = 0; i < 3; i++) { 5905 // load 80 or 88 values from a into vs1_1/2/3 5906 vs_ldpq_post(vs1_1, a); 5907 vs_ldpq_post(vs1_2, a); 5908 if (i < 2) { 5909 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5910 } 5911 // load 80 or 88 values from b into vs2_1/2/3 5912 vs_ldpq_post(vs2_1, b); 5913 vs_ldpq_post(vs2_2, b); 5914 if (i < 2) { 5915 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5916 } 5917 // sum 80 or 88 values across vs1 and vs2 into vs1 5918 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5919 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5920 if (i < 2) { 5921 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5922 } 5923 // add constant to all 80 or 88 results 5924 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5925 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5926 if (i < 2) { 5927 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5928 } 5929 // store 80 or 88 values 5930 vs_stpq_post(vs1_1, result); 5931 vs_stpq_post(vs1_2, result); 5932 if (i < 2) { 5933 __ str(vs1_3, __ Q, __ post(result, 16)); 5934 } 5935 } 5936 5937 __ leave(); // required for proper stackwalking of RuntimeStub frame 5938 __ mov(r0, zr); // return 0 5939 __ ret(lr); 5940 5941 return start; 5942 } 5943 5944 // Kyber add 3 polynomials. 5945 // Implements 5946 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5947 // 5948 // result (short[256]) = c_rarg0 5949 // a (short[256]) = c_rarg1 5950 // b (short[256]) = c_rarg2 5951 // c (short[256]) = c_rarg3 5952 address generate_kyberAddPoly_3() { 5953 5954 __ align(CodeEntryAlignment); 5955 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id; 5956 StubCodeMark mark(this, stub_id); 5957 address start = __ pc(); 5958 __ enter(); 5959 5960 const Register result = c_rarg0; 5961 const Register a = c_rarg1; 5962 const Register b = c_rarg2; 5963 const Register c = c_rarg3; 5964 5965 const Register kyberConsts = r11; 5966 5967 // As above we sum 256 sets of values in total i.e. 32 x 8H 5968 // quadwords. So, we can load, add and store the data in 3 5969 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5970 // of 10 or 11 registers. A further constraint is that the 5971 // mapping needs to skip callee saves. So, we allocate the 5972 // register sequences using two 8 sequences, two 2 sequences 5973 // and two single registers. 5974 VSeq<8> vs1_1(0); 5975 VSeq<2> vs1_2(16); 5976 FloatRegister vs1_3 = v28; 5977 VSeq<8> vs2_1(18); 5978 VSeq<2> vs2_2(26); 5979 FloatRegister vs2_3 = v29; 5980 5981 // two constant vector sequences 5982 VSeq<8> vc_1(31, 0); 5983 VSeq<2> vc_2(31, 0); 5984 5985 FloatRegister vc_3 = v31; 5986 5987 __ lea(kyberConsts, 5988 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5989 5990 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5991 for (int i = 0; i < 3; i++) { 5992 // load 80 or 88 values from a into vs1_1/2/3 5993 vs_ldpq_post(vs1_1, a); 5994 vs_ldpq_post(vs1_2, a); 5995 if (i < 2) { 5996 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5997 } 5998 // load 80 or 88 values from b into vs2_1/2/3 5999 vs_ldpq_post(vs2_1, b); 6000 vs_ldpq_post(vs2_2, b); 6001 if (i < 2) { 6002 __ ldr(vs2_3, __ Q, __ post(b, 16)); 6003 } 6004 // sum 80 or 88 values across vs1 and vs2 into vs1 6005 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6006 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6007 if (i < 2) { 6008 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6009 } 6010 // load 80 or 88 values from c into vs2_1/2/3 6011 vs_ldpq_post(vs2_1, c); 6012 vs_ldpq_post(vs2_2, c); 6013 if (i < 2) { 6014 __ ldr(vs2_3, __ Q, __ post(c, 16)); 6015 } 6016 // sum 80 or 88 values across vs1 and vs2 into vs1 6017 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6018 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6019 if (i < 2) { 6020 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6021 } 6022 // add constant to all 80 or 88 results 6023 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6024 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6025 if (i < 2) { 6026 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6027 } 6028 // store 80 or 88 values 6029 vs_stpq_post(vs1_1, result); 6030 vs_stpq_post(vs1_2, result); 6031 if (i < 2) { 6032 __ str(vs1_3, __ Q, __ post(result, 16)); 6033 } 6034 } 6035 6036 __ leave(); // required for proper stackwalking of RuntimeStub frame 6037 __ mov(r0, zr); // return 0 6038 __ ret(lr); 6039 6040 return start; 6041 } 6042 6043 // Kyber parse XOF output to polynomial coefficient candidates 6044 // or decodePoly(12, ...). 6045 // Implements 6046 // static int implKyber12To16( 6047 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6048 // 6049 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6050 // 6051 // condensed (byte[]) = c_rarg0 6052 // condensedIndex = c_rarg1 6053 // parsed (short[112 or 256]) = c_rarg2 6054 // parsedLength (112 or 256) = c_rarg3 6055 address generate_kyber12To16() { 6056 Label L_F00, L_loop, L_end; 6057 6058 __ BIND(L_F00); 6059 __ emit_int64(0x0f000f000f000f00); 6060 __ emit_int64(0x0f000f000f000f00); 6061 6062 __ align(CodeEntryAlignment); 6063 StubGenStubId stub_id = StubGenStubId::kyber12To16_id; 6064 StubCodeMark mark(this, stub_id); 6065 address start = __ pc(); 6066 __ enter(); 6067 6068 const Register condensed = c_rarg0; 6069 const Register condensedOffs = c_rarg1; 6070 const Register parsed = c_rarg2; 6071 const Register parsedLength = c_rarg3; 6072 6073 const Register tmpAddr = r11; 6074 6075 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6076 // quadwords so we need a 6 vector sequence for the inputs. 6077 // Parsing produces 64 shorts, employing two 8 vector 6078 // sequences to store and combine the intermediate data. 6079 VSeq<6> vin(24); 6080 VSeq<8> va(0), vb(16); 6081 6082 __ adr(tmpAddr, L_F00); 6083 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6084 __ add(condensed, condensed, condensedOffs); 6085 6086 __ BIND(L_loop); 6087 // load 96 (6 x 16B) byte values 6088 vs_ld3_post(vin, __ T16B, condensed); 6089 6090 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6091 // holds 48 (16x3) contiguous bytes from memory striped 6092 // horizontally across each of the 16 byte lanes. Equivalently, 6093 // that is 16 pairs of 12-bit integers. Likewise the back half 6094 // holds the next 48 bytes in the same arrangement. 6095 6096 // Each vector in the front half can also be viewed as a vertical 6097 // strip across the 16 pairs of 12 bit integers. Each byte in 6098 // vin[0] stores the low 8 bits of the first int in a pair. Each 6099 // byte in vin[1] stores the high 4 bits of the first int and the 6100 // low 4 bits of the second int. Each byte in vin[2] stores the 6101 // high 8 bits of the second int. Likewise the vectors in second 6102 // half. 6103 6104 // Converting the data to 16-bit shorts requires first of all 6105 // expanding each of the 6 x 16B vectors into 6 corresponding 6106 // pairs of 8H vectors. Mask, shift and add operations on the 6107 // resulting vector pairs can be used to combine 4 and 8 bit 6108 // parts of related 8H vector elements. 6109 // 6110 // The middle vectors (vin[2] and vin[5]) are actually expanded 6111 // twice, one copy manipulated to provide the lower 4 bits 6112 // belonging to the first short in a pair and another copy 6113 // manipulated to provide the higher 4 bits belonging to the 6114 // second short in a pair. This is why the the vector sequences va 6115 // and vb used to hold the expanded 8H elements are of length 8. 6116 6117 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6118 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6119 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6120 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6121 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6122 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6123 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6124 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6125 6126 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6127 // and vb[4:5] 6128 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6129 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6130 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6131 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6132 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6133 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6134 6135 // shift lo byte of copy 1 of the middle stripe into the high byte 6136 __ shl(va[2], __ T8H, va[2], 8); 6137 __ shl(va[3], __ T8H, va[3], 8); 6138 __ shl(vb[2], __ T8H, vb[2], 8); 6139 __ shl(vb[3], __ T8H, vb[3], 8); 6140 6141 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6142 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6143 // are in bit positions [4..11]. 6144 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6145 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6146 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6147 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6148 6149 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6150 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6151 // copy2 6152 __ andr(va[2], __ T16B, va[2], v31); 6153 __ andr(va[3], __ T16B, va[3], v31); 6154 __ ushr(va[4], __ T8H, va[4], 4); 6155 __ ushr(va[5], __ T8H, va[5], 4); 6156 __ andr(vb[2], __ T16B, vb[2], v31); 6157 __ andr(vb[3], __ T16B, vb[3], v31); 6158 __ ushr(vb[4], __ T8H, vb[4], 4); 6159 __ ushr(vb[5], __ T8H, vb[5], 4); 6160 6161 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6162 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6163 // n.b. the ordering ensures: i) inputs are consumed before they 6164 // are overwritten ii) the order of 16-bit results across successive 6165 // pairs of vectors in va and then vb reflects the order of the 6166 // corresponding 12-bit inputs 6167 __ addv(va[0], __ T8H, va[0], va[2]); 6168 __ addv(va[2], __ T8H, va[1], va[3]); 6169 __ addv(va[1], __ T8H, va[4], va[6]); 6170 __ addv(va[3], __ T8H, va[5], va[7]); 6171 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6172 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6173 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6174 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6175 6176 // store 64 results interleaved as shorts 6177 vs_st2_post(vs_front(va), __ T8H, parsed); 6178 vs_st2_post(vs_front(vb), __ T8H, parsed); 6179 6180 __ sub(parsedLength, parsedLength, 64); 6181 __ cmp(parsedLength, (u1)64); 6182 __ br(Assembler::GE, L_loop); 6183 __ cbz(parsedLength, L_end); 6184 6185 // if anything is left it should be a final 72 bytes of input 6186 // i.e. a final 48 12-bit values. so we handle this by loading 6187 // 48 bytes into all 16B lanes of front(vin) and only 24 6188 // bytes into the lower 8B lane of back(vin) 6189 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6190 vs_ld3(vs_back(vin), __ T8B, condensed); 6191 6192 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6193 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6194 // 5 and target element 2 of vb duplicates element 4. 6195 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6196 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6197 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6198 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6199 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6200 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6201 6202 // This time expand just the lower 8 lanes 6203 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6204 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6205 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6206 6207 // shift lo byte of copy 1 of the middle stripe into the high byte 6208 __ shl(va[2], __ T8H, va[2], 8); 6209 __ shl(va[3], __ T8H, va[3], 8); 6210 __ shl(vb[2], __ T8H, vb[2], 8); 6211 6212 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6213 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6214 // int are in bit positions [4..11]. 6215 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6216 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6217 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6218 6219 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6220 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6221 // copy2 6222 __ andr(va[2], __ T16B, va[2], v31); 6223 __ andr(va[3], __ T16B, va[3], v31); 6224 __ ushr(va[4], __ T8H, va[4], 4); 6225 __ ushr(va[5], __ T8H, va[5], 4); 6226 __ andr(vb[2], __ T16B, vb[2], v31); 6227 __ ushr(vb[4], __ T8H, vb[4], 4); 6228 6229 6230 6231 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6232 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6233 6234 // n.b. ordering ensures: i) inputs are consumed before they are 6235 // overwritten ii) order of 16-bit results across succsessive 6236 // pairs of vectors in va and then lower half of vb reflects order 6237 // of corresponding 12-bit inputs 6238 __ addv(va[0], __ T8H, va[0], va[2]); 6239 __ addv(va[2], __ T8H, va[1], va[3]); 6240 __ addv(va[1], __ T8H, va[4], va[6]); 6241 __ addv(va[3], __ T8H, va[5], va[7]); 6242 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6243 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6244 6245 // store 48 results interleaved as shorts 6246 vs_st2_post(vs_front(va), __ T8H, parsed); 6247 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6248 6249 __ BIND(L_end); 6250 6251 __ leave(); // required for proper stackwalking of RuntimeStub frame 6252 __ mov(r0, zr); // return 0 6253 __ ret(lr); 6254 6255 return start; 6256 } 6257 6258 // Kyber Barrett reduce function. 6259 // Implements 6260 // static int implKyberBarrettReduce(short[] coeffs) {} 6261 // 6262 // coeffs (short[256]) = c_rarg0 6263 address generate_kyberBarrettReduce() { 6264 6265 __ align(CodeEntryAlignment); 6266 StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id; 6267 StubCodeMark mark(this, stub_id); 6268 address start = __ pc(); 6269 __ enter(); 6270 6271 const Register coeffs = c_rarg0; 6272 6273 const Register kyberConsts = r10; 6274 const Register result = r11; 6275 6276 // As above we process 256 sets of values in total i.e. 32 x 6277 // 8H quadwords. So, we can load, add and store the data in 3 6278 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6279 // of 10 or 11 registers. A further constraint is that the 6280 // mapping needs to skip callee saves. So, we allocate the 6281 // register sequences using two 8 sequences, two 2 sequences 6282 // and two single registers. 6283 VSeq<8> vs1_1(0); 6284 VSeq<2> vs1_2(16); 6285 FloatRegister vs1_3 = v28; 6286 VSeq<8> vs2_1(18); 6287 VSeq<2> vs2_2(26); 6288 FloatRegister vs2_3 = v29; 6289 6290 // we also need a pair of corresponding constant sequences 6291 6292 VSeq<8> vc1_1(30, 0); 6293 VSeq<2> vc1_2(30, 0); 6294 FloatRegister vc1_3 = v30; // for kyber_q 6295 6296 VSeq<8> vc2_1(31, 0); 6297 VSeq<2> vc2_2(31, 0); 6298 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6299 6300 __ add(result, coeffs, 0); 6301 __ lea(kyberConsts, 6302 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6303 6304 // load q and the multiplier for the Barrett reduction 6305 __ add(kyberConsts, kyberConsts, 16); 6306 __ ldpq(vc1_3, vc2_3, kyberConsts); 6307 6308 for (int i = 0; i < 3; i++) { 6309 // load 80 or 88 coefficients 6310 vs_ldpq_post(vs1_1, coeffs); 6311 vs_ldpq_post(vs1_2, coeffs); 6312 if (i < 2) { 6313 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6314 } 6315 6316 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6317 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6318 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6319 if (i < 2) { 6320 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6321 } 6322 6323 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6324 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6325 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6326 if (i < 2) { 6327 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6328 } 6329 6330 // vs1 <- vs1 - vs2 * kyber_q 6331 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6332 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6333 if (i < 2) { 6334 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6335 } 6336 6337 vs_stpq_post(vs1_1, result); 6338 vs_stpq_post(vs1_2, result); 6339 if (i < 2) { 6340 __ str(vs1_3, __ Q, __ post(result, 16)); 6341 } 6342 } 6343 6344 __ leave(); // required for proper stackwalking of RuntimeStub frame 6345 __ mov(r0, zr); // return 0 6346 __ ret(lr); 6347 6348 return start; 6349 } 6350 6351 6352 // Dilithium-specific montmul helper routines that generate parallel 6353 // code for, respectively, a single 4x4s vector sequence montmul or 6354 // two such multiplies in a row. 6355 6356 // Perform 16 32-bit Montgomery multiplications in parallel 6357 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6358 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6359 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6360 // It will assert that the register use is valid 6361 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6362 } 6363 6364 // Perform 2x16 32-bit Montgomery multiplications in parallel 6365 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6366 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6367 // Schedule two successive 4x4S multiplies via the montmul helper 6368 // on the front and back halves of va, vb and vc. The helper will 6369 // assert that the register use has no overlap conflicts on each 6370 // individual call but we also need to ensure that the necessary 6371 // disjoint/equality constraints are met across both calls. 6372 6373 // vb, vc, vtmp and vq must be disjoint. va must either be 6374 // disjoint from all other registers or equal vc 6375 6376 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6377 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6378 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6379 6380 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6381 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6382 6383 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6384 6385 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6386 assert(vs_disjoint(va, vb), "va and vb overlap"); 6387 assert(vs_disjoint(va, vq), "va and vq overlap"); 6388 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6389 6390 // We multiply the front and back halves of each sequence 4 at a 6391 // time because 6392 // 6393 // 1) we are currently only able to get 4-way instruction 6394 // parallelism at best 6395 // 6396 // 2) we need registers for the constants in vq and temporary 6397 // scratch registers to hold intermediate results so vtmp can only 6398 // be a VSeq<4> which means we only have 4 scratch slots. 6399 6400 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6401 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6402 } 6403 6404 // Perform combined montmul then add/sub on 4x4S vectors. 6405 void dilithium_montmul16_sub_add( 6406 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6407 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6408 // compute a = montmul(a1, c) 6409 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6410 // ouptut a1 = a0 - a 6411 vs_subv(va1, __ T4S, va0, vc); 6412 // and a0 = a0 + a 6413 vs_addv(va0, __ T4S, va0, vc); 6414 } 6415 6416 // Perform combined add/sub then montul on 4x4S vectors. 6417 void dilithium_sub_add_montmul16( 6418 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6419 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6420 // compute c = a0 - a1 6421 vs_subv(vtmp1, __ T4S, va0, va1); 6422 // output a0 = a0 + a1 6423 vs_addv(va0, __ T4S, va0, va1); 6424 // output a1 = b montmul c 6425 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6426 } 6427 6428 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6429 // in the Java implementation come in sequences of at least 8, so we 6430 // can use ldpq to collect the corresponding data into pairs of vector 6431 // registers. 6432 // We collect the coefficients corresponding to the 'j+l' indexes into 6433 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6434 // then we do the (Montgomery) multiplications by the zetas in parallel 6435 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6436 // v0-v7, then do the additions into v24-v31 and the subtractions into 6437 // v0-v7 and finally save the results back to the coeffs array. 6438 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6439 const Register coeffs, const Register zetas) { 6440 int c1 = 0; 6441 int c2 = 512; 6442 int startIncr; 6443 // don't use callee save registers v8 - v15 6444 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6445 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6446 VSeq<2> vq(30); // n.b. constants overlap vs3 6447 int offsets[4] = { 0, 32, 64, 96 }; 6448 6449 for (int level = 0; level < 5; level++) { 6450 int c1Start = c1; 6451 int c2Start = c2; 6452 if (level == 3) { 6453 offsets[1] = 32; 6454 offsets[2] = 128; 6455 offsets[3] = 160; 6456 } else if (level == 4) { 6457 offsets[1] = 64; 6458 offsets[2] = 128; 6459 offsets[3] = 192; 6460 } 6461 6462 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6463 // time at 4 different offsets and multiply them in order by the 6464 // next set of input values. So we employ indexed load and store 6465 // pair instructions with arrangement 4S. 6466 for (int i = 0; i < 4; i++) { 6467 // reload q and qinv 6468 vs_ldpq(vq, dilithiumConsts); // qInv, q 6469 // load 8x4S coefficients via second start pos == c2 6470 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6471 // load next 8x4S inputs == b 6472 vs_ldpq_post(vs2, zetas); 6473 // compute a == c2 * b mod MONT_Q 6474 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6475 // load 8x4s coefficients via first start pos == c1 6476 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6477 // compute a1 = c1 + a 6478 vs_addv(vs3, __ T4S, vs1, vs2); 6479 // compute a2 = c1 - a 6480 vs_subv(vs1, __ T4S, vs1, vs2); 6481 // output a1 and a2 6482 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6483 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6484 6485 int k = 4 * level + i; 6486 6487 if (k > 7) { 6488 startIncr = 256; 6489 } else if (k == 5) { 6490 startIncr = 384; 6491 } else { 6492 startIncr = 128; 6493 } 6494 6495 c1Start += startIncr; 6496 c2Start += startIncr; 6497 } 6498 6499 c2 /= 2; 6500 } 6501 } 6502 6503 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6504 // Implements the method 6505 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6506 // of the Java class sun.security.provider 6507 // 6508 // coeffs (int[256]) = c_rarg0 6509 // zetas (int[256]) = c_rarg1 6510 address generate_dilithiumAlmostNtt() { 6511 6512 __ align(CodeEntryAlignment); 6513 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 6514 StubCodeMark mark(this, stub_id); 6515 address start = __ pc(); 6516 __ enter(); 6517 6518 const Register coeffs = c_rarg0; 6519 const Register zetas = c_rarg1; 6520 6521 const Register tmpAddr = r9; 6522 const Register dilithiumConsts = r10; 6523 const Register result = r11; 6524 // don't use callee save registers v8 - v15 6525 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6526 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6527 VSeq<2> vq(30); // n.b. constants overlap vs3 6528 int offsets[4] = { 0, 32, 64, 96}; 6529 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6530 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6531 __ add(result, coeffs, 0); 6532 __ lea(dilithiumConsts, 6533 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6534 6535 // Each level represents one iteration of the outer for loop of the Java version. 6536 6537 // level 0-4 6538 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6539 6540 // level 5 6541 6542 // At level 5 the coefficients we need to combine with the zetas 6543 // are grouped in memory in blocks of size 4. So, for both sets of 6544 // coefficients we load 4 adjacent values at 8 different offsets 6545 // using an indexed ldr with register variant Q and multiply them 6546 // in sequence order by the next set of inputs. Likewise we store 6547 // the resuls using an indexed str with register variant Q. 6548 for (int i = 0; i < 1024; i += 256) { 6549 // reload constants q, qinv each iteration as they get clobbered later 6550 vs_ldpq(vq, dilithiumConsts); // qInv, q 6551 // load 32 (8x4S) coefficients via first offsets = c1 6552 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6553 // load next 32 (8x4S) inputs = b 6554 vs_ldpq_post(vs2, zetas); 6555 // a = b montul c1 6556 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6557 // load 32 (8x4S) coefficients via second offsets = c2 6558 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6559 // add/sub with result of multiply 6560 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6561 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6562 // write back new coefficients using same offsets 6563 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6564 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6565 } 6566 6567 // level 6 6568 // At level 6 the coefficients we need to combine with the zetas 6569 // are grouped in memory in pairs, the first two being montmul 6570 // inputs and the second add/sub inputs. We can still implement 6571 // the montmul+sub+add using 4-way parallelism but only if we 6572 // combine the coefficients with the zetas 16 at a time. We load 8 6573 // adjacent values at 4 different offsets using an ld2 load with 6574 // arrangement 2D. That interleaves the lower and upper halves of 6575 // each pair of quadwords into successive vector registers. We 6576 // then need to montmul the 4 even elements of the coefficients 6577 // register sequence by the zetas in order and then add/sub the 4 6578 // odd elements of the coefficients register sequence. We use an 6579 // equivalent st2 operation to store the results back into memory 6580 // de-interleaved. 6581 for (int i = 0; i < 1024; i += 128) { 6582 // reload constants q, qinv each iteration as they get clobbered later 6583 vs_ldpq(vq, dilithiumConsts); // qInv, q 6584 // load interleaved 16 (4x2D) coefficients via offsets 6585 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6586 // load next 16 (4x4S) inputs 6587 vs_ldpq_post(vs_front(vs2), zetas); 6588 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6589 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6590 vs_front(vs2), vtmp, vq); 6591 // store interleaved 16 (4x2D) coefficients via offsets 6592 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6593 } 6594 6595 // level 7 6596 // At level 7 the coefficients we need to combine with the zetas 6597 // occur singly with montmul inputs alterating with add/sub 6598 // inputs. Once again we can use 4-way parallelism to combine 16 6599 // zetas at a time. However, we have to load 8 adjacent values at 6600 // 4 different offsets using an ld2 load with arrangement 4S. That 6601 // interleaves the the odd words of each pair into one 6602 // coefficients vector register and the even words of the pair 6603 // into the next register. We then need to montmul the 4 even 6604 // elements of the coefficients register sequence by the zetas in 6605 // order and then add/sub the 4 odd elements of the coefficients 6606 // register sequence. We use an equivalent st2 operation to store 6607 // the results back into memory de-interleaved. 6608 6609 for (int i = 0; i < 1024; i += 128) { 6610 // reload constants q, qinv each iteration as they get clobbered later 6611 vs_ldpq(vq, dilithiumConsts); // qInv, q 6612 // load interleaved 16 (4x4S) coefficients via offsets 6613 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6614 // load next 16 (4x4S) inputs 6615 vs_ldpq_post(vs_front(vs2), zetas); 6616 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6617 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6618 vs_front(vs2), vtmp, vq); 6619 // store interleaved 16 (4x4S) coefficients via offsets 6620 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6621 } 6622 __ leave(); // required for proper stackwalking of RuntimeStub frame 6623 __ mov(r0, zr); // return 0 6624 __ ret(lr); 6625 6626 return start; 6627 } 6628 6629 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6630 // in the Java implementation come in sequences of at least 8, so we 6631 // can use ldpq to collect the corresponding data into pairs of vector 6632 // registers 6633 // We collect the coefficients that correspond to the 'j's into vs1 6634 // the coefficiets that correspond to the 'j+l's into vs2 then 6635 // do the additions into vs3 and the subtractions into vs1 then 6636 // save the result of the additions, load the zetas into vs2 6637 // do the (Montgomery) multiplications by zeta in parallel into vs2 6638 // finally save the results back to the coeffs array 6639 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6640 const Register coeffs, const Register zetas) { 6641 int c1 = 0; 6642 int c2 = 32; 6643 int startIncr; 6644 int offsets[4]; 6645 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6646 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6647 VSeq<2> vq(30); // n.b. constants overlap vs3 6648 6649 offsets[0] = 0; 6650 6651 for (int level = 3; level < 8; level++) { 6652 int c1Start = c1; 6653 int c2Start = c2; 6654 if (level == 3) { 6655 offsets[1] = 64; 6656 offsets[2] = 128; 6657 offsets[3] = 192; 6658 } else if (level == 4) { 6659 offsets[1] = 32; 6660 offsets[2] = 128; 6661 offsets[3] = 160; 6662 } else { 6663 offsets[1] = 32; 6664 offsets[2] = 64; 6665 offsets[3] = 96; 6666 } 6667 6668 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6669 // time at 4 different offsets and multiply them in order by the 6670 // next set of input values. So we employ indexed load and store 6671 // pair instructions with arrangement 4S. 6672 for (int i = 0; i < 4; i++) { 6673 // load v1 32 (8x4S) coefficients relative to first start index 6674 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6675 // load v2 32 (8x4S) coefficients relative to second start index 6676 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6677 // a0 = v1 + v2 -- n.b. clobbers vqs 6678 vs_addv(vs3, __ T4S, vs1, vs2); 6679 // a1 = v1 - v2 6680 vs_subv(vs1, __ T4S, vs1, vs2); 6681 // save a1 relative to first start index 6682 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6683 // load constants q, qinv each iteration as they get clobbered above 6684 vs_ldpq(vq, dilithiumConsts); // qInv, q 6685 // load b next 32 (8x4S) inputs 6686 vs_ldpq_post(vs2, zetas); 6687 // a = a1 montmul b 6688 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6689 // save a relative to second start index 6690 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6691 6692 int k = 4 * level + i; 6693 6694 if (k < 24) { 6695 startIncr = 256; 6696 } else if (k == 25) { 6697 startIncr = 384; 6698 } else { 6699 startIncr = 128; 6700 } 6701 6702 c1Start += startIncr; 6703 c2Start += startIncr; 6704 } 6705 6706 c2 *= 2; 6707 } 6708 } 6709 6710 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6711 // Implements the method 6712 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6713 // the sun.security.provider.ML_DSA class. 6714 // 6715 // coeffs (int[256]) = c_rarg0 6716 // zetas (int[256]) = c_rarg1 6717 address generate_dilithiumAlmostInverseNtt() { 6718 6719 __ align(CodeEntryAlignment); 6720 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 6721 StubCodeMark mark(this, stub_id); 6722 address start = __ pc(); 6723 __ enter(); 6724 6725 const Register coeffs = c_rarg0; 6726 const Register zetas = c_rarg1; 6727 6728 const Register tmpAddr = r9; 6729 const Register dilithiumConsts = r10; 6730 const Register result = r11; 6731 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6732 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6733 VSeq<2> vq(30); // n.b. constants overlap vs3 6734 int offsets[4] = { 0, 32, 64, 96 }; 6735 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6736 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6737 6738 __ add(result, coeffs, 0); 6739 __ lea(dilithiumConsts, 6740 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6741 6742 // Each level represents one iteration of the outer for loop of the Java version 6743 6744 // level 0 6745 // At level 0 we need to interleave adjacent quartets of 6746 // coefficients before we multiply and add/sub by the next 16 6747 // zetas just as we did for level 7 in the multiply code. So we 6748 // load and store the values using an ld2/st2 with arrangement 4S. 6749 for (int i = 0; i < 1024; i += 128) { 6750 // load constants q, qinv 6751 // n.b. this can be moved out of the loop as they do not get 6752 // clobbered by first two loops 6753 vs_ldpq(vq, dilithiumConsts); // qInv, q 6754 // a0/a1 load interleaved 32 (8x4S) coefficients 6755 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6756 // b load next 32 (8x4S) inputs 6757 vs_ldpq_post(vs_front(vs2), zetas); 6758 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6759 // n.b. second half of vs2 provides temporary register storage 6760 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6761 vs_front(vs2), vs_back(vs2), vtmp, vq); 6762 // a0/a1 store interleaved 32 (8x4S) coefficients 6763 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6764 } 6765 6766 // level 1 6767 // At level 1 we need to interleave pairs of adjacent pairs of 6768 // coefficients before we multiply by the next 16 zetas just as we 6769 // did for level 6 in the multiply code. So we load and store the 6770 // values an ld2/st2 with arrangement 2D. 6771 for (int i = 0; i < 1024; i += 128) { 6772 // a0/a1 load interleaved 32 (8x2D) coefficients 6773 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6774 // b load next 16 (4x4S) inputs 6775 vs_ldpq_post(vs_front(vs2), zetas); 6776 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6777 // n.b. second half of vs2 provides temporary register storage 6778 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6779 vs_front(vs2), vs_back(vs2), vtmp, vq); 6780 // a0/a1 store interleaved 32 (8x2D) coefficients 6781 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6782 } 6783 6784 // level 2 6785 // At level 2 coefficients come in blocks of 4. So, we load 4 6786 // adjacent coefficients at 8 distinct offsets for both the first 6787 // and second coefficient sequences, using an ldr with register 6788 // variant Q then combine them with next set of 32 zetas. Likewise 6789 // we store the results using an str with register variant Q. 6790 for (int i = 0; i < 1024; i += 256) { 6791 // c0 load 32 (8x4S) coefficients via first offsets 6792 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6793 // c1 load 32 (8x4S) coefficients via second offsets 6794 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6795 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6796 vs_addv(vs3, __ T4S, vs1, vs2); 6797 // c = c0 - c1 6798 vs_subv(vs1, __ T4S, vs1, vs2); 6799 // store a0 32 (8x4S) coefficients via first offsets 6800 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6801 // b load 32 (8x4S) next inputs 6802 vs_ldpq_post(vs2, zetas); 6803 // reload constants q, qinv -- they were clobbered earlier 6804 vs_ldpq(vq, dilithiumConsts); // qInv, q 6805 // compute a1 = b montmul c 6806 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6807 // store a1 32 (8x4S) coefficients via second offsets 6808 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6809 } 6810 6811 // level 3-7 6812 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6813 6814 __ leave(); // required for proper stackwalking of RuntimeStub frame 6815 __ mov(r0, zr); // return 0 6816 __ ret(lr); 6817 6818 return start; 6819 } 6820 6821 // Dilithium multiply polynomials in the NTT domain. 6822 // Straightforward implementation of the method 6823 // static int implDilithiumNttMult( 6824 // int[] result, int[] ntta, int[] nttb {} of 6825 // the sun.security.provider.ML_DSA class. 6826 // 6827 // result (int[256]) = c_rarg0 6828 // poly1 (int[256]) = c_rarg1 6829 // poly2 (int[256]) = c_rarg2 6830 address generate_dilithiumNttMult() { 6831 6832 __ align(CodeEntryAlignment); 6833 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 6834 StubCodeMark mark(this, stub_id); 6835 address start = __ pc(); 6836 __ enter(); 6837 6838 Label L_loop; 6839 6840 const Register result = c_rarg0; 6841 const Register poly1 = c_rarg1; 6842 const Register poly2 = c_rarg2; 6843 6844 const Register dilithiumConsts = r10; 6845 const Register len = r11; 6846 6847 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6848 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6849 VSeq<2> vq(30); // n.b. constants overlap vs3 6850 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6851 6852 __ lea(dilithiumConsts, 6853 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6854 6855 // load constants q, qinv 6856 vs_ldpq(vq, dilithiumConsts); // qInv, q 6857 // load constant rSquare into v29 6858 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6859 6860 __ mov(len, zr); 6861 __ add(len, len, 1024); 6862 6863 __ BIND(L_loop); 6864 6865 // b load 32 (8x4S) next inputs from poly1 6866 vs_ldpq_post(vs1, poly1); 6867 // c load 32 (8x4S) next inputs from poly2 6868 vs_ldpq_post(vs2, poly2); 6869 // compute a = b montmul c 6870 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6871 // compute a = rsquare montmul a 6872 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6873 // save a 32 (8x4S) results 6874 vs_stpq_post(vs2, result); 6875 6876 __ sub(len, len, 128); 6877 __ cmp(len, (u1)128); 6878 __ br(Assembler::GE, L_loop); 6879 6880 __ leave(); // required for proper stackwalking of RuntimeStub frame 6881 __ mov(r0, zr); // return 0 6882 __ ret(lr); 6883 6884 return start; 6885 } 6886 6887 // Dilithium Motgomery multiply an array by a constant. 6888 // A straightforward implementation of the method 6889 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6890 // of the sun.security.provider.MLDSA class 6891 // 6892 // coeffs (int[256]) = c_rarg0 6893 // constant (int) = c_rarg1 6894 address generate_dilithiumMontMulByConstant() { 6895 6896 __ align(CodeEntryAlignment); 6897 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 6898 StubCodeMark mark(this, stub_id); 6899 address start = __ pc(); 6900 __ enter(); 6901 6902 Label L_loop; 6903 6904 const Register coeffs = c_rarg0; 6905 const Register constant = c_rarg1; 6906 6907 const Register dilithiumConsts = r10; 6908 const Register result = r11; 6909 const Register len = r12; 6910 6911 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6912 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6913 VSeq<2> vq(30); // n.b. constants overlap vs3 6914 VSeq<8> vconst(29, 0); // for montmul by constant 6915 6916 // results track inputs 6917 __ add(result, coeffs, 0); 6918 __ lea(dilithiumConsts, 6919 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6920 6921 // load constants q, qinv -- they do not get clobbered by first two loops 6922 vs_ldpq(vq, dilithiumConsts); // qInv, q 6923 // copy caller supplied constant across vconst 6924 __ dup(vconst[0], __ T4S, constant); 6925 __ mov(len, zr); 6926 __ add(len, len, 1024); 6927 6928 __ BIND(L_loop); 6929 6930 // load next 32 inputs 6931 vs_ldpq_post(vs2, coeffs); 6932 // mont mul by constant 6933 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6934 // write next 32 results 6935 vs_stpq_post(vs2, result); 6936 6937 __ sub(len, len, 128); 6938 __ cmp(len, (u1)128); 6939 __ br(Assembler::GE, L_loop); 6940 6941 __ leave(); // required for proper stackwalking of RuntimeStub frame 6942 __ mov(r0, zr); // return 0 6943 __ ret(lr); 6944 6945 return start; 6946 } 6947 6948 // Dilithium decompose poly. 6949 // Implements the method 6950 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6951 // of the sun.security.provider.ML_DSA class 6952 // 6953 // input (int[256]) = c_rarg0 6954 // lowPart (int[256]) = c_rarg1 6955 // highPart (int[256]) = c_rarg2 6956 // twoGamma2 (int) = c_rarg3 6957 // multiplier (int) = c_rarg4 6958 address generate_dilithiumDecomposePoly() { 6959 6960 __ align(CodeEntryAlignment); 6961 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 6962 StubCodeMark mark(this, stub_id); 6963 address start = __ pc(); 6964 Label L_loop; 6965 6966 const Register input = c_rarg0; 6967 const Register lowPart = c_rarg1; 6968 const Register highPart = c_rarg2; 6969 const Register twoGamma2 = c_rarg3; 6970 const Register multiplier = c_rarg4; 6971 6972 const Register len = r9; 6973 const Register dilithiumConsts = r10; 6974 const Register tmp = r11; 6975 6976 // 6 independent sets of 4x4s values 6977 VSeq<4> vs1(0), vs2(4), vs3(8); 6978 VSeq<4> vs4(12), vs5(16), vtmp(20); 6979 6980 // 7 constants for cross-multiplying 6981 VSeq<4> one(25, 0); 6982 VSeq<4> qminus1(26, 0); 6983 VSeq<4> g2(27, 0); 6984 VSeq<4> twog2(28, 0); 6985 VSeq<4> mult(29, 0); 6986 VSeq<4> q(30, 0); 6987 VSeq<4> qadd(31, 0); 6988 6989 __ enter(); 6990 6991 __ lea(dilithiumConsts, 6992 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6993 6994 // save callee-saved registers 6995 __ stpd(v8, v9, __ pre(sp, -64)); 6996 __ stpd(v10, v11, Address(sp, 16)); 6997 __ stpd(v12, v13, Address(sp, 32)); 6998 __ stpd(v14, v15, Address(sp, 48)); 6999 7000 // populate constant registers 7001 __ mov(tmp, zr); 7002 __ add(tmp, tmp, 1); 7003 __ dup(one[0], __ T4S, tmp); // 1 7004 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 7005 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 7006 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 7007 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 7008 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 7009 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 7010 7011 __ mov(len, zr); 7012 __ add(len, len, 1024); 7013 7014 __ BIND(L_loop); 7015 7016 // load next 4x4S inputs interleaved: rplus --> vs1 7017 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 7018 7019 // rplus = rplus - ((rplus + qadd) >> 23) * q 7020 vs_addv(vtmp, __ T4S, vs1, qadd); 7021 vs_sshr(vtmp, __ T4S, vtmp, 23); 7022 vs_mulv(vtmp, __ T4S, vtmp, q); 7023 vs_subv(vs1, __ T4S, vs1, vtmp); 7024 7025 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7026 vs_sshr(vtmp, __ T4S, vs1, 31); 7027 vs_andr(vtmp, vtmp, q); 7028 vs_addv(vs1, __ T4S, vs1, vtmp); 7029 7030 // quotient --> vs2 7031 // int quotient = (rplus * multiplier) >> 22; 7032 vs_mulv(vtmp, __ T4S, vs1, mult); 7033 vs_sshr(vs2, __ T4S, vtmp, 22); 7034 7035 // r0 --> vs3 7036 // int r0 = rplus - quotient * twoGamma2; 7037 vs_mulv(vtmp, __ T4S, vs2, twog2); 7038 vs_subv(vs3, __ T4S, vs1, vtmp); 7039 7040 // mask --> vs4 7041 // int mask = (twoGamma2 - r0) >> 22; 7042 vs_subv(vtmp, __ T4S, twog2, vs3); 7043 vs_sshr(vs4, __ T4S, vtmp, 22); 7044 7045 // r0 -= (mask & twoGamma2); 7046 vs_andr(vtmp, vs4, twog2); 7047 vs_subv(vs3, __ T4S, vs3, vtmp); 7048 7049 // quotient += (mask & 1); 7050 vs_andr(vtmp, vs4, one); 7051 vs_addv(vs2, __ T4S, vs2, vtmp); 7052 7053 // mask = (twoGamma2 / 2 - r0) >> 31; 7054 vs_subv(vtmp, __ T4S, g2, vs3); 7055 vs_sshr(vs4, __ T4S, vtmp, 31); 7056 7057 // r0 -= (mask & twoGamma2); 7058 vs_andr(vtmp, vs4, twog2); 7059 vs_subv(vs3, __ T4S, vs3, vtmp); 7060 7061 // quotient += (mask & 1); 7062 vs_andr(vtmp, vs4, one); 7063 vs_addv(vs2, __ T4S, vs2, vtmp); 7064 7065 // r1 --> vs5 7066 // int r1 = rplus - r0 - (dilithium_q - 1); 7067 vs_subv(vtmp, __ T4S, vs1, vs3); 7068 vs_subv(vs5, __ T4S, vtmp, qminus1); 7069 7070 // r1 --> vs1 (overwriting rplus) 7071 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7072 vs_negr(vtmp, __ T4S, vs5); 7073 vs_orr(vtmp, vs5, vtmp); 7074 vs_sshr(vs1, __ T4S, vtmp, 31); 7075 7076 // r0 += ~r1; 7077 vs_notr(vtmp, vs1); 7078 vs_addv(vs3, __ T4S, vs3, vtmp); 7079 7080 // r1 = r1 & quotient; 7081 vs_andr(vs1, vs2, vs1); 7082 7083 // store results inteleaved 7084 // lowPart[m] = r0; 7085 // highPart[m] = r1; 7086 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7087 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7088 7089 __ sub(len, len, 64); 7090 __ cmp(len, (u1)64); 7091 __ br(Assembler::GE, L_loop); 7092 7093 // restore callee-saved vector registers 7094 __ ldpd(v14, v15, Address(sp, 48)); 7095 __ ldpd(v12, v13, Address(sp, 32)); 7096 __ ldpd(v10, v11, Address(sp, 16)); 7097 __ ldpd(v8, v9, __ post(sp, 64)); 7098 7099 __ leave(); // required for proper stackwalking of RuntimeStub frame 7100 __ mov(r0, zr); // return 0 7101 __ ret(lr); 7102 7103 return start; 7104 } 7105 7106 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4, 7107 Register tmp0, Register tmp1, Register tmp2) { 7108 __ bic(tmp0, a2, a1); // for a0 7109 __ bic(tmp1, a3, a2); // for a1 7110 __ bic(tmp2, a4, a3); // for a2 7111 __ eor(a2, a2, tmp2); 7112 __ bic(tmp2, a0, a4); // for a3 7113 __ eor(a3, a3, tmp2); 7114 __ bic(tmp2, a1, a0); // for a4 7115 __ eor(a0, a0, tmp0); 7116 __ eor(a1, a1, tmp1); 7117 __ eor(a4, a4, tmp2); 7118 } 7119 7120 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc, 7121 Register a0, Register a1, Register a2, Register a3, Register a4, 7122 Register a5, Register a6, Register a7, Register a8, Register a9, 7123 Register a10, Register a11, Register a12, Register a13, Register a14, 7124 Register a15, Register a16, Register a17, Register a18, Register a19, 7125 Register a20, Register a21, Register a22, Register a23, Register a24, 7126 Register tmp0, Register tmp1, Register tmp2) { 7127 __ eor3(tmp1, a4, a9, a14); 7128 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4 7129 __ eor3(tmp2, a1, a6, a11); 7130 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1 7131 __ rax1(tmp2, tmp0, tmp1); // d0 7132 { 7133 7134 Register tmp3, tmp4; 7135 if (can_use_fp && can_use_r18) { 7136 tmp3 = rfp; 7137 tmp4 = r18_tls; 7138 } else { 7139 tmp3 = a4; 7140 tmp4 = a9; 7141 __ stp(tmp3, tmp4, __ pre(sp, -16)); 7142 } 7143 7144 __ eor3(tmp3, a0, a5, a10); 7145 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0 7146 __ eor(a0, a0, tmp2); 7147 __ eor(a5, a5, tmp2); 7148 __ eor(a10, a10, tmp2); 7149 __ eor(a15, a15, tmp2); 7150 __ eor(a20, a20, tmp2); // d0(tmp2) 7151 __ eor3(tmp3, a2, a7, a12); 7152 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2 7153 __ rax1(tmp3, tmp4, tmp2); // d1 7154 __ eor(a1, a1, tmp3); 7155 __ eor(a6, a6, tmp3); 7156 __ eor(a11, a11, tmp3); 7157 __ eor(a16, a16, tmp3); 7158 __ eor(a21, a21, tmp3); // d1(tmp3) 7159 __ rax1(tmp3, tmp2, tmp0); // d3 7160 __ eor3(tmp2, a3, a8, a13); 7161 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3 7162 __ eor(a3, a3, tmp3); 7163 __ eor(a8, a8, tmp3); 7164 __ eor(a13, a13, tmp3); 7165 __ eor(a18, a18, tmp3); 7166 __ eor(a23, a23, tmp3); 7167 __ rax1(tmp2, tmp1, tmp0); // d2 7168 __ eor(a2, a2, tmp2); 7169 __ eor(a7, a7, tmp2); 7170 __ eor(a12, a12, tmp2); 7171 __ rax1(tmp0, tmp0, tmp4); // d4 7172 if (!can_use_fp || !can_use_r18) { 7173 __ ldp(tmp3, tmp4, __ post(sp, 16)); 7174 } 7175 __ eor(a17, a17, tmp2); 7176 __ eor(a22, a22, tmp2); 7177 __ eor(a4, a4, tmp0); 7178 __ eor(a9, a9, tmp0); 7179 __ eor(a14, a14, tmp0); 7180 __ eor(a19, a19, tmp0); 7181 __ eor(a24, a24, tmp0); 7182 } 7183 7184 __ rol(tmp0, a10, 3); 7185 __ rol(a10, a1, 1); 7186 __ rol(a1, a6, 44); 7187 __ rol(a6, a9, 20); 7188 __ rol(a9, a22, 61); 7189 __ rol(a22, a14, 39); 7190 __ rol(a14, a20, 18); 7191 __ rol(a20, a2, 62); 7192 __ rol(a2, a12, 43); 7193 __ rol(a12, a13, 25); 7194 __ rol(a13, a19, 8) ; 7195 __ rol(a19, a23, 56); 7196 __ rol(a23, a15, 41); 7197 __ rol(a15, a4, 27); 7198 __ rol(a4, a24, 14); 7199 __ rol(a24, a21, 2); 7200 __ rol(a21, a8, 55); 7201 __ rol(a8, a16, 45); 7202 __ rol(a16, a5, 36); 7203 __ rol(a5, a3, 28); 7204 __ rol(a3, a18, 21); 7205 __ rol(a18, a17, 15); 7206 __ rol(a17, a11, 10); 7207 __ rol(a11, a7, 6); 7208 __ mov(a7, tmp0); 7209 7210 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2); 7211 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2); 7212 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2); 7213 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2); 7214 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2); 7215 7216 __ ldr(tmp1, __ post(rc, 8)); 7217 __ eor(a0, a0, tmp1); 7218 7219 } 7220 7221 // Arguments: 7222 // 7223 // Inputs: 7224 // c_rarg0 - byte[] source+offset 7225 // c_rarg1 - byte[] SHA.state 7226 // c_rarg2 - int block_size 7227 // c_rarg3 - int offset 7228 // c_rarg4 - int limit 7229 // 7230 address generate_sha3_implCompress_gpr(StubGenStubId stub_id) { 7231 bool multi_block; 7232 switch (stub_id) { 7233 case sha3_implCompress_id: 7234 multi_block = false; 7235 break; 7236 case sha3_implCompressMB_id: 7237 multi_block = true; 7238 break; 7239 default: 7240 ShouldNotReachHere(); 7241 } 7242 7243 static const uint64_t round_consts[24] = { 7244 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 7245 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 7246 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 7247 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 7248 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 7249 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 7250 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 7251 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 7252 }; 7253 7254 __ align(CodeEntryAlignment); 7255 StubCodeMark mark(this, stub_id); 7256 address start = __ pc(); 7257 7258 Register buf = c_rarg0; 7259 Register state = c_rarg1; 7260 Register block_size = c_rarg2; 7261 Register ofs = c_rarg3; 7262 Register limit = c_rarg4; 7263 7264 // use r3.r17,r19..r28 to keep a0..a24. 7265 // a0..a24 are respective locals from SHA3.java 7266 Register a0 = r25, 7267 a1 = r26, 7268 a2 = r27, 7269 a3 = r3, 7270 a4 = r4, 7271 a5 = r5, 7272 a6 = r6, 7273 a7 = r7, 7274 a8 = rscratch1, // r8 7275 a9 = rscratch2, // r9 7276 a10 = r10, 7277 a11 = r11, 7278 a12 = r12, 7279 a13 = r13, 7280 a14 = r14, 7281 a15 = r15, 7282 a16 = r16, 7283 a17 = r17, 7284 a18 = r28, 7285 a19 = r19, 7286 a20 = r20, 7287 a21 = r21, 7288 a22 = r22, 7289 a23 = r23, 7290 a24 = r24; 7291 7292 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30; 7293 7294 Label sha3_loop, rounds24_preloop, loop_body; 7295 Label sha3_512_or_sha3_384, shake128; 7296 7297 bool can_use_r18 = false; 7298 #ifndef R18_RESERVED 7299 can_use_r18 = true; 7300 #endif 7301 bool can_use_fp = !PreserveFramePointer; 7302 7303 __ enter(); 7304 7305 // save almost all yet unsaved gpr registers on stack 7306 __ str(block_size, __ pre(sp, -128)); 7307 if (multi_block) { 7308 __ stpw(ofs, limit, Address(sp, 8)); 7309 } 7310 // 8 bytes at sp+16 will be used to keep buf 7311 __ stp(r19, r20, Address(sp, 32)); 7312 __ stp(r21, r22, Address(sp, 48)); 7313 __ stp(r23, r24, Address(sp, 64)); 7314 __ stp(r25, r26, Address(sp, 80)); 7315 __ stp(r27, r28, Address(sp, 96)); 7316 if (can_use_r18 && can_use_fp) { 7317 __ stp(r18_tls, state, Address(sp, 112)); 7318 } else { 7319 __ str(state, Address(sp, 112)); 7320 } 7321 7322 // begin sha3 calculations: loading a0..a24 from state arrary 7323 __ ldp(a0, a1, state); 7324 __ ldp(a2, a3, Address(state, 16)); 7325 __ ldp(a4, a5, Address(state, 32)); 7326 __ ldp(a6, a7, Address(state, 48)); 7327 __ ldp(a8, a9, Address(state, 64)); 7328 __ ldp(a10, a11, Address(state, 80)); 7329 __ ldp(a12, a13, Address(state, 96)); 7330 __ ldp(a14, a15, Address(state, 112)); 7331 __ ldp(a16, a17, Address(state, 128)); 7332 __ ldp(a18, a19, Address(state, 144)); 7333 __ ldp(a20, a21, Address(state, 160)); 7334 __ ldp(a22, a23, Address(state, 176)); 7335 __ ldr(a24, Address(state, 192)); 7336 7337 __ BIND(sha3_loop); 7338 7339 // load input 7340 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7341 __ eor(a0, a0, tmp3); 7342 __ eor(a1, a1, tmp2); 7343 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7344 __ eor(a2, a2, tmp3); 7345 __ eor(a3, a3, tmp2); 7346 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7347 __ eor(a4, a4, tmp3); 7348 __ eor(a5, a5, tmp2); 7349 __ ldr(tmp3, __ post(buf, 8)); 7350 __ eor(a6, a6, tmp3); 7351 7352 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 7353 __ tbz(block_size, 7, sha3_512_or_sha3_384); 7354 7355 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7356 __ eor(a7, a7, tmp3); 7357 __ eor(a8, a8, tmp2); 7358 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7359 __ eor(a9, a9, tmp3); 7360 __ eor(a10, a10, tmp2); 7361 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7362 __ eor(a11, a11, tmp3); 7363 __ eor(a12, a12, tmp2); 7364 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7365 __ eor(a13, a13, tmp3); 7366 __ eor(a14, a14, tmp2); 7367 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7368 __ eor(a15, a15, tmp3); 7369 __ eor(a16, a16, tmp2); 7370 7371 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 7372 __ andw(tmp2, block_size, 48); 7373 __ cbzw(tmp2, rounds24_preloop); 7374 __ tbnz(block_size, 5, shake128); 7375 // block_size == 144, bit5 == 0, SHA3-244 7376 __ ldr(tmp3, __ post(buf, 8)); 7377 __ eor(a17, a17, tmp3); 7378 __ b(rounds24_preloop); 7379 7380 __ BIND(shake128); 7381 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7382 __ eor(a17, a17, tmp3); 7383 __ eor(a18, a18, tmp2); 7384 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7385 __ eor(a19, a19, tmp3); 7386 __ eor(a20, a20, tmp2); 7387 __ b(rounds24_preloop); // block_size == 168, SHAKE128 7388 7389 __ BIND(sha3_512_or_sha3_384); 7390 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7391 __ eor(a7, a7, tmp3); 7392 __ eor(a8, a8, tmp2); 7393 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512 7394 7395 // SHA3-384 7396 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7397 __ eor(a9, a9, tmp3); 7398 __ eor(a10, a10, tmp2); 7399 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7400 __ eor(a11, a11, tmp3); 7401 __ eor(a12, a12, tmp2); 7402 7403 __ BIND(rounds24_preloop); 7404 __ fmovs(v0, 24.0); // float loop counter, 7405 __ fmovs(v1, 1.0); // exact representation 7406 7407 __ str(buf, Address(sp, 16)); 7408 __ lea(tmp3, ExternalAddress((address) round_consts)); 7409 7410 __ BIND(loop_body); 7411 keccak_round_gpr(can_use_fp, can_use_r18, tmp3, 7412 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, 7413 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, 7414 tmp0, tmp1, tmp2); 7415 __ fsubs(v0, v0, v1); 7416 __ fcmps(v0, 0.0); 7417 __ br(__ NE, loop_body); 7418 7419 if (multi_block) { 7420 __ ldrw(block_size, sp); // block_size 7421 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit 7422 __ addw(tmp2, tmp2, block_size); 7423 __ cmpw(tmp2, tmp1); 7424 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping 7425 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping 7426 __ br(Assembler::LE, sha3_loop); 7427 __ movw(c_rarg0, tmp2); // return offset 7428 } 7429 if (can_use_fp && can_use_r18) { 7430 __ ldp(r18_tls, state, Address(sp, 112)); 7431 } else { 7432 __ ldr(state, Address(sp, 112)); 7433 } 7434 // save calculated sha3 state 7435 __ stp(a0, a1, Address(state)); 7436 __ stp(a2, a3, Address(state, 16)); 7437 __ stp(a4, a5, Address(state, 32)); 7438 __ stp(a6, a7, Address(state, 48)); 7439 __ stp(a8, a9, Address(state, 64)); 7440 __ stp(a10, a11, Address(state, 80)); 7441 __ stp(a12, a13, Address(state, 96)); 7442 __ stp(a14, a15, Address(state, 112)); 7443 __ stp(a16, a17, Address(state, 128)); 7444 __ stp(a18, a19, Address(state, 144)); 7445 __ stp(a20, a21, Address(state, 160)); 7446 __ stp(a22, a23, Address(state, 176)); 7447 __ str(a24, Address(state, 192)); 7448 7449 // restore required registers from stack 7450 __ ldp(r19, r20, Address(sp, 32)); 7451 __ ldp(r21, r22, Address(sp, 48)); 7452 __ ldp(r23, r24, Address(sp, 64)); 7453 __ ldp(r25, r26, Address(sp, 80)); 7454 __ ldp(r27, r28, Address(sp, 96)); 7455 if (can_use_fp && can_use_r18) { 7456 __ add(rfp, sp, 128); // leave() will copy rfp to sp below 7457 } // else no need to recalculate rfp, since it wasn't changed 7458 7459 __ leave(); 7460 7461 __ ret(lr); 7462 7463 return start; 7464 } 7465 7466 /** 7467 * Arguments: 7468 * 7469 * Inputs: 7470 * c_rarg0 - int crc 7471 * c_rarg1 - byte* buf 7472 * c_rarg2 - int length 7473 * 7474 * Output: 7475 * rax - int crc result 7476 */ 7477 address generate_updateBytesCRC32() { 7478 assert(UseCRC32Intrinsics, "what are we doing here?"); 7479 7480 __ align(CodeEntryAlignment); 7481 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 7482 StubCodeMark mark(this, stub_id); 7483 7484 address start = __ pc(); 7485 7486 const Register crc = c_rarg0; // crc 7487 const Register buf = c_rarg1; // source java byte array address 7488 const Register len = c_rarg2; // length 7489 const Register table0 = c_rarg3; // crc_table address 7490 const Register table1 = c_rarg4; 7491 const Register table2 = c_rarg5; 7492 const Register table3 = c_rarg6; 7493 const Register tmp3 = c_rarg7; 7494 7495 BLOCK_COMMENT("Entry:"); 7496 __ enter(); // required for proper stackwalking of RuntimeStub frame 7497 7498 __ kernel_crc32(crc, buf, len, 7499 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7500 7501 __ leave(); // required for proper stackwalking of RuntimeStub frame 7502 __ ret(lr); 7503 7504 return start; 7505 } 7506 7507 /** 7508 * Arguments: 7509 * 7510 * Inputs: 7511 * c_rarg0 - int crc 7512 * c_rarg1 - byte* buf 7513 * c_rarg2 - int length 7514 * c_rarg3 - int* table 7515 * 7516 * Output: 7517 * r0 - int crc result 7518 */ 7519 address generate_updateBytesCRC32C() { 7520 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7521 7522 __ align(CodeEntryAlignment); 7523 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 7524 StubCodeMark mark(this, stub_id); 7525 7526 address start = __ pc(); 7527 7528 const Register crc = c_rarg0; // crc 7529 const Register buf = c_rarg1; // source java byte array address 7530 const Register len = c_rarg2; // length 7531 const Register table0 = c_rarg3; // crc_table address 7532 const Register table1 = c_rarg4; 7533 const Register table2 = c_rarg5; 7534 const Register table3 = c_rarg6; 7535 const Register tmp3 = c_rarg7; 7536 7537 BLOCK_COMMENT("Entry:"); 7538 __ enter(); // required for proper stackwalking of RuntimeStub frame 7539 7540 __ kernel_crc32c(crc, buf, len, 7541 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7542 7543 __ leave(); // required for proper stackwalking of RuntimeStub frame 7544 __ ret(lr); 7545 7546 return start; 7547 } 7548 7549 /*** 7550 * Arguments: 7551 * 7552 * Inputs: 7553 * c_rarg0 - int adler 7554 * c_rarg1 - byte* buff 7555 * c_rarg2 - int len 7556 * 7557 * Output: 7558 * c_rarg0 - int adler result 7559 */ 7560 address generate_updateBytesAdler32() { 7561 __ align(CodeEntryAlignment); 7562 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 7563 StubCodeMark mark(this, stub_id); 7564 address start = __ pc(); 7565 7566 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7567 7568 // Aliases 7569 Register adler = c_rarg0; 7570 Register s1 = c_rarg0; 7571 Register s2 = c_rarg3; 7572 Register buff = c_rarg1; 7573 Register len = c_rarg2; 7574 Register nmax = r4; 7575 Register base = r5; 7576 Register count = r6; 7577 Register temp0 = rscratch1; 7578 Register temp1 = rscratch2; 7579 FloatRegister vbytes = v0; 7580 FloatRegister vs1acc = v1; 7581 FloatRegister vs2acc = v2; 7582 FloatRegister vtable = v3; 7583 7584 // Max number of bytes we can process before having to take the mod 7585 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7586 uint64_t BASE = 0xfff1; 7587 uint64_t NMAX = 0x15B0; 7588 7589 __ mov(base, BASE); 7590 __ mov(nmax, NMAX); 7591 7592 // Load accumulation coefficients for the upper 16 bits 7593 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7594 __ ld1(vtable, __ T16B, Address(temp0)); 7595 7596 // s1 is initialized to the lower 16 bits of adler 7597 // s2 is initialized to the upper 16 bits of adler 7598 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7599 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7600 7601 // The pipelined loop needs at least 16 elements for 1 iteration 7602 // It does check this, but it is more effective to skip to the cleanup loop 7603 __ cmp(len, (u1)16); 7604 __ br(Assembler::HS, L_nmax); 7605 __ cbz(len, L_combine); 7606 7607 __ bind(L_simple_by1_loop); 7608 __ ldrb(temp0, Address(__ post(buff, 1))); 7609 __ add(s1, s1, temp0); 7610 __ add(s2, s2, s1); 7611 __ subs(len, len, 1); 7612 __ br(Assembler::HI, L_simple_by1_loop); 7613 7614 // s1 = s1 % BASE 7615 __ subs(temp0, s1, base); 7616 __ csel(s1, temp0, s1, Assembler::HS); 7617 7618 // s2 = s2 % BASE 7619 __ lsr(temp0, s2, 16); 7620 __ lsl(temp1, temp0, 4); 7621 __ sub(temp1, temp1, temp0); 7622 __ add(s2, temp1, s2, ext::uxth); 7623 7624 __ subs(temp0, s2, base); 7625 __ csel(s2, temp0, s2, Assembler::HS); 7626 7627 __ b(L_combine); 7628 7629 __ bind(L_nmax); 7630 __ subs(len, len, nmax); 7631 __ sub(count, nmax, 16); 7632 __ br(Assembler::LO, L_by16); 7633 7634 __ bind(L_nmax_loop); 7635 7636 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7637 vbytes, vs1acc, vs2acc, vtable); 7638 7639 __ subs(count, count, 16); 7640 __ br(Assembler::HS, L_nmax_loop); 7641 7642 // s1 = s1 % BASE 7643 __ lsr(temp0, s1, 16); 7644 __ lsl(temp1, temp0, 4); 7645 __ sub(temp1, temp1, temp0); 7646 __ add(temp1, temp1, s1, ext::uxth); 7647 7648 __ lsr(temp0, temp1, 16); 7649 __ lsl(s1, temp0, 4); 7650 __ sub(s1, s1, temp0); 7651 __ add(s1, s1, temp1, ext:: uxth); 7652 7653 __ subs(temp0, s1, base); 7654 __ csel(s1, temp0, s1, Assembler::HS); 7655 7656 // s2 = s2 % BASE 7657 __ lsr(temp0, s2, 16); 7658 __ lsl(temp1, temp0, 4); 7659 __ sub(temp1, temp1, temp0); 7660 __ add(temp1, temp1, s2, ext::uxth); 7661 7662 __ lsr(temp0, temp1, 16); 7663 __ lsl(s2, temp0, 4); 7664 __ sub(s2, s2, temp0); 7665 __ add(s2, s2, temp1, ext:: uxth); 7666 7667 __ subs(temp0, s2, base); 7668 __ csel(s2, temp0, s2, Assembler::HS); 7669 7670 __ subs(len, len, nmax); 7671 __ sub(count, nmax, 16); 7672 __ br(Assembler::HS, L_nmax_loop); 7673 7674 __ bind(L_by16); 7675 __ adds(len, len, count); 7676 __ br(Assembler::LO, L_by1); 7677 7678 __ bind(L_by16_loop); 7679 7680 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7681 vbytes, vs1acc, vs2acc, vtable); 7682 7683 __ subs(len, len, 16); 7684 __ br(Assembler::HS, L_by16_loop); 7685 7686 __ bind(L_by1); 7687 __ adds(len, len, 15); 7688 __ br(Assembler::LO, L_do_mod); 7689 7690 __ bind(L_by1_loop); 7691 __ ldrb(temp0, Address(__ post(buff, 1))); 7692 __ add(s1, temp0, s1); 7693 __ add(s2, s2, s1); 7694 __ subs(len, len, 1); 7695 __ br(Assembler::HS, L_by1_loop); 7696 7697 __ bind(L_do_mod); 7698 // s1 = s1 % BASE 7699 __ lsr(temp0, s1, 16); 7700 __ lsl(temp1, temp0, 4); 7701 __ sub(temp1, temp1, temp0); 7702 __ add(temp1, temp1, s1, ext::uxth); 7703 7704 __ lsr(temp0, temp1, 16); 7705 __ lsl(s1, temp0, 4); 7706 __ sub(s1, s1, temp0); 7707 __ add(s1, s1, temp1, ext:: uxth); 7708 7709 __ subs(temp0, s1, base); 7710 __ csel(s1, temp0, s1, Assembler::HS); 7711 7712 // s2 = s2 % BASE 7713 __ lsr(temp0, s2, 16); 7714 __ lsl(temp1, temp0, 4); 7715 __ sub(temp1, temp1, temp0); 7716 __ add(temp1, temp1, s2, ext::uxth); 7717 7718 __ lsr(temp0, temp1, 16); 7719 __ lsl(s2, temp0, 4); 7720 __ sub(s2, s2, temp0); 7721 __ add(s2, s2, temp1, ext:: uxth); 7722 7723 __ subs(temp0, s2, base); 7724 __ csel(s2, temp0, s2, Assembler::HS); 7725 7726 // Combine lower bits and higher bits 7727 __ bind(L_combine); 7728 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7729 7730 __ ret(lr); 7731 7732 return start; 7733 } 7734 7735 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7736 Register temp0, Register temp1, FloatRegister vbytes, 7737 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7738 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7739 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7740 // In non-vectorized code, we update s1 and s2 as: 7741 // s1 <- s1 + b1 7742 // s2 <- s2 + s1 7743 // s1 <- s1 + b2 7744 // s2 <- s2 + b1 7745 // ... 7746 // s1 <- s1 + b16 7747 // s2 <- s2 + s1 7748 // Putting above assignments together, we have: 7749 // s1_new = s1 + b1 + b2 + ... + b16 7750 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7751 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7752 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7753 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7754 7755 // s2 = s2 + s1 * 16 7756 __ add(s2, s2, s1, Assembler::LSL, 4); 7757 7758 // vs1acc = b1 + b2 + b3 + ... + b16 7759 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7760 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7761 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7762 __ uaddlv(vs1acc, __ T16B, vbytes); 7763 __ uaddlv(vs2acc, __ T8H, vs2acc); 7764 7765 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7766 __ fmovd(temp0, vs1acc); 7767 __ fmovd(temp1, vs2acc); 7768 __ add(s1, s1, temp0); 7769 __ add(s2, s2, temp1); 7770 } 7771 7772 /** 7773 * Arguments: 7774 * 7775 * Input: 7776 * c_rarg0 - x address 7777 * c_rarg1 - x length 7778 * c_rarg2 - y address 7779 * c_rarg3 - y length 7780 * c_rarg4 - z address 7781 */ 7782 address generate_multiplyToLen() { 7783 __ align(CodeEntryAlignment); 7784 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 7785 StubCodeMark mark(this, stub_id); 7786 7787 address start = __ pc(); 7788 const Register x = r0; 7789 const Register xlen = r1; 7790 const Register y = r2; 7791 const Register ylen = r3; 7792 const Register z = r4; 7793 7794 const Register tmp0 = r5; 7795 const Register tmp1 = r10; 7796 const Register tmp2 = r11; 7797 const Register tmp3 = r12; 7798 const Register tmp4 = r13; 7799 const Register tmp5 = r14; 7800 const Register tmp6 = r15; 7801 const Register tmp7 = r16; 7802 7803 BLOCK_COMMENT("Entry:"); 7804 __ enter(); // required for proper stackwalking of RuntimeStub frame 7805 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7806 __ leave(); // required for proper stackwalking of RuntimeStub frame 7807 __ ret(lr); 7808 7809 return start; 7810 } 7811 7812 address generate_squareToLen() { 7813 // squareToLen algorithm for sizes 1..127 described in java code works 7814 // faster than multiply_to_len on some CPUs and slower on others, but 7815 // multiply_to_len shows a bit better overall results 7816 __ align(CodeEntryAlignment); 7817 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 7818 StubCodeMark mark(this, stub_id); 7819 address start = __ pc(); 7820 7821 const Register x = r0; 7822 const Register xlen = r1; 7823 const Register z = r2; 7824 const Register y = r4; // == x 7825 const Register ylen = r5; // == xlen 7826 7827 const Register tmp0 = r3; 7828 const Register tmp1 = r10; 7829 const Register tmp2 = r11; 7830 const Register tmp3 = r12; 7831 const Register tmp4 = r13; 7832 const Register tmp5 = r14; 7833 const Register tmp6 = r15; 7834 const Register tmp7 = r16; 7835 7836 RegSet spilled_regs = RegSet::of(y, ylen); 7837 BLOCK_COMMENT("Entry:"); 7838 __ enter(); 7839 __ push(spilled_regs, sp); 7840 __ mov(y, x); 7841 __ mov(ylen, xlen); 7842 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7843 __ pop(spilled_regs, sp); 7844 __ leave(); 7845 __ ret(lr); 7846 return start; 7847 } 7848 7849 address generate_mulAdd() { 7850 __ align(CodeEntryAlignment); 7851 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 7852 StubCodeMark mark(this, stub_id); 7853 7854 address start = __ pc(); 7855 7856 const Register out = r0; 7857 const Register in = r1; 7858 const Register offset = r2; 7859 const Register len = r3; 7860 const Register k = r4; 7861 7862 BLOCK_COMMENT("Entry:"); 7863 __ enter(); 7864 __ mul_add(out, in, offset, len, k); 7865 __ leave(); 7866 __ ret(lr); 7867 7868 return start; 7869 } 7870 7871 // Arguments: 7872 // 7873 // Input: 7874 // c_rarg0 - newArr address 7875 // c_rarg1 - oldArr address 7876 // c_rarg2 - newIdx 7877 // c_rarg3 - shiftCount 7878 // c_rarg4 - numIter 7879 // 7880 address generate_bigIntegerRightShift() { 7881 __ align(CodeEntryAlignment); 7882 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 7883 StubCodeMark mark(this, stub_id); 7884 address start = __ pc(); 7885 7886 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7887 7888 Register newArr = c_rarg0; 7889 Register oldArr = c_rarg1; 7890 Register newIdx = c_rarg2; 7891 Register shiftCount = c_rarg3; 7892 Register numIter = c_rarg4; 7893 Register idx = numIter; 7894 7895 Register newArrCur = rscratch1; 7896 Register shiftRevCount = rscratch2; 7897 Register oldArrCur = r13; 7898 Register oldArrNext = r14; 7899 7900 FloatRegister oldElem0 = v0; 7901 FloatRegister oldElem1 = v1; 7902 FloatRegister newElem = v2; 7903 FloatRegister shiftVCount = v3; 7904 FloatRegister shiftVRevCount = v4; 7905 7906 __ cbz(idx, Exit); 7907 7908 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7909 7910 // left shift count 7911 __ movw(shiftRevCount, 32); 7912 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7913 7914 // numIter too small to allow a 4-words SIMD loop, rolling back 7915 __ cmp(numIter, (u1)4); 7916 __ br(Assembler::LT, ShiftThree); 7917 7918 __ dup(shiftVCount, __ T4S, shiftCount); 7919 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7920 __ negr(shiftVCount, __ T4S, shiftVCount); 7921 7922 __ BIND(ShiftSIMDLoop); 7923 7924 // Calculate the load addresses 7925 __ sub(idx, idx, 4); 7926 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7927 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7928 __ add(oldArrCur, oldArrNext, 4); 7929 7930 // Load 4 words and process 7931 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7932 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7933 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7934 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7935 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7936 __ st1(newElem, __ T4S, Address(newArrCur)); 7937 7938 __ cmp(idx, (u1)4); 7939 __ br(Assembler::LT, ShiftTwoLoop); 7940 __ b(ShiftSIMDLoop); 7941 7942 __ BIND(ShiftTwoLoop); 7943 __ cbz(idx, Exit); 7944 __ cmp(idx, (u1)1); 7945 __ br(Assembler::EQ, ShiftOne); 7946 7947 // Calculate the load addresses 7948 __ sub(idx, idx, 2); 7949 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7950 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7951 __ add(oldArrCur, oldArrNext, 4); 7952 7953 // Load 2 words and process 7954 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7955 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7956 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7957 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7958 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7959 __ st1(newElem, __ T2S, Address(newArrCur)); 7960 __ b(ShiftTwoLoop); 7961 7962 __ BIND(ShiftThree); 7963 __ tbz(idx, 1, ShiftOne); 7964 __ tbz(idx, 0, ShiftTwo); 7965 __ ldrw(r10, Address(oldArr, 12)); 7966 __ ldrw(r11, Address(oldArr, 8)); 7967 __ lsrvw(r10, r10, shiftCount); 7968 __ lslvw(r11, r11, shiftRevCount); 7969 __ orrw(r12, r10, r11); 7970 __ strw(r12, Address(newArr, 8)); 7971 7972 __ BIND(ShiftTwo); 7973 __ ldrw(r10, Address(oldArr, 8)); 7974 __ ldrw(r11, Address(oldArr, 4)); 7975 __ lsrvw(r10, r10, shiftCount); 7976 __ lslvw(r11, r11, shiftRevCount); 7977 __ orrw(r12, r10, r11); 7978 __ strw(r12, Address(newArr, 4)); 7979 7980 __ BIND(ShiftOne); 7981 __ ldrw(r10, Address(oldArr, 4)); 7982 __ ldrw(r11, Address(oldArr)); 7983 __ lsrvw(r10, r10, shiftCount); 7984 __ lslvw(r11, r11, shiftRevCount); 7985 __ orrw(r12, r10, r11); 7986 __ strw(r12, Address(newArr)); 7987 7988 __ BIND(Exit); 7989 __ ret(lr); 7990 7991 return start; 7992 } 7993 7994 // Arguments: 7995 // 7996 // Input: 7997 // c_rarg0 - newArr address 7998 // c_rarg1 - oldArr address 7999 // c_rarg2 - newIdx 8000 // c_rarg3 - shiftCount 8001 // c_rarg4 - numIter 8002 // 8003 address generate_bigIntegerLeftShift() { 8004 __ align(CodeEntryAlignment); 8005 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 8006 StubCodeMark mark(this, stub_id); 8007 address start = __ pc(); 8008 8009 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 8010 8011 Register newArr = c_rarg0; 8012 Register oldArr = c_rarg1; 8013 Register newIdx = c_rarg2; 8014 Register shiftCount = c_rarg3; 8015 Register numIter = c_rarg4; 8016 8017 Register shiftRevCount = rscratch1; 8018 Register oldArrNext = rscratch2; 8019 8020 FloatRegister oldElem0 = v0; 8021 FloatRegister oldElem1 = v1; 8022 FloatRegister newElem = v2; 8023 FloatRegister shiftVCount = v3; 8024 FloatRegister shiftVRevCount = v4; 8025 8026 __ cbz(numIter, Exit); 8027 8028 __ add(oldArrNext, oldArr, 4); 8029 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 8030 8031 // right shift count 8032 __ movw(shiftRevCount, 32); 8033 __ subw(shiftRevCount, shiftRevCount, shiftCount); 8034 8035 // numIter too small to allow a 4-words SIMD loop, rolling back 8036 __ cmp(numIter, (u1)4); 8037 __ br(Assembler::LT, ShiftThree); 8038 8039 __ dup(shiftVCount, __ T4S, shiftCount); 8040 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 8041 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 8042 8043 __ BIND(ShiftSIMDLoop); 8044 8045 // load 4 words and process 8046 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 8047 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 8048 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 8049 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 8050 __ orr(newElem, __ T16B, oldElem0, oldElem1); 8051 __ st1(newElem, __ T4S, __ post(newArr, 16)); 8052 __ sub(numIter, numIter, 4); 8053 8054 __ cmp(numIter, (u1)4); 8055 __ br(Assembler::LT, ShiftTwoLoop); 8056 __ b(ShiftSIMDLoop); 8057 8058 __ BIND(ShiftTwoLoop); 8059 __ cbz(numIter, Exit); 8060 __ cmp(numIter, (u1)1); 8061 __ br(Assembler::EQ, ShiftOne); 8062 8063 // load 2 words and process 8064 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 8065 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 8066 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 8067 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 8068 __ orr(newElem, __ T8B, oldElem0, oldElem1); 8069 __ st1(newElem, __ T2S, __ post(newArr, 8)); 8070 __ sub(numIter, numIter, 2); 8071 __ b(ShiftTwoLoop); 8072 8073 __ BIND(ShiftThree); 8074 __ ldrw(r10, __ post(oldArr, 4)); 8075 __ ldrw(r11, __ post(oldArrNext, 4)); 8076 __ lslvw(r10, r10, shiftCount); 8077 __ lsrvw(r11, r11, shiftRevCount); 8078 __ orrw(r12, r10, r11); 8079 __ strw(r12, __ post(newArr, 4)); 8080 __ tbz(numIter, 1, Exit); 8081 __ tbz(numIter, 0, ShiftOne); 8082 8083 __ BIND(ShiftTwo); 8084 __ ldrw(r10, __ post(oldArr, 4)); 8085 __ ldrw(r11, __ post(oldArrNext, 4)); 8086 __ lslvw(r10, r10, shiftCount); 8087 __ lsrvw(r11, r11, shiftRevCount); 8088 __ orrw(r12, r10, r11); 8089 __ strw(r12, __ post(newArr, 4)); 8090 8091 __ BIND(ShiftOne); 8092 __ ldrw(r10, Address(oldArr)); 8093 __ ldrw(r11, Address(oldArrNext)); 8094 __ lslvw(r10, r10, shiftCount); 8095 __ lsrvw(r11, r11, shiftRevCount); 8096 __ orrw(r12, r10, r11); 8097 __ strw(r12, Address(newArr)); 8098 8099 __ BIND(Exit); 8100 __ ret(lr); 8101 8102 return start; 8103 } 8104 8105 address generate_count_positives(address &count_positives_long) { 8106 const u1 large_loop_size = 64; 8107 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 8108 int dcache_line = VM_Version::dcache_line_size(); 8109 8110 Register ary1 = r1, len = r2, result = r0; 8111 8112 __ align(CodeEntryAlignment); 8113 8114 StubGenStubId stub_id = StubGenStubId::count_positives_id; 8115 StubCodeMark mark(this, stub_id); 8116 8117 address entry = __ pc(); 8118 8119 __ enter(); 8120 // precondition: a copy of len is already in result 8121 // __ mov(result, len); 8122 8123 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 8124 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 8125 8126 __ cmp(len, (u1)15); 8127 __ br(Assembler::GT, LEN_OVER_15); 8128 // The only case when execution falls into this code is when pointer is near 8129 // the end of memory page and we have to avoid reading next page 8130 __ add(ary1, ary1, len); 8131 __ subs(len, len, 8); 8132 __ br(Assembler::GT, LEN_OVER_8); 8133 __ ldr(rscratch2, Address(ary1, -8)); 8134 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 8135 __ lsrv(rscratch2, rscratch2, rscratch1); 8136 __ tst(rscratch2, UPPER_BIT_MASK); 8137 __ csel(result, zr, result, Assembler::NE); 8138 __ leave(); 8139 __ ret(lr); 8140 __ bind(LEN_OVER_8); 8141 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 8142 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 8143 __ tst(rscratch2, UPPER_BIT_MASK); 8144 __ br(Assembler::NE, RET_NO_POP); 8145 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 8146 __ lsrv(rscratch1, rscratch1, rscratch2); 8147 __ tst(rscratch1, UPPER_BIT_MASK); 8148 __ bind(RET_NO_POP); 8149 __ csel(result, zr, result, Assembler::NE); 8150 __ leave(); 8151 __ ret(lr); 8152 8153 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 8154 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 8155 8156 count_positives_long = __ pc(); // 2nd entry point 8157 8158 __ enter(); 8159 8160 __ bind(LEN_OVER_15); 8161 __ push(spilled_regs, sp); 8162 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 8163 __ cbz(rscratch2, ALIGNED); 8164 __ ldp(tmp6, tmp1, Address(ary1)); 8165 __ mov(tmp5, 16); 8166 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 8167 __ add(ary1, ary1, rscratch1); 8168 __ orr(tmp6, tmp6, tmp1); 8169 __ tst(tmp6, UPPER_BIT_MASK); 8170 __ br(Assembler::NE, RET_ADJUST); 8171 __ sub(len, len, rscratch1); 8172 8173 __ bind(ALIGNED); 8174 __ cmp(len, large_loop_size); 8175 __ br(Assembler::LT, CHECK_16); 8176 // Perform 16-byte load as early return in pre-loop to handle situation 8177 // when initially aligned large array has negative values at starting bytes, 8178 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 8179 // slower. Cases with negative bytes further ahead won't be affected that 8180 // much. In fact, it'll be faster due to early loads, less instructions and 8181 // less branches in LARGE_LOOP. 8182 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 8183 __ sub(len, len, 16); 8184 __ orr(tmp6, tmp6, tmp1); 8185 __ tst(tmp6, UPPER_BIT_MASK); 8186 __ br(Assembler::NE, RET_ADJUST_16); 8187 __ cmp(len, large_loop_size); 8188 __ br(Assembler::LT, CHECK_16); 8189 8190 if (SoftwarePrefetchHintDistance >= 0 8191 && SoftwarePrefetchHintDistance >= dcache_line) { 8192 // initial prefetch 8193 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 8194 } 8195 __ bind(LARGE_LOOP); 8196 if (SoftwarePrefetchHintDistance >= 0) { 8197 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 8198 } 8199 // Issue load instructions first, since it can save few CPU/MEM cycles, also 8200 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 8201 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 8202 // instructions per cycle and have less branches, but this approach disables 8203 // early return, thus, all 64 bytes are loaded and checked every time. 8204 __ ldp(tmp2, tmp3, Address(ary1)); 8205 __ ldp(tmp4, tmp5, Address(ary1, 16)); 8206 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 8207 __ ldp(tmp6, tmp1, Address(ary1, 48)); 8208 __ add(ary1, ary1, large_loop_size); 8209 __ sub(len, len, large_loop_size); 8210 __ orr(tmp2, tmp2, tmp3); 8211 __ orr(tmp4, tmp4, tmp5); 8212 __ orr(rscratch1, rscratch1, rscratch2); 8213 __ orr(tmp6, tmp6, tmp1); 8214 __ orr(tmp2, tmp2, tmp4); 8215 __ orr(rscratch1, rscratch1, tmp6); 8216 __ orr(tmp2, tmp2, rscratch1); 8217 __ tst(tmp2, UPPER_BIT_MASK); 8218 __ br(Assembler::NE, RET_ADJUST_LONG); 8219 __ cmp(len, large_loop_size); 8220 __ br(Assembler::GE, LARGE_LOOP); 8221 8222 __ bind(CHECK_16); // small 16-byte load pre-loop 8223 __ cmp(len, (u1)16); 8224 __ br(Assembler::LT, POST_LOOP16); 8225 8226 __ bind(LOOP16); // small 16-byte load loop 8227 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 8228 __ sub(len, len, 16); 8229 __ orr(tmp2, tmp2, tmp3); 8230 __ tst(tmp2, UPPER_BIT_MASK); 8231 __ br(Assembler::NE, RET_ADJUST_16); 8232 __ cmp(len, (u1)16); 8233 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 8234 8235 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 8236 __ cmp(len, (u1)8); 8237 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 8238 __ ldr(tmp3, Address(__ post(ary1, 8))); 8239 __ tst(tmp3, UPPER_BIT_MASK); 8240 __ br(Assembler::NE, RET_ADJUST); 8241 __ sub(len, len, 8); 8242 8243 __ bind(POST_LOOP16_LOAD_TAIL); 8244 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 8245 __ ldr(tmp1, Address(ary1)); 8246 __ mov(tmp2, 64); 8247 __ sub(tmp4, tmp2, len, __ LSL, 3); 8248 __ lslv(tmp1, tmp1, tmp4); 8249 __ tst(tmp1, UPPER_BIT_MASK); 8250 __ br(Assembler::NE, RET_ADJUST); 8251 // Fallthrough 8252 8253 __ bind(RET_LEN); 8254 __ pop(spilled_regs, sp); 8255 __ leave(); 8256 __ ret(lr); 8257 8258 // difference result - len is the count of guaranteed to be 8259 // positive bytes 8260 8261 __ bind(RET_ADJUST_LONG); 8262 __ add(len, len, (u1)(large_loop_size - 16)); 8263 __ bind(RET_ADJUST_16); 8264 __ add(len, len, 16); 8265 __ bind(RET_ADJUST); 8266 __ pop(spilled_regs, sp); 8267 __ leave(); 8268 __ sub(result, result, len); 8269 __ ret(lr); 8270 8271 return entry; 8272 } 8273 8274 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 8275 bool usePrefetch, Label &NOT_EQUAL) { 8276 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8277 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8278 tmp7 = r12, tmp8 = r13; 8279 Label LOOP; 8280 8281 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8282 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8283 __ bind(LOOP); 8284 if (usePrefetch) { 8285 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8286 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8287 } 8288 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8289 __ eor(tmp1, tmp1, tmp2); 8290 __ eor(tmp3, tmp3, tmp4); 8291 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8292 __ orr(tmp1, tmp1, tmp3); 8293 __ cbnz(tmp1, NOT_EQUAL); 8294 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8295 __ eor(tmp5, tmp5, tmp6); 8296 __ eor(tmp7, tmp7, tmp8); 8297 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8298 __ orr(tmp5, tmp5, tmp7); 8299 __ cbnz(tmp5, NOT_EQUAL); 8300 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8301 __ eor(tmp1, tmp1, tmp2); 8302 __ eor(tmp3, tmp3, tmp4); 8303 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8304 __ orr(tmp1, tmp1, tmp3); 8305 __ cbnz(tmp1, NOT_EQUAL); 8306 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8307 __ eor(tmp5, tmp5, tmp6); 8308 __ sub(cnt1, cnt1, 8 * wordSize); 8309 __ eor(tmp7, tmp7, tmp8); 8310 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8311 // tmp6 is not used. MacroAssembler::subs is used here (rather than 8312 // cmp) because subs allows an unlimited range of immediate operand. 8313 __ subs(tmp6, cnt1, loopThreshold); 8314 __ orr(tmp5, tmp5, tmp7); 8315 __ cbnz(tmp5, NOT_EQUAL); 8316 __ br(__ GE, LOOP); 8317 // post-loop 8318 __ eor(tmp1, tmp1, tmp2); 8319 __ eor(tmp3, tmp3, tmp4); 8320 __ orr(tmp1, tmp1, tmp3); 8321 __ sub(cnt1, cnt1, 2 * wordSize); 8322 __ cbnz(tmp1, NOT_EQUAL); 8323 } 8324 8325 void generate_large_array_equals_loop_simd(int loopThreshold, 8326 bool usePrefetch, Label &NOT_EQUAL) { 8327 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8328 tmp2 = rscratch2; 8329 Label LOOP; 8330 8331 __ bind(LOOP); 8332 if (usePrefetch) { 8333 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8334 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8335 } 8336 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 8337 __ sub(cnt1, cnt1, 8 * wordSize); 8338 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 8339 __ subs(tmp1, cnt1, loopThreshold); 8340 __ eor(v0, __ T16B, v0, v4); 8341 __ eor(v1, __ T16B, v1, v5); 8342 __ eor(v2, __ T16B, v2, v6); 8343 __ eor(v3, __ T16B, v3, v7); 8344 __ orr(v0, __ T16B, v0, v1); 8345 __ orr(v1, __ T16B, v2, v3); 8346 __ orr(v0, __ T16B, v0, v1); 8347 __ umov(tmp1, v0, __ D, 0); 8348 __ umov(tmp2, v0, __ D, 1); 8349 __ orr(tmp1, tmp1, tmp2); 8350 __ cbnz(tmp1, NOT_EQUAL); 8351 __ br(__ GE, LOOP); 8352 } 8353 8354 // a1 = r1 - array1 address 8355 // a2 = r2 - array2 address 8356 // result = r0 - return value. Already contains "false" 8357 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 8358 // r3-r5 are reserved temporary registers 8359 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 8360 address generate_large_array_equals() { 8361 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8362 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8363 tmp7 = r12, tmp8 = r13; 8364 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 8365 SMALL_LOOP, POST_LOOP; 8366 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 8367 // calculate if at least 32 prefetched bytes are used 8368 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 8369 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 8370 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 8371 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 8372 tmp5, tmp6, tmp7, tmp8); 8373 8374 __ align(CodeEntryAlignment); 8375 8376 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 8377 StubCodeMark mark(this, stub_id); 8378 8379 address entry = __ pc(); 8380 __ enter(); 8381 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8382 // also advance pointers to use post-increment instead of pre-increment 8383 __ add(a1, a1, wordSize); 8384 __ add(a2, a2, wordSize); 8385 if (AvoidUnalignedAccesses) { 8386 // both implementations (SIMD/nonSIMD) are using relatively large load 8387 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8388 // on some CPUs in case of address is not at least 16-byte aligned. 8389 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8390 // load if needed at least for 1st address and make if 16-byte aligned. 8391 Label ALIGNED16; 8392 __ tbz(a1, 3, ALIGNED16); 8393 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8394 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8395 __ sub(cnt1, cnt1, wordSize); 8396 __ eor(tmp1, tmp1, tmp2); 8397 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8398 __ bind(ALIGNED16); 8399 } 8400 if (UseSIMDForArrayEquals) { 8401 if (SoftwarePrefetchHintDistance >= 0) { 8402 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8403 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8404 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8405 /* prfm = */ true, NOT_EQUAL); 8406 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8407 __ br(__ LT, TAIL); 8408 } 8409 __ bind(NO_PREFETCH_LARGE_LOOP); 8410 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8411 /* prfm = */ false, NOT_EQUAL); 8412 } else { 8413 __ push(spilled_regs, sp); 8414 if (SoftwarePrefetchHintDistance >= 0) { 8415 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8416 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8417 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8418 /* prfm = */ true, NOT_EQUAL); 8419 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8420 __ br(__ LT, TAIL); 8421 } 8422 __ bind(NO_PREFETCH_LARGE_LOOP); 8423 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8424 /* prfm = */ false, NOT_EQUAL); 8425 } 8426 __ bind(TAIL); 8427 __ cbz(cnt1, EQUAL); 8428 __ subs(cnt1, cnt1, wordSize); 8429 __ br(__ LE, POST_LOOP); 8430 __ bind(SMALL_LOOP); 8431 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8432 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8433 __ subs(cnt1, cnt1, wordSize); 8434 __ eor(tmp1, tmp1, tmp2); 8435 __ cbnz(tmp1, NOT_EQUAL); 8436 __ br(__ GT, SMALL_LOOP); 8437 __ bind(POST_LOOP); 8438 __ ldr(tmp1, Address(a1, cnt1)); 8439 __ ldr(tmp2, Address(a2, cnt1)); 8440 __ eor(tmp1, tmp1, tmp2); 8441 __ cbnz(tmp1, NOT_EQUAL); 8442 __ bind(EQUAL); 8443 __ mov(result, true); 8444 __ bind(NOT_EQUAL); 8445 if (!UseSIMDForArrayEquals) { 8446 __ pop(spilled_regs, sp); 8447 } 8448 __ bind(NOT_EQUAL_NO_POP); 8449 __ leave(); 8450 __ ret(lr); 8451 return entry; 8452 } 8453 8454 // result = r0 - return value. Contains initial hashcode value on entry. 8455 // ary = r1 - array address 8456 // cnt = r2 - elements count 8457 // Clobbers: v0-v13, rscratch1, rscratch2 8458 address generate_large_arrays_hashcode(BasicType eltype) { 8459 const Register result = r0, ary = r1, cnt = r2; 8460 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8461 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8462 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8463 const FloatRegister vpowm = v13; 8464 8465 ARRAYS_HASHCODE_REGISTERS; 8466 8467 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8468 8469 unsigned int vf; // vectorization factor 8470 bool multiply_by_halves; 8471 Assembler::SIMD_Arrangement load_arrangement; 8472 switch (eltype) { 8473 case T_BOOLEAN: 8474 case T_BYTE: 8475 load_arrangement = Assembler::T8B; 8476 multiply_by_halves = true; 8477 vf = 8; 8478 break; 8479 case T_CHAR: 8480 case T_SHORT: 8481 load_arrangement = Assembler::T8H; 8482 multiply_by_halves = true; 8483 vf = 8; 8484 break; 8485 case T_INT: 8486 load_arrangement = Assembler::T4S; 8487 multiply_by_halves = false; 8488 vf = 4; 8489 break; 8490 default: 8491 ShouldNotReachHere(); 8492 } 8493 8494 // Unroll factor 8495 const unsigned uf = 4; 8496 8497 // Effective vectorization factor 8498 const unsigned evf = vf * uf; 8499 8500 __ align(CodeEntryAlignment); 8501 8502 StubGenStubId stub_id; 8503 switch (eltype) { 8504 case T_BOOLEAN: 8505 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 8506 break; 8507 case T_BYTE: 8508 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 8509 break; 8510 case T_CHAR: 8511 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 8512 break; 8513 case T_SHORT: 8514 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 8515 break; 8516 case T_INT: 8517 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 8518 break; 8519 default: 8520 stub_id = StubGenStubId::NO_STUBID; 8521 ShouldNotReachHere(); 8522 }; 8523 8524 StubCodeMark mark(this, stub_id); 8525 8526 address entry = __ pc(); 8527 __ enter(); 8528 8529 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8530 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8531 // value shouldn't change throughout both loops. 8532 __ movw(rscratch1, intpow(31U, 3)); 8533 __ mov(vpow, Assembler::S, 0, rscratch1); 8534 __ movw(rscratch1, intpow(31U, 2)); 8535 __ mov(vpow, Assembler::S, 1, rscratch1); 8536 __ movw(rscratch1, intpow(31U, 1)); 8537 __ mov(vpow, Assembler::S, 2, rscratch1); 8538 __ movw(rscratch1, intpow(31U, 0)); 8539 __ mov(vpow, Assembler::S, 3, rscratch1); 8540 8541 __ mov(vmul0, Assembler::T16B, 0); 8542 __ mov(vmul0, Assembler::S, 3, result); 8543 8544 __ andr(rscratch2, cnt, (uf - 1) * vf); 8545 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8546 8547 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8548 __ mov(vpowm, Assembler::S, 0, rscratch1); 8549 8550 // SMALL LOOP 8551 __ bind(SMALL_LOOP); 8552 8553 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8554 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8555 __ subsw(rscratch2, rscratch2, vf); 8556 8557 if (load_arrangement == Assembler::T8B) { 8558 // Extend 8B to 8H to be able to use vector multiply 8559 // instructions 8560 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8561 if (is_signed_subword_type(eltype)) { 8562 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8563 } else { 8564 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8565 } 8566 } 8567 8568 switch (load_arrangement) { 8569 case Assembler::T4S: 8570 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8571 break; 8572 case Assembler::T8B: 8573 case Assembler::T8H: 8574 assert(is_subword_type(eltype), "subword type expected"); 8575 if (is_signed_subword_type(eltype)) { 8576 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8577 } else { 8578 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8579 } 8580 break; 8581 default: 8582 __ should_not_reach_here(); 8583 } 8584 8585 // Process the upper half of a vector 8586 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8587 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8588 if (is_signed_subword_type(eltype)) { 8589 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8590 } else { 8591 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8592 } 8593 } 8594 8595 __ br(Assembler::HI, SMALL_LOOP); 8596 8597 // SMALL LOOP'S EPILOQUE 8598 __ lsr(rscratch2, cnt, exact_log2(evf)); 8599 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8600 8601 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8602 __ addv(vmul0, Assembler::T4S, vmul0); 8603 __ umov(result, vmul0, Assembler::S, 0); 8604 8605 // TAIL 8606 __ bind(TAIL); 8607 8608 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8609 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8610 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8611 __ andr(rscratch2, cnt, vf - 1); 8612 __ bind(TAIL_SHORTCUT); 8613 __ adr(rscratch1, BR_BASE); 8614 // For Cortex-A53 offset is 4 because 2 nops are generated. 8615 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8616 __ movw(rscratch2, 0x1f); 8617 __ br(rscratch1); 8618 8619 for (size_t i = 0; i < vf - 1; ++i) { 8620 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8621 eltype); 8622 __ maddw(result, result, rscratch2, rscratch1); 8623 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8624 // Generate 2nd nop to have 4 instructions per iteration. 8625 if (VM_Version::supports_a53mac()) { 8626 __ nop(); 8627 } 8628 } 8629 __ bind(BR_BASE); 8630 8631 __ leave(); 8632 __ ret(lr); 8633 8634 // LARGE LOOP 8635 __ bind(LARGE_LOOP_PREHEADER); 8636 8637 __ lsr(rscratch2, cnt, exact_log2(evf)); 8638 8639 if (multiply_by_halves) { 8640 // 31^4 - multiplier between lower and upper parts of a register 8641 __ movw(rscratch1, intpow(31U, vf / 2)); 8642 __ mov(vpowm, Assembler::S, 1, rscratch1); 8643 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8644 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8645 __ mov(vpowm, Assembler::S, 0, rscratch1); 8646 } else { 8647 // 31^16 8648 __ movw(rscratch1, intpow(31U, evf)); 8649 __ mov(vpowm, Assembler::S, 0, rscratch1); 8650 } 8651 8652 __ mov(vmul3, Assembler::T16B, 0); 8653 __ mov(vmul2, Assembler::T16B, 0); 8654 __ mov(vmul1, Assembler::T16B, 0); 8655 8656 __ bind(LARGE_LOOP); 8657 8658 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8659 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8660 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8661 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8662 8663 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8664 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8665 8666 if (load_arrangement == Assembler::T8B) { 8667 // Extend 8B to 8H to be able to use vector multiply 8668 // instructions 8669 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8670 if (is_signed_subword_type(eltype)) { 8671 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8672 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8673 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8674 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8675 } else { 8676 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8677 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8678 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8679 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8680 } 8681 } 8682 8683 switch (load_arrangement) { 8684 case Assembler::T4S: 8685 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8686 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8687 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8688 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8689 break; 8690 case Assembler::T8B: 8691 case Assembler::T8H: 8692 assert(is_subword_type(eltype), "subword type expected"); 8693 if (is_signed_subword_type(eltype)) { 8694 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8695 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8696 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8697 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8698 } else { 8699 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8700 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8701 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8702 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8703 } 8704 break; 8705 default: 8706 __ should_not_reach_here(); 8707 } 8708 8709 // Process the upper half of a vector 8710 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8711 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8712 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8713 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8714 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8715 if (is_signed_subword_type(eltype)) { 8716 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8717 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8718 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8719 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8720 } else { 8721 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8722 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8723 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8724 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8725 } 8726 } 8727 8728 __ subsw(rscratch2, rscratch2, 1); 8729 __ br(Assembler::HI, LARGE_LOOP); 8730 8731 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8732 __ addv(vmul3, Assembler::T4S, vmul3); 8733 __ umov(result, vmul3, Assembler::S, 0); 8734 8735 __ mov(rscratch2, intpow(31U, vf)); 8736 8737 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8738 __ addv(vmul2, Assembler::T4S, vmul2); 8739 __ umov(rscratch1, vmul2, Assembler::S, 0); 8740 __ maddw(result, result, rscratch2, rscratch1); 8741 8742 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8743 __ addv(vmul1, Assembler::T4S, vmul1); 8744 __ umov(rscratch1, vmul1, Assembler::S, 0); 8745 __ maddw(result, result, rscratch2, rscratch1); 8746 8747 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8748 __ addv(vmul0, Assembler::T4S, vmul0); 8749 __ umov(rscratch1, vmul0, Assembler::S, 0); 8750 __ maddw(result, result, rscratch2, rscratch1); 8751 8752 __ andr(rscratch2, cnt, vf - 1); 8753 __ cbnz(rscratch2, TAIL_SHORTCUT); 8754 8755 __ leave(); 8756 __ ret(lr); 8757 8758 return entry; 8759 } 8760 8761 address generate_dsin_dcos(bool isCos) { 8762 __ align(CodeEntryAlignment); 8763 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 8764 StubCodeMark mark(this, stub_id); 8765 address start = __ pc(); 8766 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8767 (address)StubRoutines::aarch64::_two_over_pi, 8768 (address)StubRoutines::aarch64::_pio2, 8769 (address)StubRoutines::aarch64::_dsin_coef, 8770 (address)StubRoutines::aarch64::_dcos_coef); 8771 return start; 8772 } 8773 8774 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8775 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8776 Label &DIFF2) { 8777 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8778 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8779 8780 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8781 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8782 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8783 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8784 8785 __ fmovd(tmpL, vtmp3); 8786 __ eor(rscratch2, tmp3, tmpL); 8787 __ cbnz(rscratch2, DIFF2); 8788 8789 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8790 __ umov(tmpL, vtmp3, __ D, 1); 8791 __ eor(rscratch2, tmpU, tmpL); 8792 __ cbnz(rscratch2, DIFF1); 8793 8794 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8795 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8796 __ fmovd(tmpL, vtmp); 8797 __ eor(rscratch2, tmp3, tmpL); 8798 __ cbnz(rscratch2, DIFF2); 8799 8800 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8801 __ umov(tmpL, vtmp, __ D, 1); 8802 __ eor(rscratch2, tmpU, tmpL); 8803 __ cbnz(rscratch2, DIFF1); 8804 } 8805 8806 // r0 = result 8807 // r1 = str1 8808 // r2 = cnt1 8809 // r3 = str2 8810 // r4 = cnt2 8811 // r10 = tmp1 8812 // r11 = tmp2 8813 address generate_compare_long_string_different_encoding(bool isLU) { 8814 __ align(CodeEntryAlignment); 8815 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 8816 StubCodeMark mark(this, stub_id); 8817 address entry = __ pc(); 8818 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8819 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8820 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8821 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8822 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8823 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8824 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8825 8826 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8827 8828 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8829 // cnt2 == amount of characters left to compare 8830 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8831 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8832 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8833 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8834 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8835 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8836 __ eor(rscratch2, tmp1, tmp2); 8837 __ mov(rscratch1, tmp2); 8838 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8839 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8840 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8841 __ push(spilled_regs, sp); 8842 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8843 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8844 8845 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8846 8847 if (SoftwarePrefetchHintDistance >= 0) { 8848 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8849 __ br(__ LT, NO_PREFETCH); 8850 __ bind(LARGE_LOOP_PREFETCH); 8851 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8852 __ mov(tmp4, 2); 8853 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8854 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8855 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8856 __ subs(tmp4, tmp4, 1); 8857 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8858 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8859 __ mov(tmp4, 2); 8860 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8861 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8862 __ subs(tmp4, tmp4, 1); 8863 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8864 __ sub(cnt2, cnt2, 64); 8865 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8866 __ br(__ GE, LARGE_LOOP_PREFETCH); 8867 } 8868 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8869 __ bind(NO_PREFETCH); 8870 __ subs(cnt2, cnt2, 16); 8871 __ br(__ LT, TAIL); 8872 __ align(OptoLoopAlignment); 8873 __ bind(SMALL_LOOP); // smaller loop 8874 __ subs(cnt2, cnt2, 16); 8875 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8876 __ br(__ GE, SMALL_LOOP); 8877 __ cmn(cnt2, (u1)16); 8878 __ br(__ EQ, LOAD_LAST); 8879 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8880 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8881 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8882 __ ldr(tmp3, Address(cnt1, -8)); 8883 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8884 __ b(LOAD_LAST); 8885 __ bind(DIFF2); 8886 __ mov(tmpU, tmp3); 8887 __ bind(DIFF1); 8888 __ pop(spilled_regs, sp); 8889 __ b(CALCULATE_DIFFERENCE); 8890 __ bind(LOAD_LAST); 8891 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8892 // No need to load it again 8893 __ mov(tmpU, tmp3); 8894 __ pop(spilled_regs, sp); 8895 8896 // tmp2 points to the address of the last 4 Latin1 characters right now 8897 __ ldrs(vtmp, Address(tmp2)); 8898 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8899 __ fmovd(tmpL, vtmp); 8900 8901 __ eor(rscratch2, tmpU, tmpL); 8902 __ cbz(rscratch2, DONE); 8903 8904 // Find the first different characters in the longwords and 8905 // compute their difference. 8906 __ bind(CALCULATE_DIFFERENCE); 8907 __ rev(rscratch2, rscratch2); 8908 __ clz(rscratch2, rscratch2); 8909 __ andr(rscratch2, rscratch2, -16); 8910 __ lsrv(tmp1, tmp1, rscratch2); 8911 __ uxthw(tmp1, tmp1); 8912 __ lsrv(rscratch1, rscratch1, rscratch2); 8913 __ uxthw(rscratch1, rscratch1); 8914 __ subw(result, tmp1, rscratch1); 8915 __ bind(DONE); 8916 __ ret(lr); 8917 return entry; 8918 } 8919 8920 // r0 = input (float16) 8921 // v0 = result (float) 8922 // v1 = temporary float register 8923 address generate_float16ToFloat() { 8924 __ align(CodeEntryAlignment); 8925 StubGenStubId stub_id = StubGenStubId::hf2f_id; 8926 StubCodeMark mark(this, stub_id); 8927 address entry = __ pc(); 8928 BLOCK_COMMENT("Entry:"); 8929 __ flt16_to_flt(v0, r0, v1); 8930 __ ret(lr); 8931 return entry; 8932 } 8933 8934 // v0 = input (float) 8935 // r0 = result (float16) 8936 // v1 = temporary float register 8937 address generate_floatToFloat16() { 8938 __ align(CodeEntryAlignment); 8939 StubGenStubId stub_id = StubGenStubId::f2hf_id; 8940 StubCodeMark mark(this, stub_id); 8941 address entry = __ pc(); 8942 BLOCK_COMMENT("Entry:"); 8943 __ flt_to_flt16(r0, v0, v1); 8944 __ ret(lr); 8945 return entry; 8946 } 8947 8948 address generate_method_entry_barrier() { 8949 __ align(CodeEntryAlignment); 8950 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 8951 StubCodeMark mark(this, stub_id); 8952 8953 Label deoptimize_label; 8954 8955 address start = __ pc(); 8956 8957 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8958 8959 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8960 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8961 // We can get here despite the nmethod being good, if we have not 8962 // yet applied our cross modification fence (or data fence). 8963 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8964 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8965 __ ldrw(rscratch2, rscratch2); 8966 __ strw(rscratch2, thread_epoch_addr); 8967 __ isb(); 8968 __ membar(__ LoadLoad); 8969 } 8970 8971 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8972 8973 __ enter(); 8974 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8975 8976 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8977 8978 __ push_call_clobbered_registers(); 8979 8980 __ mov(c_rarg0, rscratch2); 8981 __ call_VM_leaf 8982 (CAST_FROM_FN_PTR 8983 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8984 8985 __ reset_last_Java_frame(true); 8986 8987 __ mov(rscratch1, r0); 8988 8989 __ pop_call_clobbered_registers(); 8990 8991 __ cbnz(rscratch1, deoptimize_label); 8992 8993 __ leave(); 8994 __ ret(lr); 8995 8996 __ BIND(deoptimize_label); 8997 8998 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8999 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 9000 9001 __ mov(sp, rscratch1); 9002 __ br(rscratch2); 9003 9004 return start; 9005 } 9006 9007 // r0 = result 9008 // r1 = str1 9009 // r2 = cnt1 9010 // r3 = str2 9011 // r4 = cnt2 9012 // r10 = tmp1 9013 // r11 = tmp2 9014 address generate_compare_long_string_same_encoding(bool isLL) { 9015 __ align(CodeEntryAlignment); 9016 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 9017 StubCodeMark mark(this, stub_id); 9018 address entry = __ pc(); 9019 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9020 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 9021 9022 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 9023 9024 // exit from large loop when less than 64 bytes left to read or we're about 9025 // to prefetch memory behind array border 9026 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 9027 9028 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 9029 __ eor(rscratch2, tmp1, tmp2); 9030 __ cbnz(rscratch2, CAL_DIFFERENCE); 9031 9032 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 9033 // update pointers, because of previous read 9034 __ add(str1, str1, wordSize); 9035 __ add(str2, str2, wordSize); 9036 if (SoftwarePrefetchHintDistance >= 0) { 9037 __ align(OptoLoopAlignment); 9038 __ bind(LARGE_LOOP_PREFETCH); 9039 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 9040 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 9041 9042 for (int i = 0; i < 4; i++) { 9043 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 9044 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 9045 __ cmp(tmp1, tmp2); 9046 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9047 __ br(Assembler::NE, DIFF); 9048 } 9049 __ sub(cnt2, cnt2, isLL ? 64 : 32); 9050 __ add(str1, str1, 64); 9051 __ add(str2, str2, 64); 9052 __ subs(rscratch2, cnt2, largeLoopExitCondition); 9053 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 9054 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 9055 } 9056 9057 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 9058 __ br(Assembler::LE, LESS16); 9059 __ align(OptoLoopAlignment); 9060 __ bind(LOOP_COMPARE16); 9061 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9062 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9063 __ cmp(tmp1, tmp2); 9064 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9065 __ br(Assembler::NE, DIFF); 9066 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9067 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9068 __ br(Assembler::LT, LESS16); 9069 9070 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9071 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9072 __ cmp(tmp1, tmp2); 9073 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9074 __ br(Assembler::NE, DIFF); 9075 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9076 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9077 __ br(Assembler::GE, LOOP_COMPARE16); 9078 __ cbz(cnt2, LENGTH_DIFF); 9079 9080 __ bind(LESS16); 9081 // each 8 compare 9082 __ subs(cnt2, cnt2, isLL ? 8 : 4); 9083 __ br(Assembler::LE, LESS8); 9084 __ ldr(tmp1, Address(__ post(str1, 8))); 9085 __ ldr(tmp2, Address(__ post(str2, 8))); 9086 __ eor(rscratch2, tmp1, tmp2); 9087 __ cbnz(rscratch2, CAL_DIFFERENCE); 9088 __ sub(cnt2, cnt2, isLL ? 8 : 4); 9089 9090 __ bind(LESS8); // directly load last 8 bytes 9091 if (!isLL) { 9092 __ add(cnt2, cnt2, cnt2); 9093 } 9094 __ ldr(tmp1, Address(str1, cnt2)); 9095 __ ldr(tmp2, Address(str2, cnt2)); 9096 __ eor(rscratch2, tmp1, tmp2); 9097 __ cbz(rscratch2, LENGTH_DIFF); 9098 __ b(CAL_DIFFERENCE); 9099 9100 __ bind(DIFF); 9101 __ cmp(tmp1, tmp2); 9102 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 9103 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 9104 // reuse rscratch2 register for the result of eor instruction 9105 __ eor(rscratch2, tmp1, tmp2); 9106 9107 __ bind(CAL_DIFFERENCE); 9108 __ rev(rscratch2, rscratch2); 9109 __ clz(rscratch2, rscratch2); 9110 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 9111 __ lsrv(tmp1, tmp1, rscratch2); 9112 __ lsrv(tmp2, tmp2, rscratch2); 9113 if (isLL) { 9114 __ uxtbw(tmp1, tmp1); 9115 __ uxtbw(tmp2, tmp2); 9116 } else { 9117 __ uxthw(tmp1, tmp1); 9118 __ uxthw(tmp2, tmp2); 9119 } 9120 __ subw(result, tmp1, tmp2); 9121 9122 __ bind(LENGTH_DIFF); 9123 __ ret(lr); 9124 return entry; 9125 } 9126 9127 enum string_compare_mode { 9128 LL, 9129 LU, 9130 UL, 9131 UU, 9132 }; 9133 9134 // The following registers are declared in aarch64.ad 9135 // r0 = result 9136 // r1 = str1 9137 // r2 = cnt1 9138 // r3 = str2 9139 // r4 = cnt2 9140 // r10 = tmp1 9141 // r11 = tmp2 9142 // z0 = ztmp1 9143 // z1 = ztmp2 9144 // p0 = pgtmp1 9145 // p1 = pgtmp2 9146 address generate_compare_long_string_sve(string_compare_mode mode) { 9147 StubGenStubId stub_id; 9148 switch (mode) { 9149 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 9150 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 9151 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 9152 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 9153 default: ShouldNotReachHere(); 9154 } 9155 9156 __ align(CodeEntryAlignment); 9157 address entry = __ pc(); 9158 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9159 tmp1 = r10, tmp2 = r11; 9160 9161 Label LOOP, DONE, MISMATCH; 9162 Register vec_len = tmp1; 9163 Register idx = tmp2; 9164 // The minimum of the string lengths has been stored in cnt2. 9165 Register cnt = cnt2; 9166 FloatRegister ztmp1 = z0, ztmp2 = z1; 9167 PRegister pgtmp1 = p0, pgtmp2 = p1; 9168 9169 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 9170 switch (mode) { \ 9171 case LL: \ 9172 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 9173 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 9174 break; \ 9175 case LU: \ 9176 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 9177 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9178 break; \ 9179 case UL: \ 9180 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9181 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 9182 break; \ 9183 case UU: \ 9184 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9185 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9186 break; \ 9187 default: \ 9188 ShouldNotReachHere(); \ 9189 } 9190 9191 StubCodeMark mark(this, stub_id); 9192 9193 __ mov(idx, 0); 9194 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9195 9196 if (mode == LL) { 9197 __ sve_cntb(vec_len); 9198 } else { 9199 __ sve_cnth(vec_len); 9200 } 9201 9202 __ sub(rscratch1, cnt, vec_len); 9203 9204 __ bind(LOOP); 9205 9206 // main loop 9207 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9208 __ add(idx, idx, vec_len); 9209 // Compare strings. 9210 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9211 __ br(__ NE, MISMATCH); 9212 __ cmp(idx, rscratch1); 9213 __ br(__ LT, LOOP); 9214 9215 // post loop, last iteration 9216 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9217 9218 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9219 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9220 __ br(__ EQ, DONE); 9221 9222 __ bind(MISMATCH); 9223 9224 // Crop the vector to find its location. 9225 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 9226 // Extract the first different characters of each string. 9227 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 9228 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 9229 9230 // Compute the difference of the first different characters. 9231 __ sub(result, rscratch1, rscratch2); 9232 9233 __ bind(DONE); 9234 __ ret(lr); 9235 #undef LOAD_PAIR 9236 return entry; 9237 } 9238 9239 void generate_compare_long_strings() { 9240 if (UseSVE == 0) { 9241 StubRoutines::aarch64::_compare_long_string_LL 9242 = generate_compare_long_string_same_encoding(true); 9243 StubRoutines::aarch64::_compare_long_string_UU 9244 = generate_compare_long_string_same_encoding(false); 9245 StubRoutines::aarch64::_compare_long_string_LU 9246 = generate_compare_long_string_different_encoding(true); 9247 StubRoutines::aarch64::_compare_long_string_UL 9248 = generate_compare_long_string_different_encoding(false); 9249 } else { 9250 StubRoutines::aarch64::_compare_long_string_LL 9251 = generate_compare_long_string_sve(LL); 9252 StubRoutines::aarch64::_compare_long_string_UU 9253 = generate_compare_long_string_sve(UU); 9254 StubRoutines::aarch64::_compare_long_string_LU 9255 = generate_compare_long_string_sve(LU); 9256 StubRoutines::aarch64::_compare_long_string_UL 9257 = generate_compare_long_string_sve(UL); 9258 } 9259 } 9260 9261 // R0 = result 9262 // R1 = str2 9263 // R2 = cnt1 9264 // R3 = str1 9265 // R4 = cnt2 9266 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 9267 // 9268 // This generic linear code use few additional ideas, which makes it faster: 9269 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 9270 // in order to skip initial loading(help in systems with 1 ld pipeline) 9271 // 2) we can use "fast" algorithm of finding single character to search for 9272 // first symbol with less branches(1 branch per each loaded register instead 9273 // of branch for each symbol), so, this is where constants like 9274 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 9275 // 3) after loading and analyzing 1st register of source string, it can be 9276 // used to search for every 1st character entry, saving few loads in 9277 // comparison with "simplier-but-slower" implementation 9278 // 4) in order to avoid lots of push/pop operations, code below is heavily 9279 // re-using/re-initializing/compressing register values, which makes code 9280 // larger and a bit less readable, however, most of extra operations are 9281 // issued during loads or branches, so, penalty is minimal 9282 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 9283 StubGenStubId stub_id; 9284 if (str1_isL) { 9285 if (str2_isL) { 9286 stub_id = StubGenStubId::string_indexof_linear_ll_id; 9287 } else { 9288 stub_id = StubGenStubId::string_indexof_linear_ul_id; 9289 } 9290 } else { 9291 if (str2_isL) { 9292 ShouldNotReachHere(); 9293 } else { 9294 stub_id = StubGenStubId::string_indexof_linear_uu_id; 9295 } 9296 } 9297 __ align(CodeEntryAlignment); 9298 StubCodeMark mark(this, stub_id); 9299 address entry = __ pc(); 9300 9301 int str1_chr_size = str1_isL ? 1 : 2; 9302 int str2_chr_size = str2_isL ? 1 : 2; 9303 int str1_chr_shift = str1_isL ? 0 : 1; 9304 int str2_chr_shift = str2_isL ? 0 : 1; 9305 bool isL = str1_isL && str2_isL; 9306 // parameters 9307 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 9308 // temporary registers 9309 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 9310 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 9311 // redefinitions 9312 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 9313 9314 __ push(spilled_regs, sp); 9315 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 9316 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 9317 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 9318 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 9319 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 9320 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 9321 // Read whole register from str1. It is safe, because length >=8 here 9322 __ ldr(ch1, Address(str1)); 9323 // Read whole register from str2. It is safe, because length >=8 here 9324 __ ldr(ch2, Address(str2)); 9325 __ sub(cnt2, cnt2, cnt1); 9326 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 9327 if (str1_isL != str2_isL) { 9328 __ eor(v0, __ T16B, v0, v0); 9329 } 9330 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 9331 __ mul(first, first, tmp1); 9332 // check if we have less than 1 register to check 9333 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 9334 if (str1_isL != str2_isL) { 9335 __ fmovd(v1, ch1); 9336 } 9337 __ br(__ LE, L_SMALL); 9338 __ eor(ch2, first, ch2); 9339 if (str1_isL != str2_isL) { 9340 __ zip1(v1, __ T16B, v1, v0); 9341 } 9342 __ sub(tmp2, ch2, tmp1); 9343 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9344 __ bics(tmp2, tmp2, ch2); 9345 if (str1_isL != str2_isL) { 9346 __ fmovd(ch1, v1); 9347 } 9348 __ br(__ NE, L_HAS_ZERO); 9349 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9350 __ add(result, result, wordSize/str2_chr_size); 9351 __ add(str2, str2, wordSize); 9352 __ br(__ LT, L_POST_LOOP); 9353 __ BIND(L_LOOP); 9354 __ ldr(ch2, Address(str2)); 9355 __ eor(ch2, first, ch2); 9356 __ sub(tmp2, ch2, tmp1); 9357 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9358 __ bics(tmp2, tmp2, ch2); 9359 __ br(__ NE, L_HAS_ZERO); 9360 __ BIND(L_LOOP_PROCEED); 9361 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9362 __ add(str2, str2, wordSize); 9363 __ add(result, result, wordSize/str2_chr_size); 9364 __ br(__ GE, L_LOOP); 9365 __ BIND(L_POST_LOOP); 9366 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 9367 __ br(__ LE, NOMATCH); 9368 __ ldr(ch2, Address(str2)); 9369 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9370 __ eor(ch2, first, ch2); 9371 __ sub(tmp2, ch2, tmp1); 9372 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9373 __ mov(tmp4, -1); // all bits set 9374 __ b(L_SMALL_PROCEED); 9375 __ align(OptoLoopAlignment); 9376 __ BIND(L_SMALL); 9377 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9378 __ eor(ch2, first, ch2); 9379 if (str1_isL != str2_isL) { 9380 __ zip1(v1, __ T16B, v1, v0); 9381 } 9382 __ sub(tmp2, ch2, tmp1); 9383 __ mov(tmp4, -1); // all bits set 9384 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9385 if (str1_isL != str2_isL) { 9386 __ fmovd(ch1, v1); // move converted 4 symbols 9387 } 9388 __ BIND(L_SMALL_PROCEED); 9389 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9390 __ bic(tmp2, tmp2, ch2); 9391 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9392 __ rbit(tmp2, tmp2); 9393 __ br(__ EQ, NOMATCH); 9394 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9395 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9396 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9397 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9398 if (str2_isL) { // LL 9399 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9400 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9401 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9402 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9403 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9404 } else { 9405 __ mov(ch2, 0xE); // all bits in byte set except last one 9406 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9407 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9408 __ lslv(tmp2, tmp2, tmp4); 9409 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9410 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9411 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9412 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9413 } 9414 __ cmp(ch1, ch2); 9415 __ mov(tmp4, wordSize/str2_chr_size); 9416 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9417 __ BIND(L_SMALL_CMP_LOOP); 9418 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9419 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9420 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9421 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9422 __ add(tmp4, tmp4, 1); 9423 __ cmp(tmp4, cnt1); 9424 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9425 __ cmp(first, ch2); 9426 __ br(__ EQ, L_SMALL_CMP_LOOP); 9427 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9428 __ cbz(tmp2, NOMATCH); // no more matches. exit 9429 __ clz(tmp4, tmp2); 9430 __ add(result, result, 1); // advance index 9431 __ add(str2, str2, str2_chr_size); // advance pointer 9432 __ b(L_SMALL_HAS_ZERO_LOOP); 9433 __ align(OptoLoopAlignment); 9434 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9435 __ cmp(first, ch2); 9436 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9437 __ b(DONE); 9438 __ align(OptoLoopAlignment); 9439 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9440 if (str2_isL) { // LL 9441 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9442 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9443 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9444 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9445 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9446 } else { 9447 __ mov(ch2, 0xE); // all bits in byte set except last one 9448 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9449 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9450 __ lslv(tmp2, tmp2, tmp4); 9451 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9452 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9453 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9454 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9455 } 9456 __ cmp(ch1, ch2); 9457 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9458 __ b(DONE); 9459 __ align(OptoLoopAlignment); 9460 __ BIND(L_HAS_ZERO); 9461 __ rbit(tmp2, tmp2); 9462 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9463 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9464 // It's fine because both counters are 32bit and are not changed in this 9465 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9466 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9467 __ sub(result, result, 1); 9468 __ BIND(L_HAS_ZERO_LOOP); 9469 __ mov(cnt1, wordSize/str2_chr_size); 9470 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9471 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9472 if (str2_isL) { 9473 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9474 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9475 __ lslv(tmp2, tmp2, tmp4); 9476 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9477 __ add(tmp4, tmp4, 1); 9478 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9479 __ lsl(tmp2, tmp2, 1); 9480 __ mov(tmp4, wordSize/str2_chr_size); 9481 } else { 9482 __ mov(ch2, 0xE); 9483 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9484 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9485 __ lslv(tmp2, tmp2, tmp4); 9486 __ add(tmp4, tmp4, 1); 9487 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9488 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9489 __ lsl(tmp2, tmp2, 1); 9490 __ mov(tmp4, wordSize/str2_chr_size); 9491 __ sub(str2, str2, str2_chr_size); 9492 } 9493 __ cmp(ch1, ch2); 9494 __ mov(tmp4, wordSize/str2_chr_size); 9495 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9496 __ BIND(L_CMP_LOOP); 9497 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9498 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9499 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9500 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9501 __ add(tmp4, tmp4, 1); 9502 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9503 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9504 __ cmp(cnt1, ch2); 9505 __ br(__ EQ, L_CMP_LOOP); 9506 __ BIND(L_CMP_LOOP_NOMATCH); 9507 // here we're not matched 9508 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9509 __ clz(tmp4, tmp2); 9510 __ add(str2, str2, str2_chr_size); // advance pointer 9511 __ b(L_HAS_ZERO_LOOP); 9512 __ align(OptoLoopAlignment); 9513 __ BIND(L_CMP_LOOP_LAST_CMP); 9514 __ cmp(cnt1, ch2); 9515 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9516 __ b(DONE); 9517 __ align(OptoLoopAlignment); 9518 __ BIND(L_CMP_LOOP_LAST_CMP2); 9519 if (str2_isL) { 9520 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9521 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9522 __ lslv(tmp2, tmp2, tmp4); 9523 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9524 __ add(tmp4, tmp4, 1); 9525 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9526 __ lsl(tmp2, tmp2, 1); 9527 } else { 9528 __ mov(ch2, 0xE); 9529 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9530 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9531 __ lslv(tmp2, tmp2, tmp4); 9532 __ add(tmp4, tmp4, 1); 9533 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9534 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9535 __ lsl(tmp2, tmp2, 1); 9536 __ sub(str2, str2, str2_chr_size); 9537 } 9538 __ cmp(ch1, ch2); 9539 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9540 __ b(DONE); 9541 __ align(OptoLoopAlignment); 9542 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9543 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9544 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9545 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9546 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9547 // result by analyzed characters value, so, we can just reset lower bits 9548 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9549 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9550 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9551 // index of last analyzed substring inside current octet. So, str2 in at 9552 // respective start address. We need to advance it to next octet 9553 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9554 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9555 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9556 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9557 __ movw(cnt2, cnt2); 9558 __ b(L_LOOP_PROCEED); 9559 __ align(OptoLoopAlignment); 9560 __ BIND(NOMATCH); 9561 __ mov(result, -1); 9562 __ BIND(DONE); 9563 __ pop(spilled_regs, sp); 9564 __ ret(lr); 9565 return entry; 9566 } 9567 9568 void generate_string_indexof_stubs() { 9569 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9570 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9571 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9572 } 9573 9574 void inflate_and_store_2_fp_registers(bool generatePrfm, 9575 FloatRegister src1, FloatRegister src2) { 9576 Register dst = r1; 9577 __ zip1(v1, __ T16B, src1, v0); 9578 __ zip2(v2, __ T16B, src1, v0); 9579 if (generatePrfm) { 9580 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9581 } 9582 __ zip1(v3, __ T16B, src2, v0); 9583 __ zip2(v4, __ T16B, src2, v0); 9584 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9585 } 9586 9587 // R0 = src 9588 // R1 = dst 9589 // R2 = len 9590 // R3 = len >> 3 9591 // V0 = 0 9592 // v1 = loaded 8 bytes 9593 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9594 address generate_large_byte_array_inflate() { 9595 __ align(CodeEntryAlignment); 9596 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 9597 StubCodeMark mark(this, stub_id); 9598 address entry = __ pc(); 9599 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9600 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9601 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9602 9603 // do one more 8-byte read to have address 16-byte aligned in most cases 9604 // also use single store instruction 9605 __ ldrd(v2, __ post(src, 8)); 9606 __ sub(octetCounter, octetCounter, 2); 9607 __ zip1(v1, __ T16B, v1, v0); 9608 __ zip1(v2, __ T16B, v2, v0); 9609 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9610 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9611 __ subs(rscratch1, octetCounter, large_loop_threshold); 9612 __ br(__ LE, LOOP_START); 9613 __ b(LOOP_PRFM_START); 9614 __ bind(LOOP_PRFM); 9615 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9616 __ bind(LOOP_PRFM_START); 9617 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9618 __ sub(octetCounter, octetCounter, 8); 9619 __ subs(rscratch1, octetCounter, large_loop_threshold); 9620 inflate_and_store_2_fp_registers(true, v3, v4); 9621 inflate_and_store_2_fp_registers(true, v5, v6); 9622 __ br(__ GT, LOOP_PRFM); 9623 __ cmp(octetCounter, (u1)8); 9624 __ br(__ LT, DONE); 9625 __ bind(LOOP); 9626 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9627 __ bind(LOOP_START); 9628 __ sub(octetCounter, octetCounter, 8); 9629 __ cmp(octetCounter, (u1)8); 9630 inflate_and_store_2_fp_registers(false, v3, v4); 9631 inflate_and_store_2_fp_registers(false, v5, v6); 9632 __ br(__ GE, LOOP); 9633 __ bind(DONE); 9634 __ ret(lr); 9635 return entry; 9636 } 9637 9638 /** 9639 * Arguments: 9640 * 9641 * Input: 9642 * c_rarg0 - current state address 9643 * c_rarg1 - H key address 9644 * c_rarg2 - data address 9645 * c_rarg3 - number of blocks 9646 * 9647 * Output: 9648 * Updated state at c_rarg0 9649 */ 9650 address generate_ghash_processBlocks() { 9651 // Bafflingly, GCM uses little-endian for the byte order, but 9652 // big-endian for the bit order. For example, the polynomial 1 is 9653 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9654 // 9655 // So, we must either reverse the bytes in each word and do 9656 // everything big-endian or reverse the bits in each byte and do 9657 // it little-endian. On AArch64 it's more idiomatic to reverse 9658 // the bits in each byte (we have an instruction, RBIT, to do 9659 // that) and keep the data in little-endian bit order through the 9660 // calculation, bit-reversing the inputs and outputs. 9661 9662 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 9663 StubCodeMark mark(this, stub_id); 9664 __ align(wordSize * 2); 9665 address p = __ pc(); 9666 __ emit_int64(0x87); // The low-order bits of the field 9667 // polynomial (i.e. p = z^7+z^2+z+1) 9668 // repeated in the low and high parts of a 9669 // 128-bit vector 9670 __ emit_int64(0x87); 9671 9672 __ align(CodeEntryAlignment); 9673 address start = __ pc(); 9674 9675 Register state = c_rarg0; 9676 Register subkeyH = c_rarg1; 9677 Register data = c_rarg2; 9678 Register blocks = c_rarg3; 9679 9680 FloatRegister vzr = v30; 9681 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9682 9683 __ ldrq(v24, p); // The field polynomial 9684 9685 __ ldrq(v0, Address(state)); 9686 __ ldrq(v1, Address(subkeyH)); 9687 9688 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9689 __ rbit(v0, __ T16B, v0); 9690 __ rev64(v1, __ T16B, v1); 9691 __ rbit(v1, __ T16B, v1); 9692 9693 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9694 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9695 9696 { 9697 Label L_ghash_loop; 9698 __ bind(L_ghash_loop); 9699 9700 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9701 // reversing each byte 9702 __ rbit(v2, __ T16B, v2); 9703 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9704 9705 // Multiply state in v2 by subkey in v1 9706 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9707 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9708 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9709 // Reduce v7:v5 by the field polynomial 9710 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9711 9712 __ sub(blocks, blocks, 1); 9713 __ cbnz(blocks, L_ghash_loop); 9714 } 9715 9716 // The bit-reversed result is at this point in v0 9717 __ rev64(v0, __ T16B, v0); 9718 __ rbit(v0, __ T16B, v0); 9719 9720 __ st1(v0, __ T16B, state); 9721 __ ret(lr); 9722 9723 return start; 9724 } 9725 9726 address generate_ghash_processBlocks_wide() { 9727 address small = generate_ghash_processBlocks(); 9728 9729 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 9730 StubCodeMark mark(this, stub_id); 9731 __ align(wordSize * 2); 9732 address p = __ pc(); 9733 __ emit_int64(0x87); // The low-order bits of the field 9734 // polynomial (i.e. p = z^7+z^2+z+1) 9735 // repeated in the low and high parts of a 9736 // 128-bit vector 9737 __ emit_int64(0x87); 9738 9739 __ align(CodeEntryAlignment); 9740 address start = __ pc(); 9741 9742 Register state = c_rarg0; 9743 Register subkeyH = c_rarg1; 9744 Register data = c_rarg2; 9745 Register blocks = c_rarg3; 9746 9747 const int unroll = 4; 9748 9749 __ cmp(blocks, (unsigned char)(unroll * 2)); 9750 __ br(__ LT, small); 9751 9752 if (unroll > 1) { 9753 // Save state before entering routine 9754 __ sub(sp, sp, 4 * 16); 9755 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9756 __ sub(sp, sp, 4 * 16); 9757 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9758 } 9759 9760 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9761 9762 if (unroll > 1) { 9763 // And restore state 9764 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9765 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9766 } 9767 9768 __ cmp(blocks, (unsigned char)0); 9769 __ br(__ GT, small); 9770 9771 __ ret(lr); 9772 9773 return start; 9774 } 9775 9776 void generate_base64_encode_simdround(Register src, Register dst, 9777 FloatRegister codec, u8 size) { 9778 9779 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9780 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9781 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9782 9783 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9784 9785 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9786 9787 __ ushr(ind0, arrangement, in0, 2); 9788 9789 __ ushr(ind1, arrangement, in1, 2); 9790 __ shl(in0, arrangement, in0, 6); 9791 __ orr(ind1, arrangement, ind1, in0); 9792 __ ushr(ind1, arrangement, ind1, 2); 9793 9794 __ ushr(ind2, arrangement, in2, 4); 9795 __ shl(in1, arrangement, in1, 4); 9796 __ orr(ind2, arrangement, in1, ind2); 9797 __ ushr(ind2, arrangement, ind2, 2); 9798 9799 __ shl(ind3, arrangement, in2, 2); 9800 __ ushr(ind3, arrangement, ind3, 2); 9801 9802 __ tbl(out0, arrangement, codec, 4, ind0); 9803 __ tbl(out1, arrangement, codec, 4, ind1); 9804 __ tbl(out2, arrangement, codec, 4, ind2); 9805 __ tbl(out3, arrangement, codec, 4, ind3); 9806 9807 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9808 } 9809 9810 /** 9811 * Arguments: 9812 * 9813 * Input: 9814 * c_rarg0 - src_start 9815 * c_rarg1 - src_offset 9816 * c_rarg2 - src_length 9817 * c_rarg3 - dest_start 9818 * c_rarg4 - dest_offset 9819 * c_rarg5 - isURL 9820 * 9821 */ 9822 address generate_base64_encodeBlock() { 9823 9824 static const char toBase64[64] = { 9825 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9826 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9827 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9828 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9829 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9830 }; 9831 9832 static const char toBase64URL[64] = { 9833 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9834 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9835 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9836 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9837 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9838 }; 9839 9840 __ align(CodeEntryAlignment); 9841 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 9842 StubCodeMark mark(this, stub_id); 9843 address start = __ pc(); 9844 9845 Register src = c_rarg0; // source array 9846 Register soff = c_rarg1; // source start offset 9847 Register send = c_rarg2; // source end offset 9848 Register dst = c_rarg3; // dest array 9849 Register doff = c_rarg4; // position for writing to dest array 9850 Register isURL = c_rarg5; // Base64 or URL character set 9851 9852 // c_rarg6 and c_rarg7 are free to use as temps 9853 Register codec = c_rarg6; 9854 Register length = c_rarg7; 9855 9856 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9857 9858 __ add(src, src, soff); 9859 __ add(dst, dst, doff); 9860 __ sub(length, send, soff); 9861 9862 // load the codec base address 9863 __ lea(codec, ExternalAddress((address) toBase64)); 9864 __ cbz(isURL, ProcessData); 9865 __ lea(codec, ExternalAddress((address) toBase64URL)); 9866 9867 __ BIND(ProcessData); 9868 9869 // too short to formup a SIMD loop, roll back 9870 __ cmp(length, (u1)24); 9871 __ br(Assembler::LT, Process3B); 9872 9873 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9874 9875 __ BIND(Process48B); 9876 __ cmp(length, (u1)48); 9877 __ br(Assembler::LT, Process24B); 9878 generate_base64_encode_simdround(src, dst, v0, 16); 9879 __ sub(length, length, 48); 9880 __ b(Process48B); 9881 9882 __ BIND(Process24B); 9883 __ cmp(length, (u1)24); 9884 __ br(Assembler::LT, SIMDExit); 9885 generate_base64_encode_simdround(src, dst, v0, 8); 9886 __ sub(length, length, 24); 9887 9888 __ BIND(SIMDExit); 9889 __ cbz(length, Exit); 9890 9891 __ BIND(Process3B); 9892 // 3 src bytes, 24 bits 9893 __ ldrb(r10, __ post(src, 1)); 9894 __ ldrb(r11, __ post(src, 1)); 9895 __ ldrb(r12, __ post(src, 1)); 9896 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9897 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9898 // codec index 9899 __ ubfmw(r15, r12, 18, 23); 9900 __ ubfmw(r14, r12, 12, 17); 9901 __ ubfmw(r13, r12, 6, 11); 9902 __ andw(r12, r12, 63); 9903 // get the code based on the codec 9904 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9905 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9906 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9907 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9908 __ strb(r15, __ post(dst, 1)); 9909 __ strb(r14, __ post(dst, 1)); 9910 __ strb(r13, __ post(dst, 1)); 9911 __ strb(r12, __ post(dst, 1)); 9912 __ sub(length, length, 3); 9913 __ cbnz(length, Process3B); 9914 9915 __ BIND(Exit); 9916 __ ret(lr); 9917 9918 return start; 9919 } 9920 9921 void generate_base64_decode_simdround(Register src, Register dst, 9922 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9923 9924 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9925 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9926 9927 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9928 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9929 9930 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9931 9932 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9933 9934 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9935 9936 // we need unsigned saturating subtract, to make sure all input values 9937 // in range [0, 63] will have 0U value in the higher half lookup 9938 __ uqsubv(decH0, __ T16B, in0, v27); 9939 __ uqsubv(decH1, __ T16B, in1, v27); 9940 __ uqsubv(decH2, __ T16B, in2, v27); 9941 __ uqsubv(decH3, __ T16B, in3, v27); 9942 9943 // lower half lookup 9944 __ tbl(decL0, arrangement, codecL, 4, in0); 9945 __ tbl(decL1, arrangement, codecL, 4, in1); 9946 __ tbl(decL2, arrangement, codecL, 4, in2); 9947 __ tbl(decL3, arrangement, codecL, 4, in3); 9948 9949 // higher half lookup 9950 __ tbx(decH0, arrangement, codecH, 4, decH0); 9951 __ tbx(decH1, arrangement, codecH, 4, decH1); 9952 __ tbx(decH2, arrangement, codecH, 4, decH2); 9953 __ tbx(decH3, arrangement, codecH, 4, decH3); 9954 9955 // combine lower and higher 9956 __ orr(decL0, arrangement, decL0, decH0); 9957 __ orr(decL1, arrangement, decL1, decH1); 9958 __ orr(decL2, arrangement, decL2, decH2); 9959 __ orr(decL3, arrangement, decL3, decH3); 9960 9961 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9962 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9963 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9964 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9965 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9966 __ orr(in0, arrangement, decH0, decH1); 9967 __ orr(in1, arrangement, decH2, decH3); 9968 __ orr(in2, arrangement, in0, in1); 9969 __ umaxv(in3, arrangement, in2); 9970 __ umov(rscratch2, in3, __ B, 0); 9971 9972 // get the data to output 9973 __ shl(out0, arrangement, decL0, 2); 9974 __ ushr(out1, arrangement, decL1, 4); 9975 __ orr(out0, arrangement, out0, out1); 9976 __ shl(out1, arrangement, decL1, 4); 9977 __ ushr(out2, arrangement, decL2, 2); 9978 __ orr(out1, arrangement, out1, out2); 9979 __ shl(out2, arrangement, decL2, 6); 9980 __ orr(out2, arrangement, out2, decL3); 9981 9982 __ cbz(rscratch2, NoIllegalData); 9983 9984 // handle illegal input 9985 __ umov(r10, in2, __ D, 0); 9986 if (size == 16) { 9987 __ cbnz(r10, ErrorInLowerHalf); 9988 9989 // illegal input is in higher half, store the lower half now. 9990 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9991 9992 __ umov(r10, in2, __ D, 1); 9993 __ umov(r11, out0, __ D, 1); 9994 __ umov(r12, out1, __ D, 1); 9995 __ umov(r13, out2, __ D, 1); 9996 __ b(StoreLegalData); 9997 9998 __ BIND(ErrorInLowerHalf); 9999 } 10000 __ umov(r11, out0, __ D, 0); 10001 __ umov(r12, out1, __ D, 0); 10002 __ umov(r13, out2, __ D, 0); 10003 10004 __ BIND(StoreLegalData); 10005 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 10006 __ strb(r11, __ post(dst, 1)); 10007 __ strb(r12, __ post(dst, 1)); 10008 __ strb(r13, __ post(dst, 1)); 10009 __ lsr(r10, r10, 8); 10010 __ lsr(r11, r11, 8); 10011 __ lsr(r12, r12, 8); 10012 __ lsr(r13, r13, 8); 10013 __ b(StoreLegalData); 10014 10015 __ BIND(NoIllegalData); 10016 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 10017 } 10018 10019 10020 /** 10021 * Arguments: 10022 * 10023 * Input: 10024 * c_rarg0 - src_start 10025 * c_rarg1 - src_offset 10026 * c_rarg2 - src_length 10027 * c_rarg3 - dest_start 10028 * c_rarg4 - dest_offset 10029 * c_rarg5 - isURL 10030 * c_rarg6 - isMIME 10031 * 10032 */ 10033 address generate_base64_decodeBlock() { 10034 10035 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 10036 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 10037 // titled "Base64 decoding". 10038 10039 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 10040 // except the trailing character '=' is also treated illegal value in this intrinsic. That 10041 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 10042 static const uint8_t fromBase64ForNoSIMD[256] = { 10043 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10044 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10045 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10046 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10047 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10048 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 10049 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10050 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10051 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10052 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10053 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10054 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10055 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10056 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10057 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10058 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10059 }; 10060 10061 static const uint8_t fromBase64URLForNoSIMD[256] = { 10062 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10063 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10064 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10065 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10066 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10067 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 10068 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10069 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10070 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10071 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10072 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10073 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10074 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10075 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10076 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10077 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10078 }; 10079 10080 // A legal value of base64 code is in range [0, 127]. We need two lookups 10081 // with tbl/tbx and combine them to get the decode data. The 1st table vector 10082 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 10083 // table vector lookup use tbx, out of range indices are unchanged in 10084 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 10085 // The value of index 64 is set to 0, so that we know that we already get the 10086 // decoded data with the 1st lookup. 10087 static const uint8_t fromBase64ForSIMD[128] = { 10088 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10089 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10090 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10091 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10092 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10093 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10094 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10095 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10096 }; 10097 10098 static const uint8_t fromBase64URLForSIMD[128] = { 10099 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10100 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10101 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10102 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10103 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10104 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10105 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10106 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10107 }; 10108 10109 __ align(CodeEntryAlignment); 10110 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 10111 StubCodeMark mark(this, stub_id); 10112 address start = __ pc(); 10113 10114 Register src = c_rarg0; // source array 10115 Register soff = c_rarg1; // source start offset 10116 Register send = c_rarg2; // source end offset 10117 Register dst = c_rarg3; // dest array 10118 Register doff = c_rarg4; // position for writing to dest array 10119 Register isURL = c_rarg5; // Base64 or URL character set 10120 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 10121 10122 Register length = send; // reuse send as length of source data to process 10123 10124 Register simd_codec = c_rarg6; 10125 Register nosimd_codec = c_rarg7; 10126 10127 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 10128 10129 __ enter(); 10130 10131 __ add(src, src, soff); 10132 __ add(dst, dst, doff); 10133 10134 __ mov(doff, dst); 10135 10136 __ sub(length, send, soff); 10137 __ bfm(length, zr, 0, 1); 10138 10139 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 10140 __ cbz(isURL, ProcessData); 10141 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 10142 10143 __ BIND(ProcessData); 10144 __ mov(rscratch1, length); 10145 __ cmp(length, (u1)144); // 144 = 80 + 64 10146 __ br(Assembler::LT, Process4B); 10147 10148 // In the MIME case, the line length cannot be more than 76 10149 // bytes (see RFC 2045). This is too short a block for SIMD 10150 // to be worthwhile, so we use non-SIMD here. 10151 __ movw(rscratch1, 79); 10152 10153 __ BIND(Process4B); 10154 __ ldrw(r14, __ post(src, 4)); 10155 __ ubfxw(r10, r14, 0, 8); 10156 __ ubfxw(r11, r14, 8, 8); 10157 __ ubfxw(r12, r14, 16, 8); 10158 __ ubfxw(r13, r14, 24, 8); 10159 // get the de-code 10160 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 10161 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 10162 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 10163 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 10164 // error detection, 255u indicates an illegal input 10165 __ orrw(r14, r10, r11); 10166 __ orrw(r15, r12, r13); 10167 __ orrw(r14, r14, r15); 10168 __ tbnz(r14, 7, Exit); 10169 // recover the data 10170 __ lslw(r14, r10, 10); 10171 __ bfiw(r14, r11, 4, 6); 10172 __ bfmw(r14, r12, 2, 5); 10173 __ rev16w(r14, r14); 10174 __ bfiw(r13, r12, 6, 2); 10175 __ strh(r14, __ post(dst, 2)); 10176 __ strb(r13, __ post(dst, 1)); 10177 // non-simd loop 10178 __ subsw(rscratch1, rscratch1, 4); 10179 __ br(Assembler::GT, Process4B); 10180 10181 // if exiting from PreProcess80B, rscratch1 == -1; 10182 // otherwise, rscratch1 == 0. 10183 __ cbzw(rscratch1, Exit); 10184 __ sub(length, length, 80); 10185 10186 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 10187 __ cbz(isURL, SIMDEnter); 10188 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 10189 10190 __ BIND(SIMDEnter); 10191 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 10192 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 10193 __ mov(rscratch1, 63); 10194 __ dup(v27, __ T16B, rscratch1); 10195 10196 __ BIND(Process64B); 10197 __ cmp(length, (u1)64); 10198 __ br(Assembler::LT, Process32B); 10199 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 10200 __ sub(length, length, 64); 10201 __ b(Process64B); 10202 10203 __ BIND(Process32B); 10204 __ cmp(length, (u1)32); 10205 __ br(Assembler::LT, SIMDExit); 10206 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 10207 __ sub(length, length, 32); 10208 __ b(Process32B); 10209 10210 __ BIND(SIMDExit); 10211 __ cbz(length, Exit); 10212 __ movw(rscratch1, length); 10213 __ b(Process4B); 10214 10215 __ BIND(Exit); 10216 __ sub(c_rarg0, dst, doff); 10217 10218 __ leave(); 10219 __ ret(lr); 10220 10221 return start; 10222 } 10223 10224 // Support for spin waits. 10225 address generate_spin_wait() { 10226 __ align(CodeEntryAlignment); 10227 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 10228 StubCodeMark mark(this, stub_id); 10229 address start = __ pc(); 10230 10231 __ spin_wait(); 10232 __ ret(lr); 10233 10234 return start; 10235 } 10236 10237 void generate_lookup_secondary_supers_table_stub() { 10238 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 10239 StubCodeMark mark(this, stub_id); 10240 10241 const Register 10242 r_super_klass = r0, 10243 r_array_base = r1, 10244 r_array_length = r2, 10245 r_array_index = r3, 10246 r_sub_klass = r4, 10247 r_bitmap = rscratch2, 10248 result = r5; 10249 const FloatRegister 10250 vtemp = v0; 10251 10252 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 10253 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 10254 Label L_success; 10255 __ enter(); 10256 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 10257 r_array_base, r_array_length, r_array_index, 10258 vtemp, result, slot, 10259 /*stub_is_near*/true); 10260 __ leave(); 10261 __ ret(lr); 10262 } 10263 } 10264 10265 // Slow path implementation for UseSecondarySupersTable. 10266 address generate_lookup_secondary_supers_table_slow_path_stub() { 10267 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 10268 StubCodeMark mark(this, stub_id); 10269 10270 address start = __ pc(); 10271 const Register 10272 r_super_klass = r0, // argument 10273 r_array_base = r1, // argument 10274 temp1 = r2, // temp 10275 r_array_index = r3, // argument 10276 r_bitmap = rscratch2, // argument 10277 result = r5; // argument 10278 10279 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 10280 __ ret(lr); 10281 10282 return start; 10283 } 10284 10285 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10286 10287 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 10288 // 10289 // If LSE is in use, generate LSE versions of all the stubs. The 10290 // non-LSE versions are in atomic_aarch64.S. 10291 10292 // class AtomicStubMark records the entry point of a stub and the 10293 // stub pointer which will point to it. The stub pointer is set to 10294 // the entry point when ~AtomicStubMark() is called, which must be 10295 // after ICache::invalidate_range. This ensures safe publication of 10296 // the generated code. 10297 class AtomicStubMark { 10298 address _entry_point; 10299 aarch64_atomic_stub_t *_stub; 10300 MacroAssembler *_masm; 10301 public: 10302 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 10303 _masm = masm; 10304 __ align(32); 10305 _entry_point = __ pc(); 10306 _stub = stub; 10307 } 10308 ~AtomicStubMark() { 10309 *_stub = (aarch64_atomic_stub_t)_entry_point; 10310 } 10311 }; 10312 10313 // NB: For memory_order_conservative we need a trailing membar after 10314 // LSE atomic operations but not a leading membar. 10315 // 10316 // We don't need a leading membar because a clause in the Arm ARM 10317 // says: 10318 // 10319 // Barrier-ordered-before 10320 // 10321 // Barrier instructions order prior Memory effects before subsequent 10322 // Memory effects generated by the same Observer. A read or a write 10323 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 10324 // Observer if and only if RW1 appears in program order before RW 2 10325 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 10326 // instruction with both Acquire and Release semantics. 10327 // 10328 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 10329 // and Release semantics, therefore we don't need a leading 10330 // barrier. However, there is no corresponding Barrier-ordered-after 10331 // relationship, therefore we need a trailing membar to prevent a 10332 // later store or load from being reordered with the store in an 10333 // atomic instruction. 10334 // 10335 // This was checked by using the herd7 consistency model simulator 10336 // (http://diy.inria.fr/) with this test case: 10337 // 10338 // AArch64 LseCas 10339 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 10340 // P0 | P1; 10341 // LDR W4, [X2] | MOV W3, #0; 10342 // DMB LD | MOV W4, #1; 10343 // LDR W3, [X1] | CASAL W3, W4, [X1]; 10344 // | DMB ISH; 10345 // | STR W4, [X2]; 10346 // exists 10347 // (0:X3=0 /\ 0:X4=1) 10348 // 10349 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 10350 // with the store to x in P1. Without the DMB in P1 this may happen. 10351 // 10352 // At the time of writing we don't know of any AArch64 hardware that 10353 // reorders stores in this way, but the Reference Manual permits it. 10354 10355 void gen_cas_entry(Assembler::operand_size size, 10356 atomic_memory_order order) { 10357 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 10358 exchange_val = c_rarg2; 10359 bool acquire, release; 10360 switch (order) { 10361 case memory_order_relaxed: 10362 acquire = false; 10363 release = false; 10364 break; 10365 case memory_order_release: 10366 acquire = false; 10367 release = true; 10368 break; 10369 default: 10370 acquire = true; 10371 release = true; 10372 break; 10373 } 10374 __ mov(prev, compare_val); 10375 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 10376 if (order == memory_order_conservative) { 10377 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10378 } 10379 if (size == Assembler::xword) { 10380 __ mov(r0, prev); 10381 } else { 10382 __ movw(r0, prev); 10383 } 10384 __ ret(lr); 10385 } 10386 10387 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10388 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10389 // If not relaxed, then default to conservative. Relaxed is the only 10390 // case we use enough to be worth specializing. 10391 if (order == memory_order_relaxed) { 10392 __ ldadd(size, incr, prev, addr); 10393 } else { 10394 __ ldaddal(size, incr, prev, addr); 10395 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10396 } 10397 if (size == Assembler::xword) { 10398 __ mov(r0, prev); 10399 } else { 10400 __ movw(r0, prev); 10401 } 10402 __ ret(lr); 10403 } 10404 10405 void gen_swpal_entry(Assembler::operand_size size) { 10406 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10407 __ swpal(size, incr, prev, addr); 10408 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10409 if (size == Assembler::xword) { 10410 __ mov(r0, prev); 10411 } else { 10412 __ movw(r0, prev); 10413 } 10414 __ ret(lr); 10415 } 10416 10417 void generate_atomic_entry_points() { 10418 if (! UseLSE) { 10419 return; 10420 } 10421 __ align(CodeEntryAlignment); 10422 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 10423 StubCodeMark mark(this, stub_id); 10424 address first_entry = __ pc(); 10425 10426 // ADD, memory_order_conservative 10427 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10428 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10429 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10430 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10431 10432 // ADD, memory_order_relaxed 10433 AtomicStubMark mark_fetch_add_4_relaxed 10434 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10435 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10436 AtomicStubMark mark_fetch_add_8_relaxed 10437 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10438 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10439 10440 // XCHG, memory_order_conservative 10441 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10442 gen_swpal_entry(Assembler::word); 10443 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10444 gen_swpal_entry(Assembler::xword); 10445 10446 // CAS, memory_order_conservative 10447 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10448 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10449 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10450 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10451 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10452 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10453 10454 // CAS, memory_order_relaxed 10455 AtomicStubMark mark_cmpxchg_1_relaxed 10456 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10457 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10458 AtomicStubMark mark_cmpxchg_4_relaxed 10459 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10460 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10461 AtomicStubMark mark_cmpxchg_8_relaxed 10462 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10463 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10464 10465 AtomicStubMark mark_cmpxchg_4_release 10466 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10467 gen_cas_entry(MacroAssembler::word, memory_order_release); 10468 AtomicStubMark mark_cmpxchg_8_release 10469 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10470 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10471 10472 AtomicStubMark mark_cmpxchg_4_seq_cst 10473 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10474 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10475 AtomicStubMark mark_cmpxchg_8_seq_cst 10476 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10477 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10478 10479 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10480 } 10481 #endif // LINUX 10482 10483 static void save_return_registers(MacroAssembler* masm) { 10484 if (InlineTypeReturnedAsFields) { 10485 masm->push(RegSet::range(r0, r7), sp); 10486 masm->sub(sp, sp, 4 * wordSize); 10487 masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp)); 10488 masm->sub(sp, sp, 4 * wordSize); 10489 masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp)); 10490 } else { 10491 masm->fmovd(rscratch1, v0); 10492 masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize))); 10493 } 10494 } 10495 10496 static void restore_return_registers(MacroAssembler* masm) { 10497 if (InlineTypeReturnedAsFields) { 10498 masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize))); 10499 masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize))); 10500 masm->pop(RegSet::range(r0, r7), sp); 10501 } else { 10502 masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize))); 10503 masm->fmovd(v0, rscratch1); 10504 } 10505 } 10506 10507 address generate_cont_thaw(Continuation::thaw_kind kind) { 10508 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10509 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10510 10511 address start = __ pc(); 10512 10513 if (return_barrier) { 10514 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10515 __ mov(sp, rscratch1); 10516 } 10517 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10518 10519 if (return_barrier) { 10520 // preserve possible return value from a method returning to the return barrier 10521 save_return_registers(_masm); 10522 } 10523 10524 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10525 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10526 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10527 10528 if (return_barrier) { 10529 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10530 restore_return_registers(_masm); 10531 } 10532 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10533 10534 10535 Label thaw_success; 10536 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10537 __ cbnz(rscratch2, thaw_success); 10538 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10539 __ br(rscratch1); 10540 __ bind(thaw_success); 10541 10542 // make room for the thawed frames 10543 __ sub(rscratch1, sp, rscratch2); 10544 __ andr(rscratch1, rscratch1, -16); // align 10545 __ mov(sp, rscratch1); 10546 10547 if (return_barrier) { 10548 // save original return value -- again 10549 save_return_registers(_masm); 10550 } 10551 10552 // If we want, we can templatize thaw by kind, and have three different entries 10553 __ movw(c_rarg1, (uint32_t)kind); 10554 10555 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10556 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10557 10558 if (return_barrier) { 10559 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10560 restore_return_registers(_masm); 10561 } else { 10562 __ mov(r0, zr); // return 0 (success) from doYield 10563 } 10564 10565 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10566 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10567 __ mov(rfp, sp); 10568 10569 if (return_barrier_exception) { 10570 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10571 __ authenticate_return_address(c_rarg1); 10572 __ verify_oop(r0); 10573 // save return value containing the exception oop in callee-saved R19 10574 __ mov(r19, r0); 10575 10576 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10577 10578 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10579 // __ reinitialize_ptrue(); 10580 10581 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10582 10583 __ mov(r1, r0); // the exception handler 10584 __ mov(r0, r19); // restore return value containing the exception oop 10585 __ verify_oop(r0); 10586 10587 __ leave(); 10588 __ mov(r3, lr); 10589 __ br(r1); // the exception handler 10590 } else { 10591 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10592 __ leave(); 10593 __ ret(lr); 10594 } 10595 10596 return start; 10597 } 10598 10599 address generate_cont_thaw() { 10600 if (!Continuations::enabled()) return nullptr; 10601 10602 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 10603 StubCodeMark mark(this, stub_id); 10604 address start = __ pc(); 10605 generate_cont_thaw(Continuation::thaw_top); 10606 return start; 10607 } 10608 10609 address generate_cont_returnBarrier() { 10610 if (!Continuations::enabled()) return nullptr; 10611 10612 // TODO: will probably need multiple return barriers depending on return type 10613 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 10614 StubCodeMark mark(this, stub_id); 10615 address start = __ pc(); 10616 10617 generate_cont_thaw(Continuation::thaw_return_barrier); 10618 10619 return start; 10620 } 10621 10622 address generate_cont_returnBarrier_exception() { 10623 if (!Continuations::enabled()) return nullptr; 10624 10625 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 10626 StubCodeMark mark(this, stub_id); 10627 address start = __ pc(); 10628 10629 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10630 10631 return start; 10632 } 10633 10634 address generate_cont_preempt_stub() { 10635 if (!Continuations::enabled()) return nullptr; 10636 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 10637 StubCodeMark mark(this, stub_id); 10638 address start = __ pc(); 10639 10640 __ reset_last_Java_frame(true); 10641 10642 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10643 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10644 __ mov(sp, rscratch2); 10645 10646 Label preemption_cancelled; 10647 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10648 __ cbnz(rscratch1, preemption_cancelled); 10649 10650 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10651 SharedRuntime::continuation_enter_cleanup(_masm); 10652 __ leave(); 10653 __ ret(lr); 10654 10655 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10656 __ bind(preemption_cancelled); 10657 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10658 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10659 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10660 __ ldr(rscratch1, Address(rscratch1)); 10661 __ br(rscratch1); 10662 10663 return start; 10664 } 10665 10666 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10667 // are represented as long[5], with BITS_PER_LIMB = 26. 10668 // Pack five 26-bit limbs into three 64-bit registers. 10669 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10670 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10671 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10672 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10673 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10674 10675 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10676 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10677 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10678 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10679 10680 if (dest2->is_valid()) { 10681 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10682 } else { 10683 #ifdef ASSERT 10684 Label OK; 10685 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10686 __ br(__ EQ, OK); 10687 __ stop("high bits of Poly1305 integer should be zero"); 10688 __ should_not_reach_here(); 10689 __ bind(OK); 10690 #endif 10691 } 10692 } 10693 10694 // As above, but return only a 128-bit integer, packed into two 10695 // 64-bit registers. 10696 void pack_26(Register dest0, Register dest1, Register src) { 10697 pack_26(dest0, dest1, noreg, src); 10698 } 10699 10700 // Multiply and multiply-accumulate unsigned 64-bit registers. 10701 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10702 __ mul(prod_lo, n, m); 10703 __ umulh(prod_hi, n, m); 10704 } 10705 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10706 wide_mul(rscratch1, rscratch2, n, m); 10707 __ adds(sum_lo, sum_lo, rscratch1); 10708 __ adc(sum_hi, sum_hi, rscratch2); 10709 } 10710 10711 // Poly1305, RFC 7539 10712 10713 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10714 // description of the tricks used to simplify and accelerate this 10715 // computation. 10716 10717 address generate_poly1305_processBlocks() { 10718 __ align(CodeEntryAlignment); 10719 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 10720 StubCodeMark mark(this, stub_id); 10721 address start = __ pc(); 10722 Label here; 10723 __ enter(); 10724 RegSet callee_saved = RegSet::range(r19, r28); 10725 __ push(callee_saved, sp); 10726 10727 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10728 10729 // Arguments 10730 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10731 10732 // R_n is the 128-bit randomly-generated key, packed into two 10733 // registers. The caller passes this key to us as long[5], with 10734 // BITS_PER_LIMB = 26. 10735 const Register R_0 = *++regs, R_1 = *++regs; 10736 pack_26(R_0, R_1, r_start); 10737 10738 // RR_n is (R_n >> 2) * 5 10739 const Register RR_0 = *++regs, RR_1 = *++regs; 10740 __ lsr(RR_0, R_0, 2); 10741 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10742 __ lsr(RR_1, R_1, 2); 10743 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10744 10745 // U_n is the current checksum 10746 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10747 pack_26(U_0, U_1, U_2, acc_start); 10748 10749 static constexpr int BLOCK_LENGTH = 16; 10750 Label DONE, LOOP; 10751 10752 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10753 __ br(Assembler::LT, DONE); { 10754 __ bind(LOOP); 10755 10756 // S_n is to be the sum of U_n and the next block of data 10757 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10758 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10759 __ adds(S_0, U_0, S_0); 10760 __ adcs(S_1, U_1, S_1); 10761 __ adc(S_2, U_2, zr); 10762 __ add(S_2, S_2, 1); 10763 10764 const Register U_0HI = *++regs, U_1HI = *++regs; 10765 10766 // NB: this logic depends on some of the special properties of 10767 // Poly1305 keys. In particular, because we know that the top 10768 // four bits of R_0 and R_1 are zero, we can add together 10769 // partial products without any risk of needing to propagate a 10770 // carry out. 10771 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10772 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10773 __ andr(U_2, R_0, 3); 10774 __ mul(U_2, S_2, U_2); 10775 10776 // Recycle registers S_0, S_1, S_2 10777 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10778 10779 // Partial reduction mod 2**130 - 5 10780 __ adds(U_1, U_0HI, U_1); 10781 __ adc(U_2, U_1HI, U_2); 10782 // Sum now in U_2:U_1:U_0. 10783 // Dead: U_0HI, U_1HI. 10784 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10785 10786 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10787 10788 // First, U_2:U_1:U_0 += (U_2 >> 2) 10789 __ lsr(rscratch1, U_2, 2); 10790 __ andr(U_2, U_2, (u8)3); 10791 __ adds(U_0, U_0, rscratch1); 10792 __ adcs(U_1, U_1, zr); 10793 __ adc(U_2, U_2, zr); 10794 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10795 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10796 __ adcs(U_1, U_1, zr); 10797 __ adc(U_2, U_2, zr); 10798 10799 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10800 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10801 __ br(~ Assembler::LT, LOOP); 10802 } 10803 10804 // Further reduce modulo 2^130 - 5 10805 __ lsr(rscratch1, U_2, 2); 10806 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10807 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10808 __ adcs(U_1, U_1, zr); 10809 __ andr(U_2, U_2, (u1)3); 10810 __ adc(U_2, U_2, zr); 10811 10812 // Unpack the sum into five 26-bit limbs and write to memory. 10813 __ ubfiz(rscratch1, U_0, 0, 26); 10814 __ ubfx(rscratch2, U_0, 26, 26); 10815 __ stp(rscratch1, rscratch2, Address(acc_start)); 10816 __ ubfx(rscratch1, U_0, 52, 12); 10817 __ bfi(rscratch1, U_1, 12, 14); 10818 __ ubfx(rscratch2, U_1, 14, 26); 10819 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10820 __ ubfx(rscratch1, U_1, 40, 24); 10821 __ bfi(rscratch1, U_2, 24, 3); 10822 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10823 10824 __ bind(DONE); 10825 __ pop(callee_saved, sp); 10826 __ leave(); 10827 __ ret(lr); 10828 10829 return start; 10830 } 10831 10832 // exception handler for upcall stubs 10833 address generate_upcall_stub_exception_handler() { 10834 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 10835 StubCodeMark mark(this, stub_id); 10836 address start = __ pc(); 10837 10838 // Native caller has no idea how to handle exceptions, 10839 // so we just crash here. Up to callee to catch exceptions. 10840 __ verify_oop(r0); 10841 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10842 __ blr(rscratch1); 10843 __ should_not_reach_here(); 10844 10845 return start; 10846 } 10847 10848 // load Method* target of MethodHandle 10849 // j_rarg0 = jobject receiver 10850 // rmethod = result 10851 address generate_upcall_stub_load_target() { 10852 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 10853 StubCodeMark mark(this, stub_id); 10854 address start = __ pc(); 10855 10856 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10857 // Load target method from receiver 10858 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10859 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10860 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10861 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10862 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10863 noreg, noreg); 10864 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10865 10866 __ ret(lr); 10867 10868 return start; 10869 } 10870 10871 #undef __ 10872 #define __ masm-> 10873 10874 class MontgomeryMultiplyGenerator : public MacroAssembler { 10875 10876 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10877 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10878 10879 RegSet _toSave; 10880 bool _squaring; 10881 10882 public: 10883 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10884 : MacroAssembler(as->code()), _squaring(squaring) { 10885 10886 // Register allocation 10887 10888 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10889 Pa_base = *regs; // Argument registers 10890 if (squaring) 10891 Pb_base = Pa_base; 10892 else 10893 Pb_base = *++regs; 10894 Pn_base = *++regs; 10895 Rlen= *++regs; 10896 inv = *++regs; 10897 Pm_base = *++regs; 10898 10899 // Working registers: 10900 Ra = *++regs; // The current digit of a, b, n, and m. 10901 Rb = *++regs; 10902 Rm = *++regs; 10903 Rn = *++regs; 10904 10905 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10906 Pb = *++regs; 10907 Pm = *++regs; 10908 Pn = *++regs; 10909 10910 t0 = *++regs; // Three registers which form a 10911 t1 = *++regs; // triple-precision accumuator. 10912 t2 = *++regs; 10913 10914 Ri = *++regs; // Inner and outer loop indexes. 10915 Rj = *++regs; 10916 10917 Rhi_ab = *++regs; // Product registers: low and high parts 10918 Rlo_ab = *++regs; // of a*b and m*n. 10919 Rhi_mn = *++regs; 10920 Rlo_mn = *++regs; 10921 10922 // r19 and up are callee-saved. 10923 _toSave = RegSet::range(r19, *regs) + Pm_base; 10924 } 10925 10926 private: 10927 void save_regs() { 10928 push(_toSave, sp); 10929 } 10930 10931 void restore_regs() { 10932 pop(_toSave, sp); 10933 } 10934 10935 template <typename T> 10936 void unroll_2(Register count, T block) { 10937 Label loop, end, odd; 10938 tbnz(count, 0, odd); 10939 cbz(count, end); 10940 align(16); 10941 bind(loop); 10942 (this->*block)(); 10943 bind(odd); 10944 (this->*block)(); 10945 subs(count, count, 2); 10946 br(Assembler::GT, loop); 10947 bind(end); 10948 } 10949 10950 template <typename T> 10951 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10952 Label loop, end, odd; 10953 tbnz(count, 0, odd); 10954 cbz(count, end); 10955 align(16); 10956 bind(loop); 10957 (this->*block)(d, s, tmp); 10958 bind(odd); 10959 (this->*block)(d, s, tmp); 10960 subs(count, count, 2); 10961 br(Assembler::GT, loop); 10962 bind(end); 10963 } 10964 10965 void pre1(RegisterOrConstant i) { 10966 block_comment("pre1"); 10967 // Pa = Pa_base; 10968 // Pb = Pb_base + i; 10969 // Pm = Pm_base; 10970 // Pn = Pn_base + i; 10971 // Ra = *Pa; 10972 // Rb = *Pb; 10973 // Rm = *Pm; 10974 // Rn = *Pn; 10975 ldr(Ra, Address(Pa_base)); 10976 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10977 ldr(Rm, Address(Pm_base)); 10978 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10979 lea(Pa, Address(Pa_base)); 10980 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10981 lea(Pm, Address(Pm_base)); 10982 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10983 10984 // Zero the m*n result. 10985 mov(Rhi_mn, zr); 10986 mov(Rlo_mn, zr); 10987 } 10988 10989 // The core multiply-accumulate step of a Montgomery 10990 // multiplication. The idea is to schedule operations as a 10991 // pipeline so that instructions with long latencies (loads and 10992 // multiplies) have time to complete before their results are 10993 // used. This most benefits in-order implementations of the 10994 // architecture but out-of-order ones also benefit. 10995 void step() { 10996 block_comment("step"); 10997 // MACC(Ra, Rb, t0, t1, t2); 10998 // Ra = *++Pa; 10999 // Rb = *--Pb; 11000 umulh(Rhi_ab, Ra, Rb); 11001 mul(Rlo_ab, Ra, Rb); 11002 ldr(Ra, pre(Pa, wordSize)); 11003 ldr(Rb, pre(Pb, -wordSize)); 11004 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 11005 // previous iteration. 11006 // MACC(Rm, Rn, t0, t1, t2); 11007 // Rm = *++Pm; 11008 // Rn = *--Pn; 11009 umulh(Rhi_mn, Rm, Rn); 11010 mul(Rlo_mn, Rm, Rn); 11011 ldr(Rm, pre(Pm, wordSize)); 11012 ldr(Rn, pre(Pn, -wordSize)); 11013 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11014 } 11015 11016 void post1() { 11017 block_comment("post1"); 11018 11019 // MACC(Ra, Rb, t0, t1, t2); 11020 // Ra = *++Pa; 11021 // Rb = *--Pb; 11022 umulh(Rhi_ab, Ra, Rb); 11023 mul(Rlo_ab, Ra, Rb); 11024 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11025 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11026 11027 // *Pm = Rm = t0 * inv; 11028 mul(Rm, t0, inv); 11029 str(Rm, Address(Pm)); 11030 11031 // MACC(Rm, Rn, t0, t1, t2); 11032 // t0 = t1; t1 = t2; t2 = 0; 11033 umulh(Rhi_mn, Rm, Rn); 11034 11035 #ifndef PRODUCT 11036 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11037 { 11038 mul(Rlo_mn, Rm, Rn); 11039 add(Rlo_mn, t0, Rlo_mn); 11040 Label ok; 11041 cbz(Rlo_mn, ok); { 11042 stop("broken Montgomery multiply"); 11043 } bind(ok); 11044 } 11045 #endif 11046 // We have very carefully set things up so that 11047 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11048 // the lower half of Rm * Rn because we know the result already: 11049 // it must be -t0. t0 + (-t0) must generate a carry iff 11050 // t0 != 0. So, rather than do a mul and an adds we just set 11051 // the carry flag iff t0 is nonzero. 11052 // 11053 // mul(Rlo_mn, Rm, Rn); 11054 // adds(zr, t0, Rlo_mn); 11055 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11056 adcs(t0, t1, Rhi_mn); 11057 adc(t1, t2, zr); 11058 mov(t2, zr); 11059 } 11060 11061 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 11062 block_comment("pre2"); 11063 // Pa = Pa_base + i-len; 11064 // Pb = Pb_base + len; 11065 // Pm = Pm_base + i-len; 11066 // Pn = Pn_base + len; 11067 11068 if (i.is_register()) { 11069 sub(Rj, i.as_register(), len); 11070 } else { 11071 mov(Rj, i.as_constant()); 11072 sub(Rj, Rj, len); 11073 } 11074 // Rj == i-len 11075 11076 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 11077 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 11078 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11079 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 11080 11081 // Ra = *++Pa; 11082 // Rb = *--Pb; 11083 // Rm = *++Pm; 11084 // Rn = *--Pn; 11085 ldr(Ra, pre(Pa, wordSize)); 11086 ldr(Rb, pre(Pb, -wordSize)); 11087 ldr(Rm, pre(Pm, wordSize)); 11088 ldr(Rn, pre(Pn, -wordSize)); 11089 11090 mov(Rhi_mn, zr); 11091 mov(Rlo_mn, zr); 11092 } 11093 11094 void post2(RegisterOrConstant i, RegisterOrConstant len) { 11095 block_comment("post2"); 11096 if (i.is_constant()) { 11097 mov(Rj, i.as_constant()-len.as_constant()); 11098 } else { 11099 sub(Rj, i.as_register(), len); 11100 } 11101 11102 adds(t0, t0, Rlo_mn); // The pending m*n, low part 11103 11104 // As soon as we know the least significant digit of our result, 11105 // store it. 11106 // Pm_base[i-len] = t0; 11107 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11108 11109 // t0 = t1; t1 = t2; t2 = 0; 11110 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 11111 adc(t1, t2, zr); 11112 mov(t2, zr); 11113 } 11114 11115 // A carry in t0 after Montgomery multiplication means that we 11116 // should subtract multiples of n from our result in m. We'll 11117 // keep doing that until there is no carry. 11118 void normalize(RegisterOrConstant len) { 11119 block_comment("normalize"); 11120 // while (t0) 11121 // t0 = sub(Pm_base, Pn_base, t0, len); 11122 Label loop, post, again; 11123 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 11124 cbz(t0, post); { 11125 bind(again); { 11126 mov(i, zr); 11127 mov(cnt, len); 11128 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11129 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11130 subs(zr, zr, zr); // set carry flag, i.e. no borrow 11131 align(16); 11132 bind(loop); { 11133 sbcs(Rm, Rm, Rn); 11134 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11135 add(i, i, 1); 11136 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11137 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11138 sub(cnt, cnt, 1); 11139 } cbnz(cnt, loop); 11140 sbc(t0, t0, zr); 11141 } cbnz(t0, again); 11142 } bind(post); 11143 } 11144 11145 // Move memory at s to d, reversing words. 11146 // Increments d to end of copied memory 11147 // Destroys tmp1, tmp2 11148 // Preserves len 11149 // Leaves s pointing to the address which was in d at start 11150 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 11151 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 11152 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 11153 11154 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 11155 mov(tmp1, len); 11156 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 11157 sub(s, d, len, ext::uxtw, LogBytesPerWord); 11158 } 11159 // where 11160 void reverse1(Register d, Register s, Register tmp) { 11161 ldr(tmp, pre(s, -wordSize)); 11162 ror(tmp, tmp, 32); 11163 str(tmp, post(d, wordSize)); 11164 } 11165 11166 void step_squaring() { 11167 // An extra ACC 11168 step(); 11169 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11170 } 11171 11172 void last_squaring(RegisterOrConstant i) { 11173 Label dont; 11174 // if ((i & 1) == 0) { 11175 tbnz(i.as_register(), 0, dont); { 11176 // MACC(Ra, Rb, t0, t1, t2); 11177 // Ra = *++Pa; 11178 // Rb = *--Pb; 11179 umulh(Rhi_ab, Ra, Rb); 11180 mul(Rlo_ab, Ra, Rb); 11181 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11182 } bind(dont); 11183 } 11184 11185 void extra_step_squaring() { 11186 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11187 11188 // MACC(Rm, Rn, t0, t1, t2); 11189 // Rm = *++Pm; 11190 // Rn = *--Pn; 11191 umulh(Rhi_mn, Rm, Rn); 11192 mul(Rlo_mn, Rm, Rn); 11193 ldr(Rm, pre(Pm, wordSize)); 11194 ldr(Rn, pre(Pn, -wordSize)); 11195 } 11196 11197 void post1_squaring() { 11198 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11199 11200 // *Pm = Rm = t0 * inv; 11201 mul(Rm, t0, inv); 11202 str(Rm, Address(Pm)); 11203 11204 // MACC(Rm, Rn, t0, t1, t2); 11205 // t0 = t1; t1 = t2; t2 = 0; 11206 umulh(Rhi_mn, Rm, Rn); 11207 11208 #ifndef PRODUCT 11209 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11210 { 11211 mul(Rlo_mn, Rm, Rn); 11212 add(Rlo_mn, t0, Rlo_mn); 11213 Label ok; 11214 cbz(Rlo_mn, ok); { 11215 stop("broken Montgomery multiply"); 11216 } bind(ok); 11217 } 11218 #endif 11219 // We have very carefully set things up so that 11220 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11221 // the lower half of Rm * Rn because we know the result already: 11222 // it must be -t0. t0 + (-t0) must generate a carry iff 11223 // t0 != 0. So, rather than do a mul and an adds we just set 11224 // the carry flag iff t0 is nonzero. 11225 // 11226 // mul(Rlo_mn, Rm, Rn); 11227 // adds(zr, t0, Rlo_mn); 11228 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11229 adcs(t0, t1, Rhi_mn); 11230 adc(t1, t2, zr); 11231 mov(t2, zr); 11232 } 11233 11234 void acc(Register Rhi, Register Rlo, 11235 Register t0, Register t1, Register t2) { 11236 adds(t0, t0, Rlo); 11237 adcs(t1, t1, Rhi); 11238 adc(t2, t2, zr); 11239 } 11240 11241 public: 11242 /** 11243 * Fast Montgomery multiplication. The derivation of the 11244 * algorithm is in A Cryptographic Library for the Motorola 11245 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 11246 * 11247 * Arguments: 11248 * 11249 * Inputs for multiplication: 11250 * c_rarg0 - int array elements a 11251 * c_rarg1 - int array elements b 11252 * c_rarg2 - int array elements n (the modulus) 11253 * c_rarg3 - int length 11254 * c_rarg4 - int inv 11255 * c_rarg5 - int array elements m (the result) 11256 * 11257 * Inputs for squaring: 11258 * c_rarg0 - int array elements a 11259 * c_rarg1 - int array elements n (the modulus) 11260 * c_rarg2 - int length 11261 * c_rarg3 - int inv 11262 * c_rarg4 - int array elements m (the result) 11263 * 11264 */ 11265 address generate_multiply() { 11266 Label argh, nothing; 11267 bind(argh); 11268 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11269 11270 align(CodeEntryAlignment); 11271 address entry = pc(); 11272 11273 cbzw(Rlen, nothing); 11274 11275 enter(); 11276 11277 // Make room. 11278 cmpw(Rlen, 512); 11279 br(Assembler::HI, argh); 11280 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11281 andr(sp, Ra, -2 * wordSize); 11282 11283 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11284 11285 { 11286 // Copy input args, reversing as we go. We use Ra as a 11287 // temporary variable. 11288 reverse(Ra, Pa_base, Rlen, t0, t1); 11289 if (!_squaring) 11290 reverse(Ra, Pb_base, Rlen, t0, t1); 11291 reverse(Ra, Pn_base, Rlen, t0, t1); 11292 } 11293 11294 // Push all call-saved registers and also Pm_base which we'll need 11295 // at the end. 11296 save_regs(); 11297 11298 #ifndef PRODUCT 11299 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 11300 { 11301 ldr(Rn, Address(Pn_base, 0)); 11302 mul(Rlo_mn, Rn, inv); 11303 subs(zr, Rlo_mn, -1); 11304 Label ok; 11305 br(EQ, ok); { 11306 stop("broken inverse in Montgomery multiply"); 11307 } bind(ok); 11308 } 11309 #endif 11310 11311 mov(Pm_base, Ra); 11312 11313 mov(t0, zr); 11314 mov(t1, zr); 11315 mov(t2, zr); 11316 11317 block_comment("for (int i = 0; i < len; i++) {"); 11318 mov(Ri, zr); { 11319 Label loop, end; 11320 cmpw(Ri, Rlen); 11321 br(Assembler::GE, end); 11322 11323 bind(loop); 11324 pre1(Ri); 11325 11326 block_comment(" for (j = i; j; j--) {"); { 11327 movw(Rj, Ri); 11328 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11329 } block_comment(" } // j"); 11330 11331 post1(); 11332 addw(Ri, Ri, 1); 11333 cmpw(Ri, Rlen); 11334 br(Assembler::LT, loop); 11335 bind(end); 11336 block_comment("} // i"); 11337 } 11338 11339 block_comment("for (int i = len; i < 2*len; i++) {"); 11340 mov(Ri, Rlen); { 11341 Label loop, end; 11342 cmpw(Ri, Rlen, Assembler::LSL, 1); 11343 br(Assembler::GE, end); 11344 11345 bind(loop); 11346 pre2(Ri, Rlen); 11347 11348 block_comment(" for (j = len*2-i-1; j; j--) {"); { 11349 lslw(Rj, Rlen, 1); 11350 subw(Rj, Rj, Ri); 11351 subw(Rj, Rj, 1); 11352 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11353 } block_comment(" } // j"); 11354 11355 post2(Ri, Rlen); 11356 addw(Ri, Ri, 1); 11357 cmpw(Ri, Rlen, Assembler::LSL, 1); 11358 br(Assembler::LT, loop); 11359 bind(end); 11360 } 11361 block_comment("} // i"); 11362 11363 normalize(Rlen); 11364 11365 mov(Ra, Pm_base); // Save Pm_base in Ra 11366 restore_regs(); // Restore caller's Pm_base 11367 11368 // Copy our result into caller's Pm_base 11369 reverse(Pm_base, Ra, Rlen, t0, t1); 11370 11371 leave(); 11372 bind(nothing); 11373 ret(lr); 11374 11375 return entry; 11376 } 11377 // In C, approximately: 11378 11379 // void 11380 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 11381 // julong Pn_base[], julong Pm_base[], 11382 // julong inv, int len) { 11383 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11384 // julong *Pa, *Pb, *Pn, *Pm; 11385 // julong Ra, Rb, Rn, Rm; 11386 11387 // int i; 11388 11389 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11390 11391 // for (i = 0; i < len; i++) { 11392 // int j; 11393 11394 // Pa = Pa_base; 11395 // Pb = Pb_base + i; 11396 // Pm = Pm_base; 11397 // Pn = Pn_base + i; 11398 11399 // Ra = *Pa; 11400 // Rb = *Pb; 11401 // Rm = *Pm; 11402 // Rn = *Pn; 11403 11404 // int iters = i; 11405 // for (j = 0; iters--; j++) { 11406 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11407 // MACC(Ra, Rb, t0, t1, t2); 11408 // Ra = *++Pa; 11409 // Rb = *--Pb; 11410 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11411 // MACC(Rm, Rn, t0, t1, t2); 11412 // Rm = *++Pm; 11413 // Rn = *--Pn; 11414 // } 11415 11416 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11417 // MACC(Ra, Rb, t0, t1, t2); 11418 // *Pm = Rm = t0 * inv; 11419 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11420 // MACC(Rm, Rn, t0, t1, t2); 11421 11422 // assert(t0 == 0, "broken Montgomery multiply"); 11423 11424 // t0 = t1; t1 = t2; t2 = 0; 11425 // } 11426 11427 // for (i = len; i < 2*len; i++) { 11428 // int j; 11429 11430 // Pa = Pa_base + i-len; 11431 // Pb = Pb_base + len; 11432 // Pm = Pm_base + i-len; 11433 // Pn = Pn_base + len; 11434 11435 // Ra = *++Pa; 11436 // Rb = *--Pb; 11437 // Rm = *++Pm; 11438 // Rn = *--Pn; 11439 11440 // int iters = len*2-i-1; 11441 // for (j = i-len+1; iters--; j++) { 11442 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11443 // MACC(Ra, Rb, t0, t1, t2); 11444 // Ra = *++Pa; 11445 // Rb = *--Pb; 11446 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11447 // MACC(Rm, Rn, t0, t1, t2); 11448 // Rm = *++Pm; 11449 // Rn = *--Pn; 11450 // } 11451 11452 // Pm_base[i-len] = t0; 11453 // t0 = t1; t1 = t2; t2 = 0; 11454 // } 11455 11456 // while (t0) 11457 // t0 = sub(Pm_base, Pn_base, t0, len); 11458 // } 11459 11460 /** 11461 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11462 * multiplies than Montgomery multiplication so it should be up to 11463 * 25% faster. However, its loop control is more complex and it 11464 * may actually run slower on some machines. 11465 * 11466 * Arguments: 11467 * 11468 * Inputs: 11469 * c_rarg0 - int array elements a 11470 * c_rarg1 - int array elements n (the modulus) 11471 * c_rarg2 - int length 11472 * c_rarg3 - int inv 11473 * c_rarg4 - int array elements m (the result) 11474 * 11475 */ 11476 address generate_square() { 11477 Label argh; 11478 bind(argh); 11479 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11480 11481 align(CodeEntryAlignment); 11482 address entry = pc(); 11483 11484 enter(); 11485 11486 // Make room. 11487 cmpw(Rlen, 512); 11488 br(Assembler::HI, argh); 11489 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11490 andr(sp, Ra, -2 * wordSize); 11491 11492 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11493 11494 { 11495 // Copy input args, reversing as we go. We use Ra as a 11496 // temporary variable. 11497 reverse(Ra, Pa_base, Rlen, t0, t1); 11498 reverse(Ra, Pn_base, Rlen, t0, t1); 11499 } 11500 11501 // Push all call-saved registers and also Pm_base which we'll need 11502 // at the end. 11503 save_regs(); 11504 11505 mov(Pm_base, Ra); 11506 11507 mov(t0, zr); 11508 mov(t1, zr); 11509 mov(t2, zr); 11510 11511 block_comment("for (int i = 0; i < len; i++) {"); 11512 mov(Ri, zr); { 11513 Label loop, end; 11514 bind(loop); 11515 cmp(Ri, Rlen); 11516 br(Assembler::GE, end); 11517 11518 pre1(Ri); 11519 11520 block_comment("for (j = (i+1)/2; j; j--) {"); { 11521 add(Rj, Ri, 1); 11522 lsr(Rj, Rj, 1); 11523 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11524 } block_comment(" } // j"); 11525 11526 last_squaring(Ri); 11527 11528 block_comment(" for (j = i/2; j; j--) {"); { 11529 lsr(Rj, Ri, 1); 11530 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11531 } block_comment(" } // j"); 11532 11533 post1_squaring(); 11534 add(Ri, Ri, 1); 11535 cmp(Ri, Rlen); 11536 br(Assembler::LT, loop); 11537 11538 bind(end); 11539 block_comment("} // i"); 11540 } 11541 11542 block_comment("for (int i = len; i < 2*len; i++) {"); 11543 mov(Ri, Rlen); { 11544 Label loop, end; 11545 bind(loop); 11546 cmp(Ri, Rlen, Assembler::LSL, 1); 11547 br(Assembler::GE, end); 11548 11549 pre2(Ri, Rlen); 11550 11551 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11552 lsl(Rj, Rlen, 1); 11553 sub(Rj, Rj, Ri); 11554 sub(Rj, Rj, 1); 11555 lsr(Rj, Rj, 1); 11556 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11557 } block_comment(" } // j"); 11558 11559 last_squaring(Ri); 11560 11561 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11562 lsl(Rj, Rlen, 1); 11563 sub(Rj, Rj, Ri); 11564 lsr(Rj, Rj, 1); 11565 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11566 } block_comment(" } // j"); 11567 11568 post2(Ri, Rlen); 11569 add(Ri, Ri, 1); 11570 cmp(Ri, Rlen, Assembler::LSL, 1); 11571 11572 br(Assembler::LT, loop); 11573 bind(end); 11574 block_comment("} // i"); 11575 } 11576 11577 normalize(Rlen); 11578 11579 mov(Ra, Pm_base); // Save Pm_base in Ra 11580 restore_regs(); // Restore caller's Pm_base 11581 11582 // Copy our result into caller's Pm_base 11583 reverse(Pm_base, Ra, Rlen, t0, t1); 11584 11585 leave(); 11586 ret(lr); 11587 11588 return entry; 11589 } 11590 // In C, approximately: 11591 11592 // void 11593 // montgomery_square(julong Pa_base[], julong Pn_base[], 11594 // julong Pm_base[], julong inv, int len) { 11595 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11596 // julong *Pa, *Pb, *Pn, *Pm; 11597 // julong Ra, Rb, Rn, Rm; 11598 11599 // int i; 11600 11601 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11602 11603 // for (i = 0; i < len; i++) { 11604 // int j; 11605 11606 // Pa = Pa_base; 11607 // Pb = Pa_base + i; 11608 // Pm = Pm_base; 11609 // Pn = Pn_base + i; 11610 11611 // Ra = *Pa; 11612 // Rb = *Pb; 11613 // Rm = *Pm; 11614 // Rn = *Pn; 11615 11616 // int iters = (i+1)/2; 11617 // for (j = 0; iters--; j++) { 11618 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11619 // MACC2(Ra, Rb, t0, t1, t2); 11620 // Ra = *++Pa; 11621 // Rb = *--Pb; 11622 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11623 // MACC(Rm, Rn, t0, t1, t2); 11624 // Rm = *++Pm; 11625 // Rn = *--Pn; 11626 // } 11627 // if ((i & 1) == 0) { 11628 // assert(Ra == Pa_base[j], "must be"); 11629 // MACC(Ra, Ra, t0, t1, t2); 11630 // } 11631 // iters = i/2; 11632 // assert(iters == i-j, "must be"); 11633 // for (; iters--; j++) { 11634 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11635 // MACC(Rm, Rn, t0, t1, t2); 11636 // Rm = *++Pm; 11637 // Rn = *--Pn; 11638 // } 11639 11640 // *Pm = Rm = t0 * inv; 11641 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11642 // MACC(Rm, Rn, t0, t1, t2); 11643 11644 // assert(t0 == 0, "broken Montgomery multiply"); 11645 11646 // t0 = t1; t1 = t2; t2 = 0; 11647 // } 11648 11649 // for (i = len; i < 2*len; i++) { 11650 // int start = i-len+1; 11651 // int end = start + (len - start)/2; 11652 // int j; 11653 11654 // Pa = Pa_base + i-len; 11655 // Pb = Pa_base + len; 11656 // Pm = Pm_base + i-len; 11657 // Pn = Pn_base + len; 11658 11659 // Ra = *++Pa; 11660 // Rb = *--Pb; 11661 // Rm = *++Pm; 11662 // Rn = *--Pn; 11663 11664 // int iters = (2*len-i-1)/2; 11665 // assert(iters == end-start, "must be"); 11666 // for (j = start; iters--; j++) { 11667 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11668 // MACC2(Ra, Rb, t0, t1, t2); 11669 // Ra = *++Pa; 11670 // Rb = *--Pb; 11671 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11672 // MACC(Rm, Rn, t0, t1, t2); 11673 // Rm = *++Pm; 11674 // Rn = *--Pn; 11675 // } 11676 // if ((i & 1) == 0) { 11677 // assert(Ra == Pa_base[j], "must be"); 11678 // MACC(Ra, Ra, t0, t1, t2); 11679 // } 11680 // iters = (2*len-i)/2; 11681 // assert(iters == len-j, "must be"); 11682 // for (; iters--; j++) { 11683 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11684 // MACC(Rm, Rn, t0, t1, t2); 11685 // Rm = *++Pm; 11686 // Rn = *--Pn; 11687 // } 11688 // Pm_base[i-len] = t0; 11689 // t0 = t1; t1 = t2; t2 = 0; 11690 // } 11691 11692 // while (t0) 11693 // t0 = sub(Pm_base, Pn_base, t0, len); 11694 // } 11695 }; 11696 11697 // Call here from the interpreter or compiled code to either load 11698 // multiple returned values from the inline type instance being 11699 // returned to registers or to store returned values to a newly 11700 // allocated inline type instance. 11701 address generate_return_value_stub(address destination, const char* name, bool has_res) { 11702 // We need to save all registers the calling convention may use so 11703 // the runtime calls read or update those registers. This needs to 11704 // be in sync with SharedRuntime::java_return_convention(). 11705 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 11706 enum layout { 11707 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 11708 j_rarg6_off, j_rarg6_2, 11709 j_rarg5_off, j_rarg5_2, 11710 j_rarg4_off, j_rarg4_2, 11711 j_rarg3_off, j_rarg3_2, 11712 j_rarg2_off, j_rarg2_2, 11713 j_rarg1_off, j_rarg1_2, 11714 j_rarg0_off, j_rarg0_2, 11715 11716 j_farg7_off, j_farg7_2, 11717 j_farg6_off, j_farg6_2, 11718 j_farg5_off, j_farg5_2, 11719 j_farg4_off, j_farg4_2, 11720 j_farg3_off, j_farg3_2, 11721 j_farg2_off, j_farg2_2, 11722 j_farg1_off, j_farg1_2, 11723 j_farg0_off, j_farg0_2, 11724 11725 rfp_off, rfp_off2, 11726 return_off, return_off2, 11727 11728 framesize // inclusive of return address 11729 }; 11730 11731 CodeBuffer code(name, 512, 64); 11732 MacroAssembler* masm = new MacroAssembler(&code); 11733 11734 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 11735 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 11736 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 11737 int frame_size_in_words = frame_size_in_bytes / wordSize; 11738 11739 OopMapSet* oop_maps = new OopMapSet(); 11740 OopMap* map = new OopMap(frame_size_in_slots, 0); 11741 11742 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 11743 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 11744 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 11745 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 11746 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 11747 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 11748 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 11749 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 11750 11751 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 11752 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 11753 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 11754 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 11755 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 11756 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 11757 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 11758 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 11759 11760 address start = __ pc(); 11761 11762 __ enter(); // Save FP and LR before call 11763 11764 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 11765 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 11766 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 11767 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 11768 11769 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 11770 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 11771 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 11772 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 11773 11774 int frame_complete = __ offset(); 11775 11776 // Set up last_Java_sp and last_Java_fp 11777 address the_pc = __ pc(); 11778 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 11779 11780 // Call runtime 11781 __ mov(c_rarg1, r0); 11782 __ mov(c_rarg0, rthread); 11783 11784 __ mov(rscratch1, destination); 11785 __ blr(rscratch1); 11786 11787 oop_maps->add_gc_map(the_pc - start, map); 11788 11789 __ reset_last_Java_frame(false); 11790 11791 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 11792 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 11793 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 11794 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 11795 11796 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 11797 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 11798 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 11799 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 11800 11801 __ leave(); 11802 11803 // check for pending exceptions 11804 Label pending; 11805 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 11806 __ cbnz(rscratch1, pending); 11807 11808 if (has_res) { 11809 __ get_vm_result_oop(r0, rthread); 11810 } 11811 11812 __ ret(lr); 11813 11814 __ bind(pending); 11815 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 11816 11817 // ------------- 11818 // make sure all code is generated 11819 masm->flush(); 11820 11821 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 11822 return stub->entry_point(); 11823 } 11824 11825 // Initialization 11826 void generate_preuniverse_stubs() { 11827 // preuniverse stubs are not needed for aarch64 11828 } 11829 11830 void generate_initial_stubs() { 11831 // Generate initial stubs and initializes the entry points 11832 11833 // entry points that exist in all platforms Note: This is code 11834 // that could be shared among different platforms - however the 11835 // benefit seems to be smaller than the disadvantage of having a 11836 // much more complicated generator structure. See also comment in 11837 // stubRoutines.hpp. 11838 11839 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11840 11841 StubRoutines::_call_stub_entry = 11842 generate_call_stub(StubRoutines::_call_stub_return_address); 11843 11844 // is referenced by megamorphic call 11845 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11846 11847 // Initialize table for copy memory (arraycopy) check. 11848 if (UnsafeMemoryAccess::_table == nullptr) { 11849 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11850 } 11851 11852 if (UseCRC32Intrinsics) { 11853 // set table address before stub generation which use it 11854 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 11855 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11856 } 11857 11858 if (UseCRC32CIntrinsics) { 11859 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11860 } 11861 11862 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11863 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11864 } 11865 11866 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11867 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11868 } 11869 11870 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11871 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11872 StubRoutines::_hf2f = generate_float16ToFloat(); 11873 StubRoutines::_f2hf = generate_floatToFloat16(); 11874 } 11875 11876 if (InlineTypeReturnedAsFields) { 11877 StubRoutines::_load_inline_type_fields_in_regs = 11878 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 11879 StubRoutines::_store_inline_type_fields_to_buf = 11880 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 11881 } 11882 11883 } 11884 11885 void generate_continuation_stubs() { 11886 // Continuation stubs: 11887 StubRoutines::_cont_thaw = generate_cont_thaw(); 11888 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11889 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11890 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11891 } 11892 11893 void generate_final_stubs() { 11894 // support for verify_oop (must happen after universe_init) 11895 if (VerifyOops) { 11896 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11897 } 11898 11899 // arraycopy stubs used by compilers 11900 generate_arraycopy_stubs(); 11901 11902 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11903 11904 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11905 11906 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11907 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11908 11909 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11910 11911 generate_atomic_entry_points(); 11912 11913 #endif // LINUX 11914 11915 #ifdef COMPILER2 11916 if (UseSecondarySupersTable) { 11917 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11918 if (! InlineSecondarySupersTest) { 11919 generate_lookup_secondary_supers_table_stub(); 11920 } 11921 } 11922 #endif 11923 11924 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11925 11926 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11927 } 11928 11929 void generate_compiler_stubs() { 11930 #if COMPILER2_OR_JVMCI 11931 11932 if (UseSVE == 0) { 11933 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 11934 } 11935 11936 // array equals stub for large arrays. 11937 if (!UseSimpleArrayEquals) { 11938 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11939 } 11940 11941 // arrays_hascode stub for large arrays. 11942 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11943 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11944 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11945 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11946 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11947 11948 // byte_array_inflate stub for large arrays. 11949 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11950 11951 // countPositives stub for large arrays. 11952 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11953 11954 generate_compare_long_strings(); 11955 11956 generate_string_indexof_stubs(); 11957 11958 #ifdef COMPILER2 11959 if (UseMultiplyToLenIntrinsic) { 11960 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11961 } 11962 11963 if (UseSquareToLenIntrinsic) { 11964 StubRoutines::_squareToLen = generate_squareToLen(); 11965 } 11966 11967 if (UseMulAddIntrinsic) { 11968 StubRoutines::_mulAdd = generate_mulAdd(); 11969 } 11970 11971 if (UseSIMDForBigIntegerShiftIntrinsics) { 11972 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11973 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11974 } 11975 11976 if (UseMontgomeryMultiplyIntrinsic) { 11977 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 11978 StubCodeMark mark(this, stub_id); 11979 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11980 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11981 } 11982 11983 if (UseMontgomerySquareIntrinsic) { 11984 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 11985 StubCodeMark mark(this, stub_id); 11986 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11987 // We use generate_multiply() rather than generate_square() 11988 // because it's faster for the sizes of modulus we care about. 11989 StubRoutines::_montgomerySquare = g.generate_multiply(); 11990 } 11991 11992 #endif // COMPILER2 11993 11994 if (UseChaCha20Intrinsics) { 11995 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11996 } 11997 11998 if (UseKyberIntrinsics) { 11999 StubRoutines::_kyberNtt = generate_kyberNtt(); 12000 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 12001 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 12002 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 12003 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 12004 StubRoutines::_kyber12To16 = generate_kyber12To16(); 12005 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 12006 } 12007 12008 if (UseDilithiumIntrinsics) { 12009 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 12010 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 12011 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 12012 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 12013 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 12014 } 12015 12016 if (UseBASE64Intrinsics) { 12017 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 12018 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 12019 } 12020 12021 // data cache line writeback 12022 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 12023 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 12024 12025 if (UseAESIntrinsics) { 12026 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 12027 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 12028 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 12029 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 12030 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 12031 } 12032 if (UseGHASHIntrinsics) { 12033 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 12034 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 12035 } 12036 if (UseAESIntrinsics && UseGHASHIntrinsics) { 12037 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 12038 } 12039 12040 if (UseMD5Intrinsics) { 12041 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 12042 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 12043 } 12044 if (UseSHA1Intrinsics) { 12045 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 12046 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 12047 } 12048 if (UseSHA256Intrinsics) { 12049 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 12050 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 12051 } 12052 if (UseSHA512Intrinsics) { 12053 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 12054 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 12055 } 12056 if (UseSHA3Intrinsics) { 12057 12058 StubRoutines::_double_keccak = generate_double_keccak(); 12059 if (UseSIMDForSHA3Intrinsic) { 12060 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 12061 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 12062 } else { 12063 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubGenStubId::sha3_implCompress_id); 12064 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubGenStubId::sha3_implCompressMB_id); 12065 } 12066 } 12067 12068 if (UsePoly1305Intrinsics) { 12069 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 12070 } 12071 12072 // generate Adler32 intrinsics code 12073 if (UseAdler32Intrinsics) { 12074 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 12075 } 12076 12077 #endif // COMPILER2_OR_JVMCI 12078 } 12079 12080 public: 12081 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 12082 switch(blob_id) { 12083 case preuniverse_id: 12084 generate_preuniverse_stubs(); 12085 break; 12086 case initial_id: 12087 generate_initial_stubs(); 12088 break; 12089 case continuation_id: 12090 generate_continuation_stubs(); 12091 break; 12092 case compiler_id: 12093 generate_compiler_stubs(); 12094 break; 12095 case final_id: 12096 generate_final_stubs(); 12097 break; 12098 default: 12099 fatal("unexpected blob id: %d", blob_id); 12100 break; 12101 }; 12102 } 12103 }; // end class declaration 12104 12105 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 12106 StubGenerator g(code, blob_id); 12107 } 12108 12109 12110 #if defined (LINUX) 12111 12112 // Define pointers to atomic stubs and initialize them to point to the 12113 // code in atomic_aarch64.S. 12114 12115 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 12116 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 12117 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 12118 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 12119 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 12120 12121 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 12122 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 12123 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 12124 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 12125 DEFAULT_ATOMIC_OP(xchg, 4, ) 12126 DEFAULT_ATOMIC_OP(xchg, 8, ) 12127 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 12128 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 12129 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 12130 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 12131 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 12132 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 12133 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 12134 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 12135 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 12136 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 12137 12138 #undef DEFAULT_ATOMIC_OP 12139 12140 #endif // LINUX