1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 426 StubCodeMark mark(this, stub_id); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 482 StubCodeMark mark(this, stub_id); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // Reinitialize the ptrue predicate register, in case the external runtime 521 // call clobbers ptrue reg, as we may return to SVE compiled code. 522 __ reinitialize_ptrue(); 523 524 // we should not really care that lr is no longer the callee 525 // address. we saved the value the handler needs in r19 so we can 526 // just copy it to r3. however, the C2 handler will push its own 527 // frame and then calls into the VM and the VM code asserts that 528 // the PC for the frame above the handler belongs to a compiled 529 // Java method. So, we restore lr here to satisfy that assert. 530 __ mov(lr, r19); 531 // setup r0 & r3 & clear pending exception 532 __ mov(r3, r19); 533 __ mov(r19, r0); 534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 535 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 536 537 #ifdef ASSERT 538 // make sure exception is set 539 { 540 Label L; 541 __ cbnz(r0, L); 542 __ stop("StubRoutines::forward exception: no pending exception (2)"); 543 __ bind(L); 544 } 545 #endif 546 547 // continue at exception handler 548 // r0: exception 549 // r3: throwing pc 550 // r19: exception handler 551 __ verify_oop(r0); 552 __ br(r19); 553 554 return start; 555 } 556 557 // Non-destructive plausibility checks for oops 558 // 559 // Arguments: 560 // r0: oop to verify 561 // rscratch1: error message 562 // 563 // Stack after saving c_rarg3: 564 // [tos + 0]: saved c_rarg3 565 // [tos + 1]: saved c_rarg2 566 // [tos + 2]: saved lr 567 // [tos + 3]: saved rscratch2 568 // [tos + 4]: saved r0 569 // [tos + 5]: saved rscratch1 570 address generate_verify_oop() { 571 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 572 StubCodeMark mark(this, stub_id); 573 address start = __ pc(); 574 575 Label exit, error; 576 577 // save c_rarg2 and c_rarg3 578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 579 580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ ldr(c_rarg3, Address(c_rarg2)); 583 __ add(c_rarg3, c_rarg3, 1); 584 __ str(c_rarg3, Address(c_rarg2)); 585 586 // object is in r0 587 // make sure object is 'reasonable' 588 __ cbz(r0, exit); // if obj is null it is OK 589 590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blr(rscratch1); 614 __ hlt(0); 615 616 return start; 617 } 618 619 // Generate indices for iota vector. 620 address generate_iota_indices(StubGenStubId stub_id) { 621 __ align(CodeEntryAlignment); 622 StubCodeMark mark(this, stub_id); 623 address start = __ pc(); 624 // B 625 __ emit_data64(0x0706050403020100, relocInfo::none); 626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 627 // H 628 __ emit_data64(0x0003000200010000, relocInfo::none); 629 __ emit_data64(0x0007000600050004, relocInfo::none); 630 // S 631 __ emit_data64(0x0000000100000000, relocInfo::none); 632 __ emit_data64(0x0000000300000002, relocInfo::none); 633 // D 634 __ emit_data64(0x0000000000000000, relocInfo::none); 635 __ emit_data64(0x0000000000000001, relocInfo::none); 636 // S - FP 637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 639 // D - FP 640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 642 return start; 643 } 644 645 // The inner part of zero_words(). This is the bulk operation, 646 // zeroing words in blocks, possibly using DC ZVA to do it. The 647 // caller is responsible for zeroing the last few words. 648 // 649 // Inputs: 650 // r10: the HeapWord-aligned base address of an array to zero. 651 // r11: the count in HeapWords, r11 > 0. 652 // 653 // Returns r10 and r11, adjusted for the caller to clear. 654 // r10: the base address of the tail of words left to clear. 655 // r11: the number of words in the tail. 656 // r11 < MacroAssembler::zero_words_block_size. 657 658 address generate_zero_blocks() { 659 Label done; 660 Label base_aligned; 661 662 Register base = r10, cnt = r11; 663 664 __ align(CodeEntryAlignment); 665 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 666 StubCodeMark mark(this, stub_id); 667 address start = __ pc(); 668 669 if (UseBlockZeroing) { 670 int zva_length = VM_Version::zva_length(); 671 672 // Ensure ZVA length can be divided by 16. This is required by 673 // the subsequent operations. 674 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 675 676 __ tbz(base, 3, base_aligned); 677 __ str(zr, Address(__ post(base, 8))); 678 __ sub(cnt, cnt, 1); 679 __ bind(base_aligned); 680 681 // Ensure count >= zva_length * 2 so that it still deserves a zva after 682 // alignment. 683 Label small; 684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 685 __ subs(rscratch1, cnt, low_limit >> 3); 686 __ br(Assembler::LT, small); 687 __ zero_dcache_blocks(base, cnt); 688 __ bind(small); 689 } 690 691 { 692 // Number of stp instructions we'll unroll 693 const int unroll = 694 MacroAssembler::zero_words_block_size / 2; 695 // Clear the remaining blocks. 696 Label loop; 697 __ subs(cnt, cnt, unroll * 2); 698 __ br(Assembler::LT, done); 699 __ bind(loop); 700 for (int i = 0; i < unroll; i++) 701 __ stp(zr, zr, __ post(base, 16)); 702 __ subs(cnt, cnt, unroll * 2); 703 __ br(Assembler::GE, loop); 704 __ bind(done); 705 __ add(cnt, cnt, unroll * 2); 706 } 707 708 __ ret(lr); 709 710 return start; 711 } 712 713 714 typedef enum { 715 copy_forwards = 1, 716 copy_backwards = -1 717 } copy_direction; 718 719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 720 // for arraycopy stubs. 721 class ArrayCopyBarrierSetHelper : StackObj { 722 BarrierSetAssembler* _bs_asm; 723 MacroAssembler* _masm; 724 DecoratorSet _decorators; 725 BasicType _type; 726 Register _gct1; 727 Register _gct2; 728 Register _gct3; 729 FloatRegister _gcvt1; 730 FloatRegister _gcvt2; 731 FloatRegister _gcvt3; 732 733 public: 734 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 735 DecoratorSet decorators, 736 BasicType type, 737 Register gct1, 738 Register gct2, 739 Register gct3, 740 FloatRegister gcvt1, 741 FloatRegister gcvt2, 742 FloatRegister gcvt3) 743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 744 _masm(masm), 745 _decorators(decorators), 746 _type(type), 747 _gct1(gct1), 748 _gct2(gct2), 749 _gct3(gct3), 750 _gcvt1(gcvt1), 751 _gcvt2(gcvt2), 752 _gcvt3(gcvt3) { 753 } 754 755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 757 dst1, dst2, src, 758 _gct1, _gct2, _gcvt1); 759 } 760 761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 763 dst, src1, src2, 764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 765 } 766 767 void copy_load_at_16(Register dst1, Register dst2, Address src) { 768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 769 dst1, dst2, src, 770 _gct1); 771 } 772 773 void copy_store_at_16(Address dst, Register src1, Register src2) { 774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 775 dst, src1, src2, 776 _gct1, _gct2, _gct3); 777 } 778 779 void copy_load_at_8(Register dst, Address src) { 780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 781 dst, noreg, src, 782 _gct1); 783 } 784 785 void copy_store_at_8(Address dst, Register src) { 786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 787 dst, src, noreg, 788 _gct1, _gct2, _gct3); 789 } 790 }; 791 792 // Bulk copy of blocks of 8 words. 793 // 794 // count is a count of words. 795 // 796 // Precondition: count >= 8 797 // 798 // Postconditions: 799 // 800 // The least significant bit of count contains the remaining count 801 // of words to copy. The rest of count is trash. 802 // 803 // s and d are adjusted to point to the remaining words to copy 804 // 805 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 806 BasicType type; 807 copy_direction direction; 808 809 switch (stub_id) { 810 case copy_byte_f_id: 811 direction = copy_forwards; 812 type = T_BYTE; 813 break; 814 case copy_byte_b_id: 815 direction = copy_backwards; 816 type = T_BYTE; 817 break; 818 case copy_oop_f_id: 819 direction = copy_forwards; 820 type = T_OBJECT; 821 break; 822 case copy_oop_b_id: 823 direction = copy_backwards; 824 type = T_OBJECT; 825 break; 826 case copy_oop_uninit_f_id: 827 direction = copy_forwards; 828 type = T_OBJECT; 829 break; 830 case copy_oop_uninit_b_id: 831 direction = copy_backwards; 832 type = T_OBJECT; 833 break; 834 default: 835 ShouldNotReachHere(); 836 } 837 838 int unit = wordSize * direction; 839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 840 841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 842 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 843 const Register stride = r14; 844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 847 848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 849 assert_different_registers(s, d, count, rscratch1, rscratch2); 850 851 Label again, drain; 852 853 __ align(CodeEntryAlignment); 854 855 StubCodeMark mark(this, stub_id); 856 857 __ bind(start); 858 859 Label unaligned_copy_long; 860 if (AvoidUnalignedAccesses) { 861 __ tbnz(d, 3, unaligned_copy_long); 862 } 863 864 if (direction == copy_forwards) { 865 __ sub(s, s, bias); 866 __ sub(d, d, bias); 867 } 868 869 #ifdef ASSERT 870 // Make sure we are never given < 8 words 871 { 872 Label L; 873 __ cmp(count, (u1)8); 874 __ br(Assembler::GE, L); 875 __ stop("genrate_copy_longs called with < 8 words"); 876 __ bind(L); 877 } 878 #endif 879 880 // Fill 8 registers 881 if (UseSIMDForMemoryOps) { 882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 889 } 890 891 __ subs(count, count, 16); 892 __ br(Assembler::LO, drain); 893 894 int prefetch = PrefetchCopyIntervalInBytes; 895 bool use_stride = false; 896 if (direction == copy_backwards) { 897 use_stride = prefetch > 256; 898 prefetch = -prefetch; 899 if (use_stride) __ mov(stride, prefetch); 900 } 901 902 __ bind(again); 903 904 if (PrefetchCopyIntervalInBytes > 0) 905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 906 907 if (UseSIMDForMemoryOps) { 908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 912 } else { 913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 921 } 922 923 __ subs(count, count, 8); 924 __ br(Assembler::HS, again); 925 926 // Drain 927 __ bind(drain); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 931 } else { 932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 } 937 938 { 939 Label L1, L2; 940 __ tbz(count, exact_log2(4), L1); 941 if (UseSIMDForMemoryOps) { 942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 944 } else { 945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 949 } 950 __ bind(L1); 951 952 if (direction == copy_forwards) { 953 __ add(s, s, bias); 954 __ add(d, d, bias); 955 } 956 957 __ tbz(count, 1, L2); 958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 960 __ bind(L2); 961 } 962 963 __ ret(lr); 964 965 if (AvoidUnalignedAccesses) { 966 Label drain, again; 967 // Register order for storing. Order is different for backward copy. 968 969 __ bind(unaligned_copy_long); 970 971 // source address is even aligned, target odd aligned 972 // 973 // when forward copying word pairs we read long pairs at offsets 974 // {0, 2, 4, 6} (in long words). when backwards copying we read 975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 976 // address by -2 in the forwards case so we can compute the 977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 978 // or -1. 979 // 980 // when forward copying we need to store 1 word, 3 pairs and 981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 982 // zero offset We adjust the destination by -1 which means we 983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 984 // 985 // When backwards copyng we need to store 1 word, 3 pairs and 986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 987 // offsets {1, 3, 5, 7, 8} * unit. 988 989 if (direction == copy_forwards) { 990 __ sub(s, s, 16); 991 __ sub(d, d, 8); 992 } 993 994 // Fill 8 registers 995 // 996 // for forwards copy s was offset by -16 from the original input 997 // value of s so the register contents are at these offsets 998 // relative to the 64 bit block addressed by that original input 999 // and so on for each successive 64 byte block when s is updated 1000 // 1001 // t0 at offset 0, t1 at offset 8 1002 // t2 at offset 16, t3 at offset 24 1003 // t4 at offset 32, t5 at offset 40 1004 // t6 at offset 48, t7 at offset 56 1005 1006 // for backwards copy s was not offset so the register contents 1007 // are at these offsets into the preceding 64 byte block 1008 // relative to that original input and so on for each successive 1009 // preceding 64 byte block when s is updated. this explains the 1010 // slightly counter-intuitive looking pattern of register usage 1011 // in the stp instructions for backwards copy. 1012 // 1013 // t0 at offset -16, t1 at offset -8 1014 // t2 at offset -32, t3 at offset -24 1015 // t4 at offset -48, t5 at offset -40 1016 // t6 at offset -64, t7 at offset -56 1017 1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1022 1023 __ subs(count, count, 16); 1024 __ br(Assembler::LO, drain); 1025 1026 int prefetch = PrefetchCopyIntervalInBytes; 1027 bool use_stride = false; 1028 if (direction == copy_backwards) { 1029 use_stride = prefetch > 256; 1030 prefetch = -prefetch; 1031 if (use_stride) __ mov(stride, prefetch); 1032 } 1033 1034 __ bind(again); 1035 1036 if (PrefetchCopyIntervalInBytes > 0) 1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1038 1039 if (direction == copy_forwards) { 1040 // allowing for the offset of -8 the store instructions place 1041 // registers into the target 64 bit block at the following 1042 // offsets 1043 // 1044 // t0 at offset 0 1045 // t1 at offset 8, t2 at offset 16 1046 // t3 at offset 24, t4 at offset 32 1047 // t5 at offset 40, t6 at offset 48 1048 // t7 at offset 56 1049 1050 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1059 } else { 1060 // d was not offset when we started so the registers are 1061 // written into the 64 bit block preceding d with the following 1062 // offsets 1063 // 1064 // t1 at offset -8 1065 // t3 at offset -24, t0 at offset -16 1066 // t5 at offset -48, t2 at offset -32 1067 // t7 at offset -56, t4 at offset -48 1068 // t6 at offset -64 1069 // 1070 // note that this matches the offsets previously noted for the 1071 // loads 1072 1073 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1082 } 1083 1084 __ subs(count, count, 8); 1085 __ br(Assembler::HS, again); 1086 1087 // Drain 1088 // 1089 // this uses the same pattern of offsets and register arguments 1090 // as above 1091 __ bind(drain); 1092 if (direction == copy_forwards) { 1093 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1098 } else { 1099 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1104 } 1105 // now we need to copy any remaining part block which may 1106 // include a 4 word block subblock and/or a 2 word subblock. 1107 // bits 2 and 1 in the count are the tell-tale for whether we 1108 // have each such subblock 1109 { 1110 Label L1, L2; 1111 __ tbz(count, exact_log2(4), L1); 1112 // this is the same as above but copying only 4 longs hence 1113 // with only one intervening stp between the str instructions 1114 // but note that the offsets and registers still follow the 1115 // same pattern 1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1118 if (direction == copy_forwards) { 1119 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1122 } else { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1126 } 1127 __ bind(L1); 1128 1129 __ tbz(count, 1, L2); 1130 // this is the same as above but copying only 2 longs hence 1131 // there is no intervening stp between the str instructions 1132 // but note that the offset and register patterns are still 1133 // the same 1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1135 if (direction == copy_forwards) { 1136 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1141 } 1142 __ bind(L2); 1143 1144 // for forwards copy we need to re-adjust the offsets we 1145 // applied so that s and d are follow the last words written 1146 1147 if (direction == copy_forwards) { 1148 __ add(s, s, 16); 1149 __ add(d, d, 8); 1150 } 1151 1152 } 1153 1154 __ ret(lr); 1155 } 1156 } 1157 1158 // Small copy: less than 16 bytes. 1159 // 1160 // NB: Ignores all of the bits of count which represent more than 15 1161 // bytes, so a caller doesn't have to mask them. 1162 1163 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1164 bool is_backwards = step < 0; 1165 size_t granularity = g_uabs(step); 1166 int direction = is_backwards ? -1 : 1; 1167 1168 Label Lword, Lint, Lshort, Lbyte; 1169 1170 assert(granularity 1171 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1172 1173 const Register t0 = r3; 1174 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1175 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1176 1177 // ??? I don't know if this bit-test-and-branch is the right thing 1178 // to do. It does a lot of jumping, resulting in several 1179 // mispredicted branches. It might make more sense to do this 1180 // with something like Duff's device with a single computed branch. 1181 1182 __ tbz(count, 3 - exact_log2(granularity), Lword); 1183 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1184 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1185 __ bind(Lword); 1186 1187 if (granularity <= sizeof (jint)) { 1188 __ tbz(count, 2 - exact_log2(granularity), Lint); 1189 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1190 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1191 __ bind(Lint); 1192 } 1193 1194 if (granularity <= sizeof (jshort)) { 1195 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1196 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1197 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1198 __ bind(Lshort); 1199 } 1200 1201 if (granularity <= sizeof (jbyte)) { 1202 __ tbz(count, 0, Lbyte); 1203 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1204 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1205 __ bind(Lbyte); 1206 } 1207 } 1208 1209 Label copy_f, copy_b; 1210 Label copy_obj_f, copy_obj_b; 1211 Label copy_obj_uninit_f, copy_obj_uninit_b; 1212 1213 // All-singing all-dancing memory copy. 1214 // 1215 // Copy count units of memory from s to d. The size of a unit is 1216 // step, which can be positive or negative depending on the direction 1217 // of copy. If is_aligned is false, we align the source address. 1218 // 1219 1220 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1221 Register s, Register d, Register count, int step) { 1222 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1223 bool is_backwards = step < 0; 1224 unsigned int granularity = g_uabs(step); 1225 const Register t0 = r3, t1 = r4; 1226 1227 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1228 // load all the data before writing anything 1229 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1230 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1231 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1232 const Register send = r17, dend = r16; 1233 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1234 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1235 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1236 1237 if (PrefetchCopyIntervalInBytes > 0) 1238 __ prfm(Address(s, 0), PLDL1KEEP); 1239 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1240 __ br(Assembler::HI, copy_big); 1241 1242 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1243 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1244 1245 __ cmp(count, u1(16/granularity)); 1246 __ br(Assembler::LS, copy16); 1247 1248 __ cmp(count, u1(64/granularity)); 1249 __ br(Assembler::HI, copy80); 1250 1251 __ cmp(count, u1(32/granularity)); 1252 __ br(Assembler::LS, copy32); 1253 1254 // 33..64 bytes 1255 if (UseSIMDForMemoryOps) { 1256 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1257 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1258 bs.copy_store_at_32(Address(d, 0), v0, v1); 1259 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1260 } else { 1261 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1262 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1263 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1264 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1265 1266 bs.copy_store_at_16(Address(d, 0), t0, t1); 1267 bs.copy_store_at_16(Address(d, 16), t2, t3); 1268 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1269 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1270 } 1271 __ b(finish); 1272 1273 // 17..32 bytes 1274 __ bind(copy32); 1275 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1276 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1277 1278 bs.copy_store_at_16(Address(d, 0), t0, t1); 1279 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1280 __ b(finish); 1281 1282 // 65..80/96 bytes 1283 // (96 bytes if SIMD because we do 32 byes per instruction) 1284 __ bind(copy80); 1285 if (UseSIMDForMemoryOps) { 1286 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1287 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1288 // Unaligned pointers can be an issue for copying. 1289 // The issue has more chances to happen when granularity of data is 1290 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1291 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1292 // The most performance drop has been seen for the range 65-80 bytes. 1293 // For such cases using the pair of ldp/stp instead of the third pair of 1294 // ldpq/stpq fixes the performance issue. 1295 if (granularity < sizeof (jint)) { 1296 Label copy96; 1297 __ cmp(count, u1(80/granularity)); 1298 __ br(Assembler::HI, copy96); 1299 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1300 1301 bs.copy_store_at_32(Address(d, 0), v0, v1); 1302 bs.copy_store_at_32(Address(d, 32), v2, v3); 1303 1304 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1305 __ b(finish); 1306 1307 __ bind(copy96); 1308 } 1309 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1310 1311 bs.copy_store_at_32(Address(d, 0), v0, v1); 1312 bs.copy_store_at_32(Address(d, 32), v2, v3); 1313 1314 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1315 } else { 1316 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1317 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1318 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1319 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1320 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1321 1322 bs.copy_store_at_16(Address(d, 0), t0, t1); 1323 bs.copy_store_at_16(Address(d, 16), t2, t3); 1324 bs.copy_store_at_16(Address(d, 32), t4, t5); 1325 bs.copy_store_at_16(Address(d, 48), t6, t7); 1326 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1327 } 1328 __ b(finish); 1329 1330 // 0..16 bytes 1331 __ bind(copy16); 1332 __ cmp(count, u1(8/granularity)); 1333 __ br(Assembler::LO, copy8); 1334 1335 // 8..16 bytes 1336 bs.copy_load_at_8(t0, Address(s, 0)); 1337 bs.copy_load_at_8(t1, Address(send, -8)); 1338 bs.copy_store_at_8(Address(d, 0), t0); 1339 bs.copy_store_at_8(Address(dend, -8), t1); 1340 __ b(finish); 1341 1342 if (granularity < 8) { 1343 // 4..7 bytes 1344 __ bind(copy8); 1345 __ tbz(count, 2 - exact_log2(granularity), copy4); 1346 __ ldrw(t0, Address(s, 0)); 1347 __ ldrw(t1, Address(send, -4)); 1348 __ strw(t0, Address(d, 0)); 1349 __ strw(t1, Address(dend, -4)); 1350 __ b(finish); 1351 if (granularity < 4) { 1352 // 0..3 bytes 1353 __ bind(copy4); 1354 __ cbz(count, finish); // get rid of 0 case 1355 if (granularity == 2) { 1356 __ ldrh(t0, Address(s, 0)); 1357 __ strh(t0, Address(d, 0)); 1358 } else { // granularity == 1 1359 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1360 // the first and last byte. 1361 // Handle the 3 byte case by loading and storing base + count/2 1362 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1363 // This does means in the 1 byte case we load/store the same 1364 // byte 3 times. 1365 __ lsr(count, count, 1); 1366 __ ldrb(t0, Address(s, 0)); 1367 __ ldrb(t1, Address(send, -1)); 1368 __ ldrb(t2, Address(s, count)); 1369 __ strb(t0, Address(d, 0)); 1370 __ strb(t1, Address(dend, -1)); 1371 __ strb(t2, Address(d, count)); 1372 } 1373 __ b(finish); 1374 } 1375 } 1376 1377 __ bind(copy_big); 1378 if (is_backwards) { 1379 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1380 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1381 } 1382 1383 // Now we've got the small case out of the way we can align the 1384 // source address on a 2-word boundary. 1385 1386 // Here we will materialize a count in r15, which is used by copy_memory_small 1387 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1388 // Up until here, we have used t9, which aliases r15, but from here on, that register 1389 // can not be used as a temp register, as it contains the count. 1390 1391 Label aligned; 1392 1393 if (is_aligned) { 1394 // We may have to adjust by 1 word to get s 2-word-aligned. 1395 __ tbz(s, exact_log2(wordSize), aligned); 1396 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1397 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1398 __ sub(count, count, wordSize/granularity); 1399 } else { 1400 if (is_backwards) { 1401 __ andr(r15, s, 2 * wordSize - 1); 1402 } else { 1403 __ neg(r15, s); 1404 __ andr(r15, r15, 2 * wordSize - 1); 1405 } 1406 // r15 is the byte adjustment needed to align s. 1407 __ cbz(r15, aligned); 1408 int shift = exact_log2(granularity); 1409 if (shift > 0) { 1410 __ lsr(r15, r15, shift); 1411 } 1412 __ sub(count, count, r15); 1413 1414 #if 0 1415 // ?? This code is only correct for a disjoint copy. It may or 1416 // may not make sense to use it in that case. 1417 1418 // Copy the first pair; s and d may not be aligned. 1419 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1420 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1421 1422 // Align s and d, adjust count 1423 if (is_backwards) { 1424 __ sub(s, s, r15); 1425 __ sub(d, d, r15); 1426 } else { 1427 __ add(s, s, r15); 1428 __ add(d, d, r15); 1429 } 1430 #else 1431 copy_memory_small(decorators, type, s, d, r15, step); 1432 #endif 1433 } 1434 1435 __ bind(aligned); 1436 1437 // s is now 2-word-aligned. 1438 1439 // We have a count of units and some trailing bytes. Adjust the 1440 // count and do a bulk copy of words. If the shift is zero 1441 // perform a move instead to benefit from zero latency moves. 1442 int shift = exact_log2(wordSize/granularity); 1443 if (shift > 0) { 1444 __ lsr(r15, count, shift); 1445 } else { 1446 __ mov(r15, count); 1447 } 1448 if (direction == copy_forwards) { 1449 if (type != T_OBJECT) { 1450 __ bl(copy_f); 1451 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1452 __ bl(copy_obj_uninit_f); 1453 } else { 1454 __ bl(copy_obj_f); 1455 } 1456 } else { 1457 if (type != T_OBJECT) { 1458 __ bl(copy_b); 1459 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1460 __ bl(copy_obj_uninit_b); 1461 } else { 1462 __ bl(copy_obj_b); 1463 } 1464 } 1465 1466 // And the tail. 1467 copy_memory_small(decorators, type, s, d, count, step); 1468 1469 if (granularity >= 8) __ bind(copy8); 1470 if (granularity >= 4) __ bind(copy4); 1471 __ bind(finish); 1472 } 1473 1474 1475 void clobber_registers() { 1476 #ifdef ASSERT 1477 RegSet clobbered 1478 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1479 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1480 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1481 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1482 __ mov(*it, rscratch1); 1483 } 1484 #endif 1485 1486 } 1487 1488 // Scan over array at a for count oops, verifying each one. 1489 // Preserves a and count, clobbers rscratch1 and rscratch2. 1490 void verify_oop_array (int size, Register a, Register count, Register temp) { 1491 Label loop, end; 1492 __ mov(rscratch1, a); 1493 __ mov(rscratch2, zr); 1494 __ bind(loop); 1495 __ cmp(rscratch2, count); 1496 __ br(Assembler::HS, end); 1497 if (size == wordSize) { 1498 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1499 __ verify_oop(temp); 1500 } else { 1501 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1502 __ decode_heap_oop(temp); // calls verify_oop 1503 } 1504 __ add(rscratch2, rscratch2, 1); 1505 __ b(loop); 1506 __ bind(end); 1507 } 1508 1509 // Arguments: 1510 // stub_id - is used to name the stub and identify all details of 1511 // how to perform the copy. 1512 // 1513 // entry - is assigned to the stub's post push entry point unless 1514 // it is null 1515 // 1516 // Inputs: 1517 // c_rarg0 - source array address 1518 // c_rarg1 - destination array address 1519 // c_rarg2 - element count, treated as ssize_t, can be zero 1520 // 1521 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1522 // the hardware handle it. The two dwords within qwords that span 1523 // cache line boundaries will still be loaded and stored atomically. 1524 // 1525 // Side Effects: entry is set to the (post push) entry point so it 1526 // can be used by the corresponding conjoint copy 1527 // method 1528 // 1529 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1530 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1531 RegSet saved_reg = RegSet::of(s, d, count); 1532 int size; 1533 bool aligned; 1534 bool is_oop; 1535 bool dest_uninitialized; 1536 switch (stub_id) { 1537 case jbyte_disjoint_arraycopy_id: 1538 size = sizeof(jbyte); 1539 aligned = false; 1540 is_oop = false; 1541 dest_uninitialized = false; 1542 break; 1543 case arrayof_jbyte_disjoint_arraycopy_id: 1544 size = sizeof(jbyte); 1545 aligned = true; 1546 is_oop = false; 1547 dest_uninitialized = false; 1548 break; 1549 case jshort_disjoint_arraycopy_id: 1550 size = sizeof(jshort); 1551 aligned = false; 1552 is_oop = false; 1553 dest_uninitialized = false; 1554 break; 1555 case arrayof_jshort_disjoint_arraycopy_id: 1556 size = sizeof(jshort); 1557 aligned = true; 1558 is_oop = false; 1559 dest_uninitialized = false; 1560 break; 1561 case jint_disjoint_arraycopy_id: 1562 size = sizeof(jint); 1563 aligned = false; 1564 is_oop = false; 1565 dest_uninitialized = false; 1566 break; 1567 case arrayof_jint_disjoint_arraycopy_id: 1568 size = sizeof(jint); 1569 aligned = true; 1570 is_oop = false; 1571 dest_uninitialized = false; 1572 break; 1573 case jlong_disjoint_arraycopy_id: 1574 // since this is always aligned we can (should!) use the same 1575 // stub as for case arrayof_jlong_disjoint_arraycopy 1576 ShouldNotReachHere(); 1577 break; 1578 case arrayof_jlong_disjoint_arraycopy_id: 1579 size = sizeof(jlong); 1580 aligned = true; 1581 is_oop = false; 1582 dest_uninitialized = false; 1583 break; 1584 case oop_disjoint_arraycopy_id: 1585 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1586 aligned = !UseCompressedOops; 1587 is_oop = true; 1588 dest_uninitialized = false; 1589 break; 1590 case arrayof_oop_disjoint_arraycopy_id: 1591 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1592 aligned = !UseCompressedOops; 1593 is_oop = true; 1594 dest_uninitialized = false; 1595 break; 1596 case oop_disjoint_arraycopy_uninit_id: 1597 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1598 aligned = !UseCompressedOops; 1599 is_oop = true; 1600 dest_uninitialized = true; 1601 break; 1602 case arrayof_oop_disjoint_arraycopy_uninit_id: 1603 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1604 aligned = !UseCompressedOops; 1605 is_oop = true; 1606 dest_uninitialized = true; 1607 break; 1608 default: 1609 ShouldNotReachHere(); 1610 break; 1611 } 1612 1613 __ align(CodeEntryAlignment); 1614 StubCodeMark mark(this, stub_id); 1615 address start = __ pc(); 1616 __ enter(); 1617 1618 if (entry != nullptr) { 1619 *entry = __ pc(); 1620 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1621 BLOCK_COMMENT("Entry:"); 1622 } 1623 1624 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1625 if (dest_uninitialized) { 1626 decorators |= IS_DEST_UNINITIALIZED; 1627 } 1628 if (aligned) { 1629 decorators |= ARRAYCOPY_ALIGNED; 1630 } 1631 1632 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1633 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1634 1635 if (is_oop) { 1636 // save regs before copy_memory 1637 __ push(RegSet::of(d, count), sp); 1638 } 1639 { 1640 // UnsafeMemoryAccess page error: continue after unsafe access 1641 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1642 UnsafeMemoryAccessMark umam(this, add_entry, true); 1643 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1644 } 1645 1646 if (is_oop) { 1647 __ pop(RegSet::of(d, count), sp); 1648 if (VerifyOops) 1649 verify_oop_array(size, d, count, r16); 1650 } 1651 1652 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1653 1654 __ leave(); 1655 __ mov(r0, zr); // return 0 1656 __ ret(lr); 1657 return start; 1658 } 1659 1660 // Arguments: 1661 // stub_id - is used to name the stub and identify all details of 1662 // how to perform the copy. 1663 // 1664 // nooverlap_target - identifes the (post push) entry for the 1665 // corresponding disjoint copy routine which can be 1666 // jumped to if the ranges do not actually overlap 1667 // 1668 // entry - is assigned to the stub's post push entry point unless 1669 // it is null 1670 // 1671 // 1672 // Inputs: 1673 // c_rarg0 - source array address 1674 // c_rarg1 - destination array address 1675 // c_rarg2 - element count, treated as ssize_t, can be zero 1676 // 1677 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1678 // the hardware handle it. The two dwords within qwords that span 1679 // cache line boundaries will still be loaded and stored atomically. 1680 // 1681 // Side Effects: 1682 // entry is set to the no-overlap entry point so it can be used by 1683 // some other conjoint copy method 1684 // 1685 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1686 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1687 RegSet saved_regs = RegSet::of(s, d, count); 1688 int size; 1689 bool aligned; 1690 bool is_oop; 1691 bool dest_uninitialized; 1692 switch (stub_id) { 1693 case jbyte_arraycopy_id: 1694 size = sizeof(jbyte); 1695 aligned = false; 1696 is_oop = false; 1697 dest_uninitialized = false; 1698 break; 1699 case arrayof_jbyte_arraycopy_id: 1700 size = sizeof(jbyte); 1701 aligned = true; 1702 is_oop = false; 1703 dest_uninitialized = false; 1704 break; 1705 case jshort_arraycopy_id: 1706 size = sizeof(jshort); 1707 aligned = false; 1708 is_oop = false; 1709 dest_uninitialized = false; 1710 break; 1711 case arrayof_jshort_arraycopy_id: 1712 size = sizeof(jshort); 1713 aligned = true; 1714 is_oop = false; 1715 dest_uninitialized = false; 1716 break; 1717 case jint_arraycopy_id: 1718 size = sizeof(jint); 1719 aligned = false; 1720 is_oop = false; 1721 dest_uninitialized = false; 1722 break; 1723 case arrayof_jint_arraycopy_id: 1724 size = sizeof(jint); 1725 aligned = true; 1726 is_oop = false; 1727 dest_uninitialized = false; 1728 break; 1729 case jlong_arraycopy_id: 1730 // since this is always aligned we can (should!) use the same 1731 // stub as for case arrayof_jlong_disjoint_arraycopy 1732 ShouldNotReachHere(); 1733 break; 1734 case arrayof_jlong_arraycopy_id: 1735 size = sizeof(jlong); 1736 aligned = true; 1737 is_oop = false; 1738 dest_uninitialized = false; 1739 break; 1740 case oop_arraycopy_id: 1741 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1742 aligned = !UseCompressedOops; 1743 is_oop = true; 1744 dest_uninitialized = false; 1745 break; 1746 case arrayof_oop_arraycopy_id: 1747 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1748 aligned = !UseCompressedOops; 1749 is_oop = true; 1750 dest_uninitialized = false; 1751 break; 1752 case oop_arraycopy_uninit_id: 1753 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1754 aligned = !UseCompressedOops; 1755 is_oop = true; 1756 dest_uninitialized = true; 1757 break; 1758 case arrayof_oop_arraycopy_uninit_id: 1759 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1760 aligned = !UseCompressedOops; 1761 is_oop = true; 1762 dest_uninitialized = true; 1763 break; 1764 default: 1765 ShouldNotReachHere(); 1766 } 1767 1768 StubCodeMark mark(this, stub_id); 1769 address start = __ pc(); 1770 __ enter(); 1771 1772 if (entry != nullptr) { 1773 *entry = __ pc(); 1774 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1775 BLOCK_COMMENT("Entry:"); 1776 } 1777 1778 // use fwd copy when (d-s) above_equal (count*size) 1779 __ sub(rscratch1, d, s); 1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1781 __ br(Assembler::HS, nooverlap_target); 1782 1783 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1784 if (dest_uninitialized) { 1785 decorators |= IS_DEST_UNINITIALIZED; 1786 } 1787 if (aligned) { 1788 decorators |= ARRAYCOPY_ALIGNED; 1789 } 1790 1791 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1792 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1793 1794 if (is_oop) { 1795 // save regs before copy_memory 1796 __ push(RegSet::of(d, count), sp); 1797 } 1798 { 1799 // UnsafeMemoryAccess page error: continue after unsafe access 1800 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1801 UnsafeMemoryAccessMark umam(this, add_entry, true); 1802 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1803 } 1804 if (is_oop) { 1805 __ pop(RegSet::of(d, count), sp); 1806 if (VerifyOops) 1807 verify_oop_array(size, d, count, r16); 1808 } 1809 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1810 __ leave(); 1811 __ mov(r0, zr); // return 0 1812 __ ret(lr); 1813 return start; 1814 } 1815 1816 // Helper for generating a dynamic type check. 1817 // Smashes rscratch1, rscratch2. 1818 void generate_type_check(Register sub_klass, 1819 Register super_check_offset, 1820 Register super_klass, 1821 Register temp1, 1822 Register temp2, 1823 Register result, 1824 Label& L_success) { 1825 assert_different_registers(sub_klass, super_check_offset, super_klass); 1826 1827 BLOCK_COMMENT("type_check:"); 1828 1829 Label L_miss; 1830 1831 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1832 super_check_offset); 1833 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1834 1835 // Fall through on failure! 1836 __ BIND(L_miss); 1837 } 1838 1839 // 1840 // Generate checkcasting array copy stub 1841 // 1842 // Input: 1843 // c_rarg0 - source array address 1844 // c_rarg1 - destination array address 1845 // c_rarg2 - element count, treated as ssize_t, can be zero 1846 // c_rarg3 - size_t ckoff (super_check_offset) 1847 // c_rarg4 - oop ckval (super_klass) 1848 // 1849 // Output: 1850 // r0 == 0 - success 1851 // r0 == -1^K - failure, where K is partial transfer count 1852 // 1853 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1854 bool dest_uninitialized; 1855 switch (stub_id) { 1856 case checkcast_arraycopy_id: 1857 dest_uninitialized = false; 1858 break; 1859 case checkcast_arraycopy_uninit_id: 1860 dest_uninitialized = true; 1861 break; 1862 default: 1863 ShouldNotReachHere(); 1864 } 1865 1866 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1867 1868 // Input registers (after setup_arg_regs) 1869 const Register from = c_rarg0; // source array address 1870 const Register to = c_rarg1; // destination array address 1871 const Register count = c_rarg2; // elementscount 1872 const Register ckoff = c_rarg3; // super_check_offset 1873 const Register ckval = c_rarg4; // super_klass 1874 1875 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1876 RegSet wb_post_saved_regs = RegSet::of(count); 1877 1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1879 const Register copied_oop = r22; // actual oop copied 1880 const Register count_save = r21; // orig elementscount 1881 const Register start_to = r20; // destination array start address 1882 const Register r19_klass = r19; // oop._klass 1883 1884 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1885 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1886 1887 //--------------------------------------------------------------- 1888 // Assembler stub will be used for this call to arraycopy 1889 // if the two arrays are subtypes of Object[] but the 1890 // destination array type is not equal to or a supertype 1891 // of the source type. Each element must be separately 1892 // checked. 1893 1894 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1895 copied_oop, r19_klass, count_save); 1896 1897 __ align(CodeEntryAlignment); 1898 StubCodeMark mark(this, stub_id); 1899 address start = __ pc(); 1900 1901 __ enter(); // required for proper stackwalking of RuntimeStub frame 1902 1903 #ifdef ASSERT 1904 // caller guarantees that the arrays really are different 1905 // otherwise, we would have to make conjoint checks 1906 { Label L; 1907 __ b(L); // conjoint check not yet implemented 1908 __ stop("checkcast_copy within a single array"); 1909 __ bind(L); 1910 } 1911 #endif //ASSERT 1912 1913 // Caller of this entry point must set up the argument registers. 1914 if (entry != nullptr) { 1915 *entry = __ pc(); 1916 BLOCK_COMMENT("Entry:"); 1917 } 1918 1919 // Empty array: Nothing to do. 1920 __ cbz(count, L_done); 1921 __ push(RegSet::of(r19, r20, r21, r22), sp); 1922 1923 #ifdef ASSERT 1924 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1925 // The ckoff and ckval must be mutually consistent, 1926 // even though caller generates both. 1927 { Label L; 1928 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1929 __ ldrw(start_to, Address(ckval, sco_offset)); 1930 __ cmpw(ckoff, start_to); 1931 __ br(Assembler::EQ, L); 1932 __ stop("super_check_offset inconsistent"); 1933 __ bind(L); 1934 } 1935 #endif //ASSERT 1936 1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1938 bool is_oop = true; 1939 int element_size = UseCompressedOops ? 4 : 8; 1940 if (dest_uninitialized) { 1941 decorators |= IS_DEST_UNINITIALIZED; 1942 } 1943 1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1946 1947 // save the original count 1948 __ mov(count_save, count); 1949 1950 // Copy from low to high addresses 1951 __ mov(start_to, to); // Save destination array start address 1952 __ b(L_load_element); 1953 1954 // ======== begin loop ======== 1955 // (Loop is rotated; its entry is L_load_element.) 1956 // Loop control: 1957 // for (; count != 0; count--) { 1958 // copied_oop = load_heap_oop(from++); 1959 // ... generate_type_check ...; 1960 // store_heap_oop(to++, copied_oop); 1961 // } 1962 __ align(OptoLoopAlignment); 1963 1964 __ BIND(L_store_element); 1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1966 __ post(to, element_size), copied_oop, noreg, 1967 gct1, gct2, gct3); 1968 __ sub(count, count, 1); 1969 __ cbz(count, L_do_card_marks); 1970 1971 // ======== loop entry is here ======== 1972 __ BIND(L_load_element); 1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1974 copied_oop, noreg, __ post(from, element_size), 1975 gct1); 1976 __ cbz(copied_oop, L_store_element); 1977 1978 __ load_klass(r19_klass, copied_oop);// query the object klass 1979 1980 BLOCK_COMMENT("type_check:"); 1981 generate_type_check(/*sub_klass*/r19_klass, 1982 /*super_check_offset*/ckoff, 1983 /*super_klass*/ckval, 1984 /*r_array_base*/gct1, 1985 /*temp2*/gct2, 1986 /*result*/r10, L_store_element); 1987 1988 // Fall through on failure! 1989 1990 // ======== end loop ======== 1991 1992 // It was a real error; we must depend on the caller to finish the job. 1993 // Register count = remaining oops, count_orig = total oops. 1994 // Emit GC store barriers for the oops we have copied and report 1995 // their number to the caller. 1996 1997 __ subs(count, count_save, count); // K = partially copied oop count 1998 __ eon(count, count, zr); // report (-1^K) to caller 1999 __ br(Assembler::EQ, L_done_pop); 2000 2001 __ BIND(L_do_card_marks); 2002 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2003 2004 __ bind(L_done_pop); 2005 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2007 2008 __ bind(L_done); 2009 __ mov(r0, count); 2010 __ leave(); 2011 __ ret(lr); 2012 2013 return start; 2014 } 2015 2016 // Perform range checks on the proposed arraycopy. 2017 // Kills temp, but nothing else. 2018 // Also, clean the sign bits of src_pos and dst_pos. 2019 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2020 Register src_pos, // source position (c_rarg1) 2021 Register dst, // destination array oo (c_rarg2) 2022 Register dst_pos, // destination position (c_rarg3) 2023 Register length, 2024 Register temp, 2025 Label& L_failed) { 2026 BLOCK_COMMENT("arraycopy_range_checks:"); 2027 2028 assert_different_registers(rscratch1, temp); 2029 2030 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2031 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2032 __ addw(temp, length, src_pos); 2033 __ cmpw(temp, rscratch1); 2034 __ br(Assembler::HI, L_failed); 2035 2036 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2037 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2038 __ addw(temp, length, dst_pos); 2039 __ cmpw(temp, rscratch1); 2040 __ br(Assembler::HI, L_failed); 2041 2042 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2043 __ movw(src_pos, src_pos); 2044 __ movw(dst_pos, dst_pos); 2045 2046 BLOCK_COMMENT("arraycopy_range_checks done"); 2047 } 2048 2049 // These stubs get called from some dumb test routine. 2050 // I'll write them properly when they're called from 2051 // something that's actually doing something. 2052 static void fake_arraycopy_stub(address src, address dst, int count) { 2053 assert(count == 0, "huh?"); 2054 } 2055 2056 2057 // 2058 // Generate 'unsafe' array copy stub 2059 // Though just as safe as the other stubs, it takes an unscaled 2060 // size_t argument instead of an element count. 2061 // 2062 // Input: 2063 // c_rarg0 - source array address 2064 // c_rarg1 - destination array address 2065 // c_rarg2 - byte count, treated as ssize_t, can be zero 2066 // 2067 // Examines the alignment of the operands and dispatches 2068 // to a long, int, short, or byte copy loop. 2069 // 2070 address generate_unsafe_copy(address byte_copy_entry, 2071 address short_copy_entry, 2072 address int_copy_entry, 2073 address long_copy_entry) { 2074 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2075 2076 Label L_long_aligned, L_int_aligned, L_short_aligned; 2077 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2078 2079 __ align(CodeEntryAlignment); 2080 StubCodeMark mark(this, stub_id); 2081 address start = __ pc(); 2082 __ enter(); // required for proper stackwalking of RuntimeStub frame 2083 2084 // bump this on entry, not on exit: 2085 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2086 2087 __ orr(rscratch1, s, d); 2088 __ orr(rscratch1, rscratch1, count); 2089 2090 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2091 __ cbz(rscratch1, L_long_aligned); 2092 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2093 __ cbz(rscratch1, L_int_aligned); 2094 __ tbz(rscratch1, 0, L_short_aligned); 2095 __ b(RuntimeAddress(byte_copy_entry)); 2096 2097 __ BIND(L_short_aligned); 2098 __ lsr(count, count, LogBytesPerShort); // size => short_count 2099 __ b(RuntimeAddress(short_copy_entry)); 2100 __ BIND(L_int_aligned); 2101 __ lsr(count, count, LogBytesPerInt); // size => int_count 2102 __ b(RuntimeAddress(int_copy_entry)); 2103 __ BIND(L_long_aligned); 2104 __ lsr(count, count, LogBytesPerLong); // size => long_count 2105 __ b(RuntimeAddress(long_copy_entry)); 2106 2107 return start; 2108 } 2109 2110 // 2111 // Generate generic array copy stubs 2112 // 2113 // Input: 2114 // c_rarg0 - src oop 2115 // c_rarg1 - src_pos (32-bits) 2116 // c_rarg2 - dst oop 2117 // c_rarg3 - dst_pos (32-bits) 2118 // c_rarg4 - element count (32-bits) 2119 // 2120 // Output: 2121 // r0 == 0 - success 2122 // r0 == -1^K - failure, where K is partial transfer count 2123 // 2124 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2125 address int_copy_entry, address oop_copy_entry, 2126 address long_copy_entry, address checkcast_copy_entry) { 2127 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2128 2129 Label L_failed, L_objArray; 2130 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2131 2132 // Input registers 2133 const Register src = c_rarg0; // source array oop 2134 const Register src_pos = c_rarg1; // source position 2135 const Register dst = c_rarg2; // destination array oop 2136 const Register dst_pos = c_rarg3; // destination position 2137 const Register length = c_rarg4; 2138 2139 2140 // Registers used as temps 2141 const Register dst_klass = c_rarg5; 2142 2143 __ align(CodeEntryAlignment); 2144 2145 StubCodeMark mark(this, stub_id); 2146 2147 address start = __ pc(); 2148 2149 __ enter(); // required for proper stackwalking of RuntimeStub frame 2150 2151 // bump this on entry, not on exit: 2152 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2153 2154 //----------------------------------------------------------------------- 2155 // Assembler stub will be used for this call to arraycopy 2156 // if the following conditions are met: 2157 // 2158 // (1) src and dst must not be null. 2159 // (2) src_pos must not be negative. 2160 // (3) dst_pos must not be negative. 2161 // (4) length must not be negative. 2162 // (5) src klass and dst klass should be the same and not null. 2163 // (6) src and dst should be arrays. 2164 // (7) src_pos + length must not exceed length of src. 2165 // (8) dst_pos + length must not exceed length of dst. 2166 // 2167 2168 // if (src == nullptr) return -1; 2169 __ cbz(src, L_failed); 2170 2171 // if (src_pos < 0) return -1; 2172 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2173 2174 // if (dst == nullptr) return -1; 2175 __ cbz(dst, L_failed); 2176 2177 // if (dst_pos < 0) return -1; 2178 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2179 2180 // registers used as temp 2181 const Register scratch_length = r16; // elements count to copy 2182 const Register scratch_src_klass = r17; // array klass 2183 const Register lh = r15; // layout helper 2184 2185 // if (length < 0) return -1; 2186 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2187 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2188 2189 __ load_klass(scratch_src_klass, src); 2190 #ifdef ASSERT 2191 // assert(src->klass() != nullptr); 2192 { 2193 BLOCK_COMMENT("assert klasses not null {"); 2194 Label L1, L2; 2195 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2196 __ bind(L1); 2197 __ stop("broken null klass"); 2198 __ bind(L2); 2199 __ load_klass(rscratch1, dst); 2200 __ cbz(rscratch1, L1); // this would be broken also 2201 BLOCK_COMMENT("} assert klasses not null done"); 2202 } 2203 #endif 2204 2205 // Load layout helper (32-bits) 2206 // 2207 // |array_tag| | header_size | element_type | |log2_element_size| 2208 // 32 30 24 16 8 2 0 2209 // 2210 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2211 // 2212 2213 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2214 2215 // Handle objArrays completely differently... 2216 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2217 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2218 __ movw(rscratch1, objArray_lh); 2219 __ eorw(rscratch2, lh, rscratch1); 2220 __ cbzw(rscratch2, L_objArray); 2221 2222 // if (src->klass() != dst->klass()) return -1; 2223 __ load_klass(rscratch2, dst); 2224 __ eor(rscratch2, rscratch2, scratch_src_klass); 2225 __ cbnz(rscratch2, L_failed); 2226 2227 // if (!src->is_Array()) return -1; 2228 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2229 2230 // At this point, it is known to be a typeArray (array_tag 0x3). 2231 #ifdef ASSERT 2232 { 2233 BLOCK_COMMENT("assert primitive array {"); 2234 Label L; 2235 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2236 __ cmpw(lh, rscratch2); 2237 __ br(Assembler::GE, L); 2238 __ stop("must be a primitive array"); 2239 __ bind(L); 2240 BLOCK_COMMENT("} assert primitive array done"); 2241 } 2242 #endif 2243 2244 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2245 rscratch2, L_failed); 2246 2247 // TypeArrayKlass 2248 // 2249 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2250 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2251 // 2252 2253 const Register rscratch1_offset = rscratch1; // array offset 2254 const Register r15_elsize = lh; // element size 2255 2256 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2257 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2258 __ add(src, src, rscratch1_offset); // src array offset 2259 __ add(dst, dst, rscratch1_offset); // dst array offset 2260 BLOCK_COMMENT("choose copy loop based on element size"); 2261 2262 // next registers should be set before the jump to corresponding stub 2263 const Register from = c_rarg0; // source array address 2264 const Register to = c_rarg1; // destination array address 2265 const Register count = c_rarg2; // elements count 2266 2267 // 'from', 'to', 'count' registers should be set in such order 2268 // since they are the same as 'src', 'src_pos', 'dst'. 2269 2270 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2271 2272 // The possible values of elsize are 0-3, i.e. exact_log2(element 2273 // size in bytes). We do a simple bitwise binary search. 2274 __ BIND(L_copy_bytes); 2275 __ tbnz(r15_elsize, 1, L_copy_ints); 2276 __ tbnz(r15_elsize, 0, L_copy_shorts); 2277 __ lea(from, Address(src, src_pos));// src_addr 2278 __ lea(to, Address(dst, dst_pos));// dst_addr 2279 __ movw(count, scratch_length); // length 2280 __ b(RuntimeAddress(byte_copy_entry)); 2281 2282 __ BIND(L_copy_shorts); 2283 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2284 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2285 __ movw(count, scratch_length); // length 2286 __ b(RuntimeAddress(short_copy_entry)); 2287 2288 __ BIND(L_copy_ints); 2289 __ tbnz(r15_elsize, 0, L_copy_longs); 2290 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2291 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2292 __ movw(count, scratch_length); // length 2293 __ b(RuntimeAddress(int_copy_entry)); 2294 2295 __ BIND(L_copy_longs); 2296 #ifdef ASSERT 2297 { 2298 BLOCK_COMMENT("assert long copy {"); 2299 Label L; 2300 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2301 __ cmpw(r15_elsize, LogBytesPerLong); 2302 __ br(Assembler::EQ, L); 2303 __ stop("must be long copy, but elsize is wrong"); 2304 __ bind(L); 2305 BLOCK_COMMENT("} assert long copy done"); 2306 } 2307 #endif 2308 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2309 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2310 __ movw(count, scratch_length); // length 2311 __ b(RuntimeAddress(long_copy_entry)); 2312 2313 // ObjArrayKlass 2314 __ BIND(L_objArray); 2315 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2316 2317 Label L_plain_copy, L_checkcast_copy; 2318 // test array classes for subtyping 2319 __ load_klass(r15, dst); 2320 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2321 __ br(Assembler::NE, L_checkcast_copy); 2322 2323 // Identically typed arrays can be copied without element-wise checks. 2324 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2325 rscratch2, L_failed); 2326 2327 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2328 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2329 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2330 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2331 __ movw(count, scratch_length); // length 2332 __ BIND(L_plain_copy); 2333 __ b(RuntimeAddress(oop_copy_entry)); 2334 2335 __ BIND(L_checkcast_copy); 2336 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2337 { 2338 // Before looking at dst.length, make sure dst is also an objArray. 2339 __ ldrw(rscratch1, Address(r15, lh_offset)); 2340 __ movw(rscratch2, objArray_lh); 2341 __ eorw(rscratch1, rscratch1, rscratch2); 2342 __ cbnzw(rscratch1, L_failed); 2343 2344 // It is safe to examine both src.length and dst.length. 2345 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2346 r15, L_failed); 2347 2348 __ load_klass(dst_klass, dst); // reload 2349 2350 // Marshal the base address arguments now, freeing registers. 2351 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2354 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2355 __ movw(count, length); // length (reloaded) 2356 Register sco_temp = c_rarg3; // this register is free now 2357 assert_different_registers(from, to, count, sco_temp, 2358 dst_klass, scratch_src_klass); 2359 // assert_clean_int(count, sco_temp); 2360 2361 // Generate the type check. 2362 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2364 2365 // Smashes rscratch1, rscratch2 2366 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2367 L_plain_copy); 2368 2369 // Fetch destination element klass from the ObjArrayKlass header. 2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2371 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2373 2374 // the checkcast_copy loop needs two extra arguments: 2375 assert(c_rarg3 == sco_temp, "#3 already in place"); 2376 // Set up arguments for checkcast_copy_entry. 2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2378 __ b(RuntimeAddress(checkcast_copy_entry)); 2379 } 2380 2381 __ BIND(L_failed); 2382 __ mov(r0, -1); 2383 __ leave(); // required for proper stackwalking of RuntimeStub frame 2384 __ ret(lr); 2385 2386 return start; 2387 } 2388 2389 // 2390 // Generate stub for array fill. If "aligned" is true, the 2391 // "to" address is assumed to be heapword aligned. 2392 // 2393 // Arguments for generated stub: 2394 // to: c_rarg0 2395 // value: c_rarg1 2396 // count: c_rarg2 treated as signed 2397 // 2398 address generate_fill(StubGenStubId stub_id) { 2399 BasicType t; 2400 bool aligned; 2401 2402 switch (stub_id) { 2403 case jbyte_fill_id: 2404 t = T_BYTE; 2405 aligned = false; 2406 break; 2407 case jshort_fill_id: 2408 t = T_SHORT; 2409 aligned = false; 2410 break; 2411 case jint_fill_id: 2412 t = T_INT; 2413 aligned = false; 2414 break; 2415 case arrayof_jbyte_fill_id: 2416 t = T_BYTE; 2417 aligned = true; 2418 break; 2419 case arrayof_jshort_fill_id: 2420 t = T_SHORT; 2421 aligned = true; 2422 break; 2423 case arrayof_jint_fill_id: 2424 t = T_INT; 2425 aligned = true; 2426 break; 2427 default: 2428 ShouldNotReachHere(); 2429 }; 2430 2431 __ align(CodeEntryAlignment); 2432 StubCodeMark mark(this, stub_id); 2433 address start = __ pc(); 2434 2435 BLOCK_COMMENT("Entry:"); 2436 2437 const Register to = c_rarg0; // source array address 2438 const Register value = c_rarg1; // value 2439 const Register count = c_rarg2; // elements count 2440 2441 const Register bz_base = r10; // base for block_zero routine 2442 const Register cnt_words = r11; // temp register 2443 2444 __ enter(); 2445 2446 Label L_fill_elements, L_exit1; 2447 2448 int shift = -1; 2449 switch (t) { 2450 case T_BYTE: 2451 shift = 0; 2452 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2453 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2454 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2455 __ br(Assembler::LO, L_fill_elements); 2456 break; 2457 case T_SHORT: 2458 shift = 1; 2459 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2461 __ br(Assembler::LO, L_fill_elements); 2462 break; 2463 case T_INT: 2464 shift = 2; 2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2466 __ br(Assembler::LO, L_fill_elements); 2467 break; 2468 default: ShouldNotReachHere(); 2469 } 2470 2471 // Align source address at 8 bytes address boundary. 2472 Label L_skip_align1, L_skip_align2, L_skip_align4; 2473 if (!aligned) { 2474 switch (t) { 2475 case T_BYTE: 2476 // One byte misalignment happens only for byte arrays. 2477 __ tbz(to, 0, L_skip_align1); 2478 __ strb(value, Address(__ post(to, 1))); 2479 __ subw(count, count, 1); 2480 __ bind(L_skip_align1); 2481 // Fallthrough 2482 case T_SHORT: 2483 // Two bytes misalignment happens only for byte and short (char) arrays. 2484 __ tbz(to, 1, L_skip_align2); 2485 __ strh(value, Address(__ post(to, 2))); 2486 __ subw(count, count, 2 >> shift); 2487 __ bind(L_skip_align2); 2488 // Fallthrough 2489 case T_INT: 2490 // Align to 8 bytes, we know we are 4 byte aligned to start. 2491 __ tbz(to, 2, L_skip_align4); 2492 __ strw(value, Address(__ post(to, 4))); 2493 __ subw(count, count, 4 >> shift); 2494 __ bind(L_skip_align4); 2495 break; 2496 default: ShouldNotReachHere(); 2497 } 2498 } 2499 2500 // 2501 // Fill large chunks 2502 // 2503 __ lsrw(cnt_words, count, 3 - shift); // number of words 2504 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2505 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2506 if (UseBlockZeroing) { 2507 Label non_block_zeroing, rest; 2508 // If the fill value is zero we can use the fast zero_words(). 2509 __ cbnz(value, non_block_zeroing); 2510 __ mov(bz_base, to); 2511 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2512 address tpc = __ zero_words(bz_base, cnt_words); 2513 if (tpc == nullptr) { 2514 fatal("CodeCache is full at generate_fill"); 2515 } 2516 __ b(rest); 2517 __ bind(non_block_zeroing); 2518 __ fill_words(to, cnt_words, value); 2519 __ bind(rest); 2520 } else { 2521 __ fill_words(to, cnt_words, value); 2522 } 2523 2524 // Remaining count is less than 8 bytes. Fill it by a single store. 2525 // Note that the total length is no less than 8 bytes. 2526 if (t == T_BYTE || t == T_SHORT) { 2527 Label L_exit1; 2528 __ cbzw(count, L_exit1); 2529 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2530 __ str(value, Address(to, -8)); // overwrite some elements 2531 __ bind(L_exit1); 2532 __ leave(); 2533 __ ret(lr); 2534 } 2535 2536 // Handle copies less than 8 bytes. 2537 Label L_fill_2, L_fill_4, L_exit2; 2538 __ bind(L_fill_elements); 2539 switch (t) { 2540 case T_BYTE: 2541 __ tbz(count, 0, L_fill_2); 2542 __ strb(value, Address(__ post(to, 1))); 2543 __ bind(L_fill_2); 2544 __ tbz(count, 1, L_fill_4); 2545 __ strh(value, Address(__ post(to, 2))); 2546 __ bind(L_fill_4); 2547 __ tbz(count, 2, L_exit2); 2548 __ strw(value, Address(to)); 2549 break; 2550 case T_SHORT: 2551 __ tbz(count, 0, L_fill_4); 2552 __ strh(value, Address(__ post(to, 2))); 2553 __ bind(L_fill_4); 2554 __ tbz(count, 1, L_exit2); 2555 __ strw(value, Address(to)); 2556 break; 2557 case T_INT: 2558 __ cbzw(count, L_exit2); 2559 __ strw(value, Address(to)); 2560 break; 2561 default: ShouldNotReachHere(); 2562 } 2563 __ bind(L_exit2); 2564 __ leave(); 2565 __ ret(lr); 2566 return start; 2567 } 2568 2569 address generate_unsafecopy_common_error_exit() { 2570 address start_pc = __ pc(); 2571 __ leave(); 2572 __ mov(r0, 0); 2573 __ ret(lr); 2574 return start_pc; 2575 } 2576 2577 // 2578 // Generate 'unsafe' set memory stub 2579 // Though just as safe as the other stubs, it takes an unscaled 2580 // size_t (# bytes) argument instead of an element count. 2581 // 2582 // This fill operation is atomicity preserving: as long as the 2583 // address supplied is sufficiently aligned, all writes of up to 64 2584 // bits in size are single-copy atomic. 2585 // 2586 // Input: 2587 // c_rarg0 - destination array address 2588 // c_rarg1 - byte count (size_t) 2589 // c_rarg2 - byte value 2590 // 2591 address generate_unsafe_setmemory() { 2592 __ align(CodeEntryAlignment); 2593 StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id); 2594 address start = __ pc(); 2595 2596 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2597 Label tail; 2598 2599 UnsafeMemoryAccessMark umam(this, true, false); 2600 2601 __ enter(); // required for proper stackwalking of RuntimeStub frame 2602 2603 __ dup(v0, __ T16B, value); 2604 2605 if (AvoidUnalignedAccesses) { 2606 __ cmp(count, (u1)16); 2607 __ br(__ LO, tail); 2608 2609 __ mov(rscratch1, 16); 2610 __ andr(rscratch2, dest, 15); 2611 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2612 __ strq(v0, Address(dest)); 2613 __ sub(count, count, rscratch1); 2614 __ add(dest, dest, rscratch1); 2615 } 2616 2617 __ subs(count, count, (u1)64); 2618 __ br(__ LO, tail); 2619 { 2620 Label again; 2621 __ bind(again); 2622 __ stpq(v0, v0, Address(dest)); 2623 __ stpq(v0, v0, Address(dest, 32)); 2624 2625 __ subs(count, count, 64); 2626 __ add(dest, dest, 64); 2627 __ br(__ HS, again); 2628 } 2629 2630 __ bind(tail); 2631 // The count of bytes is off by 64, but we don't need to correct 2632 // it because we're only going to use the least-significant few 2633 // count bits from here on. 2634 // __ add(count, count, 64); 2635 2636 { 2637 Label dont; 2638 __ tbz(count, exact_log2(32), dont); 2639 __ stpq(v0, v0, __ post(dest, 32)); 2640 __ bind(dont); 2641 } 2642 { 2643 Label dont; 2644 __ tbz(count, exact_log2(16), dont); 2645 __ strq(v0, __ post(dest, 16)); 2646 __ bind(dont); 2647 } 2648 { 2649 Label dont; 2650 __ tbz(count, exact_log2(8), dont); 2651 __ strd(v0, __ post(dest, 8)); 2652 __ bind(dont); 2653 } 2654 2655 Label finished; 2656 __ tst(count, 7); 2657 __ br(__ EQ, finished); 2658 2659 { 2660 Label dont; 2661 __ tbz(count, exact_log2(4), dont); 2662 __ strs(v0, __ post(dest, 4)); 2663 __ bind(dont); 2664 } 2665 { 2666 Label dont; 2667 __ tbz(count, exact_log2(2), dont); 2668 __ bfi(value, value, 8, 8); 2669 __ strh(value, __ post(dest, 2)); 2670 __ bind(dont); 2671 } 2672 { 2673 Label dont; 2674 __ tbz(count, exact_log2(1), dont); 2675 __ strb(value, Address(dest)); 2676 __ bind(dont); 2677 } 2678 2679 __ bind(finished); 2680 __ leave(); 2681 __ ret(lr); 2682 2683 return start; 2684 } 2685 2686 address generate_data_cache_writeback() { 2687 const Register line = c_rarg0; // address of line to write back 2688 2689 __ align(CodeEntryAlignment); 2690 2691 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2692 StubCodeMark mark(this, stub_id); 2693 2694 address start = __ pc(); 2695 __ enter(); 2696 __ cache_wb(Address(line, 0)); 2697 __ leave(); 2698 __ ret(lr); 2699 2700 return start; 2701 } 2702 2703 address generate_data_cache_writeback_sync() { 2704 const Register is_pre = c_rarg0; // pre or post sync 2705 2706 __ align(CodeEntryAlignment); 2707 2708 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2709 StubCodeMark mark(this, stub_id); 2710 2711 // pre wbsync is a no-op 2712 // post wbsync translates to an sfence 2713 2714 Label skip; 2715 address start = __ pc(); 2716 __ enter(); 2717 __ cbnz(is_pre, skip); 2718 __ cache_wbsync(false); 2719 __ bind(skip); 2720 __ leave(); 2721 __ ret(lr); 2722 2723 return start; 2724 } 2725 2726 void generate_arraycopy_stubs() { 2727 address entry; 2728 address entry_jbyte_arraycopy; 2729 address entry_jshort_arraycopy; 2730 address entry_jint_arraycopy; 2731 address entry_oop_arraycopy; 2732 address entry_jlong_arraycopy; 2733 address entry_checkcast_arraycopy; 2734 2735 address ucm_common_error_exit = generate_unsafecopy_common_error_exit(); 2736 UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit); 2737 2738 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2739 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2740 2741 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2742 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2743 2744 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2745 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2746 2747 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2748 2749 //*** jbyte 2750 // Always need aligned and unaligned versions 2751 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2752 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2753 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2754 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2755 2756 //*** jshort 2757 // Always need aligned and unaligned versions 2758 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2759 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2760 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2761 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2762 2763 //*** jint 2764 // Aligned versions 2765 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2766 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2767 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2768 // entry_jint_arraycopy always points to the unaligned version 2769 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2770 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2771 2772 //*** jlong 2773 // It is always aligned 2774 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2775 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2776 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2777 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2778 2779 //*** oops 2780 { 2781 // With compressed oops we need unaligned versions; notice that 2782 // we overwrite entry_oop_arraycopy. 2783 bool aligned = !UseCompressedOops; 2784 2785 StubRoutines::_arrayof_oop_disjoint_arraycopy 2786 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2787 StubRoutines::_arrayof_oop_arraycopy 2788 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2789 // Aligned versions without pre-barriers 2790 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2791 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2792 StubRoutines::_arrayof_oop_arraycopy_uninit 2793 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2794 } 2795 2796 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2797 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2798 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2799 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2800 2801 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2802 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2803 2804 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2805 entry_jshort_arraycopy, 2806 entry_jint_arraycopy, 2807 entry_jlong_arraycopy); 2808 2809 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2810 entry_jshort_arraycopy, 2811 entry_jint_arraycopy, 2812 entry_oop_arraycopy, 2813 entry_jlong_arraycopy, 2814 entry_checkcast_arraycopy); 2815 2816 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2817 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2818 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2819 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2820 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2821 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2822 } 2823 2824 void generate_math_stubs() { Unimplemented(); } 2825 2826 // Arguments: 2827 // 2828 // Inputs: 2829 // c_rarg0 - source byte array address 2830 // c_rarg1 - destination byte array address 2831 // c_rarg2 - K (key) in little endian int array 2832 // 2833 address generate_aescrypt_encryptBlock() { 2834 __ align(CodeEntryAlignment); 2835 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2836 StubCodeMark mark(this, stub_id); 2837 2838 const Register from = c_rarg0; // source array address 2839 const Register to = c_rarg1; // destination array address 2840 const Register key = c_rarg2; // key array address 2841 const Register keylen = rscratch1; 2842 2843 address start = __ pc(); 2844 __ enter(); 2845 2846 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2847 2848 __ aesenc_loadkeys(key, keylen); 2849 __ aesecb_encrypt(from, to, keylen); 2850 2851 __ mov(r0, 0); 2852 2853 __ leave(); 2854 __ ret(lr); 2855 2856 return start; 2857 } 2858 2859 // Arguments: 2860 // 2861 // Inputs: 2862 // c_rarg0 - source byte array address 2863 // c_rarg1 - destination byte array address 2864 // c_rarg2 - K (key) in little endian int array 2865 // 2866 address generate_aescrypt_decryptBlock() { 2867 assert(UseAES, "need AES cryptographic extension support"); 2868 __ align(CodeEntryAlignment); 2869 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2870 StubCodeMark mark(this, stub_id); 2871 Label L_doLast; 2872 2873 const Register from = c_rarg0; // source array address 2874 const Register to = c_rarg1; // destination array address 2875 const Register key = c_rarg2; // key array address 2876 const Register keylen = rscratch1; 2877 2878 address start = __ pc(); 2879 __ enter(); // required for proper stackwalking of RuntimeStub frame 2880 2881 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2882 2883 __ aesecb_decrypt(from, to, key, keylen); 2884 2885 __ mov(r0, 0); 2886 2887 __ leave(); 2888 __ ret(lr); 2889 2890 return start; 2891 } 2892 2893 // Arguments: 2894 // 2895 // Inputs: 2896 // c_rarg0 - source byte array address 2897 // c_rarg1 - destination byte array address 2898 // c_rarg2 - K (key) in little endian int array 2899 // c_rarg3 - r vector byte array address 2900 // c_rarg4 - input length 2901 // 2902 // Output: 2903 // x0 - input length 2904 // 2905 address generate_cipherBlockChaining_encryptAESCrypt() { 2906 assert(UseAES, "need AES cryptographic extension support"); 2907 __ align(CodeEntryAlignment); 2908 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2909 StubCodeMark mark(this, stub_id); 2910 2911 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2912 2913 const Register from = c_rarg0; // source array address 2914 const Register to = c_rarg1; // destination array address 2915 const Register key = c_rarg2; // key array address 2916 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2917 // and left with the results of the last encryption block 2918 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2919 const Register keylen = rscratch1; 2920 2921 address start = __ pc(); 2922 2923 __ enter(); 2924 2925 __ movw(rscratch2, len_reg); 2926 2927 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2928 2929 __ ld1(v0, __ T16B, rvec); 2930 2931 __ cmpw(keylen, 52); 2932 __ br(Assembler::CC, L_loadkeys_44); 2933 __ br(Assembler::EQ, L_loadkeys_52); 2934 2935 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2936 __ rev32(v17, __ T16B, v17); 2937 __ rev32(v18, __ T16B, v18); 2938 __ BIND(L_loadkeys_52); 2939 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2940 __ rev32(v19, __ T16B, v19); 2941 __ rev32(v20, __ T16B, v20); 2942 __ BIND(L_loadkeys_44); 2943 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2944 __ rev32(v21, __ T16B, v21); 2945 __ rev32(v22, __ T16B, v22); 2946 __ rev32(v23, __ T16B, v23); 2947 __ rev32(v24, __ T16B, v24); 2948 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2949 __ rev32(v25, __ T16B, v25); 2950 __ rev32(v26, __ T16B, v26); 2951 __ rev32(v27, __ T16B, v27); 2952 __ rev32(v28, __ T16B, v28); 2953 __ ld1(v29, v30, v31, __ T16B, key); 2954 __ rev32(v29, __ T16B, v29); 2955 __ rev32(v30, __ T16B, v30); 2956 __ rev32(v31, __ T16B, v31); 2957 2958 __ BIND(L_aes_loop); 2959 __ ld1(v1, __ T16B, __ post(from, 16)); 2960 __ eor(v0, __ T16B, v0, v1); 2961 2962 __ br(Assembler::CC, L_rounds_44); 2963 __ br(Assembler::EQ, L_rounds_52); 2964 2965 __ aese(v0, v17); __ aesmc(v0, v0); 2966 __ aese(v0, v18); __ aesmc(v0, v0); 2967 __ BIND(L_rounds_52); 2968 __ aese(v0, v19); __ aesmc(v0, v0); 2969 __ aese(v0, v20); __ aesmc(v0, v0); 2970 __ BIND(L_rounds_44); 2971 __ aese(v0, v21); __ aesmc(v0, v0); 2972 __ aese(v0, v22); __ aesmc(v0, v0); 2973 __ aese(v0, v23); __ aesmc(v0, v0); 2974 __ aese(v0, v24); __ aesmc(v0, v0); 2975 __ aese(v0, v25); __ aesmc(v0, v0); 2976 __ aese(v0, v26); __ aesmc(v0, v0); 2977 __ aese(v0, v27); __ aesmc(v0, v0); 2978 __ aese(v0, v28); __ aesmc(v0, v0); 2979 __ aese(v0, v29); __ aesmc(v0, v0); 2980 __ aese(v0, v30); 2981 __ eor(v0, __ T16B, v0, v31); 2982 2983 __ st1(v0, __ T16B, __ post(to, 16)); 2984 2985 __ subw(len_reg, len_reg, 16); 2986 __ cbnzw(len_reg, L_aes_loop); 2987 2988 __ st1(v0, __ T16B, rvec); 2989 2990 __ mov(r0, rscratch2); 2991 2992 __ leave(); 2993 __ ret(lr); 2994 2995 return start; 2996 } 2997 2998 // Arguments: 2999 // 3000 // Inputs: 3001 // c_rarg0 - source byte array address 3002 // c_rarg1 - destination byte array address 3003 // c_rarg2 - K (key) in little endian int array 3004 // c_rarg3 - r vector byte array address 3005 // c_rarg4 - input length 3006 // 3007 // Output: 3008 // r0 - input length 3009 // 3010 address generate_cipherBlockChaining_decryptAESCrypt() { 3011 assert(UseAES, "need AES cryptographic extension support"); 3012 __ align(CodeEntryAlignment); 3013 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 3014 StubCodeMark mark(this, stub_id); 3015 3016 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3017 3018 const Register from = c_rarg0; // source array address 3019 const Register to = c_rarg1; // destination array address 3020 const Register key = c_rarg2; // key array address 3021 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3022 // and left with the results of the last encryption block 3023 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3024 const Register keylen = rscratch1; 3025 3026 address start = __ pc(); 3027 3028 __ enter(); 3029 3030 __ movw(rscratch2, len_reg); 3031 3032 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3033 3034 __ ld1(v2, __ T16B, rvec); 3035 3036 __ ld1(v31, __ T16B, __ post(key, 16)); 3037 __ rev32(v31, __ T16B, v31); 3038 3039 __ cmpw(keylen, 52); 3040 __ br(Assembler::CC, L_loadkeys_44); 3041 __ br(Assembler::EQ, L_loadkeys_52); 3042 3043 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3044 __ rev32(v17, __ T16B, v17); 3045 __ rev32(v18, __ T16B, v18); 3046 __ BIND(L_loadkeys_52); 3047 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3048 __ rev32(v19, __ T16B, v19); 3049 __ rev32(v20, __ T16B, v20); 3050 __ BIND(L_loadkeys_44); 3051 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3052 __ rev32(v21, __ T16B, v21); 3053 __ rev32(v22, __ T16B, v22); 3054 __ rev32(v23, __ T16B, v23); 3055 __ rev32(v24, __ T16B, v24); 3056 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3057 __ rev32(v25, __ T16B, v25); 3058 __ rev32(v26, __ T16B, v26); 3059 __ rev32(v27, __ T16B, v27); 3060 __ rev32(v28, __ T16B, v28); 3061 __ ld1(v29, v30, __ T16B, key); 3062 __ rev32(v29, __ T16B, v29); 3063 __ rev32(v30, __ T16B, v30); 3064 3065 __ BIND(L_aes_loop); 3066 __ ld1(v0, __ T16B, __ post(from, 16)); 3067 __ orr(v1, __ T16B, v0, v0); 3068 3069 __ br(Assembler::CC, L_rounds_44); 3070 __ br(Assembler::EQ, L_rounds_52); 3071 3072 __ aesd(v0, v17); __ aesimc(v0, v0); 3073 __ aesd(v0, v18); __ aesimc(v0, v0); 3074 __ BIND(L_rounds_52); 3075 __ aesd(v0, v19); __ aesimc(v0, v0); 3076 __ aesd(v0, v20); __ aesimc(v0, v0); 3077 __ BIND(L_rounds_44); 3078 __ aesd(v0, v21); __ aesimc(v0, v0); 3079 __ aesd(v0, v22); __ aesimc(v0, v0); 3080 __ aesd(v0, v23); __ aesimc(v0, v0); 3081 __ aesd(v0, v24); __ aesimc(v0, v0); 3082 __ aesd(v0, v25); __ aesimc(v0, v0); 3083 __ aesd(v0, v26); __ aesimc(v0, v0); 3084 __ aesd(v0, v27); __ aesimc(v0, v0); 3085 __ aesd(v0, v28); __ aesimc(v0, v0); 3086 __ aesd(v0, v29); __ aesimc(v0, v0); 3087 __ aesd(v0, v30); 3088 __ eor(v0, __ T16B, v0, v31); 3089 __ eor(v0, __ T16B, v0, v2); 3090 3091 __ st1(v0, __ T16B, __ post(to, 16)); 3092 __ orr(v2, __ T16B, v1, v1); 3093 3094 __ subw(len_reg, len_reg, 16); 3095 __ cbnzw(len_reg, L_aes_loop); 3096 3097 __ st1(v2, __ T16B, rvec); 3098 3099 __ mov(r0, rscratch2); 3100 3101 __ leave(); 3102 __ ret(lr); 3103 3104 return start; 3105 } 3106 3107 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3108 // Inputs: 128-bits. in is preserved. 3109 // The least-significant 64-bit word is in the upper dword of each vector. 3110 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3111 // Output: result 3112 void be_add_128_64(FloatRegister result, FloatRegister in, 3113 FloatRegister inc, FloatRegister tmp) { 3114 assert_different_registers(result, tmp, inc); 3115 3116 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3117 // input 3118 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3119 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3120 // MSD == 0 (must be!) to LSD 3121 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3122 } 3123 3124 // CTR AES crypt. 3125 // Arguments: 3126 // 3127 // Inputs: 3128 // c_rarg0 - source byte array address 3129 // c_rarg1 - destination byte array address 3130 // c_rarg2 - K (key) in little endian int array 3131 // c_rarg3 - counter vector byte array address 3132 // c_rarg4 - input length 3133 // c_rarg5 - saved encryptedCounter start 3134 // c_rarg6 - saved used length 3135 // 3136 // Output: 3137 // r0 - input length 3138 // 3139 address generate_counterMode_AESCrypt() { 3140 const Register in = c_rarg0; 3141 const Register out = c_rarg1; 3142 const Register key = c_rarg2; 3143 const Register counter = c_rarg3; 3144 const Register saved_len = c_rarg4, len = r10; 3145 const Register saved_encrypted_ctr = c_rarg5; 3146 const Register used_ptr = c_rarg6, used = r12; 3147 3148 const Register offset = r7; 3149 const Register keylen = r11; 3150 3151 const unsigned char block_size = 16; 3152 const int bulk_width = 4; 3153 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3154 // performance with larger data sizes, but it also means that the 3155 // fast path isn't used until you have at least 8 blocks, and up 3156 // to 127 bytes of data will be executed on the slow path. For 3157 // that reason, and also so as not to blow away too much icache, 4 3158 // blocks seems like a sensible compromise. 3159 3160 // Algorithm: 3161 // 3162 // if (len == 0) { 3163 // goto DONE; 3164 // } 3165 // int result = len; 3166 // do { 3167 // if (used >= blockSize) { 3168 // if (len >= bulk_width * blockSize) { 3169 // CTR_large_block(); 3170 // if (len == 0) 3171 // goto DONE; 3172 // } 3173 // for (;;) { 3174 // 16ByteVector v0 = counter; 3175 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3176 // used = 0; 3177 // if (len < blockSize) 3178 // break; /* goto NEXT */ 3179 // 16ByteVector v1 = load16Bytes(in, offset); 3180 // v1 = v1 ^ encryptedCounter; 3181 // store16Bytes(out, offset); 3182 // used = blockSize; 3183 // offset += blockSize; 3184 // len -= blockSize; 3185 // if (len == 0) 3186 // goto DONE; 3187 // } 3188 // } 3189 // NEXT: 3190 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3191 // len--; 3192 // } while (len != 0); 3193 // DONE: 3194 // return result; 3195 // 3196 // CTR_large_block() 3197 // Wide bulk encryption of whole blocks. 3198 3199 __ align(CodeEntryAlignment); 3200 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3201 StubCodeMark mark(this, stub_id); 3202 const address start = __ pc(); 3203 __ enter(); 3204 3205 Label DONE, CTR_large_block, large_block_return; 3206 __ ldrw(used, Address(used_ptr)); 3207 __ cbzw(saved_len, DONE); 3208 3209 __ mov(len, saved_len); 3210 __ mov(offset, 0); 3211 3212 // Compute #rounds for AES based on the length of the key array 3213 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3214 3215 __ aesenc_loadkeys(key, keylen); 3216 3217 { 3218 Label L_CTR_loop, NEXT; 3219 3220 __ bind(L_CTR_loop); 3221 3222 __ cmp(used, block_size); 3223 __ br(__ LO, NEXT); 3224 3225 // Maybe we have a lot of data 3226 __ subsw(rscratch1, len, bulk_width * block_size); 3227 __ br(__ HS, CTR_large_block); 3228 __ BIND(large_block_return); 3229 __ cbzw(len, DONE); 3230 3231 // Setup the counter 3232 __ movi(v4, __ T4S, 0); 3233 __ movi(v5, __ T4S, 1); 3234 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3235 3236 // 128-bit big-endian increment 3237 __ ld1(v0, __ T16B, counter); 3238 __ rev64(v16, __ T16B, v0); 3239 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3240 __ rev64(v16, __ T16B, v16); 3241 __ st1(v16, __ T16B, counter); 3242 // Previous counter value is in v0 3243 // v4 contains { 0, 1 } 3244 3245 { 3246 // We have fewer than bulk_width blocks of data left. Encrypt 3247 // them one by one until there is less than a full block 3248 // remaining, being careful to save both the encrypted counter 3249 // and the counter. 3250 3251 Label inner_loop; 3252 __ bind(inner_loop); 3253 // Counter to encrypt is in v0 3254 __ aesecb_encrypt(noreg, noreg, keylen); 3255 __ st1(v0, __ T16B, saved_encrypted_ctr); 3256 3257 // Do we have a remaining full block? 3258 3259 __ mov(used, 0); 3260 __ cmp(len, block_size); 3261 __ br(__ LO, NEXT); 3262 3263 // Yes, we have a full block 3264 __ ldrq(v1, Address(in, offset)); 3265 __ eor(v1, __ T16B, v1, v0); 3266 __ strq(v1, Address(out, offset)); 3267 __ mov(used, block_size); 3268 __ add(offset, offset, block_size); 3269 3270 __ subw(len, len, block_size); 3271 __ cbzw(len, DONE); 3272 3273 // Increment the counter, store it back 3274 __ orr(v0, __ T16B, v16, v16); 3275 __ rev64(v16, __ T16B, v16); 3276 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3277 __ rev64(v16, __ T16B, v16); 3278 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3279 3280 __ b(inner_loop); 3281 } 3282 3283 __ BIND(NEXT); 3284 3285 // Encrypt a single byte, and loop. 3286 // We expect this to be a rare event. 3287 __ ldrb(rscratch1, Address(in, offset)); 3288 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3289 __ eor(rscratch1, rscratch1, rscratch2); 3290 __ strb(rscratch1, Address(out, offset)); 3291 __ add(offset, offset, 1); 3292 __ add(used, used, 1); 3293 __ subw(len, len,1); 3294 __ cbnzw(len, L_CTR_loop); 3295 } 3296 3297 __ bind(DONE); 3298 __ strw(used, Address(used_ptr)); 3299 __ mov(r0, saved_len); 3300 3301 __ leave(); // required for proper stackwalking of RuntimeStub frame 3302 __ ret(lr); 3303 3304 // Bulk encryption 3305 3306 __ BIND (CTR_large_block); 3307 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3308 3309 if (bulk_width == 8) { 3310 __ sub(sp, sp, 4 * 16); 3311 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3312 } 3313 __ sub(sp, sp, 4 * 16); 3314 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3315 RegSet saved_regs = (RegSet::of(in, out, offset) 3316 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3317 __ push(saved_regs, sp); 3318 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3319 __ add(in, in, offset); 3320 __ add(out, out, offset); 3321 3322 // Keys should already be loaded into the correct registers 3323 3324 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3325 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3326 3327 // AES/CTR loop 3328 { 3329 Label L_CTR_loop; 3330 __ BIND(L_CTR_loop); 3331 3332 // Setup the counters 3333 __ movi(v8, __ T4S, 0); 3334 __ movi(v9, __ T4S, 1); 3335 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3336 3337 for (int i = 0; i < bulk_width; i++) { 3338 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3339 __ rev64(v0_ofs, __ T16B, v16); 3340 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3341 } 3342 3343 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3344 3345 // Encrypt the counters 3346 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3347 3348 if (bulk_width == 8) { 3349 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3350 } 3351 3352 // XOR the encrypted counters with the inputs 3353 for (int i = 0; i < bulk_width; i++) { 3354 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3355 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3356 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3357 } 3358 3359 // Write the encrypted data 3360 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3361 if (bulk_width == 8) { 3362 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3363 } 3364 3365 __ subw(len, len, 16 * bulk_width); 3366 __ cbnzw(len, L_CTR_loop); 3367 } 3368 3369 // Save the counter back where it goes 3370 __ rev64(v16, __ T16B, v16); 3371 __ st1(v16, __ T16B, counter); 3372 3373 __ pop(saved_regs, sp); 3374 3375 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3376 if (bulk_width == 8) { 3377 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3378 } 3379 3380 __ andr(rscratch1, len, -16 * bulk_width); 3381 __ sub(len, len, rscratch1); 3382 __ add(offset, offset, rscratch1); 3383 __ mov(used, 16); 3384 __ strw(used, Address(used_ptr)); 3385 __ b(large_block_return); 3386 3387 return start; 3388 } 3389 3390 // Vector AES Galois Counter Mode implementation. Parameters: 3391 // 3392 // in = c_rarg0 3393 // len = c_rarg1 3394 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3395 // out = c_rarg3 3396 // key = c_rarg4 3397 // state = c_rarg5 - GHASH.state 3398 // subkeyHtbl = c_rarg6 - powers of H 3399 // counter = c_rarg7 - 16 bytes of CTR 3400 // return - number of processed bytes 3401 address generate_galoisCounterMode_AESCrypt() { 3402 address ghash_polynomial = __ pc(); 3403 __ emit_int64(0x87); // The low-order bits of the field 3404 // polynomial (i.e. p = z^7+z^2+z+1) 3405 // repeated in the low and high parts of a 3406 // 128-bit vector 3407 __ emit_int64(0x87); 3408 3409 __ align(CodeEntryAlignment); 3410 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3411 StubCodeMark mark(this, stub_id); 3412 address start = __ pc(); 3413 __ enter(); 3414 3415 const Register in = c_rarg0; 3416 const Register len = c_rarg1; 3417 const Register ct = c_rarg2; 3418 const Register out = c_rarg3; 3419 // and updated with the incremented counter in the end 3420 3421 const Register key = c_rarg4; 3422 const Register state = c_rarg5; 3423 3424 const Register subkeyHtbl = c_rarg6; 3425 3426 const Register counter = c_rarg7; 3427 3428 const Register keylen = r10; 3429 // Save state before entering routine 3430 __ sub(sp, sp, 4 * 16); 3431 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3432 __ sub(sp, sp, 4 * 16); 3433 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3434 3435 // __ andr(len, len, -512); 3436 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3437 __ str(len, __ pre(sp, -2 * wordSize)); 3438 3439 Label DONE; 3440 __ cbz(len, DONE); 3441 3442 // Compute #rounds for AES based on the length of the key array 3443 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3444 3445 __ aesenc_loadkeys(key, keylen); 3446 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3447 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3448 3449 // AES/CTR loop 3450 { 3451 Label L_CTR_loop; 3452 __ BIND(L_CTR_loop); 3453 3454 // Setup the counters 3455 __ movi(v8, __ T4S, 0); 3456 __ movi(v9, __ T4S, 1); 3457 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3458 3459 assert(v0->encoding() < v8->encoding(), ""); 3460 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3461 FloatRegister f = as_FloatRegister(i); 3462 __ rev32(f, __ T16B, v16); 3463 __ addv(v16, __ T4S, v16, v8); 3464 } 3465 3466 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3467 3468 // Encrypt the counters 3469 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3470 3471 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3472 3473 // XOR the encrypted counters with the inputs 3474 for (int i = 0; i < 8; i++) { 3475 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3476 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3477 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3478 } 3479 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3480 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3481 3482 __ subw(len, len, 16 * 8); 3483 __ cbnzw(len, L_CTR_loop); 3484 } 3485 3486 __ rev32(v16, __ T16B, v16); 3487 __ st1(v16, __ T16B, counter); 3488 3489 __ ldr(len, Address(sp)); 3490 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3491 3492 // GHASH/CTR loop 3493 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3494 len, /*unrolls*/4); 3495 3496 #ifdef ASSERT 3497 { Label L; 3498 __ cmp(len, (unsigned char)0); 3499 __ br(Assembler::EQ, L); 3500 __ stop("stubGenerator: abort"); 3501 __ bind(L); 3502 } 3503 #endif 3504 3505 __ bind(DONE); 3506 // Return the number of bytes processed 3507 __ ldr(r0, __ post(sp, 2 * wordSize)); 3508 3509 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3510 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3511 3512 __ leave(); // required for proper stackwalking of RuntimeStub frame 3513 __ ret(lr); 3514 return start; 3515 } 3516 3517 class Cached64Bytes { 3518 private: 3519 MacroAssembler *_masm; 3520 Register _regs[8]; 3521 3522 public: 3523 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3524 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3525 auto it = rs.begin(); 3526 for (auto &r: _regs) { 3527 r = *it; 3528 ++it; 3529 } 3530 } 3531 3532 void gen_loads(Register base) { 3533 for (int i = 0; i < 8; i += 2) { 3534 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3535 } 3536 } 3537 3538 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3539 void extract_u32(Register dest, int i) { 3540 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3541 } 3542 }; 3543 3544 // Utility routines for md5. 3545 // Clobbers r10 and r11. 3546 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3547 int k, int s, int t) { 3548 Register rscratch3 = r10; 3549 Register rscratch4 = r11; 3550 3551 __ eorw(rscratch3, r3, r4); 3552 __ movw(rscratch2, t); 3553 __ andw(rscratch3, rscratch3, r2); 3554 __ addw(rscratch4, r1, rscratch2); 3555 reg_cache.extract_u32(rscratch1, k); 3556 __ eorw(rscratch3, rscratch3, r4); 3557 __ addw(rscratch4, rscratch4, rscratch1); 3558 __ addw(rscratch3, rscratch3, rscratch4); 3559 __ rorw(rscratch2, rscratch3, 32 - s); 3560 __ addw(r1, rscratch2, r2); 3561 } 3562 3563 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3564 int k, int s, int t) { 3565 Register rscratch3 = r10; 3566 Register rscratch4 = r11; 3567 3568 reg_cache.extract_u32(rscratch1, k); 3569 __ movw(rscratch2, t); 3570 __ addw(rscratch4, r1, rscratch2); 3571 __ addw(rscratch4, rscratch4, rscratch1); 3572 __ bicw(rscratch2, r3, r4); 3573 __ andw(rscratch3, r2, r4); 3574 __ addw(rscratch2, rscratch2, rscratch4); 3575 __ addw(rscratch2, rscratch2, rscratch3); 3576 __ rorw(rscratch2, rscratch2, 32 - s); 3577 __ addw(r1, rscratch2, r2); 3578 } 3579 3580 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3581 int k, int s, int t) { 3582 Register rscratch3 = r10; 3583 Register rscratch4 = r11; 3584 3585 __ eorw(rscratch3, r3, r4); 3586 __ movw(rscratch2, t); 3587 __ addw(rscratch4, r1, rscratch2); 3588 reg_cache.extract_u32(rscratch1, k); 3589 __ eorw(rscratch3, rscratch3, r2); 3590 __ addw(rscratch4, rscratch4, rscratch1); 3591 __ addw(rscratch3, rscratch3, rscratch4); 3592 __ rorw(rscratch2, rscratch3, 32 - s); 3593 __ addw(r1, rscratch2, r2); 3594 } 3595 3596 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3597 int k, int s, int t) { 3598 Register rscratch3 = r10; 3599 Register rscratch4 = r11; 3600 3601 __ movw(rscratch3, t); 3602 __ ornw(rscratch2, r2, r4); 3603 __ addw(rscratch4, r1, rscratch3); 3604 reg_cache.extract_u32(rscratch1, k); 3605 __ eorw(rscratch3, rscratch2, r3); 3606 __ addw(rscratch4, rscratch4, rscratch1); 3607 __ addw(rscratch3, rscratch3, rscratch4); 3608 __ rorw(rscratch2, rscratch3, 32 - s); 3609 __ addw(r1, rscratch2, r2); 3610 } 3611 3612 // Arguments: 3613 // 3614 // Inputs: 3615 // c_rarg0 - byte[] source+offset 3616 // c_rarg1 - int[] SHA.state 3617 // c_rarg2 - int offset 3618 // c_rarg3 - int limit 3619 // 3620 address generate_md5_implCompress(StubGenStubId stub_id) { 3621 bool multi_block; 3622 switch (stub_id) { 3623 case md5_implCompress_id: 3624 multi_block = false; 3625 break; 3626 case md5_implCompressMB_id: 3627 multi_block = true; 3628 break; 3629 default: 3630 ShouldNotReachHere(); 3631 } 3632 __ align(CodeEntryAlignment); 3633 3634 StubCodeMark mark(this, stub_id); 3635 address start = __ pc(); 3636 3637 Register buf = c_rarg0; 3638 Register state = c_rarg1; 3639 Register ofs = c_rarg2; 3640 Register limit = c_rarg3; 3641 Register a = r4; 3642 Register b = r5; 3643 Register c = r6; 3644 Register d = r7; 3645 Register rscratch3 = r10; 3646 Register rscratch4 = r11; 3647 3648 Register state_regs[2] = { r12, r13 }; 3649 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3650 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3651 3652 __ push(saved_regs, sp); 3653 3654 __ ldp(state_regs[0], state_regs[1], Address(state)); 3655 __ ubfx(a, state_regs[0], 0, 32); 3656 __ ubfx(b, state_regs[0], 32, 32); 3657 __ ubfx(c, state_regs[1], 0, 32); 3658 __ ubfx(d, state_regs[1], 32, 32); 3659 3660 Label md5_loop; 3661 __ BIND(md5_loop); 3662 3663 reg_cache.gen_loads(buf); 3664 3665 // Round 1 3666 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3667 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3668 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3669 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3670 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3671 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3672 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3673 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3674 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3675 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3676 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3677 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3678 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3679 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3680 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3681 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3682 3683 // Round 2 3684 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3685 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3686 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3687 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3688 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3689 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3690 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3691 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3692 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3693 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3694 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3695 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3696 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3697 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3698 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3699 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3700 3701 // Round 3 3702 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3703 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3704 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3705 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3706 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3707 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3708 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3709 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3710 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3711 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3712 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3713 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3714 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3715 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3716 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3717 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3718 3719 // Round 4 3720 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3721 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3722 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3723 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3724 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3725 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3726 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3727 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3728 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3729 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3730 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3731 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3732 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3733 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3734 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3735 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3736 3737 __ addw(a, state_regs[0], a); 3738 __ ubfx(rscratch2, state_regs[0], 32, 32); 3739 __ addw(b, rscratch2, b); 3740 __ addw(c, state_regs[1], c); 3741 __ ubfx(rscratch4, state_regs[1], 32, 32); 3742 __ addw(d, rscratch4, d); 3743 3744 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3745 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3746 3747 if (multi_block) { 3748 __ add(buf, buf, 64); 3749 __ add(ofs, ofs, 64); 3750 __ cmp(ofs, limit); 3751 __ br(Assembler::LE, md5_loop); 3752 __ mov(c_rarg0, ofs); // return ofs 3753 } 3754 3755 // write hash values back in the correct order 3756 __ stp(state_regs[0], state_regs[1], Address(state)); 3757 3758 __ pop(saved_regs, sp); 3759 3760 __ ret(lr); 3761 3762 return start; 3763 } 3764 3765 // Arguments: 3766 // 3767 // Inputs: 3768 // c_rarg0 - byte[] source+offset 3769 // c_rarg1 - int[] SHA.state 3770 // c_rarg2 - int offset 3771 // c_rarg3 - int limit 3772 // 3773 address generate_sha1_implCompress(StubGenStubId stub_id) { 3774 bool multi_block; 3775 switch (stub_id) { 3776 case sha1_implCompress_id: 3777 multi_block = false; 3778 break; 3779 case sha1_implCompressMB_id: 3780 multi_block = true; 3781 break; 3782 default: 3783 ShouldNotReachHere(); 3784 } 3785 3786 __ align(CodeEntryAlignment); 3787 3788 StubCodeMark mark(this, stub_id); 3789 address start = __ pc(); 3790 3791 Register buf = c_rarg0; 3792 Register state = c_rarg1; 3793 Register ofs = c_rarg2; 3794 Register limit = c_rarg3; 3795 3796 Label keys; 3797 Label sha1_loop; 3798 3799 // load the keys into v0..v3 3800 __ adr(rscratch1, keys); 3801 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3802 // load 5 words state into v6, v7 3803 __ ldrq(v6, Address(state, 0)); 3804 __ ldrs(v7, Address(state, 16)); 3805 3806 3807 __ BIND(sha1_loop); 3808 // load 64 bytes of data into v16..v19 3809 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3810 __ rev32(v16, __ T16B, v16); 3811 __ rev32(v17, __ T16B, v17); 3812 __ rev32(v18, __ T16B, v18); 3813 __ rev32(v19, __ T16B, v19); 3814 3815 // do the sha1 3816 __ addv(v4, __ T4S, v16, v0); 3817 __ orr(v20, __ T16B, v6, v6); 3818 3819 FloatRegister d0 = v16; 3820 FloatRegister d1 = v17; 3821 FloatRegister d2 = v18; 3822 FloatRegister d3 = v19; 3823 3824 for (int round = 0; round < 20; round++) { 3825 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3826 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3827 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3828 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3829 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3830 3831 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3832 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3833 __ sha1h(tmp2, __ T4S, v20); 3834 if (round < 5) 3835 __ sha1c(v20, __ T4S, tmp3, tmp4); 3836 else if (round < 10 || round >= 15) 3837 __ sha1p(v20, __ T4S, tmp3, tmp4); 3838 else 3839 __ sha1m(v20, __ T4S, tmp3, tmp4); 3840 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3841 3842 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3843 } 3844 3845 __ addv(v7, __ T2S, v7, v21); 3846 __ addv(v6, __ T4S, v6, v20); 3847 3848 if (multi_block) { 3849 __ add(ofs, ofs, 64); 3850 __ cmp(ofs, limit); 3851 __ br(Assembler::LE, sha1_loop); 3852 __ mov(c_rarg0, ofs); // return ofs 3853 } 3854 3855 __ strq(v6, Address(state, 0)); 3856 __ strs(v7, Address(state, 16)); 3857 3858 __ ret(lr); 3859 3860 __ bind(keys); 3861 __ emit_int32(0x5a827999); 3862 __ emit_int32(0x6ed9eba1); 3863 __ emit_int32(0x8f1bbcdc); 3864 __ emit_int32(0xca62c1d6); 3865 3866 return start; 3867 } 3868 3869 3870 // Arguments: 3871 // 3872 // Inputs: 3873 // c_rarg0 - byte[] source+offset 3874 // c_rarg1 - int[] SHA.state 3875 // c_rarg2 - int offset 3876 // c_rarg3 - int limit 3877 // 3878 address generate_sha256_implCompress(StubGenStubId stub_id) { 3879 bool multi_block; 3880 switch (stub_id) { 3881 case sha256_implCompress_id: 3882 multi_block = false; 3883 break; 3884 case sha256_implCompressMB_id: 3885 multi_block = true; 3886 break; 3887 default: 3888 ShouldNotReachHere(); 3889 } 3890 3891 static const uint32_t round_consts[64] = { 3892 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3893 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3894 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3895 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3896 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3897 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3898 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3899 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3900 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3901 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3902 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3903 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3904 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3905 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3906 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3907 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3908 }; 3909 3910 __ align(CodeEntryAlignment); 3911 3912 StubCodeMark mark(this, stub_id); 3913 address start = __ pc(); 3914 3915 Register buf = c_rarg0; 3916 Register state = c_rarg1; 3917 Register ofs = c_rarg2; 3918 Register limit = c_rarg3; 3919 3920 Label sha1_loop; 3921 3922 __ stpd(v8, v9, __ pre(sp, -32)); 3923 __ stpd(v10, v11, Address(sp, 16)); 3924 3925 // dga == v0 3926 // dgb == v1 3927 // dg0 == v2 3928 // dg1 == v3 3929 // dg2 == v4 3930 // t0 == v6 3931 // t1 == v7 3932 3933 // load 16 keys to v16..v31 3934 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3935 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3936 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3937 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3938 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3939 3940 // load 8 words (256 bits) state 3941 __ ldpq(v0, v1, state); 3942 3943 __ BIND(sha1_loop); 3944 // load 64 bytes of data into v8..v11 3945 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3946 __ rev32(v8, __ T16B, v8); 3947 __ rev32(v9, __ T16B, v9); 3948 __ rev32(v10, __ T16B, v10); 3949 __ rev32(v11, __ T16B, v11); 3950 3951 __ addv(v6, __ T4S, v8, v16); 3952 __ orr(v2, __ T16B, v0, v0); 3953 __ orr(v3, __ T16B, v1, v1); 3954 3955 FloatRegister d0 = v8; 3956 FloatRegister d1 = v9; 3957 FloatRegister d2 = v10; 3958 FloatRegister d3 = v11; 3959 3960 3961 for (int round = 0; round < 16; round++) { 3962 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3963 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3964 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3965 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3966 3967 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3968 __ orr(v4, __ T16B, v2, v2); 3969 if (round < 15) 3970 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3971 __ sha256h(v2, __ T4S, v3, tmp2); 3972 __ sha256h2(v3, __ T4S, v4, tmp2); 3973 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3974 3975 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3976 } 3977 3978 __ addv(v0, __ T4S, v0, v2); 3979 __ addv(v1, __ T4S, v1, v3); 3980 3981 if (multi_block) { 3982 __ add(ofs, ofs, 64); 3983 __ cmp(ofs, limit); 3984 __ br(Assembler::LE, sha1_loop); 3985 __ mov(c_rarg0, ofs); // return ofs 3986 } 3987 3988 __ ldpd(v10, v11, Address(sp, 16)); 3989 __ ldpd(v8, v9, __ post(sp, 32)); 3990 3991 __ stpq(v0, v1, state); 3992 3993 __ ret(lr); 3994 3995 return start; 3996 } 3997 3998 // Double rounds for sha512. 3999 void sha512_dround(int dr, 4000 FloatRegister vi0, FloatRegister vi1, 4001 FloatRegister vi2, FloatRegister vi3, 4002 FloatRegister vi4, FloatRegister vrc0, 4003 FloatRegister vrc1, FloatRegister vin0, 4004 FloatRegister vin1, FloatRegister vin2, 4005 FloatRegister vin3, FloatRegister vin4) { 4006 if (dr < 36) { 4007 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4008 } 4009 __ addv(v5, __ T2D, vrc0, vin0); 4010 __ ext(v6, __ T16B, vi2, vi3, 8); 4011 __ ext(v5, __ T16B, v5, v5, 8); 4012 __ ext(v7, __ T16B, vi1, vi2, 8); 4013 __ addv(vi3, __ T2D, vi3, v5); 4014 if (dr < 32) { 4015 __ ext(v5, __ T16B, vin3, vin4, 8); 4016 __ sha512su0(vin0, __ T2D, vin1); 4017 } 4018 __ sha512h(vi3, __ T2D, v6, v7); 4019 if (dr < 32) { 4020 __ sha512su1(vin0, __ T2D, vin2, v5); 4021 } 4022 __ addv(vi4, __ T2D, vi1, vi3); 4023 __ sha512h2(vi3, __ T2D, vi1, vi0); 4024 } 4025 4026 // Arguments: 4027 // 4028 // Inputs: 4029 // c_rarg0 - byte[] source+offset 4030 // c_rarg1 - int[] SHA.state 4031 // c_rarg2 - int offset 4032 // c_rarg3 - int limit 4033 // 4034 address generate_sha512_implCompress(StubGenStubId stub_id) { 4035 bool multi_block; 4036 switch (stub_id) { 4037 case sha512_implCompress_id: 4038 multi_block = false; 4039 break; 4040 case sha512_implCompressMB_id: 4041 multi_block = true; 4042 break; 4043 default: 4044 ShouldNotReachHere(); 4045 } 4046 4047 static const uint64_t round_consts[80] = { 4048 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4049 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4050 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4051 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4052 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4053 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4054 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4055 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4056 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4057 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4058 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4059 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4060 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4061 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4062 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4063 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4064 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4065 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4066 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4067 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4068 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4069 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4070 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4071 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4072 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4073 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4074 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4075 }; 4076 4077 __ align(CodeEntryAlignment); 4078 4079 StubCodeMark mark(this, stub_id); 4080 address start = __ pc(); 4081 4082 Register buf = c_rarg0; 4083 Register state = c_rarg1; 4084 Register ofs = c_rarg2; 4085 Register limit = c_rarg3; 4086 4087 __ stpd(v8, v9, __ pre(sp, -64)); 4088 __ stpd(v10, v11, Address(sp, 16)); 4089 __ stpd(v12, v13, Address(sp, 32)); 4090 __ stpd(v14, v15, Address(sp, 48)); 4091 4092 Label sha512_loop; 4093 4094 // load state 4095 __ ld1(v8, v9, v10, v11, __ T2D, state); 4096 4097 // load first 4 round constants 4098 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4099 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4100 4101 __ BIND(sha512_loop); 4102 // load 128B of data into v12..v19 4103 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4104 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4105 __ rev64(v12, __ T16B, v12); 4106 __ rev64(v13, __ T16B, v13); 4107 __ rev64(v14, __ T16B, v14); 4108 __ rev64(v15, __ T16B, v15); 4109 __ rev64(v16, __ T16B, v16); 4110 __ rev64(v17, __ T16B, v17); 4111 __ rev64(v18, __ T16B, v18); 4112 __ rev64(v19, __ T16B, v19); 4113 4114 __ mov(rscratch2, rscratch1); 4115 4116 __ mov(v0, __ T16B, v8); 4117 __ mov(v1, __ T16B, v9); 4118 __ mov(v2, __ T16B, v10); 4119 __ mov(v3, __ T16B, v11); 4120 4121 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4122 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4123 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4124 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4125 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4126 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4127 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4128 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4129 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4130 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4131 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4132 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4133 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4134 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4135 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4136 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4137 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4138 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4139 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4140 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4141 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4142 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4143 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4144 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4145 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4146 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4147 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4148 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4149 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4150 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4151 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4152 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4153 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4154 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4155 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4156 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4157 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4158 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4159 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4160 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4161 4162 __ addv(v8, __ T2D, v8, v0); 4163 __ addv(v9, __ T2D, v9, v1); 4164 __ addv(v10, __ T2D, v10, v2); 4165 __ addv(v11, __ T2D, v11, v3); 4166 4167 if (multi_block) { 4168 __ add(ofs, ofs, 128); 4169 __ cmp(ofs, limit); 4170 __ br(Assembler::LE, sha512_loop); 4171 __ mov(c_rarg0, ofs); // return ofs 4172 } 4173 4174 __ st1(v8, v9, v10, v11, __ T2D, state); 4175 4176 __ ldpd(v14, v15, Address(sp, 48)); 4177 __ ldpd(v12, v13, Address(sp, 32)); 4178 __ ldpd(v10, v11, Address(sp, 16)); 4179 __ ldpd(v8, v9, __ post(sp, 64)); 4180 4181 __ ret(lr); 4182 4183 return start; 4184 } 4185 4186 // Execute one round of keccak of two computations in parallel. 4187 // One of the states should be loaded into the lower halves of 4188 // the vector registers v0-v24, the other should be loaded into 4189 // the upper halves of those registers. The ld1r instruction loads 4190 // the round constant into both halves of register v31. 4191 // Intermediate results c0...c5 and d0...d5 are computed 4192 // in registers v25...v30. 4193 // All vector instructions that are used operate on both register 4194 // halves in parallel. 4195 // If only a single computation is needed, one can only load the lower halves. 4196 void keccak_round(Register rscratch1) { 4197 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4198 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4199 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4200 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4201 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4202 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4203 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4204 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4205 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4206 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4207 4208 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4209 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4210 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4211 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4212 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4213 4214 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4215 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4216 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4217 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4218 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4219 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4220 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4221 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4222 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4223 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4224 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4225 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4226 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4227 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4228 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4229 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4230 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4231 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4232 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4233 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4234 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4235 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4236 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4237 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4238 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4239 4240 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4241 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4242 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4243 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4244 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4245 4246 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4247 4248 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4249 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4250 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4251 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4252 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4253 4254 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4255 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4256 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4257 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4258 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4259 4260 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4261 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4262 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4263 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4264 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4265 4266 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4267 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4268 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4269 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4270 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4271 4272 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4273 } 4274 4275 // Arguments: 4276 // 4277 // Inputs: 4278 // c_rarg0 - byte[] source+offset 4279 // c_rarg1 - byte[] SHA.state 4280 // c_rarg2 - int block_size 4281 // c_rarg3 - int offset 4282 // c_rarg4 - int limit 4283 // 4284 address generate_sha3_implCompress(StubGenStubId stub_id) { 4285 bool multi_block; 4286 switch (stub_id) { 4287 case sha3_implCompress_id: 4288 multi_block = false; 4289 break; 4290 case sha3_implCompressMB_id: 4291 multi_block = true; 4292 break; 4293 default: 4294 ShouldNotReachHere(); 4295 } 4296 4297 static const uint64_t round_consts[24] = { 4298 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4299 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4300 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4301 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4302 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4303 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4304 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4305 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4306 }; 4307 4308 __ align(CodeEntryAlignment); 4309 4310 StubCodeMark mark(this, stub_id); 4311 address start = __ pc(); 4312 4313 Register buf = c_rarg0; 4314 Register state = c_rarg1; 4315 Register block_size = c_rarg2; 4316 Register ofs = c_rarg3; 4317 Register limit = c_rarg4; 4318 4319 Label sha3_loop, rounds24_loop; 4320 Label sha3_512_or_sha3_384, shake128; 4321 4322 __ stpd(v8, v9, __ pre(sp, -64)); 4323 __ stpd(v10, v11, Address(sp, 16)); 4324 __ stpd(v12, v13, Address(sp, 32)); 4325 __ stpd(v14, v15, Address(sp, 48)); 4326 4327 // load state 4328 __ add(rscratch1, state, 32); 4329 __ ld1(v0, v1, v2, v3, __ T1D, state); 4330 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4331 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4332 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4333 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4334 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4335 __ ld1(v24, __ T1D, rscratch1); 4336 4337 __ BIND(sha3_loop); 4338 4339 // 24 keccak rounds 4340 __ movw(rscratch2, 24); 4341 4342 // load round_constants base 4343 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4344 4345 // load input 4346 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4347 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4348 __ eor(v0, __ T8B, v0, v25); 4349 __ eor(v1, __ T8B, v1, v26); 4350 __ eor(v2, __ T8B, v2, v27); 4351 __ eor(v3, __ T8B, v3, v28); 4352 __ eor(v4, __ T8B, v4, v29); 4353 __ eor(v5, __ T8B, v5, v30); 4354 __ eor(v6, __ T8B, v6, v31); 4355 4356 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4357 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4358 4359 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4360 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4361 __ eor(v7, __ T8B, v7, v25); 4362 __ eor(v8, __ T8B, v8, v26); 4363 __ eor(v9, __ T8B, v9, v27); 4364 __ eor(v10, __ T8B, v10, v28); 4365 __ eor(v11, __ T8B, v11, v29); 4366 __ eor(v12, __ T8B, v12, v30); 4367 __ eor(v13, __ T8B, v13, v31); 4368 4369 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4370 __ eor(v14, __ T8B, v14, v25); 4371 __ eor(v15, __ T8B, v15, v26); 4372 __ eor(v16, __ T8B, v16, v27); 4373 4374 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4375 __ andw(c_rarg5, block_size, 48); 4376 __ cbzw(c_rarg5, rounds24_loop); 4377 4378 __ tbnz(block_size, 5, shake128); 4379 // block_size == 144, bit5 == 0, SHA3-224 4380 __ ldrd(v28, __ post(buf, 8)); 4381 __ eor(v17, __ T8B, v17, v28); 4382 __ b(rounds24_loop); 4383 4384 __ BIND(shake128); 4385 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4386 __ eor(v17, __ T8B, v17, v28); 4387 __ eor(v18, __ T8B, v18, v29); 4388 __ eor(v19, __ T8B, v19, v30); 4389 __ eor(v20, __ T8B, v20, v31); 4390 __ b(rounds24_loop); // block_size == 168, SHAKE128 4391 4392 __ BIND(sha3_512_or_sha3_384); 4393 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4394 __ eor(v7, __ T8B, v7, v25); 4395 __ eor(v8, __ T8B, v8, v26); 4396 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4397 4398 // SHA3-384 4399 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4400 __ eor(v9, __ T8B, v9, v27); 4401 __ eor(v10, __ T8B, v10, v28); 4402 __ eor(v11, __ T8B, v11, v29); 4403 __ eor(v12, __ T8B, v12, v30); 4404 4405 __ BIND(rounds24_loop); 4406 __ subw(rscratch2, rscratch2, 1); 4407 4408 keccak_round(rscratch1); 4409 4410 __ cbnzw(rscratch2, rounds24_loop); 4411 4412 if (multi_block) { 4413 __ add(ofs, ofs, block_size); 4414 __ cmp(ofs, limit); 4415 __ br(Assembler::LE, sha3_loop); 4416 __ mov(c_rarg0, ofs); // return ofs 4417 } 4418 4419 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4420 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4421 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4422 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4423 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4424 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4425 __ st1(v24, __ T1D, state); 4426 4427 // restore callee-saved registers 4428 __ ldpd(v14, v15, Address(sp, 48)); 4429 __ ldpd(v12, v13, Address(sp, 32)); 4430 __ ldpd(v10, v11, Address(sp, 16)); 4431 __ ldpd(v8, v9, __ post(sp, 64)); 4432 4433 __ ret(lr); 4434 4435 return start; 4436 } 4437 4438 // Inputs: 4439 // c_rarg0 - long[] state0 4440 // c_rarg1 - long[] state1 4441 address generate_double_keccak() { 4442 static const uint64_t round_consts[24] = { 4443 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4444 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4445 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4446 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4447 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4448 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4449 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4450 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4451 }; 4452 4453 // Implements the double_keccak() method of the 4454 // sun.secyrity.provider.SHA3Parallel class 4455 __ align(CodeEntryAlignment); 4456 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4457 address start = __ pc(); 4458 __ enter(); 4459 4460 Register state0 = c_rarg0; 4461 Register state1 = c_rarg1; 4462 4463 Label rounds24_loop; 4464 4465 // save callee-saved registers 4466 __ stpd(v8, v9, __ pre(sp, -64)); 4467 __ stpd(v10, v11, Address(sp, 16)); 4468 __ stpd(v12, v13, Address(sp, 32)); 4469 __ stpd(v14, v15, Address(sp, 48)); 4470 4471 // load states 4472 __ add(rscratch1, state0, 32); 4473 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4474 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4475 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4476 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4477 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4478 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4479 __ ld1(v24, __ D, 0, rscratch1); 4480 __ add(rscratch1, state1, 32); 4481 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4482 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4483 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4484 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4485 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4486 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4487 __ ld1(v24, __ D, 1, rscratch1); 4488 4489 // 24 keccak rounds 4490 __ movw(rscratch2, 24); 4491 4492 // load round_constants base 4493 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4494 4495 __ BIND(rounds24_loop); 4496 __ subw(rscratch2, rscratch2, 1); 4497 keccak_round(rscratch1); 4498 __ cbnzw(rscratch2, rounds24_loop); 4499 4500 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4501 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4502 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4503 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4504 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4505 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4506 __ st1(v24, __ D, 0, state0); 4507 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4508 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4509 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4510 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4511 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4512 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4513 __ st1(v24, __ D, 1, state1); 4514 4515 // restore callee-saved vector registers 4516 __ ldpd(v14, v15, Address(sp, 48)); 4517 __ ldpd(v12, v13, Address(sp, 32)); 4518 __ ldpd(v10, v11, Address(sp, 16)); 4519 __ ldpd(v8, v9, __ post(sp, 64)); 4520 4521 __ leave(); // required for proper stackwalking of RuntimeStub frame 4522 __ mov(r0, zr); // return 0 4523 __ ret(lr); 4524 4525 return start; 4526 } 4527 4528 // ChaCha20 block function. This version parallelizes the 32-bit 4529 // state elements on each of 16 vectors, producing 4 blocks of 4530 // keystream at a time. 4531 // 4532 // state (int[16]) = c_rarg0 4533 // keystream (byte[256]) = c_rarg1 4534 // return - number of bytes of produced keystream (always 256) 4535 // 4536 // This implementation takes each 32-bit integer from the state 4537 // array and broadcasts it across all 4 32-bit lanes of a vector register 4538 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4539 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4540 // the quarter round schedule is implemented as outlined in RFC 7539 section 4541 // 2.3. However, instead of sequentially processing the 3 quarter round 4542 // operations represented by one QUARTERROUND function, we instead stack all 4543 // the adds, xors and left-rotations from the first 4 quarter rounds together 4544 // and then do the same for the second set of 4 quarter rounds. This removes 4545 // some latency that would otherwise be incurred by waiting for an add to 4546 // complete before performing an xor (which depends on the result of the 4547 // add), etc. An adjustment happens between the first and second groups of 4 4548 // quarter rounds, but this is done only in the inputs to the macro functions 4549 // that generate the assembly instructions - these adjustments themselves are 4550 // not part of the resulting assembly. 4551 // The 4 registers v0-v3 are used during the quarter round operations as 4552 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4553 // registers become the vectors involved in adding the start state back onto 4554 // the post-QR working state. After the adds are complete, each of the 16 4555 // vectors write their first lane back to the keystream buffer, followed 4556 // by the second lane from all vectors and so on. 4557 address generate_chacha20Block_blockpar() { 4558 Label L_twoRounds, L_cc20_const; 4559 // The constant data is broken into two 128-bit segments to be loaded 4560 // onto FloatRegisters. The first 128 bits are a counter add overlay 4561 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4562 // The second 128-bits is a table constant used for 8-bit left rotations. 4563 __ BIND(L_cc20_const); 4564 __ emit_int64(0x0000000100000000UL); 4565 __ emit_int64(0x0000000300000002UL); 4566 __ emit_int64(0x0605040702010003UL); 4567 __ emit_int64(0x0E0D0C0F0A09080BUL); 4568 4569 __ align(CodeEntryAlignment); 4570 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4571 StubCodeMark mark(this, stub_id); 4572 address start = __ pc(); 4573 __ enter(); 4574 4575 int i, j; 4576 const Register state = c_rarg0; 4577 const Register keystream = c_rarg1; 4578 const Register loopCtr = r10; 4579 const Register tmpAddr = r11; 4580 const FloatRegister ctrAddOverlay = v28; 4581 const FloatRegister lrot8Tbl = v29; 4582 4583 // Organize SIMD registers in an array that facilitates 4584 // putting repetitive opcodes into loop structures. It is 4585 // important that each grouping of 4 registers is monotonically 4586 // increasing to support the requirements of multi-register 4587 // instructions (e.g. ld4r, st4, etc.) 4588 const FloatRegister workSt[16] = { 4589 v4, v5, v6, v7, v16, v17, v18, v19, 4590 v20, v21, v22, v23, v24, v25, v26, v27 4591 }; 4592 4593 // Pull in constant data. The first 16 bytes are the add overlay 4594 // which is applied to the vector holding the counter (state[12]). 4595 // The second 16 bytes is the index register for the 8-bit left 4596 // rotation tbl instruction. 4597 __ adr(tmpAddr, L_cc20_const); 4598 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4599 4600 // Load from memory and interlace across 16 SIMD registers, 4601 // With each word from memory being broadcast to all lanes of 4602 // each successive SIMD register. 4603 // Addr(0) -> All lanes in workSt[i] 4604 // Addr(4) -> All lanes workSt[i + 1], etc. 4605 __ mov(tmpAddr, state); 4606 for (i = 0; i < 16; i += 4) { 4607 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4608 __ post(tmpAddr, 16)); 4609 } 4610 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4611 4612 // Before entering the loop, create 5 4-register arrays. These 4613 // will hold the 4 registers that represent the a/b/c/d fields 4614 // in the quarter round operation. For instance the "b" field 4615 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4616 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4617 // since it is part of a diagonal organization. The aSet and scratch 4618 // register sets are defined at declaration time because they do not change 4619 // organization at any point during the 20-round processing. 4620 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4621 FloatRegister bSet[4]; 4622 FloatRegister cSet[4]; 4623 FloatRegister dSet[4]; 4624 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4625 4626 // Set up the 10 iteration loop and perform all 8 quarter round ops 4627 __ mov(loopCtr, 10); 4628 __ BIND(L_twoRounds); 4629 4630 // Set to columnar organization and do the following 4 quarter-rounds: 4631 // QUARTERROUND(0, 4, 8, 12) 4632 // QUARTERROUND(1, 5, 9, 13) 4633 // QUARTERROUND(2, 6, 10, 14) 4634 // QUARTERROUND(3, 7, 11, 15) 4635 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4636 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4637 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4638 4639 __ cc20_qr_add4(aSet, bSet); // a += b 4640 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4641 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4642 4643 __ cc20_qr_add4(cSet, dSet); // c += d 4644 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4645 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4646 4647 __ cc20_qr_add4(aSet, bSet); // a += b 4648 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4649 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4650 4651 __ cc20_qr_add4(cSet, dSet); // c += d 4652 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4653 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4654 4655 // Set to diagonal organization and do the next 4 quarter-rounds: 4656 // QUARTERROUND(0, 5, 10, 15) 4657 // QUARTERROUND(1, 6, 11, 12) 4658 // QUARTERROUND(2, 7, 8, 13) 4659 // QUARTERROUND(3, 4, 9, 14) 4660 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4661 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4662 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4663 4664 __ cc20_qr_add4(aSet, bSet); // a += b 4665 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4666 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4667 4668 __ cc20_qr_add4(cSet, dSet); // c += d 4669 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4670 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4671 4672 __ cc20_qr_add4(aSet, bSet); // a += b 4673 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4674 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4675 4676 __ cc20_qr_add4(cSet, dSet); // c += d 4677 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4678 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4679 4680 // Decrement and iterate 4681 __ sub(loopCtr, loopCtr, 1); 4682 __ cbnz(loopCtr, L_twoRounds); 4683 4684 __ mov(tmpAddr, state); 4685 4686 // Add the starting state back to the post-loop keystream 4687 // state. We read/interlace the state array from memory into 4688 // 4 registers similar to what we did in the beginning. Then 4689 // add the counter overlay onto workSt[12] at the end. 4690 for (i = 0; i < 16; i += 4) { 4691 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4692 __ addv(workSt[i], __ T4S, workSt[i], v0); 4693 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4694 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4695 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4696 } 4697 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4698 4699 // Write working state into the keystream buffer. This is accomplished 4700 // by taking the lane "i" from each of the four vectors and writing 4701 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4702 // repeating with the next 4 vectors until all 16 vectors have been used. 4703 // Then move to the next lane and repeat the process until all lanes have 4704 // been written. 4705 for (i = 0; i < 4; i++) { 4706 for (j = 0; j < 16; j += 4) { 4707 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4708 __ post(keystream, 16)); 4709 } 4710 } 4711 4712 __ mov(r0, 256); // Return length of output keystream 4713 __ leave(); 4714 __ ret(lr); 4715 4716 return start; 4717 } 4718 4719 // Helpers to schedule parallel operation bundles across vector 4720 // register sequences of size 2, 4 or 8. 4721 4722 // Implement various primitive computations across vector sequences 4723 4724 template<int N> 4725 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4726 const VSeq<N>& v1, const VSeq<N>& v2) { 4727 // output must not be constant 4728 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4729 // output cannot overwrite pending inputs 4730 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4731 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4732 for (int i = 0; i < N; i++) { 4733 __ addv(v[i], T, v1[i], v2[i]); 4734 } 4735 } 4736 4737 template<int N> 4738 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4739 const VSeq<N>& v1, const VSeq<N>& v2) { 4740 // output must not be constant 4741 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4742 // output cannot overwrite pending inputs 4743 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4744 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4745 for (int i = 0; i < N; i++) { 4746 __ subv(v[i], T, v1[i], v2[i]); 4747 } 4748 } 4749 4750 template<int N> 4751 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4752 const VSeq<N>& v1, const VSeq<N>& v2) { 4753 // output must not be constant 4754 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4755 // output cannot overwrite pending inputs 4756 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4757 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4758 for (int i = 0; i < N; i++) { 4759 __ mulv(v[i], T, v1[i], v2[i]); 4760 } 4761 } 4762 4763 template<int N> 4764 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4765 // output must not be constant 4766 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4767 // output cannot overwrite pending inputs 4768 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4769 for (int i = 0; i < N; i++) { 4770 __ negr(v[i], T, v1[i]); 4771 } 4772 } 4773 4774 template<int N> 4775 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4776 const VSeq<N>& v1, int shift) { 4777 // output must not be constant 4778 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4779 // output cannot overwrite pending inputs 4780 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4781 for (int i = 0; i < N; i++) { 4782 __ sshr(v[i], T, v1[i], shift); 4783 } 4784 } 4785 4786 template<int N> 4787 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4788 // output must not be constant 4789 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4790 // output cannot overwrite pending inputs 4791 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4792 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4793 for (int i = 0; i < N; i++) { 4794 __ andr(v[i], __ T16B, v1[i], v2[i]); 4795 } 4796 } 4797 4798 template<int N> 4799 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4800 // output must not be constant 4801 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4802 // output cannot overwrite pending inputs 4803 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4804 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4805 for (int i = 0; i < N; i++) { 4806 __ orr(v[i], __ T16B, v1[i], v2[i]); 4807 } 4808 } 4809 4810 template<int N> 4811 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4812 // output must not be constant 4813 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4814 // output cannot overwrite pending inputs 4815 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4816 for (int i = 0; i < N; i++) { 4817 __ notr(v[i], __ T16B, v1[i]); 4818 } 4819 } 4820 4821 template<int N> 4822 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4823 // output must not be constant 4824 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4825 // output cannot overwrite pending inputs 4826 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4827 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4828 for (int i = 0; i < N; i++) { 4829 __ sqdmulh(v[i], T, v1[i], v2[i]); 4830 } 4831 } 4832 4833 template<int N> 4834 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4835 // output must not be constant 4836 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4837 // output cannot overwrite pending inputs 4838 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4839 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4840 for (int i = 0; i < N; i++) { 4841 __ mlsv(v[i], T, v1[i], v2[i]); 4842 } 4843 } 4844 4845 // load N/2 successive pairs of quadword values from memory in order 4846 // into N successive vector registers of the sequence via the 4847 // address supplied in base. 4848 template<int N> 4849 void vs_ldpq(const VSeq<N>& v, Register base) { 4850 for (int i = 0; i < N; i += 2) { 4851 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4852 } 4853 } 4854 4855 // load N/2 successive pairs of quadword values from memory in order 4856 // into N vector registers of the sequence via the address supplied 4857 // in base using post-increment addressing 4858 template<int N> 4859 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4860 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4861 for (int i = 0; i < N; i += 2) { 4862 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4863 } 4864 } 4865 4866 // store N successive vector registers of the sequence into N/2 4867 // successive pairs of quadword memory locations via the address 4868 // supplied in base using post-increment addressing 4869 template<int N> 4870 void vs_stpq_post(const VSeq<N>& v, Register base) { 4871 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4872 for (int i = 0; i < N; i += 2) { 4873 __ stpq(v[i], v[i+1], __ post(base, 32)); 4874 } 4875 } 4876 4877 // load N/2 pairs of quadword values from memory de-interleaved into 4878 // N vector registers 2 at a time via the address supplied in base 4879 // using post-increment addressing. 4880 template<int N> 4881 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4882 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4883 for (int i = 0; i < N; i += 2) { 4884 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4885 } 4886 } 4887 4888 // store N vector registers interleaved into N/2 pairs of quadword 4889 // memory locations via the address supplied in base using 4890 // post-increment addressing. 4891 template<int N> 4892 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4893 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4894 for (int i = 0; i < N; i += 2) { 4895 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4896 } 4897 } 4898 4899 // load N quadword values from memory de-interleaved into N vector 4900 // registers 3 elements at a time via the address supplied in base. 4901 template<int N> 4902 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4903 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4904 for (int i = 0; i < N; i += 3) { 4905 __ ld3(v[i], v[i+1], v[i+2], T, base); 4906 } 4907 } 4908 4909 // load N quadword values from memory de-interleaved into N vector 4910 // registers 3 elements at a time via the address supplied in base 4911 // using post-increment addressing. 4912 template<int N> 4913 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4914 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4915 for (int i = 0; i < N; i += 3) { 4916 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4917 } 4918 } 4919 4920 // load N/2 pairs of quadword values from memory into N vector 4921 // registers via the address supplied in base with each pair indexed 4922 // using the the start offset plus the corresponding entry in the 4923 // offsets array 4924 template<int N> 4925 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4926 for (int i = 0; i < N/2; i++) { 4927 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4928 } 4929 } 4930 4931 // store N vector registers into N/2 pairs of quadword memory 4932 // locations via the address supplied in base with each pair indexed 4933 // using the the start offset plus the corresponding entry in the 4934 // offsets array 4935 template<int N> 4936 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4937 for (int i = 0; i < N/2; i++) { 4938 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4939 } 4940 } 4941 4942 // load N single quadword values from memory into N vector registers 4943 // via the address supplied in base with each value indexed using 4944 // the the start offset plus the corresponding entry in the offsets 4945 // array 4946 template<int N> 4947 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4948 int start, int (&offsets)[N]) { 4949 for (int i = 0; i < N; i++) { 4950 __ ldr(v[i], T, Address(base, start + offsets[i])); 4951 } 4952 } 4953 4954 // store N vector registers into N single quadword memory locations 4955 // via the address supplied in base with each value indexed using 4956 // the the start offset plus the corresponding entry in the offsets 4957 // array 4958 template<int N> 4959 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4960 int start, int (&offsets)[N]) { 4961 for (int i = 0; i < N; i++) { 4962 __ str(v[i], T, Address(base, start + offsets[i])); 4963 } 4964 } 4965 4966 // load N/2 pairs of quadword values from memory de-interleaved into 4967 // N vector registers 2 at a time via the address supplied in base 4968 // with each pair indexed using the the start offset plus the 4969 // corresponding entry in the offsets array 4970 template<int N> 4971 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4972 Register tmp, int start, int (&offsets)[N/2]) { 4973 for (int i = 0; i < N/2; i++) { 4974 __ add(tmp, base, start + offsets[i]); 4975 __ ld2(v[2*i], v[2*i+1], T, tmp); 4976 } 4977 } 4978 4979 // store N vector registers 2 at a time interleaved into N/2 pairs 4980 // of quadword memory locations via the address supplied in base 4981 // with each pair indexed using the the start offset plus the 4982 // corresponding entry in the offsets array 4983 template<int N> 4984 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4985 Register tmp, int start, int (&offsets)[N/2]) { 4986 for (int i = 0; i < N/2; i++) { 4987 __ add(tmp, base, start + offsets[i]); 4988 __ st2(v[2*i], v[2*i+1], T, tmp); 4989 } 4990 } 4991 4992 // Helper routines for various flavours of Montgomery multiply 4993 4994 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 4995 // multiplications in parallel 4996 // 4997 4998 // See the montMul() method of the sun.security.provider.ML_DSA 4999 // class. 5000 // 5001 // Computes 4x4S results or 8x8H results 5002 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5003 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5004 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5005 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5006 // Outputs: va - 4x4S or 4x8H vector register sequences 5007 // vb, vc, vtmp and vq must all be disjoint 5008 // va must be disjoint from all other inputs/temps or must equal vc 5009 // va must have a non-zero delta i.e. it must not be a constant vseq. 5010 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5011 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5012 Assembler::SIMD_Arrangement T, 5013 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5014 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5015 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5016 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5017 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5018 5019 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5020 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5021 5022 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5023 5024 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5025 assert(vs_disjoint(va, vb), "va and vb overlap"); 5026 assert(vs_disjoint(va, vq), "va and vq overlap"); 5027 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5028 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5029 5030 // schedule 4 streams of instructions across the vector sequences 5031 for (int i = 0; i < 4; i++) { 5032 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5033 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5034 } 5035 5036 for (int i = 0; i < 4; i++) { 5037 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5038 } 5039 5040 for (int i = 0; i < 4; i++) { 5041 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5042 } 5043 5044 for (int i = 0; i < 4; i++) { 5045 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5046 } 5047 } 5048 5049 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5050 // multiplications in parallel 5051 // 5052 5053 // See the montMul() method of the sun.security.provider.ML_DSA 5054 // class. 5055 // 5056 // Computes 4x4S results or 8x8H results 5057 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5058 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5059 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5060 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5061 // Outputs: va - 4x4S or 4x8H vector register sequences 5062 // vb, vc, vtmp and vq must all be disjoint 5063 // va must be disjoint from all other inputs/temps or must equal vc 5064 // va must have a non-zero delta i.e. it must not be a constant vseq. 5065 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5066 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5067 Assembler::SIMD_Arrangement T, 5068 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5069 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5070 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5071 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5072 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5073 5074 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5075 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5076 5077 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5078 5079 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5080 assert(vs_disjoint(va, vb), "va and vb overlap"); 5081 assert(vs_disjoint(va, vq), "va and vq overlap"); 5082 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5083 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5084 5085 // schedule 2 streams of instructions across the vector sequences 5086 for (int i = 0; i < 2; i++) { 5087 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5088 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5089 } 5090 5091 for (int i = 0; i < 2; i++) { 5092 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5093 } 5094 5095 for (int i = 0; i < 2; i++) { 5096 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5097 } 5098 5099 for (int i = 0; i < 2; i++) { 5100 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5101 } 5102 } 5103 5104 // Perform 16 16-bit Montgomery multiplications in parallel. 5105 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5106 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5107 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5108 // It will assert that the register use is valid 5109 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5110 } 5111 5112 // Perform 32 16-bit Montgomery multiplications in parallel. 5113 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5114 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5115 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5116 // It will assert that the register use is valid 5117 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5118 } 5119 5120 // Perform 64 16-bit Montgomery multiplications in parallel. 5121 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5122 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5123 // Schedule two successive 4x8H multiplies via the montmul helper 5124 // on the front and back halves of va, vb and vc. The helper will 5125 // assert that the register use has no overlap conflicts on each 5126 // individual call but we also need to ensure that the necessary 5127 // disjoint/equality constraints are met across both calls. 5128 5129 // vb, vc, vtmp and vq must be disjoint. va must either be 5130 // disjoint from all other registers or equal vc 5131 5132 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5133 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5134 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5135 5136 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5137 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5138 5139 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5140 5141 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5142 assert(vs_disjoint(va, vb), "va and vb overlap"); 5143 assert(vs_disjoint(va, vq), "va and vq overlap"); 5144 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5145 5146 // we multiply the front and back halves of each sequence 4 at a 5147 // time because 5148 // 5149 // 1) we are currently only able to get 4-way instruction 5150 // parallelism at best 5151 // 5152 // 2) we need registers for the constants in vq and temporary 5153 // scratch registers to hold intermediate results so vtmp can only 5154 // be a VSeq<4> which means we only have 4 scratch slots 5155 5156 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5157 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5158 } 5159 5160 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5161 const VSeq<4>& vc, 5162 const VSeq<4>& vtmp, 5163 const VSeq<2>& vq) { 5164 // compute a = montmul(a1, c) 5165 kyber_montmul32(vc, va1, vc, vtmp, vq); 5166 // ouptut a1 = a0 - a 5167 vs_subv(va1, __ T8H, va0, vc); 5168 // and a0 = a0 + a 5169 vs_addv(va0, __ T8H, va0, vc); 5170 } 5171 5172 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5173 const VSeq<4>& vb, 5174 const VSeq<4>& vtmp1, 5175 const VSeq<4>& vtmp2, 5176 const VSeq<2>& vq) { 5177 // compute c = a0 - a1 5178 vs_subv(vtmp1, __ T8H, va0, va1); 5179 // output a0 = a0 + a1 5180 vs_addv(va0, __ T8H, va0, va1); 5181 // output a1 = b montmul c 5182 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5183 } 5184 5185 void load64shorts(const VSeq<8>& v, Register shorts) { 5186 vs_ldpq_post(v, shorts); 5187 } 5188 5189 void load32shorts(const VSeq<4>& v, Register shorts) { 5190 vs_ldpq_post(v, shorts); 5191 } 5192 5193 void store64shorts(VSeq<8> v, Register tmpAddr) { 5194 vs_stpq_post(v, tmpAddr); 5195 } 5196 5197 // Kyber NTT function. 5198 // Implements 5199 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5200 // 5201 // coeffs (short[256]) = c_rarg0 5202 // ntt_zetas (short[256]) = c_rarg1 5203 address generate_kyberNtt() { 5204 5205 __ align(CodeEntryAlignment); 5206 StubGenStubId stub_id = StubGenStubId::kyberNtt_id; 5207 StubCodeMark mark(this, stub_id); 5208 address start = __ pc(); 5209 __ enter(); 5210 5211 const Register coeffs = c_rarg0; 5212 const Register zetas = c_rarg1; 5213 5214 const Register kyberConsts = r10; 5215 const Register tmpAddr = r11; 5216 5217 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5218 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5219 VSeq<2> vq(30); // n.b. constants overlap vs3 5220 5221 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5222 // load the montmul constants 5223 vs_ldpq(vq, kyberConsts); 5224 5225 // Each level corresponds to an iteration of the outermost loop of the 5226 // Java method seilerNTT(int[] coeffs). There are some differences 5227 // from what is done in the seilerNTT() method, though: 5228 // 1. The computation is using 16-bit signed values, we do not convert them 5229 // to ints here. 5230 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5231 // this array for each level, it is easier that way to fill up the vector 5232 // registers. 5233 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5234 // multiplications (this is because that way there should not be any 5235 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5236 // that we can use the 16-bit arithmetic in the vector unit. 5237 // 5238 // On each level, we fill up the vector registers in such a way that the 5239 // array elements that need to be multiplied by the zetas go into one 5240 // set of vector registers while the corresponding ones that don't need to 5241 // be multiplied, go into another set. 5242 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5243 // registers interleaving the steps of 4 identical computations, 5244 // each done on 8 16-bit values per register. 5245 5246 // At levels 0-3 the coefficients multiplied by or added/subtracted 5247 // to the zetas occur in discrete blocks whose size is some multiple 5248 // of 32. 5249 5250 // level 0 5251 __ add(tmpAddr, coeffs, 256); 5252 load64shorts(vs1, tmpAddr); 5253 load64shorts(vs2, zetas); 5254 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5255 __ add(tmpAddr, coeffs, 0); 5256 load64shorts(vs1, tmpAddr); 5257 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5258 vs_addv(vs1, __ T8H, vs1, vs2); 5259 __ add(tmpAddr, coeffs, 0); 5260 vs_stpq_post(vs1, tmpAddr); 5261 __ add(tmpAddr, coeffs, 256); 5262 vs_stpq_post(vs3, tmpAddr); 5263 // restore montmul constants 5264 vs_ldpq(vq, kyberConsts); 5265 load64shorts(vs1, tmpAddr); 5266 load64shorts(vs2, zetas); 5267 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5268 __ add(tmpAddr, coeffs, 128); 5269 load64shorts(vs1, tmpAddr); 5270 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5271 vs_addv(vs1, __ T8H, vs1, vs2); 5272 __ add(tmpAddr, coeffs, 128); 5273 store64shorts(vs1, tmpAddr); 5274 __ add(tmpAddr, coeffs, 384); 5275 store64shorts(vs3, tmpAddr); 5276 5277 // level 1 5278 // restore montmul constants 5279 vs_ldpq(vq, kyberConsts); 5280 __ add(tmpAddr, coeffs, 128); 5281 load64shorts(vs1, tmpAddr); 5282 load64shorts(vs2, zetas); 5283 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5284 __ add(tmpAddr, coeffs, 0); 5285 load64shorts(vs1, tmpAddr); 5286 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5287 vs_addv(vs1, __ T8H, vs1, vs2); 5288 __ add(tmpAddr, coeffs, 0); 5289 store64shorts(vs1, tmpAddr); 5290 store64shorts(vs3, tmpAddr); 5291 vs_ldpq(vq, kyberConsts); 5292 __ add(tmpAddr, coeffs, 384); 5293 load64shorts(vs1, tmpAddr); 5294 load64shorts(vs2, zetas); 5295 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5296 __ add(tmpAddr, coeffs, 256); 5297 load64shorts(vs1, tmpAddr); 5298 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5299 vs_addv(vs1, __ T8H, vs1, vs2); 5300 __ add(tmpAddr, coeffs, 256); 5301 store64shorts(vs1, tmpAddr); 5302 store64shorts(vs3, tmpAddr); 5303 5304 // level 2 5305 vs_ldpq(vq, kyberConsts); 5306 int offsets1[4] = { 0, 32, 128, 160 }; 5307 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5308 load64shorts(vs2, zetas); 5309 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5310 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5311 // kyber_subv_addv64(); 5312 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5313 vs_addv(vs1, __ T8H, vs1, vs2); 5314 __ add(tmpAddr, coeffs, 0); 5315 vs_stpq_post(vs_front(vs1), tmpAddr); 5316 vs_stpq_post(vs_front(vs3), tmpAddr); 5317 vs_stpq_post(vs_back(vs1), tmpAddr); 5318 vs_stpq_post(vs_back(vs3), tmpAddr); 5319 vs_ldpq(vq, kyberConsts); 5320 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5321 load64shorts(vs2, zetas); 5322 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5323 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5324 // kyber_subv_addv64(); 5325 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5326 vs_addv(vs1, __ T8H, vs1, vs2); 5327 __ add(tmpAddr, coeffs, 256); 5328 vs_stpq_post(vs_front(vs1), tmpAddr); 5329 vs_stpq_post(vs_front(vs3), tmpAddr); 5330 vs_stpq_post(vs_back(vs1), tmpAddr); 5331 vs_stpq_post(vs_back(vs3), tmpAddr); 5332 5333 // level 3 5334 vs_ldpq(vq, kyberConsts); 5335 int offsets2[4] = { 0, 64, 128, 192 }; 5336 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5337 load64shorts(vs2, zetas); 5338 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5339 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5340 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5341 vs_addv(vs1, __ T8H, vs1, vs2); 5342 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5343 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5344 5345 vs_ldpq(vq, kyberConsts); 5346 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5347 load64shorts(vs2, zetas); 5348 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5349 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5350 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5351 vs_addv(vs1, __ T8H, vs1, vs2); 5352 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5353 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5354 5355 // level 4 5356 // At level 4 coefficients occur in 8 discrete blocks of size 16 5357 // so they are loaded using employing an ldr at 8 distinct offsets. 5358 5359 vs_ldpq(vq, kyberConsts); 5360 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5361 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5362 load64shorts(vs2, zetas); 5363 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5364 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5365 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5366 vs_addv(vs1, __ T8H, vs1, vs2); 5367 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5368 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5369 5370 vs_ldpq(vq, kyberConsts); 5371 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5372 load64shorts(vs2, zetas); 5373 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5374 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5375 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5376 vs_addv(vs1, __ T8H, vs1, vs2); 5377 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5378 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5379 5380 // level 5 5381 // At level 5 related coefficients occur in discrete blocks of size 8 so 5382 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5383 5384 vs_ldpq(vq, kyberConsts); 5385 int offsets4[4] = { 0, 32, 64, 96 }; 5386 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5387 load32shorts(vs_front(vs2), zetas); 5388 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5389 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5390 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5391 load32shorts(vs_front(vs2), zetas); 5392 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5393 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5394 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5395 load32shorts(vs_front(vs2), zetas); 5396 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5397 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5398 5399 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5400 load32shorts(vs_front(vs2), zetas); 5401 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5402 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5403 5404 // level 6 5405 // At level 6 related coefficients occur in discrete blocks of size 4 so 5406 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5407 5408 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5409 load32shorts(vs_front(vs2), zetas); 5410 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5411 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5412 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5413 // __ ldpq(v18, v19, __ post(zetas, 32)); 5414 load32shorts(vs_front(vs2), zetas); 5415 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5416 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5417 5418 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5419 load32shorts(vs_front(vs2), zetas); 5420 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5421 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5422 5423 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5424 load32shorts(vs_front(vs2), zetas); 5425 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5426 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5427 5428 __ leave(); // required for proper stackwalking of RuntimeStub frame 5429 __ mov(r0, zr); // return 0 5430 __ ret(lr); 5431 5432 return start; 5433 } 5434 5435 // Kyber Inverse NTT function 5436 // Implements 5437 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5438 // 5439 // coeffs (short[256]) = c_rarg0 5440 // ntt_zetas (short[256]) = c_rarg1 5441 address generate_kyberInverseNtt() { 5442 5443 __ align(CodeEntryAlignment); 5444 StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id; 5445 StubCodeMark mark(this, stub_id); 5446 address start = __ pc(); 5447 __ enter(); 5448 5449 const Register coeffs = c_rarg0; 5450 const Register zetas = c_rarg1; 5451 5452 const Register kyberConsts = r10; 5453 const Register tmpAddr = r11; 5454 const Register tmpAddr2 = c_rarg2; 5455 5456 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5457 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5458 VSeq<2> vq(30); // n.b. constants overlap vs3 5459 5460 __ lea(kyberConsts, 5461 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5462 5463 // level 0 5464 // At level 0 related coefficients occur in discrete blocks of size 4 so 5465 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5466 5467 vs_ldpq(vq, kyberConsts); 5468 int offsets4[4] = { 0, 32, 64, 96 }; 5469 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5470 load32shorts(vs_front(vs2), zetas); 5471 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5472 vs_front(vs2), vs_back(vs2), vtmp, vq); 5473 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5474 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5475 load32shorts(vs_front(vs2), zetas); 5476 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5477 vs_front(vs2), vs_back(vs2), vtmp, vq); 5478 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5479 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5480 load32shorts(vs_front(vs2), zetas); 5481 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5482 vs_front(vs2), vs_back(vs2), vtmp, vq); 5483 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5484 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5485 load32shorts(vs_front(vs2), zetas); 5486 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5487 vs_front(vs2), vs_back(vs2), vtmp, vq); 5488 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5489 5490 // level 1 5491 // At level 1 related coefficients occur in discrete blocks of size 8 so 5492 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5493 5494 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5495 load32shorts(vs_front(vs2), zetas); 5496 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5497 vs_front(vs2), vs_back(vs2), vtmp, vq); 5498 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5499 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5500 load32shorts(vs_front(vs2), zetas); 5501 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5502 vs_front(vs2), vs_back(vs2), vtmp, vq); 5503 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5504 5505 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5506 load32shorts(vs_front(vs2), zetas); 5507 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5508 vs_front(vs2), vs_back(vs2), vtmp, vq); 5509 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5510 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5511 load32shorts(vs_front(vs2), zetas); 5512 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5513 vs_front(vs2), vs_back(vs2), vtmp, vq); 5514 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5515 5516 // level 2 5517 // At level 2 coefficients occur in 8 discrete blocks of size 16 5518 // so they are loaded using employing an ldr at 8 distinct offsets. 5519 5520 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5521 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5522 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5523 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5524 vs_subv(vs1, __ T8H, vs1, vs2); 5525 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5526 load64shorts(vs2, zetas); 5527 vs_ldpq(vq, kyberConsts); 5528 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5529 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5530 5531 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5532 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5533 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5534 vs_subv(vs1, __ T8H, vs1, vs2); 5535 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5536 load64shorts(vs2, zetas); 5537 vs_ldpq(vq, kyberConsts); 5538 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5539 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5540 5541 // Barrett reduction at indexes where overflow may happen 5542 5543 // load q and the multiplier for the Barrett reduction 5544 __ add(tmpAddr, kyberConsts, 16); 5545 vs_ldpq(vq, tmpAddr); 5546 5547 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5548 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5549 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5550 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5551 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5552 vs_sshr(vs2, __ T8H, vs2, 11); 5553 vs_mlsv(vs1, __ T8H, vs2, vq1); 5554 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5555 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5556 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5557 vs_sshr(vs2, __ T8H, vs2, 11); 5558 vs_mlsv(vs1, __ T8H, vs2, vq1); 5559 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5560 5561 // level 3 5562 // From level 3 upwards coefficients occur in discrete blocks whose size is 5563 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5564 5565 int offsets2[4] = { 0, 64, 128, 192 }; 5566 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5567 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5568 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5569 vs_subv(vs1, __ T8H, vs1, vs2); 5570 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5571 load64shorts(vs2, zetas); 5572 vs_ldpq(vq, kyberConsts); 5573 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5574 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5575 5576 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5577 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5578 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5579 vs_subv(vs1, __ T8H, vs1, vs2); 5580 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5581 load64shorts(vs2, zetas); 5582 vs_ldpq(vq, kyberConsts); 5583 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5584 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5585 5586 // level 4 5587 5588 int offsets1[4] = { 0, 32, 128, 160 }; 5589 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5590 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5591 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5592 vs_subv(vs1, __ T8H, vs1, vs2); 5593 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5594 load64shorts(vs2, zetas); 5595 vs_ldpq(vq, kyberConsts); 5596 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5597 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5598 5599 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5600 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5601 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5602 vs_subv(vs1, __ T8H, vs1, vs2); 5603 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5604 load64shorts(vs2, zetas); 5605 vs_ldpq(vq, kyberConsts); 5606 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5607 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5608 5609 // level 5 5610 5611 __ add(tmpAddr, coeffs, 0); 5612 load64shorts(vs1, tmpAddr); 5613 __ add(tmpAddr, coeffs, 128); 5614 load64shorts(vs2, tmpAddr); 5615 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5616 vs_subv(vs1, __ T8H, vs1, vs2); 5617 __ add(tmpAddr, coeffs, 0); 5618 store64shorts(vs3, tmpAddr); 5619 load64shorts(vs2, zetas); 5620 vs_ldpq(vq, kyberConsts); 5621 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5622 __ add(tmpAddr, coeffs, 128); 5623 store64shorts(vs2, tmpAddr); 5624 5625 load64shorts(vs1, tmpAddr); 5626 __ add(tmpAddr, coeffs, 384); 5627 load64shorts(vs2, tmpAddr); 5628 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5629 vs_subv(vs1, __ T8H, vs1, vs2); 5630 __ add(tmpAddr, coeffs, 256); 5631 store64shorts(vs3, tmpAddr); 5632 load64shorts(vs2, zetas); 5633 vs_ldpq(vq, kyberConsts); 5634 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5635 __ add(tmpAddr, coeffs, 384); 5636 store64shorts(vs2, tmpAddr); 5637 5638 // Barrett reduction at indexes where overflow may happen 5639 5640 // load q and the multiplier for the Barrett reduction 5641 __ add(tmpAddr, kyberConsts, 16); 5642 vs_ldpq(vq, tmpAddr); 5643 5644 int offsets0[2] = { 0, 256 }; 5645 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5646 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5647 vs_sshr(vs2, __ T8H, vs2, 11); 5648 vs_mlsv(vs1, __ T8H, vs2, vq1); 5649 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5650 5651 // level 6 5652 5653 __ add(tmpAddr, coeffs, 0); 5654 load64shorts(vs1, tmpAddr); 5655 __ add(tmpAddr, coeffs, 256); 5656 load64shorts(vs2, tmpAddr); 5657 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5658 vs_subv(vs1, __ T8H, vs1, vs2); 5659 __ add(tmpAddr, coeffs, 0); 5660 store64shorts(vs3, tmpAddr); 5661 load64shorts(vs2, zetas); 5662 vs_ldpq(vq, kyberConsts); 5663 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5664 __ add(tmpAddr, coeffs, 256); 5665 store64shorts(vs2, tmpAddr); 5666 5667 __ add(tmpAddr, coeffs, 128); 5668 load64shorts(vs1, tmpAddr); 5669 __ add(tmpAddr, coeffs, 384); 5670 load64shorts(vs2, tmpAddr); 5671 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5672 vs_subv(vs1, __ T8H, vs1, vs2); 5673 __ add(tmpAddr, coeffs, 128); 5674 store64shorts(vs3, tmpAddr); 5675 load64shorts(vs2, zetas); 5676 vs_ldpq(vq, kyberConsts); 5677 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5678 __ add(tmpAddr, coeffs, 384); 5679 store64shorts(vs2, tmpAddr); 5680 5681 // multiply by 2^-n 5682 5683 // load toMont(2^-n mod q) 5684 __ add(tmpAddr, kyberConsts, 48); 5685 __ ldr(v29, __ Q, tmpAddr); 5686 5687 vs_ldpq(vq, kyberConsts); 5688 __ add(tmpAddr, coeffs, 0); 5689 load64shorts(vs1, tmpAddr); 5690 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5691 __ add(tmpAddr, coeffs, 0); 5692 store64shorts(vs2, tmpAddr); 5693 5694 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5695 load64shorts(vs1, tmpAddr); 5696 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5697 __ add(tmpAddr, coeffs, 128); 5698 store64shorts(vs2, tmpAddr); 5699 5700 // now tmpAddr contains coeffs + 256 5701 load64shorts(vs1, tmpAddr); 5702 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5703 __ add(tmpAddr, coeffs, 256); 5704 store64shorts(vs2, tmpAddr); 5705 5706 // now tmpAddr contains coeffs + 384 5707 load64shorts(vs1, tmpAddr); 5708 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5709 __ add(tmpAddr, coeffs, 384); 5710 store64shorts(vs2, tmpAddr); 5711 5712 __ leave(); // required for proper stackwalking of RuntimeStub frame 5713 __ mov(r0, zr); // return 0 5714 __ ret(lr); 5715 5716 return start; 5717 } 5718 5719 // Kyber multiply polynomials in the NTT domain. 5720 // Implements 5721 // static int implKyberNttMult( 5722 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5723 // 5724 // result (short[256]) = c_rarg0 5725 // ntta (short[256]) = c_rarg1 5726 // nttb (short[256]) = c_rarg2 5727 // zetas (short[128]) = c_rarg3 5728 address generate_kyberNttMult() { 5729 5730 __ align(CodeEntryAlignment); 5731 StubGenStubId stub_id = StubGenStubId::kyberNttMult_id; 5732 StubCodeMark mark(this, stub_id); 5733 address start = __ pc(); 5734 __ enter(); 5735 5736 const Register result = c_rarg0; 5737 const Register ntta = c_rarg1; 5738 const Register nttb = c_rarg2; 5739 const Register zetas = c_rarg3; 5740 5741 const Register kyberConsts = r10; 5742 const Register limit = r11; 5743 5744 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5745 VSeq<4> vs3(16), vs4(20); 5746 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5747 VSeq<2> vz(28); // pair of zetas 5748 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5749 5750 __ lea(kyberConsts, 5751 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5752 5753 Label kyberNttMult_loop; 5754 5755 __ add(limit, result, 512); 5756 5757 // load q and qinv 5758 vs_ldpq(vq, kyberConsts); 5759 5760 // load R^2 mod q (to convert back from Montgomery representation) 5761 __ add(kyberConsts, kyberConsts, 64); 5762 __ ldr(v27, __ Q, kyberConsts); 5763 5764 __ BIND(kyberNttMult_loop); 5765 5766 // load 16 zetas 5767 vs_ldpq_post(vz, zetas); 5768 5769 // load 2 sets of 32 coefficients from the two input arrays 5770 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5771 // are striped across pairs of vector registers 5772 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5773 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5774 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5775 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5776 5777 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5778 // i.e. montmul the first and second halves of vs1 in order and 5779 // then with one sequence reversed storing the two results in vs3 5780 // 5781 // vs3[0] <- montmul(a0, b0) 5782 // vs3[1] <- montmul(a1, b1) 5783 // vs3[2] <- montmul(a0, b1) 5784 // vs3[3] <- montmul(a1, b0) 5785 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5786 kyber_montmul16(vs_back(vs3), 5787 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5788 5789 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5790 // i.e. montmul the first and second halves of vs4 in order and 5791 // then with one sequence reversed storing the two results in vs1 5792 // 5793 // vs1[0] <- montmul(a2, b2) 5794 // vs1[1] <- montmul(a3, b3) 5795 // vs1[2] <- montmul(a2, b3) 5796 // vs1[3] <- montmul(a3, b2) 5797 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5798 kyber_montmul16(vs_back(vs1), 5799 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5800 5801 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5802 // We can schedule two montmuls at a time if we use a suitable vector 5803 // sequence <vs3[1], vs1[1]>. 5804 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5805 VSeq<2> vs5(vs3[1], delta); 5806 5807 // vs3[1] <- montmul(montmul(a1, b1), z0) 5808 // vs1[1] <- montmul(montmul(a3, b3), z1) 5809 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5810 5811 // add results in pairs storing in vs3 5812 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5813 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5814 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5815 5816 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5817 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5818 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5819 5820 // vs1 <- montmul(vs3, montRSquareModQ) 5821 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5822 5823 // store back the two pairs of result vectors de-interleaved as 8H elements 5824 // i.e. storing each pairs of shorts striped across a register pair adjacent 5825 // in memory 5826 vs_st2_post(vs1, __ T8H, result); 5827 5828 __ cmp(result, limit); 5829 __ br(Assembler::NE, kyberNttMult_loop); 5830 5831 __ leave(); // required for proper stackwalking of RuntimeStub frame 5832 __ mov(r0, zr); // return 0 5833 __ ret(lr); 5834 5835 return start; 5836 } 5837 5838 // Kyber add 2 polynomials. 5839 // Implements 5840 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5841 // 5842 // result (short[256]) = c_rarg0 5843 // a (short[256]) = c_rarg1 5844 // b (short[256]) = c_rarg2 5845 address generate_kyberAddPoly_2() { 5846 5847 __ align(CodeEntryAlignment); 5848 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id; 5849 StubCodeMark mark(this, stub_id); 5850 address start = __ pc(); 5851 __ enter(); 5852 5853 const Register result = c_rarg0; 5854 const Register a = c_rarg1; 5855 const Register b = c_rarg2; 5856 5857 const Register kyberConsts = r11; 5858 5859 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5860 // So, we can load, add and store the data in 3 groups of 11, 5861 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5862 // registers. A further constraint is that the mapping needs 5863 // to skip callee saves. So, we allocate the register 5864 // sequences using two 8 sequences, two 2 sequences and two 5865 // single registers. 5866 VSeq<8> vs1_1(0); 5867 VSeq<2> vs1_2(16); 5868 FloatRegister vs1_3 = v28; 5869 VSeq<8> vs2_1(18); 5870 VSeq<2> vs2_2(26); 5871 FloatRegister vs2_3 = v29; 5872 5873 // two constant vector sequences 5874 VSeq<8> vc_1(31, 0); 5875 VSeq<2> vc_2(31, 0); 5876 5877 FloatRegister vc_3 = v31; 5878 __ lea(kyberConsts, 5879 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5880 5881 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5882 for (int i = 0; i < 3; i++) { 5883 // load 80 or 88 values from a into vs1_1/2/3 5884 vs_ldpq_post(vs1_1, a); 5885 vs_ldpq_post(vs1_2, a); 5886 if (i < 2) { 5887 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5888 } 5889 // load 80 or 88 values from b into vs2_1/2/3 5890 vs_ldpq_post(vs2_1, b); 5891 vs_ldpq_post(vs2_2, b); 5892 if (i < 2) { 5893 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5894 } 5895 // sum 80 or 88 values across vs1 and vs2 into vs1 5896 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5897 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5898 if (i < 2) { 5899 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5900 } 5901 // add constant to all 80 or 88 results 5902 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5903 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5904 if (i < 2) { 5905 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5906 } 5907 // store 80 or 88 values 5908 vs_stpq_post(vs1_1, result); 5909 vs_stpq_post(vs1_2, result); 5910 if (i < 2) { 5911 __ str(vs1_3, __ Q, __ post(result, 16)); 5912 } 5913 } 5914 5915 __ leave(); // required for proper stackwalking of RuntimeStub frame 5916 __ mov(r0, zr); // return 0 5917 __ ret(lr); 5918 5919 return start; 5920 } 5921 5922 // Kyber add 3 polynomials. 5923 // Implements 5924 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5925 // 5926 // result (short[256]) = c_rarg0 5927 // a (short[256]) = c_rarg1 5928 // b (short[256]) = c_rarg2 5929 // c (short[256]) = c_rarg3 5930 address generate_kyberAddPoly_3() { 5931 5932 __ align(CodeEntryAlignment); 5933 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id; 5934 StubCodeMark mark(this, stub_id); 5935 address start = __ pc(); 5936 __ enter(); 5937 5938 const Register result = c_rarg0; 5939 const Register a = c_rarg1; 5940 const Register b = c_rarg2; 5941 const Register c = c_rarg3; 5942 5943 const Register kyberConsts = r11; 5944 5945 // As above we sum 256 sets of values in total i.e. 32 x 8H 5946 // quadwords. So, we can load, add and store the data in 3 5947 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5948 // of 10 or 11 registers. A further constraint is that the 5949 // mapping needs to skip callee saves. So, we allocate the 5950 // register sequences using two 8 sequences, two 2 sequences 5951 // and two single registers. 5952 VSeq<8> vs1_1(0); 5953 VSeq<2> vs1_2(16); 5954 FloatRegister vs1_3 = v28; 5955 VSeq<8> vs2_1(18); 5956 VSeq<2> vs2_2(26); 5957 FloatRegister vs2_3 = v29; 5958 5959 // two constant vector sequences 5960 VSeq<8> vc_1(31, 0); 5961 VSeq<2> vc_2(31, 0); 5962 5963 FloatRegister vc_3 = v31; 5964 5965 __ lea(kyberConsts, 5966 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5967 5968 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5969 for (int i = 0; i < 3; i++) { 5970 // load 80 or 88 values from a into vs1_1/2/3 5971 vs_ldpq_post(vs1_1, a); 5972 vs_ldpq_post(vs1_2, a); 5973 if (i < 2) { 5974 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5975 } 5976 // load 80 or 88 values from b into vs2_1/2/3 5977 vs_ldpq_post(vs2_1, b); 5978 vs_ldpq_post(vs2_2, b); 5979 if (i < 2) { 5980 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5981 } 5982 // sum 80 or 88 values across vs1 and vs2 into vs1 5983 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5984 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5985 if (i < 2) { 5986 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5987 } 5988 // load 80 or 88 values from c into vs2_1/2/3 5989 vs_ldpq_post(vs2_1, c); 5990 vs_ldpq_post(vs2_2, c); 5991 if (i < 2) { 5992 __ ldr(vs2_3, __ Q, __ post(c, 16)); 5993 } 5994 // sum 80 or 88 values across vs1 and vs2 into vs1 5995 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5996 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5997 if (i < 2) { 5998 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5999 } 6000 // add constant to all 80 or 88 results 6001 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6002 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6003 if (i < 2) { 6004 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6005 } 6006 // store 80 or 88 values 6007 vs_stpq_post(vs1_1, result); 6008 vs_stpq_post(vs1_2, result); 6009 if (i < 2) { 6010 __ str(vs1_3, __ Q, __ post(result, 16)); 6011 } 6012 } 6013 6014 __ leave(); // required for proper stackwalking of RuntimeStub frame 6015 __ mov(r0, zr); // return 0 6016 __ ret(lr); 6017 6018 return start; 6019 } 6020 6021 // Kyber parse XOF output to polynomial coefficient candidates 6022 // or decodePoly(12, ...). 6023 // Implements 6024 // static int implKyber12To16( 6025 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6026 // 6027 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6028 // 6029 // condensed (byte[]) = c_rarg0 6030 // condensedIndex = c_rarg1 6031 // parsed (short[112 or 256]) = c_rarg2 6032 // parsedLength (112 or 256) = c_rarg3 6033 address generate_kyber12To16() { 6034 Label L_F00, L_loop, L_end; 6035 6036 __ BIND(L_F00); 6037 __ emit_int64(0x0f000f000f000f00); 6038 __ emit_int64(0x0f000f000f000f00); 6039 6040 __ align(CodeEntryAlignment); 6041 StubGenStubId stub_id = StubGenStubId::kyber12To16_id; 6042 StubCodeMark mark(this, stub_id); 6043 address start = __ pc(); 6044 __ enter(); 6045 6046 const Register condensed = c_rarg0; 6047 const Register condensedOffs = c_rarg1; 6048 const Register parsed = c_rarg2; 6049 const Register parsedLength = c_rarg3; 6050 6051 const Register tmpAddr = r11; 6052 6053 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6054 // quadwords so we need a 6 vector sequence for the inputs. 6055 // Parsing produces 64 shorts, employing two 8 vector 6056 // sequences to store and combine the intermediate data. 6057 VSeq<6> vin(24); 6058 VSeq<8> va(0), vb(16); 6059 6060 __ adr(tmpAddr, L_F00); 6061 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6062 __ add(condensed, condensed, condensedOffs); 6063 6064 __ BIND(L_loop); 6065 // load 96 (6 x 16B) byte values 6066 vs_ld3_post(vin, __ T16B, condensed); 6067 6068 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6069 // holds 48 (16x3) contiguous bytes from memory striped 6070 // horizontally across each of the 16 byte lanes. Equivalently, 6071 // that is 16 pairs of 12-bit integers. Likewise the back half 6072 // holds the next 48 bytes in the same arrangement. 6073 6074 // Each vector in the front half can also be viewed as a vertical 6075 // strip across the 16 pairs of 12 bit integers. Each byte in 6076 // vin[0] stores the low 8 bits of the first int in a pair. Each 6077 // byte in vin[1] stores the high 4 bits of the first int and the 6078 // low 4 bits of the second int. Each byte in vin[2] stores the 6079 // high 8 bits of the second int. Likewise the vectors in second 6080 // half. 6081 6082 // Converting the data to 16-bit shorts requires first of all 6083 // expanding each of the 6 x 16B vectors into 6 corresponding 6084 // pairs of 8H vectors. Mask, shift and add operations on the 6085 // resulting vector pairs can be used to combine 4 and 8 bit 6086 // parts of related 8H vector elements. 6087 // 6088 // The middle vectors (vin[2] and vin[5]) are actually expanded 6089 // twice, one copy manipulated to provide the lower 4 bits 6090 // belonging to the first short in a pair and another copy 6091 // manipulated to provide the higher 4 bits belonging to the 6092 // second short in a pair. This is why the the vector sequences va 6093 // and vb used to hold the expanded 8H elements are of length 8. 6094 6095 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6096 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6097 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6098 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6099 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6100 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6101 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6102 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6103 6104 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6105 // and vb[4:5] 6106 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6107 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6108 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6109 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6110 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6111 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6112 6113 // shift lo byte of copy 1 of the middle stripe into the high byte 6114 __ shl(va[2], __ T8H, va[2], 8); 6115 __ shl(va[3], __ T8H, va[3], 8); 6116 __ shl(vb[2], __ T8H, vb[2], 8); 6117 __ shl(vb[3], __ T8H, vb[3], 8); 6118 6119 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6120 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6121 // are in bit positions [4..11]. 6122 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6123 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6124 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6125 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6126 6127 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6128 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6129 // copy2 6130 __ andr(va[2], __ T16B, va[2], v31); 6131 __ andr(va[3], __ T16B, va[3], v31); 6132 __ ushr(va[4], __ T8H, va[4], 4); 6133 __ ushr(va[5], __ T8H, va[5], 4); 6134 __ andr(vb[2], __ T16B, vb[2], v31); 6135 __ andr(vb[3], __ T16B, vb[3], v31); 6136 __ ushr(vb[4], __ T8H, vb[4], 4); 6137 __ ushr(vb[5], __ T8H, vb[5], 4); 6138 6139 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6140 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6141 // n.b. the ordering ensures: i) inputs are consumed before they 6142 // are overwritten ii) the order of 16-bit results across successive 6143 // pairs of vectors in va and then vb reflects the order of the 6144 // corresponding 12-bit inputs 6145 __ addv(va[0], __ T8H, va[0], va[2]); 6146 __ addv(va[2], __ T8H, va[1], va[3]); 6147 __ addv(va[1], __ T8H, va[4], va[6]); 6148 __ addv(va[3], __ T8H, va[5], va[7]); 6149 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6150 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6151 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6152 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6153 6154 // store 64 results interleaved as shorts 6155 vs_st2_post(vs_front(va), __ T8H, parsed); 6156 vs_st2_post(vs_front(vb), __ T8H, parsed); 6157 6158 __ sub(parsedLength, parsedLength, 64); 6159 __ cmp(parsedLength, (u1)64); 6160 __ br(Assembler::GE, L_loop); 6161 __ cbz(parsedLength, L_end); 6162 6163 // if anything is left it should be a final 72 bytes of input 6164 // i.e. a final 48 12-bit values. so we handle this by loading 6165 // 48 bytes into all 16B lanes of front(vin) and only 24 6166 // bytes into the lower 8B lane of back(vin) 6167 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6168 vs_ld3(vs_back(vin), __ T8B, condensed); 6169 6170 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6171 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6172 // 5 and target element 2 of vb duplicates element 4. 6173 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6174 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6175 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6176 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6177 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6178 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6179 6180 // This time expand just the lower 8 lanes 6181 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6182 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6183 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6184 6185 // shift lo byte of copy 1 of the middle stripe into the high byte 6186 __ shl(va[2], __ T8H, va[2], 8); 6187 __ shl(va[3], __ T8H, va[3], 8); 6188 __ shl(vb[2], __ T8H, vb[2], 8); 6189 6190 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6191 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6192 // int are in bit positions [4..11]. 6193 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6194 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6195 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6196 6197 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6198 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6199 // copy2 6200 __ andr(va[2], __ T16B, va[2], v31); 6201 __ andr(va[3], __ T16B, va[3], v31); 6202 __ ushr(va[4], __ T8H, va[4], 4); 6203 __ ushr(va[5], __ T8H, va[5], 4); 6204 __ andr(vb[2], __ T16B, vb[2], v31); 6205 __ ushr(vb[4], __ T8H, vb[4], 4); 6206 6207 6208 6209 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6210 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6211 6212 // n.b. ordering ensures: i) inputs are consumed before they are 6213 // overwritten ii) order of 16-bit results across succsessive 6214 // pairs of vectors in va and then lower half of vb reflects order 6215 // of corresponding 12-bit inputs 6216 __ addv(va[0], __ T8H, va[0], va[2]); 6217 __ addv(va[2], __ T8H, va[1], va[3]); 6218 __ addv(va[1], __ T8H, va[4], va[6]); 6219 __ addv(va[3], __ T8H, va[5], va[7]); 6220 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6221 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6222 6223 // store 48 results interleaved as shorts 6224 vs_st2_post(vs_front(va), __ T8H, parsed); 6225 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6226 6227 __ BIND(L_end); 6228 6229 __ leave(); // required for proper stackwalking of RuntimeStub frame 6230 __ mov(r0, zr); // return 0 6231 __ ret(lr); 6232 6233 return start; 6234 } 6235 6236 // Kyber Barrett reduce function. 6237 // Implements 6238 // static int implKyberBarrettReduce(short[] coeffs) {} 6239 // 6240 // coeffs (short[256]) = c_rarg0 6241 address generate_kyberBarrettReduce() { 6242 6243 __ align(CodeEntryAlignment); 6244 StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id; 6245 StubCodeMark mark(this, stub_id); 6246 address start = __ pc(); 6247 __ enter(); 6248 6249 const Register coeffs = c_rarg0; 6250 6251 const Register kyberConsts = r10; 6252 const Register result = r11; 6253 6254 // As above we process 256 sets of values in total i.e. 32 x 6255 // 8H quadwords. So, we can load, add and store the data in 3 6256 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6257 // of 10 or 11 registers. A further constraint is that the 6258 // mapping needs to skip callee saves. So, we allocate the 6259 // register sequences using two 8 sequences, two 2 sequences 6260 // and two single registers. 6261 VSeq<8> vs1_1(0); 6262 VSeq<2> vs1_2(16); 6263 FloatRegister vs1_3 = v28; 6264 VSeq<8> vs2_1(18); 6265 VSeq<2> vs2_2(26); 6266 FloatRegister vs2_3 = v29; 6267 6268 // we also need a pair of corresponding constant sequences 6269 6270 VSeq<8> vc1_1(30, 0); 6271 VSeq<2> vc1_2(30, 0); 6272 FloatRegister vc1_3 = v30; // for kyber_q 6273 6274 VSeq<8> vc2_1(31, 0); 6275 VSeq<2> vc2_2(31, 0); 6276 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6277 6278 __ add(result, coeffs, 0); 6279 __ lea(kyberConsts, 6280 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6281 6282 // load q and the multiplier for the Barrett reduction 6283 __ add(kyberConsts, kyberConsts, 16); 6284 __ ldpq(vc1_3, vc2_3, kyberConsts); 6285 6286 for (int i = 0; i < 3; i++) { 6287 // load 80 or 88 coefficients 6288 vs_ldpq_post(vs1_1, coeffs); 6289 vs_ldpq_post(vs1_2, coeffs); 6290 if (i < 2) { 6291 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6292 } 6293 6294 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6295 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6296 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6297 if (i < 2) { 6298 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6299 } 6300 6301 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6302 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6303 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6304 if (i < 2) { 6305 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6306 } 6307 6308 // vs1 <- vs1 - vs2 * kyber_q 6309 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6310 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6311 if (i < 2) { 6312 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6313 } 6314 6315 vs_stpq_post(vs1_1, result); 6316 vs_stpq_post(vs1_2, result); 6317 if (i < 2) { 6318 __ str(vs1_3, __ Q, __ post(result, 16)); 6319 } 6320 } 6321 6322 __ leave(); // required for proper stackwalking of RuntimeStub frame 6323 __ mov(r0, zr); // return 0 6324 __ ret(lr); 6325 6326 return start; 6327 } 6328 6329 6330 // Dilithium-specific montmul helper routines that generate parallel 6331 // code for, respectively, a single 4x4s vector sequence montmul or 6332 // two such multiplies in a row. 6333 6334 // Perform 16 32-bit Montgomery multiplications in parallel 6335 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6336 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6337 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6338 // It will assert that the register use is valid 6339 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6340 } 6341 6342 // Perform 2x16 32-bit Montgomery multiplications in parallel 6343 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6344 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6345 // Schedule two successive 4x4S multiplies via the montmul helper 6346 // on the front and back halves of va, vb and vc. The helper will 6347 // assert that the register use has no overlap conflicts on each 6348 // individual call but we also need to ensure that the necessary 6349 // disjoint/equality constraints are met across both calls. 6350 6351 // vb, vc, vtmp and vq must be disjoint. va must either be 6352 // disjoint from all other registers or equal vc 6353 6354 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6355 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6356 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6357 6358 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6359 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6360 6361 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6362 6363 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6364 assert(vs_disjoint(va, vb), "va and vb overlap"); 6365 assert(vs_disjoint(va, vq), "va and vq overlap"); 6366 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6367 6368 // We multiply the front and back halves of each sequence 4 at a 6369 // time because 6370 // 6371 // 1) we are currently only able to get 4-way instruction 6372 // parallelism at best 6373 // 6374 // 2) we need registers for the constants in vq and temporary 6375 // scratch registers to hold intermediate results so vtmp can only 6376 // be a VSeq<4> which means we only have 4 scratch slots. 6377 6378 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6379 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6380 } 6381 6382 // Perform combined montmul then add/sub on 4x4S vectors. 6383 void dilithium_montmul16_sub_add( 6384 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6385 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6386 // compute a = montmul(a1, c) 6387 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6388 // ouptut a1 = a0 - a 6389 vs_subv(va1, __ T4S, va0, vc); 6390 // and a0 = a0 + a 6391 vs_addv(va0, __ T4S, va0, vc); 6392 } 6393 6394 // Perform combined add/sub then montul on 4x4S vectors. 6395 void dilithium_sub_add_montmul16( 6396 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6397 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6398 // compute c = a0 - a1 6399 vs_subv(vtmp1, __ T4S, va0, va1); 6400 // output a0 = a0 + a1 6401 vs_addv(va0, __ T4S, va0, va1); 6402 // output a1 = b montmul c 6403 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6404 } 6405 6406 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6407 // in the Java implementation come in sequences of at least 8, so we 6408 // can use ldpq to collect the corresponding data into pairs of vector 6409 // registers. 6410 // We collect the coefficients corresponding to the 'j+l' indexes into 6411 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6412 // then we do the (Montgomery) multiplications by the zetas in parallel 6413 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6414 // v0-v7, then do the additions into v24-v31 and the subtractions into 6415 // v0-v7 and finally save the results back to the coeffs array. 6416 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6417 const Register coeffs, const Register zetas) { 6418 int c1 = 0; 6419 int c2 = 512; 6420 int startIncr; 6421 // don't use callee save registers v8 - v15 6422 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6423 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6424 VSeq<2> vq(30); // n.b. constants overlap vs3 6425 int offsets[4] = { 0, 32, 64, 96 }; 6426 6427 for (int level = 0; level < 5; level++) { 6428 int c1Start = c1; 6429 int c2Start = c2; 6430 if (level == 3) { 6431 offsets[1] = 32; 6432 offsets[2] = 128; 6433 offsets[3] = 160; 6434 } else if (level == 4) { 6435 offsets[1] = 64; 6436 offsets[2] = 128; 6437 offsets[3] = 192; 6438 } 6439 6440 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6441 // time at 4 different offsets and multiply them in order by the 6442 // next set of input values. So we employ indexed load and store 6443 // pair instructions with arrangement 4S. 6444 for (int i = 0; i < 4; i++) { 6445 // reload q and qinv 6446 vs_ldpq(vq, dilithiumConsts); // qInv, q 6447 // load 8x4S coefficients via second start pos == c2 6448 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6449 // load next 8x4S inputs == b 6450 vs_ldpq_post(vs2, zetas); 6451 // compute a == c2 * b mod MONT_Q 6452 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6453 // load 8x4s coefficients via first start pos == c1 6454 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6455 // compute a1 = c1 + a 6456 vs_addv(vs3, __ T4S, vs1, vs2); 6457 // compute a2 = c1 - a 6458 vs_subv(vs1, __ T4S, vs1, vs2); 6459 // output a1 and a2 6460 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6461 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6462 6463 int k = 4 * level + i; 6464 6465 if (k > 7) { 6466 startIncr = 256; 6467 } else if (k == 5) { 6468 startIncr = 384; 6469 } else { 6470 startIncr = 128; 6471 } 6472 6473 c1Start += startIncr; 6474 c2Start += startIncr; 6475 } 6476 6477 c2 /= 2; 6478 } 6479 } 6480 6481 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6482 // Implements the method 6483 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6484 // of the Java class sun.security.provider 6485 // 6486 // coeffs (int[256]) = c_rarg0 6487 // zetas (int[256]) = c_rarg1 6488 address generate_dilithiumAlmostNtt() { 6489 6490 __ align(CodeEntryAlignment); 6491 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 6492 StubCodeMark mark(this, stub_id); 6493 address start = __ pc(); 6494 __ enter(); 6495 6496 const Register coeffs = c_rarg0; 6497 const Register zetas = c_rarg1; 6498 6499 const Register tmpAddr = r9; 6500 const Register dilithiumConsts = r10; 6501 const Register result = r11; 6502 // don't use callee save registers v8 - v15 6503 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6504 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6505 VSeq<2> vq(30); // n.b. constants overlap vs3 6506 int offsets[4] = { 0, 32, 64, 96}; 6507 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6508 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6509 __ add(result, coeffs, 0); 6510 __ lea(dilithiumConsts, 6511 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6512 6513 // Each level represents one iteration of the outer for loop of the Java version. 6514 6515 // level 0-4 6516 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6517 6518 // level 5 6519 6520 // At level 5 the coefficients we need to combine with the zetas 6521 // are grouped in memory in blocks of size 4. So, for both sets of 6522 // coefficients we load 4 adjacent values at 8 different offsets 6523 // using an indexed ldr with register variant Q and multiply them 6524 // in sequence order by the next set of inputs. Likewise we store 6525 // the resuls using an indexed str with register variant Q. 6526 for (int i = 0; i < 1024; i += 256) { 6527 // reload constants q, qinv each iteration as they get clobbered later 6528 vs_ldpq(vq, dilithiumConsts); // qInv, q 6529 // load 32 (8x4S) coefficients via first offsets = c1 6530 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6531 // load next 32 (8x4S) inputs = b 6532 vs_ldpq_post(vs2, zetas); 6533 // a = b montul c1 6534 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6535 // load 32 (8x4S) coefficients via second offsets = c2 6536 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6537 // add/sub with result of multiply 6538 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6539 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6540 // write back new coefficients using same offsets 6541 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6542 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6543 } 6544 6545 // level 6 6546 // At level 6 the coefficients we need to combine with the zetas 6547 // are grouped in memory in pairs, the first two being montmul 6548 // inputs and the second add/sub inputs. We can still implement 6549 // the montmul+sub+add using 4-way parallelism but only if we 6550 // combine the coefficients with the zetas 16 at a time. We load 8 6551 // adjacent values at 4 different offsets using an ld2 load with 6552 // arrangement 2D. That interleaves the lower and upper halves of 6553 // each pair of quadwords into successive vector registers. We 6554 // then need to montmul the 4 even elements of the coefficients 6555 // register sequence by the zetas in order and then add/sub the 4 6556 // odd elements of the coefficients register sequence. We use an 6557 // equivalent st2 operation to store the results back into memory 6558 // de-interleaved. 6559 for (int i = 0; i < 1024; i += 128) { 6560 // reload constants q, qinv each iteration as they get clobbered later 6561 vs_ldpq(vq, dilithiumConsts); // qInv, q 6562 // load interleaved 16 (4x2D) coefficients via offsets 6563 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6564 // load next 16 (4x4S) inputs 6565 vs_ldpq_post(vs_front(vs2), zetas); 6566 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6567 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6568 vs_front(vs2), vtmp, vq); 6569 // store interleaved 16 (4x2D) coefficients via offsets 6570 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6571 } 6572 6573 // level 7 6574 // At level 7 the coefficients we need to combine with the zetas 6575 // occur singly with montmul inputs alterating with add/sub 6576 // inputs. Once again we can use 4-way parallelism to combine 16 6577 // zetas at a time. However, we have to load 8 adjacent values at 6578 // 4 different offsets using an ld2 load with arrangement 4S. That 6579 // interleaves the the odd words of each pair into one 6580 // coefficients vector register and the even words of the pair 6581 // into the next register. We then need to montmul the 4 even 6582 // elements of the coefficients register sequence by the zetas in 6583 // order and then add/sub the 4 odd elements of the coefficients 6584 // register sequence. We use an equivalent st2 operation to store 6585 // the results back into memory de-interleaved. 6586 6587 for (int i = 0; i < 1024; i += 128) { 6588 // reload constants q, qinv each iteration as they get clobbered later 6589 vs_ldpq(vq, dilithiumConsts); // qInv, q 6590 // load interleaved 16 (4x4S) coefficients via offsets 6591 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6592 // load next 16 (4x4S) inputs 6593 vs_ldpq_post(vs_front(vs2), zetas); 6594 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6595 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6596 vs_front(vs2), vtmp, vq); 6597 // store interleaved 16 (4x4S) coefficients via offsets 6598 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6599 } 6600 __ leave(); // required for proper stackwalking of RuntimeStub frame 6601 __ mov(r0, zr); // return 0 6602 __ ret(lr); 6603 6604 return start; 6605 } 6606 6607 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6608 // in the Java implementation come in sequences of at least 8, so we 6609 // can use ldpq to collect the corresponding data into pairs of vector 6610 // registers 6611 // We collect the coefficients that correspond to the 'j's into vs1 6612 // the coefficiets that correspond to the 'j+l's into vs2 then 6613 // do the additions into vs3 and the subtractions into vs1 then 6614 // save the result of the additions, load the zetas into vs2 6615 // do the (Montgomery) multiplications by zeta in parallel into vs2 6616 // finally save the results back to the coeffs array 6617 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6618 const Register coeffs, const Register zetas) { 6619 int c1 = 0; 6620 int c2 = 32; 6621 int startIncr; 6622 int offsets[4]; 6623 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6624 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6625 VSeq<2> vq(30); // n.b. constants overlap vs3 6626 6627 offsets[0] = 0; 6628 6629 for (int level = 3; level < 8; level++) { 6630 int c1Start = c1; 6631 int c2Start = c2; 6632 if (level == 3) { 6633 offsets[1] = 64; 6634 offsets[2] = 128; 6635 offsets[3] = 192; 6636 } else if (level == 4) { 6637 offsets[1] = 32; 6638 offsets[2] = 128; 6639 offsets[3] = 160; 6640 } else { 6641 offsets[1] = 32; 6642 offsets[2] = 64; 6643 offsets[3] = 96; 6644 } 6645 6646 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6647 // time at 4 different offsets and multiply them in order by the 6648 // next set of input values. So we employ indexed load and store 6649 // pair instructions with arrangement 4S. 6650 for (int i = 0; i < 4; i++) { 6651 // load v1 32 (8x4S) coefficients relative to first start index 6652 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6653 // load v2 32 (8x4S) coefficients relative to second start index 6654 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6655 // a0 = v1 + v2 -- n.b. clobbers vqs 6656 vs_addv(vs3, __ T4S, vs1, vs2); 6657 // a1 = v1 - v2 6658 vs_subv(vs1, __ T4S, vs1, vs2); 6659 // save a1 relative to first start index 6660 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6661 // load constants q, qinv each iteration as they get clobbered above 6662 vs_ldpq(vq, dilithiumConsts); // qInv, q 6663 // load b next 32 (8x4S) inputs 6664 vs_ldpq_post(vs2, zetas); 6665 // a = a1 montmul b 6666 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6667 // save a relative to second start index 6668 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6669 6670 int k = 4 * level + i; 6671 6672 if (k < 24) { 6673 startIncr = 256; 6674 } else if (k == 25) { 6675 startIncr = 384; 6676 } else { 6677 startIncr = 128; 6678 } 6679 6680 c1Start += startIncr; 6681 c2Start += startIncr; 6682 } 6683 6684 c2 *= 2; 6685 } 6686 } 6687 6688 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6689 // Implements the method 6690 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6691 // the sun.security.provider.ML_DSA class. 6692 // 6693 // coeffs (int[256]) = c_rarg0 6694 // zetas (int[256]) = c_rarg1 6695 address generate_dilithiumAlmostInverseNtt() { 6696 6697 __ align(CodeEntryAlignment); 6698 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 6699 StubCodeMark mark(this, stub_id); 6700 address start = __ pc(); 6701 __ enter(); 6702 6703 const Register coeffs = c_rarg0; 6704 const Register zetas = c_rarg1; 6705 6706 const Register tmpAddr = r9; 6707 const Register dilithiumConsts = r10; 6708 const Register result = r11; 6709 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6710 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6711 VSeq<2> vq(30); // n.b. constants overlap vs3 6712 int offsets[4] = { 0, 32, 64, 96 }; 6713 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6714 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6715 6716 __ add(result, coeffs, 0); 6717 __ lea(dilithiumConsts, 6718 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6719 6720 // Each level represents one iteration of the outer for loop of the Java version 6721 6722 // level 0 6723 // At level 0 we need to interleave adjacent quartets of 6724 // coefficients before we multiply and add/sub by the next 16 6725 // zetas just as we did for level 7 in the multiply code. So we 6726 // load and store the values using an ld2/st2 with arrangement 4S. 6727 for (int i = 0; i < 1024; i += 128) { 6728 // load constants q, qinv 6729 // n.b. this can be moved out of the loop as they do not get 6730 // clobbered by first two loops 6731 vs_ldpq(vq, dilithiumConsts); // qInv, q 6732 // a0/a1 load interleaved 32 (8x4S) coefficients 6733 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6734 // b load next 32 (8x4S) inputs 6735 vs_ldpq_post(vs_front(vs2), zetas); 6736 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6737 // n.b. second half of vs2 provides temporary register storage 6738 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6739 vs_front(vs2), vs_back(vs2), vtmp, vq); 6740 // a0/a1 store interleaved 32 (8x4S) coefficients 6741 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6742 } 6743 6744 // level 1 6745 // At level 1 we need to interleave pairs of adjacent pairs of 6746 // coefficients before we multiply by the next 16 zetas just as we 6747 // did for level 6 in the multiply code. So we load and store the 6748 // values an ld2/st2 with arrangement 2D. 6749 for (int i = 0; i < 1024; i += 128) { 6750 // a0/a1 load interleaved 32 (8x2D) coefficients 6751 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6752 // b load next 16 (4x4S) inputs 6753 vs_ldpq_post(vs_front(vs2), zetas); 6754 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6755 // n.b. second half of vs2 provides temporary register storage 6756 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6757 vs_front(vs2), vs_back(vs2), vtmp, vq); 6758 // a0/a1 store interleaved 32 (8x2D) coefficients 6759 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6760 } 6761 6762 // level 2 6763 // At level 2 coefficients come in blocks of 4. So, we load 4 6764 // adjacent coefficients at 8 distinct offsets for both the first 6765 // and second coefficient sequences, using an ldr with register 6766 // variant Q then combine them with next set of 32 zetas. Likewise 6767 // we store the results using an str with register variant Q. 6768 for (int i = 0; i < 1024; i += 256) { 6769 // c0 load 32 (8x4S) coefficients via first offsets 6770 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6771 // c1 load 32 (8x4S) coefficients via second offsets 6772 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6773 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6774 vs_addv(vs3, __ T4S, vs1, vs2); 6775 // c = c0 - c1 6776 vs_subv(vs1, __ T4S, vs1, vs2); 6777 // store a0 32 (8x4S) coefficients via first offsets 6778 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6779 // b load 32 (8x4S) next inputs 6780 vs_ldpq_post(vs2, zetas); 6781 // reload constants q, qinv -- they were clobbered earlier 6782 vs_ldpq(vq, dilithiumConsts); // qInv, q 6783 // compute a1 = b montmul c 6784 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6785 // store a1 32 (8x4S) coefficients via second offsets 6786 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6787 } 6788 6789 // level 3-7 6790 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6791 6792 __ leave(); // required for proper stackwalking of RuntimeStub frame 6793 __ mov(r0, zr); // return 0 6794 __ ret(lr); 6795 6796 return start; 6797 } 6798 6799 // Dilithium multiply polynomials in the NTT domain. 6800 // Straightforward implementation of the method 6801 // static int implDilithiumNttMult( 6802 // int[] result, int[] ntta, int[] nttb {} of 6803 // the sun.security.provider.ML_DSA class. 6804 // 6805 // result (int[256]) = c_rarg0 6806 // poly1 (int[256]) = c_rarg1 6807 // poly2 (int[256]) = c_rarg2 6808 address generate_dilithiumNttMult() { 6809 6810 __ align(CodeEntryAlignment); 6811 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 6812 StubCodeMark mark(this, stub_id); 6813 address start = __ pc(); 6814 __ enter(); 6815 6816 Label L_loop; 6817 6818 const Register result = c_rarg0; 6819 const Register poly1 = c_rarg1; 6820 const Register poly2 = c_rarg2; 6821 6822 const Register dilithiumConsts = r10; 6823 const Register len = r11; 6824 6825 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6826 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6827 VSeq<2> vq(30); // n.b. constants overlap vs3 6828 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6829 6830 __ lea(dilithiumConsts, 6831 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6832 6833 // load constants q, qinv 6834 vs_ldpq(vq, dilithiumConsts); // qInv, q 6835 // load constant rSquare into v29 6836 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6837 6838 __ mov(len, zr); 6839 __ add(len, len, 1024); 6840 6841 __ BIND(L_loop); 6842 6843 // b load 32 (8x4S) next inputs from poly1 6844 vs_ldpq_post(vs1, poly1); 6845 // c load 32 (8x4S) next inputs from poly2 6846 vs_ldpq_post(vs2, poly2); 6847 // compute a = b montmul c 6848 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6849 // compute a = rsquare montmul a 6850 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6851 // save a 32 (8x4S) results 6852 vs_stpq_post(vs2, result); 6853 6854 __ sub(len, len, 128); 6855 __ cmp(len, (u1)128); 6856 __ br(Assembler::GE, L_loop); 6857 6858 __ leave(); // required for proper stackwalking of RuntimeStub frame 6859 __ mov(r0, zr); // return 0 6860 __ ret(lr); 6861 6862 return start; 6863 } 6864 6865 // Dilithium Motgomery multiply an array by a constant. 6866 // A straightforward implementation of the method 6867 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6868 // of the sun.security.provider.MLDSA class 6869 // 6870 // coeffs (int[256]) = c_rarg0 6871 // constant (int) = c_rarg1 6872 address generate_dilithiumMontMulByConstant() { 6873 6874 __ align(CodeEntryAlignment); 6875 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 6876 StubCodeMark mark(this, stub_id); 6877 address start = __ pc(); 6878 __ enter(); 6879 6880 Label L_loop; 6881 6882 const Register coeffs = c_rarg0; 6883 const Register constant = c_rarg1; 6884 6885 const Register dilithiumConsts = r10; 6886 const Register result = r11; 6887 const Register len = r12; 6888 6889 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6890 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6891 VSeq<2> vq(30); // n.b. constants overlap vs3 6892 VSeq<8> vconst(29, 0); // for montmul by constant 6893 6894 // results track inputs 6895 __ add(result, coeffs, 0); 6896 __ lea(dilithiumConsts, 6897 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6898 6899 // load constants q, qinv -- they do not get clobbered by first two loops 6900 vs_ldpq(vq, dilithiumConsts); // qInv, q 6901 // copy caller supplied constant across vconst 6902 __ dup(vconst[0], __ T4S, constant); 6903 __ mov(len, zr); 6904 __ add(len, len, 1024); 6905 6906 __ BIND(L_loop); 6907 6908 // load next 32 inputs 6909 vs_ldpq_post(vs2, coeffs); 6910 // mont mul by constant 6911 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6912 // write next 32 results 6913 vs_stpq_post(vs2, result); 6914 6915 __ sub(len, len, 128); 6916 __ cmp(len, (u1)128); 6917 __ br(Assembler::GE, L_loop); 6918 6919 __ leave(); // required for proper stackwalking of RuntimeStub frame 6920 __ mov(r0, zr); // return 0 6921 __ ret(lr); 6922 6923 return start; 6924 } 6925 6926 // Dilithium decompose poly. 6927 // Implements the method 6928 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6929 // of the sun.security.provider.ML_DSA class 6930 // 6931 // input (int[256]) = c_rarg0 6932 // lowPart (int[256]) = c_rarg1 6933 // highPart (int[256]) = c_rarg2 6934 // twoGamma2 (int) = c_rarg3 6935 // multiplier (int) = c_rarg4 6936 address generate_dilithiumDecomposePoly() { 6937 6938 __ align(CodeEntryAlignment); 6939 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 6940 StubCodeMark mark(this, stub_id); 6941 address start = __ pc(); 6942 Label L_loop; 6943 6944 const Register input = c_rarg0; 6945 const Register lowPart = c_rarg1; 6946 const Register highPart = c_rarg2; 6947 const Register twoGamma2 = c_rarg3; 6948 const Register multiplier = c_rarg4; 6949 6950 const Register len = r9; 6951 const Register dilithiumConsts = r10; 6952 const Register tmp = r11; 6953 6954 // 6 independent sets of 4x4s values 6955 VSeq<4> vs1(0), vs2(4), vs3(8); 6956 VSeq<4> vs4(12), vs5(16), vtmp(20); 6957 6958 // 7 constants for cross-multiplying 6959 VSeq<4> one(25, 0); 6960 VSeq<4> qminus1(26, 0); 6961 VSeq<4> g2(27, 0); 6962 VSeq<4> twog2(28, 0); 6963 VSeq<4> mult(29, 0); 6964 VSeq<4> q(30, 0); 6965 VSeq<4> qadd(31, 0); 6966 6967 __ enter(); 6968 6969 __ lea(dilithiumConsts, 6970 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6971 6972 // save callee-saved registers 6973 __ stpd(v8, v9, __ pre(sp, -64)); 6974 __ stpd(v10, v11, Address(sp, 16)); 6975 __ stpd(v12, v13, Address(sp, 32)); 6976 __ stpd(v14, v15, Address(sp, 48)); 6977 6978 // populate constant registers 6979 __ mov(tmp, zr); 6980 __ add(tmp, tmp, 1); 6981 __ dup(one[0], __ T4S, tmp); // 1 6982 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 6983 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 6984 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 6985 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 6986 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 6987 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 6988 6989 __ mov(len, zr); 6990 __ add(len, len, 1024); 6991 6992 __ BIND(L_loop); 6993 6994 // load next 4x4S inputs interleaved: rplus --> vs1 6995 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 6996 6997 // rplus = rplus - ((rplus + qadd) >> 23) * q 6998 vs_addv(vtmp, __ T4S, vs1, qadd); 6999 vs_sshr(vtmp, __ T4S, vtmp, 23); 7000 vs_mulv(vtmp, __ T4S, vtmp, q); 7001 vs_subv(vs1, __ T4S, vs1, vtmp); 7002 7003 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7004 vs_sshr(vtmp, __ T4S, vs1, 31); 7005 vs_andr(vtmp, vtmp, q); 7006 vs_addv(vs1, __ T4S, vs1, vtmp); 7007 7008 // quotient --> vs2 7009 // int quotient = (rplus * multiplier) >> 22; 7010 vs_mulv(vtmp, __ T4S, vs1, mult); 7011 vs_sshr(vs2, __ T4S, vtmp, 22); 7012 7013 // r0 --> vs3 7014 // int r0 = rplus - quotient * twoGamma2; 7015 vs_mulv(vtmp, __ T4S, vs2, twog2); 7016 vs_subv(vs3, __ T4S, vs1, vtmp); 7017 7018 // mask --> vs4 7019 // int mask = (twoGamma2 - r0) >> 22; 7020 vs_subv(vtmp, __ T4S, twog2, vs3); 7021 vs_sshr(vs4, __ T4S, vtmp, 22); 7022 7023 // r0 -= (mask & twoGamma2); 7024 vs_andr(vtmp, vs4, twog2); 7025 vs_subv(vs3, __ T4S, vs3, vtmp); 7026 7027 // quotient += (mask & 1); 7028 vs_andr(vtmp, vs4, one); 7029 vs_addv(vs2, __ T4S, vs2, vtmp); 7030 7031 // mask = (twoGamma2 / 2 - r0) >> 31; 7032 vs_subv(vtmp, __ T4S, g2, vs3); 7033 vs_sshr(vs4, __ T4S, vtmp, 31); 7034 7035 // r0 -= (mask & twoGamma2); 7036 vs_andr(vtmp, vs4, twog2); 7037 vs_subv(vs3, __ T4S, vs3, vtmp); 7038 7039 // quotient += (mask & 1); 7040 vs_andr(vtmp, vs4, one); 7041 vs_addv(vs2, __ T4S, vs2, vtmp); 7042 7043 // r1 --> vs5 7044 // int r1 = rplus - r0 - (dilithium_q - 1); 7045 vs_subv(vtmp, __ T4S, vs1, vs3); 7046 vs_subv(vs5, __ T4S, vtmp, qminus1); 7047 7048 // r1 --> vs1 (overwriting rplus) 7049 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7050 vs_negr(vtmp, __ T4S, vs5); 7051 vs_orr(vtmp, vs5, vtmp); 7052 vs_sshr(vs1, __ T4S, vtmp, 31); 7053 7054 // r0 += ~r1; 7055 vs_notr(vtmp, vs1); 7056 vs_addv(vs3, __ T4S, vs3, vtmp); 7057 7058 // r1 = r1 & quotient; 7059 vs_andr(vs1, vs2, vs1); 7060 7061 // store results inteleaved 7062 // lowPart[m] = r0; 7063 // highPart[m] = r1; 7064 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7065 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7066 7067 __ sub(len, len, 64); 7068 __ cmp(len, (u1)64); 7069 __ br(Assembler::GE, L_loop); 7070 7071 // restore callee-saved vector registers 7072 __ ldpd(v14, v15, Address(sp, 48)); 7073 __ ldpd(v12, v13, Address(sp, 32)); 7074 __ ldpd(v10, v11, Address(sp, 16)); 7075 __ ldpd(v8, v9, __ post(sp, 64)); 7076 7077 __ leave(); // required for proper stackwalking of RuntimeStub frame 7078 __ mov(r0, zr); // return 0 7079 __ ret(lr); 7080 7081 return start; 7082 } 7083 7084 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4, 7085 Register tmp0, Register tmp1, Register tmp2) { 7086 __ bic(tmp0, a2, a1); // for a0 7087 __ bic(tmp1, a3, a2); // for a1 7088 __ bic(tmp2, a4, a3); // for a2 7089 __ eor(a2, a2, tmp2); 7090 __ bic(tmp2, a0, a4); // for a3 7091 __ eor(a3, a3, tmp2); 7092 __ bic(tmp2, a1, a0); // for a4 7093 __ eor(a0, a0, tmp0); 7094 __ eor(a1, a1, tmp1); 7095 __ eor(a4, a4, tmp2); 7096 } 7097 7098 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc, 7099 Register a0, Register a1, Register a2, Register a3, Register a4, 7100 Register a5, Register a6, Register a7, Register a8, Register a9, 7101 Register a10, Register a11, Register a12, Register a13, Register a14, 7102 Register a15, Register a16, Register a17, Register a18, Register a19, 7103 Register a20, Register a21, Register a22, Register a23, Register a24, 7104 Register tmp0, Register tmp1, Register tmp2) { 7105 __ eor3(tmp1, a4, a9, a14); 7106 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4 7107 __ eor3(tmp2, a1, a6, a11); 7108 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1 7109 __ rax1(tmp2, tmp0, tmp1); // d0 7110 { 7111 7112 Register tmp3, tmp4; 7113 if (can_use_fp && can_use_r18) { 7114 tmp3 = rfp; 7115 tmp4 = r18_tls; 7116 } else { 7117 tmp3 = a4; 7118 tmp4 = a9; 7119 __ stp(tmp3, tmp4, __ pre(sp, -16)); 7120 } 7121 7122 __ eor3(tmp3, a0, a5, a10); 7123 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0 7124 __ eor(a0, a0, tmp2); 7125 __ eor(a5, a5, tmp2); 7126 __ eor(a10, a10, tmp2); 7127 __ eor(a15, a15, tmp2); 7128 __ eor(a20, a20, tmp2); // d0(tmp2) 7129 __ eor3(tmp3, a2, a7, a12); 7130 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2 7131 __ rax1(tmp3, tmp4, tmp2); // d1 7132 __ eor(a1, a1, tmp3); 7133 __ eor(a6, a6, tmp3); 7134 __ eor(a11, a11, tmp3); 7135 __ eor(a16, a16, tmp3); 7136 __ eor(a21, a21, tmp3); // d1(tmp3) 7137 __ rax1(tmp3, tmp2, tmp0); // d3 7138 __ eor3(tmp2, a3, a8, a13); 7139 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3 7140 __ eor(a3, a3, tmp3); 7141 __ eor(a8, a8, tmp3); 7142 __ eor(a13, a13, tmp3); 7143 __ eor(a18, a18, tmp3); 7144 __ eor(a23, a23, tmp3); 7145 __ rax1(tmp2, tmp1, tmp0); // d2 7146 __ eor(a2, a2, tmp2); 7147 __ eor(a7, a7, tmp2); 7148 __ eor(a12, a12, tmp2); 7149 __ rax1(tmp0, tmp0, tmp4); // d4 7150 if (!can_use_fp || !can_use_r18) { 7151 __ ldp(tmp3, tmp4, __ post(sp, 16)); 7152 } 7153 __ eor(a17, a17, tmp2); 7154 __ eor(a22, a22, tmp2); 7155 __ eor(a4, a4, tmp0); 7156 __ eor(a9, a9, tmp0); 7157 __ eor(a14, a14, tmp0); 7158 __ eor(a19, a19, tmp0); 7159 __ eor(a24, a24, tmp0); 7160 } 7161 7162 __ rol(tmp0, a10, 3); 7163 __ rol(a10, a1, 1); 7164 __ rol(a1, a6, 44); 7165 __ rol(a6, a9, 20); 7166 __ rol(a9, a22, 61); 7167 __ rol(a22, a14, 39); 7168 __ rol(a14, a20, 18); 7169 __ rol(a20, a2, 62); 7170 __ rol(a2, a12, 43); 7171 __ rol(a12, a13, 25); 7172 __ rol(a13, a19, 8) ; 7173 __ rol(a19, a23, 56); 7174 __ rol(a23, a15, 41); 7175 __ rol(a15, a4, 27); 7176 __ rol(a4, a24, 14); 7177 __ rol(a24, a21, 2); 7178 __ rol(a21, a8, 55); 7179 __ rol(a8, a16, 45); 7180 __ rol(a16, a5, 36); 7181 __ rol(a5, a3, 28); 7182 __ rol(a3, a18, 21); 7183 __ rol(a18, a17, 15); 7184 __ rol(a17, a11, 10); 7185 __ rol(a11, a7, 6); 7186 __ mov(a7, tmp0); 7187 7188 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2); 7189 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2); 7190 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2); 7191 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2); 7192 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2); 7193 7194 __ ldr(tmp1, __ post(rc, 8)); 7195 __ eor(a0, a0, tmp1); 7196 7197 } 7198 7199 // Arguments: 7200 // 7201 // Inputs: 7202 // c_rarg0 - byte[] source+offset 7203 // c_rarg1 - byte[] SHA.state 7204 // c_rarg2 - int block_size 7205 // c_rarg3 - int offset 7206 // c_rarg4 - int limit 7207 // 7208 address generate_sha3_implCompress_gpr(StubGenStubId stub_id) { 7209 bool multi_block; 7210 switch (stub_id) { 7211 case sha3_implCompress_id: 7212 multi_block = false; 7213 break; 7214 case sha3_implCompressMB_id: 7215 multi_block = true; 7216 break; 7217 default: 7218 ShouldNotReachHere(); 7219 } 7220 7221 static const uint64_t round_consts[24] = { 7222 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 7223 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 7224 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 7225 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 7226 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 7227 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 7228 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 7229 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 7230 }; 7231 7232 __ align(CodeEntryAlignment); 7233 StubCodeMark mark(this, stub_id); 7234 address start = __ pc(); 7235 7236 Register buf = c_rarg0; 7237 Register state = c_rarg1; 7238 Register block_size = c_rarg2; 7239 Register ofs = c_rarg3; 7240 Register limit = c_rarg4; 7241 7242 // use r3.r17,r19..r28 to keep a0..a24. 7243 // a0..a24 are respective locals from SHA3.java 7244 Register a0 = r25, 7245 a1 = r26, 7246 a2 = r27, 7247 a3 = r3, 7248 a4 = r4, 7249 a5 = r5, 7250 a6 = r6, 7251 a7 = r7, 7252 a8 = rscratch1, // r8 7253 a9 = rscratch2, // r9 7254 a10 = r10, 7255 a11 = r11, 7256 a12 = r12, 7257 a13 = r13, 7258 a14 = r14, 7259 a15 = r15, 7260 a16 = r16, 7261 a17 = r17, 7262 a18 = r28, 7263 a19 = r19, 7264 a20 = r20, 7265 a21 = r21, 7266 a22 = r22, 7267 a23 = r23, 7268 a24 = r24; 7269 7270 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30; 7271 7272 Label sha3_loop, rounds24_preloop, loop_body; 7273 Label sha3_512_or_sha3_384, shake128; 7274 7275 bool can_use_r18 = false; 7276 #ifndef R18_RESERVED 7277 can_use_r18 = true; 7278 #endif 7279 bool can_use_fp = !PreserveFramePointer; 7280 7281 __ enter(); 7282 7283 // save almost all yet unsaved gpr registers on stack 7284 __ str(block_size, __ pre(sp, -128)); 7285 if (multi_block) { 7286 __ stpw(ofs, limit, Address(sp, 8)); 7287 } 7288 // 8 bytes at sp+16 will be used to keep buf 7289 __ stp(r19, r20, Address(sp, 32)); 7290 __ stp(r21, r22, Address(sp, 48)); 7291 __ stp(r23, r24, Address(sp, 64)); 7292 __ stp(r25, r26, Address(sp, 80)); 7293 __ stp(r27, r28, Address(sp, 96)); 7294 if (can_use_r18 && can_use_fp) { 7295 __ stp(r18_tls, state, Address(sp, 112)); 7296 } else { 7297 __ str(state, Address(sp, 112)); 7298 } 7299 7300 // begin sha3 calculations: loading a0..a24 from state arrary 7301 __ ldp(a0, a1, state); 7302 __ ldp(a2, a3, Address(state, 16)); 7303 __ ldp(a4, a5, Address(state, 32)); 7304 __ ldp(a6, a7, Address(state, 48)); 7305 __ ldp(a8, a9, Address(state, 64)); 7306 __ ldp(a10, a11, Address(state, 80)); 7307 __ ldp(a12, a13, Address(state, 96)); 7308 __ ldp(a14, a15, Address(state, 112)); 7309 __ ldp(a16, a17, Address(state, 128)); 7310 __ ldp(a18, a19, Address(state, 144)); 7311 __ ldp(a20, a21, Address(state, 160)); 7312 __ ldp(a22, a23, Address(state, 176)); 7313 __ ldr(a24, Address(state, 192)); 7314 7315 __ BIND(sha3_loop); 7316 7317 // load input 7318 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7319 __ eor(a0, a0, tmp3); 7320 __ eor(a1, a1, tmp2); 7321 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7322 __ eor(a2, a2, tmp3); 7323 __ eor(a3, a3, tmp2); 7324 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7325 __ eor(a4, a4, tmp3); 7326 __ eor(a5, a5, tmp2); 7327 __ ldr(tmp3, __ post(buf, 8)); 7328 __ eor(a6, a6, tmp3); 7329 7330 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 7331 __ tbz(block_size, 7, sha3_512_or_sha3_384); 7332 7333 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7334 __ eor(a7, a7, tmp3); 7335 __ eor(a8, a8, tmp2); 7336 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7337 __ eor(a9, a9, tmp3); 7338 __ eor(a10, a10, tmp2); 7339 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7340 __ eor(a11, a11, tmp3); 7341 __ eor(a12, a12, tmp2); 7342 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7343 __ eor(a13, a13, tmp3); 7344 __ eor(a14, a14, tmp2); 7345 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7346 __ eor(a15, a15, tmp3); 7347 __ eor(a16, a16, tmp2); 7348 7349 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 7350 __ andw(tmp2, block_size, 48); 7351 __ cbzw(tmp2, rounds24_preloop); 7352 __ tbnz(block_size, 5, shake128); 7353 // block_size == 144, bit5 == 0, SHA3-244 7354 __ ldr(tmp3, __ post(buf, 8)); 7355 __ eor(a17, a17, tmp3); 7356 __ b(rounds24_preloop); 7357 7358 __ BIND(shake128); 7359 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7360 __ eor(a17, a17, tmp3); 7361 __ eor(a18, a18, tmp2); 7362 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7363 __ eor(a19, a19, tmp3); 7364 __ eor(a20, a20, tmp2); 7365 __ b(rounds24_preloop); // block_size == 168, SHAKE128 7366 7367 __ BIND(sha3_512_or_sha3_384); 7368 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7369 __ eor(a7, a7, tmp3); 7370 __ eor(a8, a8, tmp2); 7371 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512 7372 7373 // SHA3-384 7374 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7375 __ eor(a9, a9, tmp3); 7376 __ eor(a10, a10, tmp2); 7377 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7378 __ eor(a11, a11, tmp3); 7379 __ eor(a12, a12, tmp2); 7380 7381 __ BIND(rounds24_preloop); 7382 __ fmovs(v0, 24.0); // float loop counter, 7383 __ fmovs(v1, 1.0); // exact representation 7384 7385 __ str(buf, Address(sp, 16)); 7386 __ lea(tmp3, ExternalAddress((address) round_consts)); 7387 7388 __ BIND(loop_body); 7389 keccak_round_gpr(can_use_fp, can_use_r18, tmp3, 7390 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, 7391 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, 7392 tmp0, tmp1, tmp2); 7393 __ fsubs(v0, v0, v1); 7394 __ fcmps(v0, 0.0); 7395 __ br(__ NE, loop_body); 7396 7397 if (multi_block) { 7398 __ ldrw(block_size, sp); // block_size 7399 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit 7400 __ addw(tmp2, tmp2, block_size); 7401 __ cmpw(tmp2, tmp1); 7402 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping 7403 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping 7404 __ br(Assembler::LE, sha3_loop); 7405 __ movw(c_rarg0, tmp2); // return offset 7406 } 7407 if (can_use_fp && can_use_r18) { 7408 __ ldp(r18_tls, state, Address(sp, 112)); 7409 } else { 7410 __ ldr(state, Address(sp, 112)); 7411 } 7412 // save calculated sha3 state 7413 __ stp(a0, a1, Address(state)); 7414 __ stp(a2, a3, Address(state, 16)); 7415 __ stp(a4, a5, Address(state, 32)); 7416 __ stp(a6, a7, Address(state, 48)); 7417 __ stp(a8, a9, Address(state, 64)); 7418 __ stp(a10, a11, Address(state, 80)); 7419 __ stp(a12, a13, Address(state, 96)); 7420 __ stp(a14, a15, Address(state, 112)); 7421 __ stp(a16, a17, Address(state, 128)); 7422 __ stp(a18, a19, Address(state, 144)); 7423 __ stp(a20, a21, Address(state, 160)); 7424 __ stp(a22, a23, Address(state, 176)); 7425 __ str(a24, Address(state, 192)); 7426 7427 // restore required registers from stack 7428 __ ldp(r19, r20, Address(sp, 32)); 7429 __ ldp(r21, r22, Address(sp, 48)); 7430 __ ldp(r23, r24, Address(sp, 64)); 7431 __ ldp(r25, r26, Address(sp, 80)); 7432 __ ldp(r27, r28, Address(sp, 96)); 7433 if (can_use_fp && can_use_r18) { 7434 __ add(rfp, sp, 128); // leave() will copy rfp to sp below 7435 } // else no need to recalculate rfp, since it wasn't changed 7436 7437 __ leave(); 7438 7439 __ ret(lr); 7440 7441 return start; 7442 } 7443 7444 /** 7445 * Arguments: 7446 * 7447 * Inputs: 7448 * c_rarg0 - int crc 7449 * c_rarg1 - byte* buf 7450 * c_rarg2 - int length 7451 * 7452 * Output: 7453 * rax - int crc result 7454 */ 7455 address generate_updateBytesCRC32() { 7456 assert(UseCRC32Intrinsics, "what are we doing here?"); 7457 7458 __ align(CodeEntryAlignment); 7459 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 7460 StubCodeMark mark(this, stub_id); 7461 7462 address start = __ pc(); 7463 7464 const Register crc = c_rarg0; // crc 7465 const Register buf = c_rarg1; // source java byte array address 7466 const Register len = c_rarg2; // length 7467 const Register table0 = c_rarg3; // crc_table address 7468 const Register table1 = c_rarg4; 7469 const Register table2 = c_rarg5; 7470 const Register table3 = c_rarg6; 7471 const Register tmp3 = c_rarg7; 7472 7473 BLOCK_COMMENT("Entry:"); 7474 __ enter(); // required for proper stackwalking of RuntimeStub frame 7475 7476 __ kernel_crc32(crc, buf, len, 7477 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7478 7479 __ leave(); // required for proper stackwalking of RuntimeStub frame 7480 __ ret(lr); 7481 7482 return start; 7483 } 7484 7485 /** 7486 * Arguments: 7487 * 7488 * Inputs: 7489 * c_rarg0 - int crc 7490 * c_rarg1 - byte* buf 7491 * c_rarg2 - int length 7492 * c_rarg3 - int* table 7493 * 7494 * Output: 7495 * r0 - int crc result 7496 */ 7497 address generate_updateBytesCRC32C() { 7498 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7499 7500 __ align(CodeEntryAlignment); 7501 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 7502 StubCodeMark mark(this, stub_id); 7503 7504 address start = __ pc(); 7505 7506 const Register crc = c_rarg0; // crc 7507 const Register buf = c_rarg1; // source java byte array address 7508 const Register len = c_rarg2; // length 7509 const Register table0 = c_rarg3; // crc_table address 7510 const Register table1 = c_rarg4; 7511 const Register table2 = c_rarg5; 7512 const Register table3 = c_rarg6; 7513 const Register tmp3 = c_rarg7; 7514 7515 BLOCK_COMMENT("Entry:"); 7516 __ enter(); // required for proper stackwalking of RuntimeStub frame 7517 7518 __ kernel_crc32c(crc, buf, len, 7519 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7520 7521 __ leave(); // required for proper stackwalking of RuntimeStub frame 7522 __ ret(lr); 7523 7524 return start; 7525 } 7526 7527 /*** 7528 * Arguments: 7529 * 7530 * Inputs: 7531 * c_rarg0 - int adler 7532 * c_rarg1 - byte* buff 7533 * c_rarg2 - int len 7534 * 7535 * Output: 7536 * c_rarg0 - int adler result 7537 */ 7538 address generate_updateBytesAdler32() { 7539 __ align(CodeEntryAlignment); 7540 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 7541 StubCodeMark mark(this, stub_id); 7542 address start = __ pc(); 7543 7544 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7545 7546 // Aliases 7547 Register adler = c_rarg0; 7548 Register s1 = c_rarg0; 7549 Register s2 = c_rarg3; 7550 Register buff = c_rarg1; 7551 Register len = c_rarg2; 7552 Register nmax = r4; 7553 Register base = r5; 7554 Register count = r6; 7555 Register temp0 = rscratch1; 7556 Register temp1 = rscratch2; 7557 FloatRegister vbytes = v0; 7558 FloatRegister vs1acc = v1; 7559 FloatRegister vs2acc = v2; 7560 FloatRegister vtable = v3; 7561 7562 // Max number of bytes we can process before having to take the mod 7563 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7564 uint64_t BASE = 0xfff1; 7565 uint64_t NMAX = 0x15B0; 7566 7567 __ mov(base, BASE); 7568 __ mov(nmax, NMAX); 7569 7570 // Load accumulation coefficients for the upper 16 bits 7571 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7572 __ ld1(vtable, __ T16B, Address(temp0)); 7573 7574 // s1 is initialized to the lower 16 bits of adler 7575 // s2 is initialized to the upper 16 bits of adler 7576 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7577 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7578 7579 // The pipelined loop needs at least 16 elements for 1 iteration 7580 // It does check this, but it is more effective to skip to the cleanup loop 7581 __ cmp(len, (u1)16); 7582 __ br(Assembler::HS, L_nmax); 7583 __ cbz(len, L_combine); 7584 7585 __ bind(L_simple_by1_loop); 7586 __ ldrb(temp0, Address(__ post(buff, 1))); 7587 __ add(s1, s1, temp0); 7588 __ add(s2, s2, s1); 7589 __ subs(len, len, 1); 7590 __ br(Assembler::HI, L_simple_by1_loop); 7591 7592 // s1 = s1 % BASE 7593 __ subs(temp0, s1, base); 7594 __ csel(s1, temp0, s1, Assembler::HS); 7595 7596 // s2 = s2 % BASE 7597 __ lsr(temp0, s2, 16); 7598 __ lsl(temp1, temp0, 4); 7599 __ sub(temp1, temp1, temp0); 7600 __ add(s2, temp1, s2, ext::uxth); 7601 7602 __ subs(temp0, s2, base); 7603 __ csel(s2, temp0, s2, Assembler::HS); 7604 7605 __ b(L_combine); 7606 7607 __ bind(L_nmax); 7608 __ subs(len, len, nmax); 7609 __ sub(count, nmax, 16); 7610 __ br(Assembler::LO, L_by16); 7611 7612 __ bind(L_nmax_loop); 7613 7614 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7615 vbytes, vs1acc, vs2acc, vtable); 7616 7617 __ subs(count, count, 16); 7618 __ br(Assembler::HS, L_nmax_loop); 7619 7620 // s1 = s1 % BASE 7621 __ lsr(temp0, s1, 16); 7622 __ lsl(temp1, temp0, 4); 7623 __ sub(temp1, temp1, temp0); 7624 __ add(temp1, temp1, s1, ext::uxth); 7625 7626 __ lsr(temp0, temp1, 16); 7627 __ lsl(s1, temp0, 4); 7628 __ sub(s1, s1, temp0); 7629 __ add(s1, s1, temp1, ext:: uxth); 7630 7631 __ subs(temp0, s1, base); 7632 __ csel(s1, temp0, s1, Assembler::HS); 7633 7634 // s2 = s2 % BASE 7635 __ lsr(temp0, s2, 16); 7636 __ lsl(temp1, temp0, 4); 7637 __ sub(temp1, temp1, temp0); 7638 __ add(temp1, temp1, s2, ext::uxth); 7639 7640 __ lsr(temp0, temp1, 16); 7641 __ lsl(s2, temp0, 4); 7642 __ sub(s2, s2, temp0); 7643 __ add(s2, s2, temp1, ext:: uxth); 7644 7645 __ subs(temp0, s2, base); 7646 __ csel(s2, temp0, s2, Assembler::HS); 7647 7648 __ subs(len, len, nmax); 7649 __ sub(count, nmax, 16); 7650 __ br(Assembler::HS, L_nmax_loop); 7651 7652 __ bind(L_by16); 7653 __ adds(len, len, count); 7654 __ br(Assembler::LO, L_by1); 7655 7656 __ bind(L_by16_loop); 7657 7658 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7659 vbytes, vs1acc, vs2acc, vtable); 7660 7661 __ subs(len, len, 16); 7662 __ br(Assembler::HS, L_by16_loop); 7663 7664 __ bind(L_by1); 7665 __ adds(len, len, 15); 7666 __ br(Assembler::LO, L_do_mod); 7667 7668 __ bind(L_by1_loop); 7669 __ ldrb(temp0, Address(__ post(buff, 1))); 7670 __ add(s1, temp0, s1); 7671 __ add(s2, s2, s1); 7672 __ subs(len, len, 1); 7673 __ br(Assembler::HS, L_by1_loop); 7674 7675 __ bind(L_do_mod); 7676 // s1 = s1 % BASE 7677 __ lsr(temp0, s1, 16); 7678 __ lsl(temp1, temp0, 4); 7679 __ sub(temp1, temp1, temp0); 7680 __ add(temp1, temp1, s1, ext::uxth); 7681 7682 __ lsr(temp0, temp1, 16); 7683 __ lsl(s1, temp0, 4); 7684 __ sub(s1, s1, temp0); 7685 __ add(s1, s1, temp1, ext:: uxth); 7686 7687 __ subs(temp0, s1, base); 7688 __ csel(s1, temp0, s1, Assembler::HS); 7689 7690 // s2 = s2 % BASE 7691 __ lsr(temp0, s2, 16); 7692 __ lsl(temp1, temp0, 4); 7693 __ sub(temp1, temp1, temp0); 7694 __ add(temp1, temp1, s2, ext::uxth); 7695 7696 __ lsr(temp0, temp1, 16); 7697 __ lsl(s2, temp0, 4); 7698 __ sub(s2, s2, temp0); 7699 __ add(s2, s2, temp1, ext:: uxth); 7700 7701 __ subs(temp0, s2, base); 7702 __ csel(s2, temp0, s2, Assembler::HS); 7703 7704 // Combine lower bits and higher bits 7705 __ bind(L_combine); 7706 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7707 7708 __ ret(lr); 7709 7710 return start; 7711 } 7712 7713 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7714 Register temp0, Register temp1, FloatRegister vbytes, 7715 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7716 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7717 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7718 // In non-vectorized code, we update s1 and s2 as: 7719 // s1 <- s1 + b1 7720 // s2 <- s2 + s1 7721 // s1 <- s1 + b2 7722 // s2 <- s2 + b1 7723 // ... 7724 // s1 <- s1 + b16 7725 // s2 <- s2 + s1 7726 // Putting above assignments together, we have: 7727 // s1_new = s1 + b1 + b2 + ... + b16 7728 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7729 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7730 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7731 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7732 7733 // s2 = s2 + s1 * 16 7734 __ add(s2, s2, s1, Assembler::LSL, 4); 7735 7736 // vs1acc = b1 + b2 + b3 + ... + b16 7737 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7738 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7739 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7740 __ uaddlv(vs1acc, __ T16B, vbytes); 7741 __ uaddlv(vs2acc, __ T8H, vs2acc); 7742 7743 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7744 __ fmovd(temp0, vs1acc); 7745 __ fmovd(temp1, vs2acc); 7746 __ add(s1, s1, temp0); 7747 __ add(s2, s2, temp1); 7748 } 7749 7750 /** 7751 * Arguments: 7752 * 7753 * Input: 7754 * c_rarg0 - x address 7755 * c_rarg1 - x length 7756 * c_rarg2 - y address 7757 * c_rarg3 - y length 7758 * c_rarg4 - z address 7759 */ 7760 address generate_multiplyToLen() { 7761 __ align(CodeEntryAlignment); 7762 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 7763 StubCodeMark mark(this, stub_id); 7764 7765 address start = __ pc(); 7766 const Register x = r0; 7767 const Register xlen = r1; 7768 const Register y = r2; 7769 const Register ylen = r3; 7770 const Register z = r4; 7771 7772 const Register tmp0 = r5; 7773 const Register tmp1 = r10; 7774 const Register tmp2 = r11; 7775 const Register tmp3 = r12; 7776 const Register tmp4 = r13; 7777 const Register tmp5 = r14; 7778 const Register tmp6 = r15; 7779 const Register tmp7 = r16; 7780 7781 BLOCK_COMMENT("Entry:"); 7782 __ enter(); // required for proper stackwalking of RuntimeStub frame 7783 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7784 __ leave(); // required for proper stackwalking of RuntimeStub frame 7785 __ ret(lr); 7786 7787 return start; 7788 } 7789 7790 address generate_squareToLen() { 7791 // squareToLen algorithm for sizes 1..127 described in java code works 7792 // faster than multiply_to_len on some CPUs and slower on others, but 7793 // multiply_to_len shows a bit better overall results 7794 __ align(CodeEntryAlignment); 7795 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 7796 StubCodeMark mark(this, stub_id); 7797 address start = __ pc(); 7798 7799 const Register x = r0; 7800 const Register xlen = r1; 7801 const Register z = r2; 7802 const Register y = r4; // == x 7803 const Register ylen = r5; // == xlen 7804 7805 const Register tmp0 = r3; 7806 const Register tmp1 = r10; 7807 const Register tmp2 = r11; 7808 const Register tmp3 = r12; 7809 const Register tmp4 = r13; 7810 const Register tmp5 = r14; 7811 const Register tmp6 = r15; 7812 const Register tmp7 = r16; 7813 7814 RegSet spilled_regs = RegSet::of(y, ylen); 7815 BLOCK_COMMENT("Entry:"); 7816 __ enter(); 7817 __ push(spilled_regs, sp); 7818 __ mov(y, x); 7819 __ mov(ylen, xlen); 7820 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7821 __ pop(spilled_regs, sp); 7822 __ leave(); 7823 __ ret(lr); 7824 return start; 7825 } 7826 7827 address generate_mulAdd() { 7828 __ align(CodeEntryAlignment); 7829 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 7830 StubCodeMark mark(this, stub_id); 7831 7832 address start = __ pc(); 7833 7834 const Register out = r0; 7835 const Register in = r1; 7836 const Register offset = r2; 7837 const Register len = r3; 7838 const Register k = r4; 7839 7840 BLOCK_COMMENT("Entry:"); 7841 __ enter(); 7842 __ mul_add(out, in, offset, len, k); 7843 __ leave(); 7844 __ ret(lr); 7845 7846 return start; 7847 } 7848 7849 // Arguments: 7850 // 7851 // Input: 7852 // c_rarg0 - newArr address 7853 // c_rarg1 - oldArr address 7854 // c_rarg2 - newIdx 7855 // c_rarg3 - shiftCount 7856 // c_rarg4 - numIter 7857 // 7858 address generate_bigIntegerRightShift() { 7859 __ align(CodeEntryAlignment); 7860 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 7861 StubCodeMark mark(this, stub_id); 7862 address start = __ pc(); 7863 7864 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7865 7866 Register newArr = c_rarg0; 7867 Register oldArr = c_rarg1; 7868 Register newIdx = c_rarg2; 7869 Register shiftCount = c_rarg3; 7870 Register numIter = c_rarg4; 7871 Register idx = numIter; 7872 7873 Register newArrCur = rscratch1; 7874 Register shiftRevCount = rscratch2; 7875 Register oldArrCur = r13; 7876 Register oldArrNext = r14; 7877 7878 FloatRegister oldElem0 = v0; 7879 FloatRegister oldElem1 = v1; 7880 FloatRegister newElem = v2; 7881 FloatRegister shiftVCount = v3; 7882 FloatRegister shiftVRevCount = v4; 7883 7884 __ cbz(idx, Exit); 7885 7886 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7887 7888 // left shift count 7889 __ movw(shiftRevCount, 32); 7890 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7891 7892 // numIter too small to allow a 4-words SIMD loop, rolling back 7893 __ cmp(numIter, (u1)4); 7894 __ br(Assembler::LT, ShiftThree); 7895 7896 __ dup(shiftVCount, __ T4S, shiftCount); 7897 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7898 __ negr(shiftVCount, __ T4S, shiftVCount); 7899 7900 __ BIND(ShiftSIMDLoop); 7901 7902 // Calculate the load addresses 7903 __ sub(idx, idx, 4); 7904 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7905 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7906 __ add(oldArrCur, oldArrNext, 4); 7907 7908 // Load 4 words and process 7909 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7910 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7911 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7912 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7913 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7914 __ st1(newElem, __ T4S, Address(newArrCur)); 7915 7916 __ cmp(idx, (u1)4); 7917 __ br(Assembler::LT, ShiftTwoLoop); 7918 __ b(ShiftSIMDLoop); 7919 7920 __ BIND(ShiftTwoLoop); 7921 __ cbz(idx, Exit); 7922 __ cmp(idx, (u1)1); 7923 __ br(Assembler::EQ, ShiftOne); 7924 7925 // Calculate the load addresses 7926 __ sub(idx, idx, 2); 7927 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7928 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7929 __ add(oldArrCur, oldArrNext, 4); 7930 7931 // Load 2 words and process 7932 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7933 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7934 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7935 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7936 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7937 __ st1(newElem, __ T2S, Address(newArrCur)); 7938 __ b(ShiftTwoLoop); 7939 7940 __ BIND(ShiftThree); 7941 __ tbz(idx, 1, ShiftOne); 7942 __ tbz(idx, 0, ShiftTwo); 7943 __ ldrw(r10, Address(oldArr, 12)); 7944 __ ldrw(r11, Address(oldArr, 8)); 7945 __ lsrvw(r10, r10, shiftCount); 7946 __ lslvw(r11, r11, shiftRevCount); 7947 __ orrw(r12, r10, r11); 7948 __ strw(r12, Address(newArr, 8)); 7949 7950 __ BIND(ShiftTwo); 7951 __ ldrw(r10, Address(oldArr, 8)); 7952 __ ldrw(r11, Address(oldArr, 4)); 7953 __ lsrvw(r10, r10, shiftCount); 7954 __ lslvw(r11, r11, shiftRevCount); 7955 __ orrw(r12, r10, r11); 7956 __ strw(r12, Address(newArr, 4)); 7957 7958 __ BIND(ShiftOne); 7959 __ ldrw(r10, Address(oldArr, 4)); 7960 __ ldrw(r11, Address(oldArr)); 7961 __ lsrvw(r10, r10, shiftCount); 7962 __ lslvw(r11, r11, shiftRevCount); 7963 __ orrw(r12, r10, r11); 7964 __ strw(r12, Address(newArr)); 7965 7966 __ BIND(Exit); 7967 __ ret(lr); 7968 7969 return start; 7970 } 7971 7972 // Arguments: 7973 // 7974 // Input: 7975 // c_rarg0 - newArr address 7976 // c_rarg1 - oldArr address 7977 // c_rarg2 - newIdx 7978 // c_rarg3 - shiftCount 7979 // c_rarg4 - numIter 7980 // 7981 address generate_bigIntegerLeftShift() { 7982 __ align(CodeEntryAlignment); 7983 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 7984 StubCodeMark mark(this, stub_id); 7985 address start = __ pc(); 7986 7987 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7988 7989 Register newArr = c_rarg0; 7990 Register oldArr = c_rarg1; 7991 Register newIdx = c_rarg2; 7992 Register shiftCount = c_rarg3; 7993 Register numIter = c_rarg4; 7994 7995 Register shiftRevCount = rscratch1; 7996 Register oldArrNext = rscratch2; 7997 7998 FloatRegister oldElem0 = v0; 7999 FloatRegister oldElem1 = v1; 8000 FloatRegister newElem = v2; 8001 FloatRegister shiftVCount = v3; 8002 FloatRegister shiftVRevCount = v4; 8003 8004 __ cbz(numIter, Exit); 8005 8006 __ add(oldArrNext, oldArr, 4); 8007 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 8008 8009 // right shift count 8010 __ movw(shiftRevCount, 32); 8011 __ subw(shiftRevCount, shiftRevCount, shiftCount); 8012 8013 // numIter too small to allow a 4-words SIMD loop, rolling back 8014 __ cmp(numIter, (u1)4); 8015 __ br(Assembler::LT, ShiftThree); 8016 8017 __ dup(shiftVCount, __ T4S, shiftCount); 8018 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 8019 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 8020 8021 __ BIND(ShiftSIMDLoop); 8022 8023 // load 4 words and process 8024 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 8025 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 8026 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 8027 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 8028 __ orr(newElem, __ T16B, oldElem0, oldElem1); 8029 __ st1(newElem, __ T4S, __ post(newArr, 16)); 8030 __ sub(numIter, numIter, 4); 8031 8032 __ cmp(numIter, (u1)4); 8033 __ br(Assembler::LT, ShiftTwoLoop); 8034 __ b(ShiftSIMDLoop); 8035 8036 __ BIND(ShiftTwoLoop); 8037 __ cbz(numIter, Exit); 8038 __ cmp(numIter, (u1)1); 8039 __ br(Assembler::EQ, ShiftOne); 8040 8041 // load 2 words and process 8042 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 8043 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 8044 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 8045 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 8046 __ orr(newElem, __ T8B, oldElem0, oldElem1); 8047 __ st1(newElem, __ T2S, __ post(newArr, 8)); 8048 __ sub(numIter, numIter, 2); 8049 __ b(ShiftTwoLoop); 8050 8051 __ BIND(ShiftThree); 8052 __ ldrw(r10, __ post(oldArr, 4)); 8053 __ ldrw(r11, __ post(oldArrNext, 4)); 8054 __ lslvw(r10, r10, shiftCount); 8055 __ lsrvw(r11, r11, shiftRevCount); 8056 __ orrw(r12, r10, r11); 8057 __ strw(r12, __ post(newArr, 4)); 8058 __ tbz(numIter, 1, Exit); 8059 __ tbz(numIter, 0, ShiftOne); 8060 8061 __ BIND(ShiftTwo); 8062 __ ldrw(r10, __ post(oldArr, 4)); 8063 __ ldrw(r11, __ post(oldArrNext, 4)); 8064 __ lslvw(r10, r10, shiftCount); 8065 __ lsrvw(r11, r11, shiftRevCount); 8066 __ orrw(r12, r10, r11); 8067 __ strw(r12, __ post(newArr, 4)); 8068 8069 __ BIND(ShiftOne); 8070 __ ldrw(r10, Address(oldArr)); 8071 __ ldrw(r11, Address(oldArrNext)); 8072 __ lslvw(r10, r10, shiftCount); 8073 __ lsrvw(r11, r11, shiftRevCount); 8074 __ orrw(r12, r10, r11); 8075 __ strw(r12, Address(newArr)); 8076 8077 __ BIND(Exit); 8078 __ ret(lr); 8079 8080 return start; 8081 } 8082 8083 address generate_count_positives(address &count_positives_long) { 8084 const u1 large_loop_size = 64; 8085 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 8086 int dcache_line = VM_Version::dcache_line_size(); 8087 8088 Register ary1 = r1, len = r2, result = r0; 8089 8090 __ align(CodeEntryAlignment); 8091 8092 StubGenStubId stub_id = StubGenStubId::count_positives_id; 8093 StubCodeMark mark(this, stub_id); 8094 8095 address entry = __ pc(); 8096 8097 __ enter(); 8098 // precondition: a copy of len is already in result 8099 // __ mov(result, len); 8100 8101 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 8102 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 8103 8104 __ cmp(len, (u1)15); 8105 __ br(Assembler::GT, LEN_OVER_15); 8106 // The only case when execution falls into this code is when pointer is near 8107 // the end of memory page and we have to avoid reading next page 8108 __ add(ary1, ary1, len); 8109 __ subs(len, len, 8); 8110 __ br(Assembler::GT, LEN_OVER_8); 8111 __ ldr(rscratch2, Address(ary1, -8)); 8112 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 8113 __ lsrv(rscratch2, rscratch2, rscratch1); 8114 __ tst(rscratch2, UPPER_BIT_MASK); 8115 __ csel(result, zr, result, Assembler::NE); 8116 __ leave(); 8117 __ ret(lr); 8118 __ bind(LEN_OVER_8); 8119 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 8120 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 8121 __ tst(rscratch2, UPPER_BIT_MASK); 8122 __ br(Assembler::NE, RET_NO_POP); 8123 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 8124 __ lsrv(rscratch1, rscratch1, rscratch2); 8125 __ tst(rscratch1, UPPER_BIT_MASK); 8126 __ bind(RET_NO_POP); 8127 __ csel(result, zr, result, Assembler::NE); 8128 __ leave(); 8129 __ ret(lr); 8130 8131 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 8132 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 8133 8134 count_positives_long = __ pc(); // 2nd entry point 8135 8136 __ enter(); 8137 8138 __ bind(LEN_OVER_15); 8139 __ push(spilled_regs, sp); 8140 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 8141 __ cbz(rscratch2, ALIGNED); 8142 __ ldp(tmp6, tmp1, Address(ary1)); 8143 __ mov(tmp5, 16); 8144 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 8145 __ add(ary1, ary1, rscratch1); 8146 __ orr(tmp6, tmp6, tmp1); 8147 __ tst(tmp6, UPPER_BIT_MASK); 8148 __ br(Assembler::NE, RET_ADJUST); 8149 __ sub(len, len, rscratch1); 8150 8151 __ bind(ALIGNED); 8152 __ cmp(len, large_loop_size); 8153 __ br(Assembler::LT, CHECK_16); 8154 // Perform 16-byte load as early return in pre-loop to handle situation 8155 // when initially aligned large array has negative values at starting bytes, 8156 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 8157 // slower. Cases with negative bytes further ahead won't be affected that 8158 // much. In fact, it'll be faster due to early loads, less instructions and 8159 // less branches in LARGE_LOOP. 8160 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 8161 __ sub(len, len, 16); 8162 __ orr(tmp6, tmp6, tmp1); 8163 __ tst(tmp6, UPPER_BIT_MASK); 8164 __ br(Assembler::NE, RET_ADJUST_16); 8165 __ cmp(len, large_loop_size); 8166 __ br(Assembler::LT, CHECK_16); 8167 8168 if (SoftwarePrefetchHintDistance >= 0 8169 && SoftwarePrefetchHintDistance >= dcache_line) { 8170 // initial prefetch 8171 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 8172 } 8173 __ bind(LARGE_LOOP); 8174 if (SoftwarePrefetchHintDistance >= 0) { 8175 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 8176 } 8177 // Issue load instructions first, since it can save few CPU/MEM cycles, also 8178 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 8179 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 8180 // instructions per cycle and have less branches, but this approach disables 8181 // early return, thus, all 64 bytes are loaded and checked every time. 8182 __ ldp(tmp2, tmp3, Address(ary1)); 8183 __ ldp(tmp4, tmp5, Address(ary1, 16)); 8184 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 8185 __ ldp(tmp6, tmp1, Address(ary1, 48)); 8186 __ add(ary1, ary1, large_loop_size); 8187 __ sub(len, len, large_loop_size); 8188 __ orr(tmp2, tmp2, tmp3); 8189 __ orr(tmp4, tmp4, tmp5); 8190 __ orr(rscratch1, rscratch1, rscratch2); 8191 __ orr(tmp6, tmp6, tmp1); 8192 __ orr(tmp2, tmp2, tmp4); 8193 __ orr(rscratch1, rscratch1, tmp6); 8194 __ orr(tmp2, tmp2, rscratch1); 8195 __ tst(tmp2, UPPER_BIT_MASK); 8196 __ br(Assembler::NE, RET_ADJUST_LONG); 8197 __ cmp(len, large_loop_size); 8198 __ br(Assembler::GE, LARGE_LOOP); 8199 8200 __ bind(CHECK_16); // small 16-byte load pre-loop 8201 __ cmp(len, (u1)16); 8202 __ br(Assembler::LT, POST_LOOP16); 8203 8204 __ bind(LOOP16); // small 16-byte load loop 8205 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 8206 __ sub(len, len, 16); 8207 __ orr(tmp2, tmp2, tmp3); 8208 __ tst(tmp2, UPPER_BIT_MASK); 8209 __ br(Assembler::NE, RET_ADJUST_16); 8210 __ cmp(len, (u1)16); 8211 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 8212 8213 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 8214 __ cmp(len, (u1)8); 8215 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 8216 __ ldr(tmp3, Address(__ post(ary1, 8))); 8217 __ tst(tmp3, UPPER_BIT_MASK); 8218 __ br(Assembler::NE, RET_ADJUST); 8219 __ sub(len, len, 8); 8220 8221 __ bind(POST_LOOP16_LOAD_TAIL); 8222 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 8223 __ ldr(tmp1, Address(ary1)); 8224 __ mov(tmp2, 64); 8225 __ sub(tmp4, tmp2, len, __ LSL, 3); 8226 __ lslv(tmp1, tmp1, tmp4); 8227 __ tst(tmp1, UPPER_BIT_MASK); 8228 __ br(Assembler::NE, RET_ADJUST); 8229 // Fallthrough 8230 8231 __ bind(RET_LEN); 8232 __ pop(spilled_regs, sp); 8233 __ leave(); 8234 __ ret(lr); 8235 8236 // difference result - len is the count of guaranteed to be 8237 // positive bytes 8238 8239 __ bind(RET_ADJUST_LONG); 8240 __ add(len, len, (u1)(large_loop_size - 16)); 8241 __ bind(RET_ADJUST_16); 8242 __ add(len, len, 16); 8243 __ bind(RET_ADJUST); 8244 __ pop(spilled_regs, sp); 8245 __ leave(); 8246 __ sub(result, result, len); 8247 __ ret(lr); 8248 8249 return entry; 8250 } 8251 8252 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 8253 bool usePrefetch, Label &NOT_EQUAL) { 8254 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8255 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8256 tmp7 = r12, tmp8 = r13; 8257 Label LOOP; 8258 8259 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8260 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8261 __ bind(LOOP); 8262 if (usePrefetch) { 8263 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8264 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8265 } 8266 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8267 __ eor(tmp1, tmp1, tmp2); 8268 __ eor(tmp3, tmp3, tmp4); 8269 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8270 __ orr(tmp1, tmp1, tmp3); 8271 __ cbnz(tmp1, NOT_EQUAL); 8272 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8273 __ eor(tmp5, tmp5, tmp6); 8274 __ eor(tmp7, tmp7, tmp8); 8275 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8276 __ orr(tmp5, tmp5, tmp7); 8277 __ cbnz(tmp5, NOT_EQUAL); 8278 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8279 __ eor(tmp1, tmp1, tmp2); 8280 __ eor(tmp3, tmp3, tmp4); 8281 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8282 __ orr(tmp1, tmp1, tmp3); 8283 __ cbnz(tmp1, NOT_EQUAL); 8284 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8285 __ eor(tmp5, tmp5, tmp6); 8286 __ sub(cnt1, cnt1, 8 * wordSize); 8287 __ eor(tmp7, tmp7, tmp8); 8288 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8289 // tmp6 is not used. MacroAssembler::subs is used here (rather than 8290 // cmp) because subs allows an unlimited range of immediate operand. 8291 __ subs(tmp6, cnt1, loopThreshold); 8292 __ orr(tmp5, tmp5, tmp7); 8293 __ cbnz(tmp5, NOT_EQUAL); 8294 __ br(__ GE, LOOP); 8295 // post-loop 8296 __ eor(tmp1, tmp1, tmp2); 8297 __ eor(tmp3, tmp3, tmp4); 8298 __ orr(tmp1, tmp1, tmp3); 8299 __ sub(cnt1, cnt1, 2 * wordSize); 8300 __ cbnz(tmp1, NOT_EQUAL); 8301 } 8302 8303 void generate_large_array_equals_loop_simd(int loopThreshold, 8304 bool usePrefetch, Label &NOT_EQUAL) { 8305 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8306 tmp2 = rscratch2; 8307 Label LOOP; 8308 8309 __ bind(LOOP); 8310 if (usePrefetch) { 8311 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8312 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8313 } 8314 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 8315 __ sub(cnt1, cnt1, 8 * wordSize); 8316 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 8317 __ subs(tmp1, cnt1, loopThreshold); 8318 __ eor(v0, __ T16B, v0, v4); 8319 __ eor(v1, __ T16B, v1, v5); 8320 __ eor(v2, __ T16B, v2, v6); 8321 __ eor(v3, __ T16B, v3, v7); 8322 __ orr(v0, __ T16B, v0, v1); 8323 __ orr(v1, __ T16B, v2, v3); 8324 __ orr(v0, __ T16B, v0, v1); 8325 __ umov(tmp1, v0, __ D, 0); 8326 __ umov(tmp2, v0, __ D, 1); 8327 __ orr(tmp1, tmp1, tmp2); 8328 __ cbnz(tmp1, NOT_EQUAL); 8329 __ br(__ GE, LOOP); 8330 } 8331 8332 // a1 = r1 - array1 address 8333 // a2 = r2 - array2 address 8334 // result = r0 - return value. Already contains "false" 8335 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 8336 // r3-r5 are reserved temporary registers 8337 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 8338 address generate_large_array_equals() { 8339 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8340 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8341 tmp7 = r12, tmp8 = r13; 8342 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 8343 SMALL_LOOP, POST_LOOP; 8344 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 8345 // calculate if at least 32 prefetched bytes are used 8346 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 8347 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 8348 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 8349 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 8350 tmp5, tmp6, tmp7, tmp8); 8351 8352 __ align(CodeEntryAlignment); 8353 8354 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 8355 StubCodeMark mark(this, stub_id); 8356 8357 address entry = __ pc(); 8358 __ enter(); 8359 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8360 // also advance pointers to use post-increment instead of pre-increment 8361 __ add(a1, a1, wordSize); 8362 __ add(a2, a2, wordSize); 8363 if (AvoidUnalignedAccesses) { 8364 // both implementations (SIMD/nonSIMD) are using relatively large load 8365 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8366 // on some CPUs in case of address is not at least 16-byte aligned. 8367 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8368 // load if needed at least for 1st address and make if 16-byte aligned. 8369 Label ALIGNED16; 8370 __ tbz(a1, 3, ALIGNED16); 8371 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8372 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8373 __ sub(cnt1, cnt1, wordSize); 8374 __ eor(tmp1, tmp1, tmp2); 8375 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8376 __ bind(ALIGNED16); 8377 } 8378 if (UseSIMDForArrayEquals) { 8379 if (SoftwarePrefetchHintDistance >= 0) { 8380 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8381 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8382 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8383 /* prfm = */ true, NOT_EQUAL); 8384 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8385 __ br(__ LT, TAIL); 8386 } 8387 __ bind(NO_PREFETCH_LARGE_LOOP); 8388 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8389 /* prfm = */ false, NOT_EQUAL); 8390 } else { 8391 __ push(spilled_regs, sp); 8392 if (SoftwarePrefetchHintDistance >= 0) { 8393 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8394 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8395 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8396 /* prfm = */ true, NOT_EQUAL); 8397 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8398 __ br(__ LT, TAIL); 8399 } 8400 __ bind(NO_PREFETCH_LARGE_LOOP); 8401 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8402 /* prfm = */ false, NOT_EQUAL); 8403 } 8404 __ bind(TAIL); 8405 __ cbz(cnt1, EQUAL); 8406 __ subs(cnt1, cnt1, wordSize); 8407 __ br(__ LE, POST_LOOP); 8408 __ bind(SMALL_LOOP); 8409 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8410 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8411 __ subs(cnt1, cnt1, wordSize); 8412 __ eor(tmp1, tmp1, tmp2); 8413 __ cbnz(tmp1, NOT_EQUAL); 8414 __ br(__ GT, SMALL_LOOP); 8415 __ bind(POST_LOOP); 8416 __ ldr(tmp1, Address(a1, cnt1)); 8417 __ ldr(tmp2, Address(a2, cnt1)); 8418 __ eor(tmp1, tmp1, tmp2); 8419 __ cbnz(tmp1, NOT_EQUAL); 8420 __ bind(EQUAL); 8421 __ mov(result, true); 8422 __ bind(NOT_EQUAL); 8423 if (!UseSIMDForArrayEquals) { 8424 __ pop(spilled_regs, sp); 8425 } 8426 __ bind(NOT_EQUAL_NO_POP); 8427 __ leave(); 8428 __ ret(lr); 8429 return entry; 8430 } 8431 8432 // result = r0 - return value. Contains initial hashcode value on entry. 8433 // ary = r1 - array address 8434 // cnt = r2 - elements count 8435 // Clobbers: v0-v13, rscratch1, rscratch2 8436 address generate_large_arrays_hashcode(BasicType eltype) { 8437 const Register result = r0, ary = r1, cnt = r2; 8438 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8439 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8440 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8441 const FloatRegister vpowm = v13; 8442 8443 ARRAYS_HASHCODE_REGISTERS; 8444 8445 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8446 8447 unsigned int vf; // vectorization factor 8448 bool multiply_by_halves; 8449 Assembler::SIMD_Arrangement load_arrangement; 8450 switch (eltype) { 8451 case T_BOOLEAN: 8452 case T_BYTE: 8453 load_arrangement = Assembler::T8B; 8454 multiply_by_halves = true; 8455 vf = 8; 8456 break; 8457 case T_CHAR: 8458 case T_SHORT: 8459 load_arrangement = Assembler::T8H; 8460 multiply_by_halves = true; 8461 vf = 8; 8462 break; 8463 case T_INT: 8464 load_arrangement = Assembler::T4S; 8465 multiply_by_halves = false; 8466 vf = 4; 8467 break; 8468 default: 8469 ShouldNotReachHere(); 8470 } 8471 8472 // Unroll factor 8473 const unsigned uf = 4; 8474 8475 // Effective vectorization factor 8476 const unsigned evf = vf * uf; 8477 8478 __ align(CodeEntryAlignment); 8479 8480 StubGenStubId stub_id; 8481 switch (eltype) { 8482 case T_BOOLEAN: 8483 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 8484 break; 8485 case T_BYTE: 8486 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 8487 break; 8488 case T_CHAR: 8489 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 8490 break; 8491 case T_SHORT: 8492 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 8493 break; 8494 case T_INT: 8495 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 8496 break; 8497 default: 8498 stub_id = StubGenStubId::NO_STUBID; 8499 ShouldNotReachHere(); 8500 }; 8501 8502 StubCodeMark mark(this, stub_id); 8503 8504 address entry = __ pc(); 8505 __ enter(); 8506 8507 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8508 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8509 // value shouldn't change throughout both loops. 8510 __ movw(rscratch1, intpow(31U, 3)); 8511 __ mov(vpow, Assembler::S, 0, rscratch1); 8512 __ movw(rscratch1, intpow(31U, 2)); 8513 __ mov(vpow, Assembler::S, 1, rscratch1); 8514 __ movw(rscratch1, intpow(31U, 1)); 8515 __ mov(vpow, Assembler::S, 2, rscratch1); 8516 __ movw(rscratch1, intpow(31U, 0)); 8517 __ mov(vpow, Assembler::S, 3, rscratch1); 8518 8519 __ mov(vmul0, Assembler::T16B, 0); 8520 __ mov(vmul0, Assembler::S, 3, result); 8521 8522 __ andr(rscratch2, cnt, (uf - 1) * vf); 8523 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8524 8525 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8526 __ mov(vpowm, Assembler::S, 0, rscratch1); 8527 8528 // SMALL LOOP 8529 __ bind(SMALL_LOOP); 8530 8531 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8532 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8533 __ subsw(rscratch2, rscratch2, vf); 8534 8535 if (load_arrangement == Assembler::T8B) { 8536 // Extend 8B to 8H to be able to use vector multiply 8537 // instructions 8538 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8539 if (is_signed_subword_type(eltype)) { 8540 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8541 } else { 8542 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8543 } 8544 } 8545 8546 switch (load_arrangement) { 8547 case Assembler::T4S: 8548 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8549 break; 8550 case Assembler::T8B: 8551 case Assembler::T8H: 8552 assert(is_subword_type(eltype), "subword type expected"); 8553 if (is_signed_subword_type(eltype)) { 8554 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8555 } else { 8556 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8557 } 8558 break; 8559 default: 8560 __ should_not_reach_here(); 8561 } 8562 8563 // Process the upper half of a vector 8564 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8565 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8566 if (is_signed_subword_type(eltype)) { 8567 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8568 } else { 8569 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8570 } 8571 } 8572 8573 __ br(Assembler::HI, SMALL_LOOP); 8574 8575 // SMALL LOOP'S EPILOQUE 8576 __ lsr(rscratch2, cnt, exact_log2(evf)); 8577 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8578 8579 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8580 __ addv(vmul0, Assembler::T4S, vmul0); 8581 __ umov(result, vmul0, Assembler::S, 0); 8582 8583 // TAIL 8584 __ bind(TAIL); 8585 8586 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8587 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8588 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8589 __ andr(rscratch2, cnt, vf - 1); 8590 __ bind(TAIL_SHORTCUT); 8591 __ adr(rscratch1, BR_BASE); 8592 // For Cortex-A53 offset is 4 because 2 nops are generated. 8593 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8594 __ movw(rscratch2, 0x1f); 8595 __ br(rscratch1); 8596 8597 for (size_t i = 0; i < vf - 1; ++i) { 8598 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8599 eltype); 8600 __ maddw(result, result, rscratch2, rscratch1); 8601 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8602 // Generate 2nd nop to have 4 instructions per iteration. 8603 if (VM_Version::supports_a53mac()) { 8604 __ nop(); 8605 } 8606 } 8607 __ bind(BR_BASE); 8608 8609 __ leave(); 8610 __ ret(lr); 8611 8612 // LARGE LOOP 8613 __ bind(LARGE_LOOP_PREHEADER); 8614 8615 __ lsr(rscratch2, cnt, exact_log2(evf)); 8616 8617 if (multiply_by_halves) { 8618 // 31^4 - multiplier between lower and upper parts of a register 8619 __ movw(rscratch1, intpow(31U, vf / 2)); 8620 __ mov(vpowm, Assembler::S, 1, rscratch1); 8621 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8622 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8623 __ mov(vpowm, Assembler::S, 0, rscratch1); 8624 } else { 8625 // 31^16 8626 __ movw(rscratch1, intpow(31U, evf)); 8627 __ mov(vpowm, Assembler::S, 0, rscratch1); 8628 } 8629 8630 __ mov(vmul3, Assembler::T16B, 0); 8631 __ mov(vmul2, Assembler::T16B, 0); 8632 __ mov(vmul1, Assembler::T16B, 0); 8633 8634 __ bind(LARGE_LOOP); 8635 8636 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8637 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8638 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8639 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8640 8641 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8642 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8643 8644 if (load_arrangement == Assembler::T8B) { 8645 // Extend 8B to 8H to be able to use vector multiply 8646 // instructions 8647 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8648 if (is_signed_subword_type(eltype)) { 8649 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8650 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8651 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8652 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8653 } else { 8654 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8655 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8656 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8657 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8658 } 8659 } 8660 8661 switch (load_arrangement) { 8662 case Assembler::T4S: 8663 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8664 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8665 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8666 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8667 break; 8668 case Assembler::T8B: 8669 case Assembler::T8H: 8670 assert(is_subword_type(eltype), "subword type expected"); 8671 if (is_signed_subword_type(eltype)) { 8672 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8673 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8674 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8675 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8676 } else { 8677 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8678 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8679 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8680 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8681 } 8682 break; 8683 default: 8684 __ should_not_reach_here(); 8685 } 8686 8687 // Process the upper half of a vector 8688 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8689 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8690 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8691 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8692 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8693 if (is_signed_subword_type(eltype)) { 8694 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8695 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8696 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8697 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8698 } else { 8699 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8700 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8701 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8702 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8703 } 8704 } 8705 8706 __ subsw(rscratch2, rscratch2, 1); 8707 __ br(Assembler::HI, LARGE_LOOP); 8708 8709 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8710 __ addv(vmul3, Assembler::T4S, vmul3); 8711 __ umov(result, vmul3, Assembler::S, 0); 8712 8713 __ mov(rscratch2, intpow(31U, vf)); 8714 8715 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8716 __ addv(vmul2, Assembler::T4S, vmul2); 8717 __ umov(rscratch1, vmul2, Assembler::S, 0); 8718 __ maddw(result, result, rscratch2, rscratch1); 8719 8720 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8721 __ addv(vmul1, Assembler::T4S, vmul1); 8722 __ umov(rscratch1, vmul1, Assembler::S, 0); 8723 __ maddw(result, result, rscratch2, rscratch1); 8724 8725 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8726 __ addv(vmul0, Assembler::T4S, vmul0); 8727 __ umov(rscratch1, vmul0, Assembler::S, 0); 8728 __ maddw(result, result, rscratch2, rscratch1); 8729 8730 __ andr(rscratch2, cnt, vf - 1); 8731 __ cbnz(rscratch2, TAIL_SHORTCUT); 8732 8733 __ leave(); 8734 __ ret(lr); 8735 8736 return entry; 8737 } 8738 8739 address generate_dsin_dcos(bool isCos) { 8740 __ align(CodeEntryAlignment); 8741 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 8742 StubCodeMark mark(this, stub_id); 8743 address start = __ pc(); 8744 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8745 (address)StubRoutines::aarch64::_two_over_pi, 8746 (address)StubRoutines::aarch64::_pio2, 8747 (address)StubRoutines::aarch64::_dsin_coef, 8748 (address)StubRoutines::aarch64::_dcos_coef); 8749 return start; 8750 } 8751 8752 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8753 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8754 Label &DIFF2) { 8755 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8756 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8757 8758 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8759 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8760 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8761 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8762 8763 __ fmovd(tmpL, vtmp3); 8764 __ eor(rscratch2, tmp3, tmpL); 8765 __ cbnz(rscratch2, DIFF2); 8766 8767 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8768 __ umov(tmpL, vtmp3, __ D, 1); 8769 __ eor(rscratch2, tmpU, tmpL); 8770 __ cbnz(rscratch2, DIFF1); 8771 8772 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8773 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8774 __ fmovd(tmpL, vtmp); 8775 __ eor(rscratch2, tmp3, tmpL); 8776 __ cbnz(rscratch2, DIFF2); 8777 8778 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8779 __ umov(tmpL, vtmp, __ D, 1); 8780 __ eor(rscratch2, tmpU, tmpL); 8781 __ cbnz(rscratch2, DIFF1); 8782 } 8783 8784 // r0 = result 8785 // r1 = str1 8786 // r2 = cnt1 8787 // r3 = str2 8788 // r4 = cnt2 8789 // r10 = tmp1 8790 // r11 = tmp2 8791 address generate_compare_long_string_different_encoding(bool isLU) { 8792 __ align(CodeEntryAlignment); 8793 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 8794 StubCodeMark mark(this, stub_id); 8795 address entry = __ pc(); 8796 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8797 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8798 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8799 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8800 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8801 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8802 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8803 8804 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8805 8806 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8807 // cnt2 == amount of characters left to compare 8808 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8809 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8810 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8811 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8812 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8813 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8814 __ eor(rscratch2, tmp1, tmp2); 8815 __ mov(rscratch1, tmp2); 8816 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8817 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8818 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8819 __ push(spilled_regs, sp); 8820 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8821 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8822 8823 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8824 8825 if (SoftwarePrefetchHintDistance >= 0) { 8826 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8827 __ br(__ LT, NO_PREFETCH); 8828 __ bind(LARGE_LOOP_PREFETCH); 8829 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8830 __ mov(tmp4, 2); 8831 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8832 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8833 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8834 __ subs(tmp4, tmp4, 1); 8835 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8836 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8837 __ mov(tmp4, 2); 8838 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8839 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8840 __ subs(tmp4, tmp4, 1); 8841 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8842 __ sub(cnt2, cnt2, 64); 8843 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8844 __ br(__ GE, LARGE_LOOP_PREFETCH); 8845 } 8846 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8847 __ bind(NO_PREFETCH); 8848 __ subs(cnt2, cnt2, 16); 8849 __ br(__ LT, TAIL); 8850 __ align(OptoLoopAlignment); 8851 __ bind(SMALL_LOOP); // smaller loop 8852 __ subs(cnt2, cnt2, 16); 8853 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8854 __ br(__ GE, SMALL_LOOP); 8855 __ cmn(cnt2, (u1)16); 8856 __ br(__ EQ, LOAD_LAST); 8857 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8858 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8859 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8860 __ ldr(tmp3, Address(cnt1, -8)); 8861 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8862 __ b(LOAD_LAST); 8863 __ bind(DIFF2); 8864 __ mov(tmpU, tmp3); 8865 __ bind(DIFF1); 8866 __ pop(spilled_regs, sp); 8867 __ b(CALCULATE_DIFFERENCE); 8868 __ bind(LOAD_LAST); 8869 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8870 // No need to load it again 8871 __ mov(tmpU, tmp3); 8872 __ pop(spilled_regs, sp); 8873 8874 // tmp2 points to the address of the last 4 Latin1 characters right now 8875 __ ldrs(vtmp, Address(tmp2)); 8876 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8877 __ fmovd(tmpL, vtmp); 8878 8879 __ eor(rscratch2, tmpU, tmpL); 8880 __ cbz(rscratch2, DONE); 8881 8882 // Find the first different characters in the longwords and 8883 // compute their difference. 8884 __ bind(CALCULATE_DIFFERENCE); 8885 __ rev(rscratch2, rscratch2); 8886 __ clz(rscratch2, rscratch2); 8887 __ andr(rscratch2, rscratch2, -16); 8888 __ lsrv(tmp1, tmp1, rscratch2); 8889 __ uxthw(tmp1, tmp1); 8890 __ lsrv(rscratch1, rscratch1, rscratch2); 8891 __ uxthw(rscratch1, rscratch1); 8892 __ subw(result, tmp1, rscratch1); 8893 __ bind(DONE); 8894 __ ret(lr); 8895 return entry; 8896 } 8897 8898 // r0 = input (float16) 8899 // v0 = result (float) 8900 // v1 = temporary float register 8901 address generate_float16ToFloat() { 8902 __ align(CodeEntryAlignment); 8903 StubGenStubId stub_id = StubGenStubId::hf2f_id; 8904 StubCodeMark mark(this, stub_id); 8905 address entry = __ pc(); 8906 BLOCK_COMMENT("Entry:"); 8907 __ flt16_to_flt(v0, r0, v1); 8908 __ ret(lr); 8909 return entry; 8910 } 8911 8912 // v0 = input (float) 8913 // r0 = result (float16) 8914 // v1 = temporary float register 8915 address generate_floatToFloat16() { 8916 __ align(CodeEntryAlignment); 8917 StubGenStubId stub_id = StubGenStubId::f2hf_id; 8918 StubCodeMark mark(this, stub_id); 8919 address entry = __ pc(); 8920 BLOCK_COMMENT("Entry:"); 8921 __ flt_to_flt16(r0, v0, v1); 8922 __ ret(lr); 8923 return entry; 8924 } 8925 8926 address generate_method_entry_barrier() { 8927 __ align(CodeEntryAlignment); 8928 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 8929 StubCodeMark mark(this, stub_id); 8930 8931 Label deoptimize_label; 8932 8933 address start = __ pc(); 8934 8935 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8936 8937 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8938 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8939 // We can get here despite the nmethod being good, if we have not 8940 // yet applied our cross modification fence (or data fence). 8941 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8942 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8943 __ ldrw(rscratch2, rscratch2); 8944 __ strw(rscratch2, thread_epoch_addr); 8945 __ isb(); 8946 __ membar(__ LoadLoad); 8947 } 8948 8949 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8950 8951 __ enter(); 8952 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8953 8954 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8955 8956 __ push_call_clobbered_registers(); 8957 8958 __ mov(c_rarg0, rscratch2); 8959 __ call_VM_leaf 8960 (CAST_FROM_FN_PTR 8961 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8962 8963 __ reset_last_Java_frame(true); 8964 8965 __ mov(rscratch1, r0); 8966 8967 __ pop_call_clobbered_registers(); 8968 8969 __ cbnz(rscratch1, deoptimize_label); 8970 8971 __ leave(); 8972 __ ret(lr); 8973 8974 __ BIND(deoptimize_label); 8975 8976 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8977 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 8978 8979 __ mov(sp, rscratch1); 8980 __ br(rscratch2); 8981 8982 return start; 8983 } 8984 8985 // r0 = result 8986 // r1 = str1 8987 // r2 = cnt1 8988 // r3 = str2 8989 // r4 = cnt2 8990 // r10 = tmp1 8991 // r11 = tmp2 8992 address generate_compare_long_string_same_encoding(bool isLL) { 8993 __ align(CodeEntryAlignment); 8994 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 8995 StubCodeMark mark(this, stub_id); 8996 address entry = __ pc(); 8997 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8998 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 8999 9000 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 9001 9002 // exit from large loop when less than 64 bytes left to read or we're about 9003 // to prefetch memory behind array border 9004 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 9005 9006 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 9007 __ eor(rscratch2, tmp1, tmp2); 9008 __ cbnz(rscratch2, CAL_DIFFERENCE); 9009 9010 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 9011 // update pointers, because of previous read 9012 __ add(str1, str1, wordSize); 9013 __ add(str2, str2, wordSize); 9014 if (SoftwarePrefetchHintDistance >= 0) { 9015 __ align(OptoLoopAlignment); 9016 __ bind(LARGE_LOOP_PREFETCH); 9017 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 9018 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 9019 9020 for (int i = 0; i < 4; i++) { 9021 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 9022 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 9023 __ cmp(tmp1, tmp2); 9024 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9025 __ br(Assembler::NE, DIFF); 9026 } 9027 __ sub(cnt2, cnt2, isLL ? 64 : 32); 9028 __ add(str1, str1, 64); 9029 __ add(str2, str2, 64); 9030 __ subs(rscratch2, cnt2, largeLoopExitCondition); 9031 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 9032 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 9033 } 9034 9035 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 9036 __ br(Assembler::LE, LESS16); 9037 __ align(OptoLoopAlignment); 9038 __ bind(LOOP_COMPARE16); 9039 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9040 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9041 __ cmp(tmp1, tmp2); 9042 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9043 __ br(Assembler::NE, DIFF); 9044 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9045 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9046 __ br(Assembler::LT, LESS16); 9047 9048 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9049 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9050 __ cmp(tmp1, tmp2); 9051 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9052 __ br(Assembler::NE, DIFF); 9053 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9054 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9055 __ br(Assembler::GE, LOOP_COMPARE16); 9056 __ cbz(cnt2, LENGTH_DIFF); 9057 9058 __ bind(LESS16); 9059 // each 8 compare 9060 __ subs(cnt2, cnt2, isLL ? 8 : 4); 9061 __ br(Assembler::LE, LESS8); 9062 __ ldr(tmp1, Address(__ post(str1, 8))); 9063 __ ldr(tmp2, Address(__ post(str2, 8))); 9064 __ eor(rscratch2, tmp1, tmp2); 9065 __ cbnz(rscratch2, CAL_DIFFERENCE); 9066 __ sub(cnt2, cnt2, isLL ? 8 : 4); 9067 9068 __ bind(LESS8); // directly load last 8 bytes 9069 if (!isLL) { 9070 __ add(cnt2, cnt2, cnt2); 9071 } 9072 __ ldr(tmp1, Address(str1, cnt2)); 9073 __ ldr(tmp2, Address(str2, cnt2)); 9074 __ eor(rscratch2, tmp1, tmp2); 9075 __ cbz(rscratch2, LENGTH_DIFF); 9076 __ b(CAL_DIFFERENCE); 9077 9078 __ bind(DIFF); 9079 __ cmp(tmp1, tmp2); 9080 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 9081 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 9082 // reuse rscratch2 register for the result of eor instruction 9083 __ eor(rscratch2, tmp1, tmp2); 9084 9085 __ bind(CAL_DIFFERENCE); 9086 __ rev(rscratch2, rscratch2); 9087 __ clz(rscratch2, rscratch2); 9088 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 9089 __ lsrv(tmp1, tmp1, rscratch2); 9090 __ lsrv(tmp2, tmp2, rscratch2); 9091 if (isLL) { 9092 __ uxtbw(tmp1, tmp1); 9093 __ uxtbw(tmp2, tmp2); 9094 } else { 9095 __ uxthw(tmp1, tmp1); 9096 __ uxthw(tmp2, tmp2); 9097 } 9098 __ subw(result, tmp1, tmp2); 9099 9100 __ bind(LENGTH_DIFF); 9101 __ ret(lr); 9102 return entry; 9103 } 9104 9105 enum string_compare_mode { 9106 LL, 9107 LU, 9108 UL, 9109 UU, 9110 }; 9111 9112 // The following registers are declared in aarch64.ad 9113 // r0 = result 9114 // r1 = str1 9115 // r2 = cnt1 9116 // r3 = str2 9117 // r4 = cnt2 9118 // r10 = tmp1 9119 // r11 = tmp2 9120 // z0 = ztmp1 9121 // z1 = ztmp2 9122 // p0 = pgtmp1 9123 // p1 = pgtmp2 9124 address generate_compare_long_string_sve(string_compare_mode mode) { 9125 StubGenStubId stub_id; 9126 switch (mode) { 9127 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 9128 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 9129 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 9130 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 9131 default: ShouldNotReachHere(); 9132 } 9133 9134 __ align(CodeEntryAlignment); 9135 address entry = __ pc(); 9136 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9137 tmp1 = r10, tmp2 = r11; 9138 9139 Label LOOP, DONE, MISMATCH; 9140 Register vec_len = tmp1; 9141 Register idx = tmp2; 9142 // The minimum of the string lengths has been stored in cnt2. 9143 Register cnt = cnt2; 9144 FloatRegister ztmp1 = z0, ztmp2 = z1; 9145 PRegister pgtmp1 = p0, pgtmp2 = p1; 9146 9147 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 9148 switch (mode) { \ 9149 case LL: \ 9150 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 9151 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 9152 break; \ 9153 case LU: \ 9154 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 9155 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9156 break; \ 9157 case UL: \ 9158 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9159 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 9160 break; \ 9161 case UU: \ 9162 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9163 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9164 break; \ 9165 default: \ 9166 ShouldNotReachHere(); \ 9167 } 9168 9169 StubCodeMark mark(this, stub_id); 9170 9171 __ mov(idx, 0); 9172 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9173 9174 if (mode == LL) { 9175 __ sve_cntb(vec_len); 9176 } else { 9177 __ sve_cnth(vec_len); 9178 } 9179 9180 __ sub(rscratch1, cnt, vec_len); 9181 9182 __ bind(LOOP); 9183 9184 // main loop 9185 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9186 __ add(idx, idx, vec_len); 9187 // Compare strings. 9188 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9189 __ br(__ NE, MISMATCH); 9190 __ cmp(idx, rscratch1); 9191 __ br(__ LT, LOOP); 9192 9193 // post loop, last iteration 9194 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9195 9196 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9197 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9198 __ br(__ EQ, DONE); 9199 9200 __ bind(MISMATCH); 9201 9202 // Crop the vector to find its location. 9203 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 9204 // Extract the first different characters of each string. 9205 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 9206 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 9207 9208 // Compute the difference of the first different characters. 9209 __ sub(result, rscratch1, rscratch2); 9210 9211 __ bind(DONE); 9212 __ ret(lr); 9213 #undef LOAD_PAIR 9214 return entry; 9215 } 9216 9217 void generate_compare_long_strings() { 9218 if (UseSVE == 0) { 9219 StubRoutines::aarch64::_compare_long_string_LL 9220 = generate_compare_long_string_same_encoding(true); 9221 StubRoutines::aarch64::_compare_long_string_UU 9222 = generate_compare_long_string_same_encoding(false); 9223 StubRoutines::aarch64::_compare_long_string_LU 9224 = generate_compare_long_string_different_encoding(true); 9225 StubRoutines::aarch64::_compare_long_string_UL 9226 = generate_compare_long_string_different_encoding(false); 9227 } else { 9228 StubRoutines::aarch64::_compare_long_string_LL 9229 = generate_compare_long_string_sve(LL); 9230 StubRoutines::aarch64::_compare_long_string_UU 9231 = generate_compare_long_string_sve(UU); 9232 StubRoutines::aarch64::_compare_long_string_LU 9233 = generate_compare_long_string_sve(LU); 9234 StubRoutines::aarch64::_compare_long_string_UL 9235 = generate_compare_long_string_sve(UL); 9236 } 9237 } 9238 9239 // R0 = result 9240 // R1 = str2 9241 // R2 = cnt1 9242 // R3 = str1 9243 // R4 = cnt2 9244 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 9245 // 9246 // This generic linear code use few additional ideas, which makes it faster: 9247 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 9248 // in order to skip initial loading(help in systems with 1 ld pipeline) 9249 // 2) we can use "fast" algorithm of finding single character to search for 9250 // first symbol with less branches(1 branch per each loaded register instead 9251 // of branch for each symbol), so, this is where constants like 9252 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 9253 // 3) after loading and analyzing 1st register of source string, it can be 9254 // used to search for every 1st character entry, saving few loads in 9255 // comparison with "simplier-but-slower" implementation 9256 // 4) in order to avoid lots of push/pop operations, code below is heavily 9257 // re-using/re-initializing/compressing register values, which makes code 9258 // larger and a bit less readable, however, most of extra operations are 9259 // issued during loads or branches, so, penalty is minimal 9260 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 9261 StubGenStubId stub_id; 9262 if (str1_isL) { 9263 if (str2_isL) { 9264 stub_id = StubGenStubId::string_indexof_linear_ll_id; 9265 } else { 9266 stub_id = StubGenStubId::string_indexof_linear_ul_id; 9267 } 9268 } else { 9269 if (str2_isL) { 9270 ShouldNotReachHere(); 9271 } else { 9272 stub_id = StubGenStubId::string_indexof_linear_uu_id; 9273 } 9274 } 9275 __ align(CodeEntryAlignment); 9276 StubCodeMark mark(this, stub_id); 9277 address entry = __ pc(); 9278 9279 int str1_chr_size = str1_isL ? 1 : 2; 9280 int str2_chr_size = str2_isL ? 1 : 2; 9281 int str1_chr_shift = str1_isL ? 0 : 1; 9282 int str2_chr_shift = str2_isL ? 0 : 1; 9283 bool isL = str1_isL && str2_isL; 9284 // parameters 9285 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 9286 // temporary registers 9287 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 9288 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 9289 // redefinitions 9290 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 9291 9292 __ push(spilled_regs, sp); 9293 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 9294 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 9295 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 9296 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 9297 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 9298 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 9299 // Read whole register from str1. It is safe, because length >=8 here 9300 __ ldr(ch1, Address(str1)); 9301 // Read whole register from str2. It is safe, because length >=8 here 9302 __ ldr(ch2, Address(str2)); 9303 __ sub(cnt2, cnt2, cnt1); 9304 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 9305 if (str1_isL != str2_isL) { 9306 __ eor(v0, __ T16B, v0, v0); 9307 } 9308 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 9309 __ mul(first, first, tmp1); 9310 // check if we have less than 1 register to check 9311 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 9312 if (str1_isL != str2_isL) { 9313 __ fmovd(v1, ch1); 9314 } 9315 __ br(__ LE, L_SMALL); 9316 __ eor(ch2, first, ch2); 9317 if (str1_isL != str2_isL) { 9318 __ zip1(v1, __ T16B, v1, v0); 9319 } 9320 __ sub(tmp2, ch2, tmp1); 9321 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9322 __ bics(tmp2, tmp2, ch2); 9323 if (str1_isL != str2_isL) { 9324 __ fmovd(ch1, v1); 9325 } 9326 __ br(__ NE, L_HAS_ZERO); 9327 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9328 __ add(result, result, wordSize/str2_chr_size); 9329 __ add(str2, str2, wordSize); 9330 __ br(__ LT, L_POST_LOOP); 9331 __ BIND(L_LOOP); 9332 __ ldr(ch2, Address(str2)); 9333 __ eor(ch2, first, ch2); 9334 __ sub(tmp2, ch2, tmp1); 9335 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9336 __ bics(tmp2, tmp2, ch2); 9337 __ br(__ NE, L_HAS_ZERO); 9338 __ BIND(L_LOOP_PROCEED); 9339 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9340 __ add(str2, str2, wordSize); 9341 __ add(result, result, wordSize/str2_chr_size); 9342 __ br(__ GE, L_LOOP); 9343 __ BIND(L_POST_LOOP); 9344 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 9345 __ br(__ LE, NOMATCH); 9346 __ ldr(ch2, Address(str2)); 9347 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9348 __ eor(ch2, first, ch2); 9349 __ sub(tmp2, ch2, tmp1); 9350 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9351 __ mov(tmp4, -1); // all bits set 9352 __ b(L_SMALL_PROCEED); 9353 __ align(OptoLoopAlignment); 9354 __ BIND(L_SMALL); 9355 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9356 __ eor(ch2, first, ch2); 9357 if (str1_isL != str2_isL) { 9358 __ zip1(v1, __ T16B, v1, v0); 9359 } 9360 __ sub(tmp2, ch2, tmp1); 9361 __ mov(tmp4, -1); // all bits set 9362 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9363 if (str1_isL != str2_isL) { 9364 __ fmovd(ch1, v1); // move converted 4 symbols 9365 } 9366 __ BIND(L_SMALL_PROCEED); 9367 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9368 __ bic(tmp2, tmp2, ch2); 9369 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9370 __ rbit(tmp2, tmp2); 9371 __ br(__ EQ, NOMATCH); 9372 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9373 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9374 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9375 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9376 if (str2_isL) { // LL 9377 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9378 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9379 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9380 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9381 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9382 } else { 9383 __ mov(ch2, 0xE); // all bits in byte set except last one 9384 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9385 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9386 __ lslv(tmp2, tmp2, tmp4); 9387 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9388 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9389 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9390 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9391 } 9392 __ cmp(ch1, ch2); 9393 __ mov(tmp4, wordSize/str2_chr_size); 9394 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9395 __ BIND(L_SMALL_CMP_LOOP); 9396 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9397 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9398 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9399 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9400 __ add(tmp4, tmp4, 1); 9401 __ cmp(tmp4, cnt1); 9402 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9403 __ cmp(first, ch2); 9404 __ br(__ EQ, L_SMALL_CMP_LOOP); 9405 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9406 __ cbz(tmp2, NOMATCH); // no more matches. exit 9407 __ clz(tmp4, tmp2); 9408 __ add(result, result, 1); // advance index 9409 __ add(str2, str2, str2_chr_size); // advance pointer 9410 __ b(L_SMALL_HAS_ZERO_LOOP); 9411 __ align(OptoLoopAlignment); 9412 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9413 __ cmp(first, ch2); 9414 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9415 __ b(DONE); 9416 __ align(OptoLoopAlignment); 9417 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9418 if (str2_isL) { // LL 9419 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9420 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9421 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9422 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9423 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9424 } else { 9425 __ mov(ch2, 0xE); // all bits in byte set except last one 9426 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9427 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9428 __ lslv(tmp2, tmp2, tmp4); 9429 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9430 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9431 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9432 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9433 } 9434 __ cmp(ch1, ch2); 9435 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9436 __ b(DONE); 9437 __ align(OptoLoopAlignment); 9438 __ BIND(L_HAS_ZERO); 9439 __ rbit(tmp2, tmp2); 9440 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9441 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9442 // It's fine because both counters are 32bit and are not changed in this 9443 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9444 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9445 __ sub(result, result, 1); 9446 __ BIND(L_HAS_ZERO_LOOP); 9447 __ mov(cnt1, wordSize/str2_chr_size); 9448 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9449 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9450 if (str2_isL) { 9451 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9452 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9453 __ lslv(tmp2, tmp2, tmp4); 9454 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9455 __ add(tmp4, tmp4, 1); 9456 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9457 __ lsl(tmp2, tmp2, 1); 9458 __ mov(tmp4, wordSize/str2_chr_size); 9459 } else { 9460 __ mov(ch2, 0xE); 9461 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9462 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9463 __ lslv(tmp2, tmp2, tmp4); 9464 __ add(tmp4, tmp4, 1); 9465 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9466 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9467 __ lsl(tmp2, tmp2, 1); 9468 __ mov(tmp4, wordSize/str2_chr_size); 9469 __ sub(str2, str2, str2_chr_size); 9470 } 9471 __ cmp(ch1, ch2); 9472 __ mov(tmp4, wordSize/str2_chr_size); 9473 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9474 __ BIND(L_CMP_LOOP); 9475 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9476 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9477 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9478 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9479 __ add(tmp4, tmp4, 1); 9480 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9481 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9482 __ cmp(cnt1, ch2); 9483 __ br(__ EQ, L_CMP_LOOP); 9484 __ BIND(L_CMP_LOOP_NOMATCH); 9485 // here we're not matched 9486 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9487 __ clz(tmp4, tmp2); 9488 __ add(str2, str2, str2_chr_size); // advance pointer 9489 __ b(L_HAS_ZERO_LOOP); 9490 __ align(OptoLoopAlignment); 9491 __ BIND(L_CMP_LOOP_LAST_CMP); 9492 __ cmp(cnt1, ch2); 9493 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9494 __ b(DONE); 9495 __ align(OptoLoopAlignment); 9496 __ BIND(L_CMP_LOOP_LAST_CMP2); 9497 if (str2_isL) { 9498 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9499 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9500 __ lslv(tmp2, tmp2, tmp4); 9501 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9502 __ add(tmp4, tmp4, 1); 9503 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9504 __ lsl(tmp2, tmp2, 1); 9505 } else { 9506 __ mov(ch2, 0xE); 9507 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9508 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9509 __ lslv(tmp2, tmp2, tmp4); 9510 __ add(tmp4, tmp4, 1); 9511 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9512 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9513 __ lsl(tmp2, tmp2, 1); 9514 __ sub(str2, str2, str2_chr_size); 9515 } 9516 __ cmp(ch1, ch2); 9517 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9518 __ b(DONE); 9519 __ align(OptoLoopAlignment); 9520 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9521 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9522 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9523 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9524 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9525 // result by analyzed characters value, so, we can just reset lower bits 9526 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9527 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9528 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9529 // index of last analyzed substring inside current octet. So, str2 in at 9530 // respective start address. We need to advance it to next octet 9531 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9532 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9533 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9534 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9535 __ movw(cnt2, cnt2); 9536 __ b(L_LOOP_PROCEED); 9537 __ align(OptoLoopAlignment); 9538 __ BIND(NOMATCH); 9539 __ mov(result, -1); 9540 __ BIND(DONE); 9541 __ pop(spilled_regs, sp); 9542 __ ret(lr); 9543 return entry; 9544 } 9545 9546 void generate_string_indexof_stubs() { 9547 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9548 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9549 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9550 } 9551 9552 void inflate_and_store_2_fp_registers(bool generatePrfm, 9553 FloatRegister src1, FloatRegister src2) { 9554 Register dst = r1; 9555 __ zip1(v1, __ T16B, src1, v0); 9556 __ zip2(v2, __ T16B, src1, v0); 9557 if (generatePrfm) { 9558 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9559 } 9560 __ zip1(v3, __ T16B, src2, v0); 9561 __ zip2(v4, __ T16B, src2, v0); 9562 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9563 } 9564 9565 // R0 = src 9566 // R1 = dst 9567 // R2 = len 9568 // R3 = len >> 3 9569 // V0 = 0 9570 // v1 = loaded 8 bytes 9571 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9572 address generate_large_byte_array_inflate() { 9573 __ align(CodeEntryAlignment); 9574 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 9575 StubCodeMark mark(this, stub_id); 9576 address entry = __ pc(); 9577 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9578 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9579 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9580 9581 // do one more 8-byte read to have address 16-byte aligned in most cases 9582 // also use single store instruction 9583 __ ldrd(v2, __ post(src, 8)); 9584 __ sub(octetCounter, octetCounter, 2); 9585 __ zip1(v1, __ T16B, v1, v0); 9586 __ zip1(v2, __ T16B, v2, v0); 9587 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9588 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9589 __ subs(rscratch1, octetCounter, large_loop_threshold); 9590 __ br(__ LE, LOOP_START); 9591 __ b(LOOP_PRFM_START); 9592 __ bind(LOOP_PRFM); 9593 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9594 __ bind(LOOP_PRFM_START); 9595 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9596 __ sub(octetCounter, octetCounter, 8); 9597 __ subs(rscratch1, octetCounter, large_loop_threshold); 9598 inflate_and_store_2_fp_registers(true, v3, v4); 9599 inflate_and_store_2_fp_registers(true, v5, v6); 9600 __ br(__ GT, LOOP_PRFM); 9601 __ cmp(octetCounter, (u1)8); 9602 __ br(__ LT, DONE); 9603 __ bind(LOOP); 9604 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9605 __ bind(LOOP_START); 9606 __ sub(octetCounter, octetCounter, 8); 9607 __ cmp(octetCounter, (u1)8); 9608 inflate_and_store_2_fp_registers(false, v3, v4); 9609 inflate_and_store_2_fp_registers(false, v5, v6); 9610 __ br(__ GE, LOOP); 9611 __ bind(DONE); 9612 __ ret(lr); 9613 return entry; 9614 } 9615 9616 /** 9617 * Arguments: 9618 * 9619 * Input: 9620 * c_rarg0 - current state address 9621 * c_rarg1 - H key address 9622 * c_rarg2 - data address 9623 * c_rarg3 - number of blocks 9624 * 9625 * Output: 9626 * Updated state at c_rarg0 9627 */ 9628 address generate_ghash_processBlocks() { 9629 // Bafflingly, GCM uses little-endian for the byte order, but 9630 // big-endian for the bit order. For example, the polynomial 1 is 9631 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9632 // 9633 // So, we must either reverse the bytes in each word and do 9634 // everything big-endian or reverse the bits in each byte and do 9635 // it little-endian. On AArch64 it's more idiomatic to reverse 9636 // the bits in each byte (we have an instruction, RBIT, to do 9637 // that) and keep the data in little-endian bit order through the 9638 // calculation, bit-reversing the inputs and outputs. 9639 9640 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 9641 StubCodeMark mark(this, stub_id); 9642 __ align(wordSize * 2); 9643 address p = __ pc(); 9644 __ emit_int64(0x87); // The low-order bits of the field 9645 // polynomial (i.e. p = z^7+z^2+z+1) 9646 // repeated in the low and high parts of a 9647 // 128-bit vector 9648 __ emit_int64(0x87); 9649 9650 __ align(CodeEntryAlignment); 9651 address start = __ pc(); 9652 9653 Register state = c_rarg0; 9654 Register subkeyH = c_rarg1; 9655 Register data = c_rarg2; 9656 Register blocks = c_rarg3; 9657 9658 FloatRegister vzr = v30; 9659 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9660 9661 __ ldrq(v24, p); // The field polynomial 9662 9663 __ ldrq(v0, Address(state)); 9664 __ ldrq(v1, Address(subkeyH)); 9665 9666 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9667 __ rbit(v0, __ T16B, v0); 9668 __ rev64(v1, __ T16B, v1); 9669 __ rbit(v1, __ T16B, v1); 9670 9671 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9672 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9673 9674 { 9675 Label L_ghash_loop; 9676 __ bind(L_ghash_loop); 9677 9678 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9679 // reversing each byte 9680 __ rbit(v2, __ T16B, v2); 9681 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9682 9683 // Multiply state in v2 by subkey in v1 9684 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9685 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9686 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9687 // Reduce v7:v5 by the field polynomial 9688 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9689 9690 __ sub(blocks, blocks, 1); 9691 __ cbnz(blocks, L_ghash_loop); 9692 } 9693 9694 // The bit-reversed result is at this point in v0 9695 __ rev64(v0, __ T16B, v0); 9696 __ rbit(v0, __ T16B, v0); 9697 9698 __ st1(v0, __ T16B, state); 9699 __ ret(lr); 9700 9701 return start; 9702 } 9703 9704 address generate_ghash_processBlocks_wide() { 9705 address small = generate_ghash_processBlocks(); 9706 9707 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 9708 StubCodeMark mark(this, stub_id); 9709 __ align(wordSize * 2); 9710 address p = __ pc(); 9711 __ emit_int64(0x87); // The low-order bits of the field 9712 // polynomial (i.e. p = z^7+z^2+z+1) 9713 // repeated in the low and high parts of a 9714 // 128-bit vector 9715 __ emit_int64(0x87); 9716 9717 __ align(CodeEntryAlignment); 9718 address start = __ pc(); 9719 9720 Register state = c_rarg0; 9721 Register subkeyH = c_rarg1; 9722 Register data = c_rarg2; 9723 Register blocks = c_rarg3; 9724 9725 const int unroll = 4; 9726 9727 __ cmp(blocks, (unsigned char)(unroll * 2)); 9728 __ br(__ LT, small); 9729 9730 if (unroll > 1) { 9731 // Save state before entering routine 9732 __ sub(sp, sp, 4 * 16); 9733 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9734 __ sub(sp, sp, 4 * 16); 9735 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9736 } 9737 9738 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9739 9740 if (unroll > 1) { 9741 // And restore state 9742 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9743 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9744 } 9745 9746 __ cmp(blocks, (unsigned char)0); 9747 __ br(__ GT, small); 9748 9749 __ ret(lr); 9750 9751 return start; 9752 } 9753 9754 void generate_base64_encode_simdround(Register src, Register dst, 9755 FloatRegister codec, u8 size) { 9756 9757 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9758 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9759 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9760 9761 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9762 9763 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9764 9765 __ ushr(ind0, arrangement, in0, 2); 9766 9767 __ ushr(ind1, arrangement, in1, 2); 9768 __ shl(in0, arrangement, in0, 6); 9769 __ orr(ind1, arrangement, ind1, in0); 9770 __ ushr(ind1, arrangement, ind1, 2); 9771 9772 __ ushr(ind2, arrangement, in2, 4); 9773 __ shl(in1, arrangement, in1, 4); 9774 __ orr(ind2, arrangement, in1, ind2); 9775 __ ushr(ind2, arrangement, ind2, 2); 9776 9777 __ shl(ind3, arrangement, in2, 2); 9778 __ ushr(ind3, arrangement, ind3, 2); 9779 9780 __ tbl(out0, arrangement, codec, 4, ind0); 9781 __ tbl(out1, arrangement, codec, 4, ind1); 9782 __ tbl(out2, arrangement, codec, 4, ind2); 9783 __ tbl(out3, arrangement, codec, 4, ind3); 9784 9785 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9786 } 9787 9788 /** 9789 * Arguments: 9790 * 9791 * Input: 9792 * c_rarg0 - src_start 9793 * c_rarg1 - src_offset 9794 * c_rarg2 - src_length 9795 * c_rarg3 - dest_start 9796 * c_rarg4 - dest_offset 9797 * c_rarg5 - isURL 9798 * 9799 */ 9800 address generate_base64_encodeBlock() { 9801 9802 static const char toBase64[64] = { 9803 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9804 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9805 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9806 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9807 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9808 }; 9809 9810 static const char toBase64URL[64] = { 9811 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9812 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9813 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9814 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9815 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9816 }; 9817 9818 __ align(CodeEntryAlignment); 9819 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 9820 StubCodeMark mark(this, stub_id); 9821 address start = __ pc(); 9822 9823 Register src = c_rarg0; // source array 9824 Register soff = c_rarg1; // source start offset 9825 Register send = c_rarg2; // source end offset 9826 Register dst = c_rarg3; // dest array 9827 Register doff = c_rarg4; // position for writing to dest array 9828 Register isURL = c_rarg5; // Base64 or URL character set 9829 9830 // c_rarg6 and c_rarg7 are free to use as temps 9831 Register codec = c_rarg6; 9832 Register length = c_rarg7; 9833 9834 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9835 9836 __ add(src, src, soff); 9837 __ add(dst, dst, doff); 9838 __ sub(length, send, soff); 9839 9840 // load the codec base address 9841 __ lea(codec, ExternalAddress((address) toBase64)); 9842 __ cbz(isURL, ProcessData); 9843 __ lea(codec, ExternalAddress((address) toBase64URL)); 9844 9845 __ BIND(ProcessData); 9846 9847 // too short to formup a SIMD loop, roll back 9848 __ cmp(length, (u1)24); 9849 __ br(Assembler::LT, Process3B); 9850 9851 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9852 9853 __ BIND(Process48B); 9854 __ cmp(length, (u1)48); 9855 __ br(Assembler::LT, Process24B); 9856 generate_base64_encode_simdround(src, dst, v0, 16); 9857 __ sub(length, length, 48); 9858 __ b(Process48B); 9859 9860 __ BIND(Process24B); 9861 __ cmp(length, (u1)24); 9862 __ br(Assembler::LT, SIMDExit); 9863 generate_base64_encode_simdround(src, dst, v0, 8); 9864 __ sub(length, length, 24); 9865 9866 __ BIND(SIMDExit); 9867 __ cbz(length, Exit); 9868 9869 __ BIND(Process3B); 9870 // 3 src bytes, 24 bits 9871 __ ldrb(r10, __ post(src, 1)); 9872 __ ldrb(r11, __ post(src, 1)); 9873 __ ldrb(r12, __ post(src, 1)); 9874 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9875 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9876 // codec index 9877 __ ubfmw(r15, r12, 18, 23); 9878 __ ubfmw(r14, r12, 12, 17); 9879 __ ubfmw(r13, r12, 6, 11); 9880 __ andw(r12, r12, 63); 9881 // get the code based on the codec 9882 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9883 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9884 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9885 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9886 __ strb(r15, __ post(dst, 1)); 9887 __ strb(r14, __ post(dst, 1)); 9888 __ strb(r13, __ post(dst, 1)); 9889 __ strb(r12, __ post(dst, 1)); 9890 __ sub(length, length, 3); 9891 __ cbnz(length, Process3B); 9892 9893 __ BIND(Exit); 9894 __ ret(lr); 9895 9896 return start; 9897 } 9898 9899 void generate_base64_decode_simdround(Register src, Register dst, 9900 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9901 9902 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9903 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9904 9905 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9906 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9907 9908 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9909 9910 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9911 9912 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9913 9914 // we need unsigned saturating subtract, to make sure all input values 9915 // in range [0, 63] will have 0U value in the higher half lookup 9916 __ uqsubv(decH0, __ T16B, in0, v27); 9917 __ uqsubv(decH1, __ T16B, in1, v27); 9918 __ uqsubv(decH2, __ T16B, in2, v27); 9919 __ uqsubv(decH3, __ T16B, in3, v27); 9920 9921 // lower half lookup 9922 __ tbl(decL0, arrangement, codecL, 4, in0); 9923 __ tbl(decL1, arrangement, codecL, 4, in1); 9924 __ tbl(decL2, arrangement, codecL, 4, in2); 9925 __ tbl(decL3, arrangement, codecL, 4, in3); 9926 9927 // higher half lookup 9928 __ tbx(decH0, arrangement, codecH, 4, decH0); 9929 __ tbx(decH1, arrangement, codecH, 4, decH1); 9930 __ tbx(decH2, arrangement, codecH, 4, decH2); 9931 __ tbx(decH3, arrangement, codecH, 4, decH3); 9932 9933 // combine lower and higher 9934 __ orr(decL0, arrangement, decL0, decH0); 9935 __ orr(decL1, arrangement, decL1, decH1); 9936 __ orr(decL2, arrangement, decL2, decH2); 9937 __ orr(decL3, arrangement, decL3, decH3); 9938 9939 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9940 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9941 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9942 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9943 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9944 __ orr(in0, arrangement, decH0, decH1); 9945 __ orr(in1, arrangement, decH2, decH3); 9946 __ orr(in2, arrangement, in0, in1); 9947 __ umaxv(in3, arrangement, in2); 9948 __ umov(rscratch2, in3, __ B, 0); 9949 9950 // get the data to output 9951 __ shl(out0, arrangement, decL0, 2); 9952 __ ushr(out1, arrangement, decL1, 4); 9953 __ orr(out0, arrangement, out0, out1); 9954 __ shl(out1, arrangement, decL1, 4); 9955 __ ushr(out2, arrangement, decL2, 2); 9956 __ orr(out1, arrangement, out1, out2); 9957 __ shl(out2, arrangement, decL2, 6); 9958 __ orr(out2, arrangement, out2, decL3); 9959 9960 __ cbz(rscratch2, NoIllegalData); 9961 9962 // handle illegal input 9963 __ umov(r10, in2, __ D, 0); 9964 if (size == 16) { 9965 __ cbnz(r10, ErrorInLowerHalf); 9966 9967 // illegal input is in higher half, store the lower half now. 9968 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9969 9970 __ umov(r10, in2, __ D, 1); 9971 __ umov(r11, out0, __ D, 1); 9972 __ umov(r12, out1, __ D, 1); 9973 __ umov(r13, out2, __ D, 1); 9974 __ b(StoreLegalData); 9975 9976 __ BIND(ErrorInLowerHalf); 9977 } 9978 __ umov(r11, out0, __ D, 0); 9979 __ umov(r12, out1, __ D, 0); 9980 __ umov(r13, out2, __ D, 0); 9981 9982 __ BIND(StoreLegalData); 9983 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 9984 __ strb(r11, __ post(dst, 1)); 9985 __ strb(r12, __ post(dst, 1)); 9986 __ strb(r13, __ post(dst, 1)); 9987 __ lsr(r10, r10, 8); 9988 __ lsr(r11, r11, 8); 9989 __ lsr(r12, r12, 8); 9990 __ lsr(r13, r13, 8); 9991 __ b(StoreLegalData); 9992 9993 __ BIND(NoIllegalData); 9994 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 9995 } 9996 9997 9998 /** 9999 * Arguments: 10000 * 10001 * Input: 10002 * c_rarg0 - src_start 10003 * c_rarg1 - src_offset 10004 * c_rarg2 - src_length 10005 * c_rarg3 - dest_start 10006 * c_rarg4 - dest_offset 10007 * c_rarg5 - isURL 10008 * c_rarg6 - isMIME 10009 * 10010 */ 10011 address generate_base64_decodeBlock() { 10012 10013 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 10014 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 10015 // titled "Base64 decoding". 10016 10017 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 10018 // except the trailing character '=' is also treated illegal value in this intrinsic. That 10019 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 10020 static const uint8_t fromBase64ForNoSIMD[256] = { 10021 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10022 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10023 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10024 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10025 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10026 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 10027 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10028 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10029 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10030 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10031 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10032 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10033 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10034 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10035 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10036 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10037 }; 10038 10039 static const uint8_t fromBase64URLForNoSIMD[256] = { 10040 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10041 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10042 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10043 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10044 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10045 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 10046 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10047 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10048 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10049 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10050 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10051 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10052 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10053 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10054 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10055 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10056 }; 10057 10058 // A legal value of base64 code is in range [0, 127]. We need two lookups 10059 // with tbl/tbx and combine them to get the decode data. The 1st table vector 10060 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 10061 // table vector lookup use tbx, out of range indices are unchanged in 10062 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 10063 // The value of index 64 is set to 0, so that we know that we already get the 10064 // decoded data with the 1st lookup. 10065 static const uint8_t fromBase64ForSIMD[128] = { 10066 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10067 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10068 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10069 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10070 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10071 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10072 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10073 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10074 }; 10075 10076 static const uint8_t fromBase64URLForSIMD[128] = { 10077 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10078 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10079 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10080 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10081 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10082 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10083 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10084 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10085 }; 10086 10087 __ align(CodeEntryAlignment); 10088 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 10089 StubCodeMark mark(this, stub_id); 10090 address start = __ pc(); 10091 10092 Register src = c_rarg0; // source array 10093 Register soff = c_rarg1; // source start offset 10094 Register send = c_rarg2; // source end offset 10095 Register dst = c_rarg3; // dest array 10096 Register doff = c_rarg4; // position for writing to dest array 10097 Register isURL = c_rarg5; // Base64 or URL character set 10098 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 10099 10100 Register length = send; // reuse send as length of source data to process 10101 10102 Register simd_codec = c_rarg6; 10103 Register nosimd_codec = c_rarg7; 10104 10105 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 10106 10107 __ enter(); 10108 10109 __ add(src, src, soff); 10110 __ add(dst, dst, doff); 10111 10112 __ mov(doff, dst); 10113 10114 __ sub(length, send, soff); 10115 __ bfm(length, zr, 0, 1); 10116 10117 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 10118 __ cbz(isURL, ProcessData); 10119 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 10120 10121 __ BIND(ProcessData); 10122 __ mov(rscratch1, length); 10123 __ cmp(length, (u1)144); // 144 = 80 + 64 10124 __ br(Assembler::LT, Process4B); 10125 10126 // In the MIME case, the line length cannot be more than 76 10127 // bytes (see RFC 2045). This is too short a block for SIMD 10128 // to be worthwhile, so we use non-SIMD here. 10129 __ movw(rscratch1, 79); 10130 10131 __ BIND(Process4B); 10132 __ ldrw(r14, __ post(src, 4)); 10133 __ ubfxw(r10, r14, 0, 8); 10134 __ ubfxw(r11, r14, 8, 8); 10135 __ ubfxw(r12, r14, 16, 8); 10136 __ ubfxw(r13, r14, 24, 8); 10137 // get the de-code 10138 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 10139 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 10140 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 10141 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 10142 // error detection, 255u indicates an illegal input 10143 __ orrw(r14, r10, r11); 10144 __ orrw(r15, r12, r13); 10145 __ orrw(r14, r14, r15); 10146 __ tbnz(r14, 7, Exit); 10147 // recover the data 10148 __ lslw(r14, r10, 10); 10149 __ bfiw(r14, r11, 4, 6); 10150 __ bfmw(r14, r12, 2, 5); 10151 __ rev16w(r14, r14); 10152 __ bfiw(r13, r12, 6, 2); 10153 __ strh(r14, __ post(dst, 2)); 10154 __ strb(r13, __ post(dst, 1)); 10155 // non-simd loop 10156 __ subsw(rscratch1, rscratch1, 4); 10157 __ br(Assembler::GT, Process4B); 10158 10159 // if exiting from PreProcess80B, rscratch1 == -1; 10160 // otherwise, rscratch1 == 0. 10161 __ cbzw(rscratch1, Exit); 10162 __ sub(length, length, 80); 10163 10164 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 10165 __ cbz(isURL, SIMDEnter); 10166 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 10167 10168 __ BIND(SIMDEnter); 10169 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 10170 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 10171 __ mov(rscratch1, 63); 10172 __ dup(v27, __ T16B, rscratch1); 10173 10174 __ BIND(Process64B); 10175 __ cmp(length, (u1)64); 10176 __ br(Assembler::LT, Process32B); 10177 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 10178 __ sub(length, length, 64); 10179 __ b(Process64B); 10180 10181 __ BIND(Process32B); 10182 __ cmp(length, (u1)32); 10183 __ br(Assembler::LT, SIMDExit); 10184 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 10185 __ sub(length, length, 32); 10186 __ b(Process32B); 10187 10188 __ BIND(SIMDExit); 10189 __ cbz(length, Exit); 10190 __ movw(rscratch1, length); 10191 __ b(Process4B); 10192 10193 __ BIND(Exit); 10194 __ sub(c_rarg0, dst, doff); 10195 10196 __ leave(); 10197 __ ret(lr); 10198 10199 return start; 10200 } 10201 10202 // Support for spin waits. 10203 address generate_spin_wait() { 10204 __ align(CodeEntryAlignment); 10205 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 10206 StubCodeMark mark(this, stub_id); 10207 address start = __ pc(); 10208 10209 __ spin_wait(); 10210 __ ret(lr); 10211 10212 return start; 10213 } 10214 10215 void generate_lookup_secondary_supers_table_stub() { 10216 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 10217 StubCodeMark mark(this, stub_id); 10218 10219 const Register 10220 r_super_klass = r0, 10221 r_array_base = r1, 10222 r_array_length = r2, 10223 r_array_index = r3, 10224 r_sub_klass = r4, 10225 r_bitmap = rscratch2, 10226 result = r5; 10227 const FloatRegister 10228 vtemp = v0; 10229 10230 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 10231 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 10232 Label L_success; 10233 __ enter(); 10234 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 10235 r_array_base, r_array_length, r_array_index, 10236 vtemp, result, slot, 10237 /*stub_is_near*/true); 10238 __ leave(); 10239 __ ret(lr); 10240 } 10241 } 10242 10243 // Slow path implementation for UseSecondarySupersTable. 10244 address generate_lookup_secondary_supers_table_slow_path_stub() { 10245 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 10246 StubCodeMark mark(this, stub_id); 10247 10248 address start = __ pc(); 10249 const Register 10250 r_super_klass = r0, // argument 10251 r_array_base = r1, // argument 10252 temp1 = r2, // temp 10253 r_array_index = r3, // argument 10254 r_bitmap = rscratch2, // argument 10255 result = r5; // argument 10256 10257 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 10258 __ ret(lr); 10259 10260 return start; 10261 } 10262 10263 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10264 10265 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 10266 // 10267 // If LSE is in use, generate LSE versions of all the stubs. The 10268 // non-LSE versions are in atomic_aarch64.S. 10269 10270 // class AtomicStubMark records the entry point of a stub and the 10271 // stub pointer which will point to it. The stub pointer is set to 10272 // the entry point when ~AtomicStubMark() is called, which must be 10273 // after ICache::invalidate_range. This ensures safe publication of 10274 // the generated code. 10275 class AtomicStubMark { 10276 address _entry_point; 10277 aarch64_atomic_stub_t *_stub; 10278 MacroAssembler *_masm; 10279 public: 10280 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 10281 _masm = masm; 10282 __ align(32); 10283 _entry_point = __ pc(); 10284 _stub = stub; 10285 } 10286 ~AtomicStubMark() { 10287 *_stub = (aarch64_atomic_stub_t)_entry_point; 10288 } 10289 }; 10290 10291 // NB: For memory_order_conservative we need a trailing membar after 10292 // LSE atomic operations but not a leading membar. 10293 // 10294 // We don't need a leading membar because a clause in the Arm ARM 10295 // says: 10296 // 10297 // Barrier-ordered-before 10298 // 10299 // Barrier instructions order prior Memory effects before subsequent 10300 // Memory effects generated by the same Observer. A read or a write 10301 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 10302 // Observer if and only if RW1 appears in program order before RW 2 10303 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 10304 // instruction with both Acquire and Release semantics. 10305 // 10306 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 10307 // and Release semantics, therefore we don't need a leading 10308 // barrier. However, there is no corresponding Barrier-ordered-after 10309 // relationship, therefore we need a trailing membar to prevent a 10310 // later store or load from being reordered with the store in an 10311 // atomic instruction. 10312 // 10313 // This was checked by using the herd7 consistency model simulator 10314 // (http://diy.inria.fr/) with this test case: 10315 // 10316 // AArch64 LseCas 10317 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 10318 // P0 | P1; 10319 // LDR W4, [X2] | MOV W3, #0; 10320 // DMB LD | MOV W4, #1; 10321 // LDR W3, [X1] | CASAL W3, W4, [X1]; 10322 // | DMB ISH; 10323 // | STR W4, [X2]; 10324 // exists 10325 // (0:X3=0 /\ 0:X4=1) 10326 // 10327 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 10328 // with the store to x in P1. Without the DMB in P1 this may happen. 10329 // 10330 // At the time of writing we don't know of any AArch64 hardware that 10331 // reorders stores in this way, but the Reference Manual permits it. 10332 10333 void gen_cas_entry(Assembler::operand_size size, 10334 atomic_memory_order order) { 10335 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 10336 exchange_val = c_rarg2; 10337 bool acquire, release; 10338 switch (order) { 10339 case memory_order_relaxed: 10340 acquire = false; 10341 release = false; 10342 break; 10343 case memory_order_release: 10344 acquire = false; 10345 release = true; 10346 break; 10347 default: 10348 acquire = true; 10349 release = true; 10350 break; 10351 } 10352 __ mov(prev, compare_val); 10353 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 10354 if (order == memory_order_conservative) { 10355 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10356 } 10357 if (size == Assembler::xword) { 10358 __ mov(r0, prev); 10359 } else { 10360 __ movw(r0, prev); 10361 } 10362 __ ret(lr); 10363 } 10364 10365 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10366 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10367 // If not relaxed, then default to conservative. Relaxed is the only 10368 // case we use enough to be worth specializing. 10369 if (order == memory_order_relaxed) { 10370 __ ldadd(size, incr, prev, addr); 10371 } else { 10372 __ ldaddal(size, incr, prev, addr); 10373 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10374 } 10375 if (size == Assembler::xword) { 10376 __ mov(r0, prev); 10377 } else { 10378 __ movw(r0, prev); 10379 } 10380 __ ret(lr); 10381 } 10382 10383 void gen_swpal_entry(Assembler::operand_size size) { 10384 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10385 __ swpal(size, incr, prev, addr); 10386 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10387 if (size == Assembler::xword) { 10388 __ mov(r0, prev); 10389 } else { 10390 __ movw(r0, prev); 10391 } 10392 __ ret(lr); 10393 } 10394 10395 void generate_atomic_entry_points() { 10396 if (! UseLSE) { 10397 return; 10398 } 10399 __ align(CodeEntryAlignment); 10400 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 10401 StubCodeMark mark(this, stub_id); 10402 address first_entry = __ pc(); 10403 10404 // ADD, memory_order_conservative 10405 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10406 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10407 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10408 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10409 10410 // ADD, memory_order_relaxed 10411 AtomicStubMark mark_fetch_add_4_relaxed 10412 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10413 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10414 AtomicStubMark mark_fetch_add_8_relaxed 10415 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10416 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10417 10418 // XCHG, memory_order_conservative 10419 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10420 gen_swpal_entry(Assembler::word); 10421 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10422 gen_swpal_entry(Assembler::xword); 10423 10424 // CAS, memory_order_conservative 10425 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10426 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10427 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10428 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10429 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10430 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10431 10432 // CAS, memory_order_relaxed 10433 AtomicStubMark mark_cmpxchg_1_relaxed 10434 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10435 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10436 AtomicStubMark mark_cmpxchg_4_relaxed 10437 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10438 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10439 AtomicStubMark mark_cmpxchg_8_relaxed 10440 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10441 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10442 10443 AtomicStubMark mark_cmpxchg_4_release 10444 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10445 gen_cas_entry(MacroAssembler::word, memory_order_release); 10446 AtomicStubMark mark_cmpxchg_8_release 10447 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10448 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10449 10450 AtomicStubMark mark_cmpxchg_4_seq_cst 10451 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10452 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10453 AtomicStubMark mark_cmpxchg_8_seq_cst 10454 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10455 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10456 10457 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10458 } 10459 #endif // LINUX 10460 10461 address generate_cont_thaw(Continuation::thaw_kind kind) { 10462 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10463 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10464 10465 address start = __ pc(); 10466 10467 if (return_barrier) { 10468 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10469 __ mov(sp, rscratch1); 10470 } 10471 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10472 10473 if (return_barrier) { 10474 // preserve possible return value from a method returning to the return barrier 10475 __ fmovd(rscratch1, v0); 10476 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10477 } 10478 10479 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10480 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10481 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10482 10483 if (return_barrier) { 10484 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10485 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10486 __ fmovd(v0, rscratch1); 10487 } 10488 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10489 10490 10491 Label thaw_success; 10492 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10493 __ cbnz(rscratch2, thaw_success); 10494 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10495 __ br(rscratch1); 10496 __ bind(thaw_success); 10497 10498 // make room for the thawed frames 10499 __ sub(rscratch1, sp, rscratch2); 10500 __ andr(rscratch1, rscratch1, -16); // align 10501 __ mov(sp, rscratch1); 10502 10503 if (return_barrier) { 10504 // save original return value -- again 10505 __ fmovd(rscratch1, v0); 10506 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10507 } 10508 10509 // If we want, we can templatize thaw by kind, and have three different entries 10510 __ movw(c_rarg1, (uint32_t)kind); 10511 10512 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10513 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10514 10515 if (return_barrier) { 10516 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10517 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10518 __ fmovd(v0, rscratch1); 10519 } else { 10520 __ mov(r0, zr); // return 0 (success) from doYield 10521 } 10522 10523 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10524 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10525 __ mov(rfp, sp); 10526 10527 if (return_barrier_exception) { 10528 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10529 __ authenticate_return_address(c_rarg1); 10530 __ verify_oop(r0); 10531 // save return value containing the exception oop in callee-saved R19 10532 __ mov(r19, r0); 10533 10534 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10535 10536 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10537 // __ reinitialize_ptrue(); 10538 10539 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10540 10541 __ mov(r1, r0); // the exception handler 10542 __ mov(r0, r19); // restore return value containing the exception oop 10543 __ verify_oop(r0); 10544 10545 __ leave(); 10546 __ mov(r3, lr); 10547 __ br(r1); // the exception handler 10548 } else { 10549 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10550 __ leave(); 10551 __ ret(lr); 10552 } 10553 10554 return start; 10555 } 10556 10557 address generate_cont_thaw() { 10558 if (!Continuations::enabled()) return nullptr; 10559 10560 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 10561 StubCodeMark mark(this, stub_id); 10562 address start = __ pc(); 10563 generate_cont_thaw(Continuation::thaw_top); 10564 return start; 10565 } 10566 10567 address generate_cont_returnBarrier() { 10568 if (!Continuations::enabled()) return nullptr; 10569 10570 // TODO: will probably need multiple return barriers depending on return type 10571 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 10572 StubCodeMark mark(this, stub_id); 10573 address start = __ pc(); 10574 10575 generate_cont_thaw(Continuation::thaw_return_barrier); 10576 10577 return start; 10578 } 10579 10580 address generate_cont_returnBarrier_exception() { 10581 if (!Continuations::enabled()) return nullptr; 10582 10583 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 10584 StubCodeMark mark(this, stub_id); 10585 address start = __ pc(); 10586 10587 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10588 10589 return start; 10590 } 10591 10592 address generate_cont_preempt_stub() { 10593 if (!Continuations::enabled()) return nullptr; 10594 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 10595 StubCodeMark mark(this, stub_id); 10596 address start = __ pc(); 10597 10598 __ reset_last_Java_frame(true); 10599 10600 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10601 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10602 __ mov(sp, rscratch2); 10603 10604 Label preemption_cancelled; 10605 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10606 __ cbnz(rscratch1, preemption_cancelled); 10607 10608 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10609 SharedRuntime::continuation_enter_cleanup(_masm); 10610 __ leave(); 10611 __ ret(lr); 10612 10613 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10614 __ bind(preemption_cancelled); 10615 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10616 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10617 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10618 __ ldr(rscratch1, Address(rscratch1)); 10619 __ br(rscratch1); 10620 10621 return start; 10622 } 10623 10624 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10625 // are represented as long[5], with BITS_PER_LIMB = 26. 10626 // Pack five 26-bit limbs into three 64-bit registers. 10627 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10628 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10629 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10630 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10631 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10632 10633 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10634 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10635 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10636 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10637 10638 if (dest2->is_valid()) { 10639 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10640 } else { 10641 #ifdef ASSERT 10642 Label OK; 10643 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10644 __ br(__ EQ, OK); 10645 __ stop("high bits of Poly1305 integer should be zero"); 10646 __ should_not_reach_here(); 10647 __ bind(OK); 10648 #endif 10649 } 10650 } 10651 10652 // As above, but return only a 128-bit integer, packed into two 10653 // 64-bit registers. 10654 void pack_26(Register dest0, Register dest1, Register src) { 10655 pack_26(dest0, dest1, noreg, src); 10656 } 10657 10658 // Multiply and multiply-accumulate unsigned 64-bit registers. 10659 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10660 __ mul(prod_lo, n, m); 10661 __ umulh(prod_hi, n, m); 10662 } 10663 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10664 wide_mul(rscratch1, rscratch2, n, m); 10665 __ adds(sum_lo, sum_lo, rscratch1); 10666 __ adc(sum_hi, sum_hi, rscratch2); 10667 } 10668 10669 // Poly1305, RFC 7539 10670 10671 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10672 // description of the tricks used to simplify and accelerate this 10673 // computation. 10674 10675 address generate_poly1305_processBlocks() { 10676 __ align(CodeEntryAlignment); 10677 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 10678 StubCodeMark mark(this, stub_id); 10679 address start = __ pc(); 10680 Label here; 10681 __ enter(); 10682 RegSet callee_saved = RegSet::range(r19, r28); 10683 __ push(callee_saved, sp); 10684 10685 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10686 10687 // Arguments 10688 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10689 10690 // R_n is the 128-bit randomly-generated key, packed into two 10691 // registers. The caller passes this key to us as long[5], with 10692 // BITS_PER_LIMB = 26. 10693 const Register R_0 = *++regs, R_1 = *++regs; 10694 pack_26(R_0, R_1, r_start); 10695 10696 // RR_n is (R_n >> 2) * 5 10697 const Register RR_0 = *++regs, RR_1 = *++regs; 10698 __ lsr(RR_0, R_0, 2); 10699 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10700 __ lsr(RR_1, R_1, 2); 10701 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10702 10703 // U_n is the current checksum 10704 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10705 pack_26(U_0, U_1, U_2, acc_start); 10706 10707 static constexpr int BLOCK_LENGTH = 16; 10708 Label DONE, LOOP; 10709 10710 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10711 __ br(Assembler::LT, DONE); { 10712 __ bind(LOOP); 10713 10714 // S_n is to be the sum of U_n and the next block of data 10715 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10716 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10717 __ adds(S_0, U_0, S_0); 10718 __ adcs(S_1, U_1, S_1); 10719 __ adc(S_2, U_2, zr); 10720 __ add(S_2, S_2, 1); 10721 10722 const Register U_0HI = *++regs, U_1HI = *++regs; 10723 10724 // NB: this logic depends on some of the special properties of 10725 // Poly1305 keys. In particular, because we know that the top 10726 // four bits of R_0 and R_1 are zero, we can add together 10727 // partial products without any risk of needing to propagate a 10728 // carry out. 10729 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10730 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10731 __ andr(U_2, R_0, 3); 10732 __ mul(U_2, S_2, U_2); 10733 10734 // Recycle registers S_0, S_1, S_2 10735 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10736 10737 // Partial reduction mod 2**130 - 5 10738 __ adds(U_1, U_0HI, U_1); 10739 __ adc(U_2, U_1HI, U_2); 10740 // Sum now in U_2:U_1:U_0. 10741 // Dead: U_0HI, U_1HI. 10742 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10743 10744 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10745 10746 // First, U_2:U_1:U_0 += (U_2 >> 2) 10747 __ lsr(rscratch1, U_2, 2); 10748 __ andr(U_2, U_2, (u8)3); 10749 __ adds(U_0, U_0, rscratch1); 10750 __ adcs(U_1, U_1, zr); 10751 __ adc(U_2, U_2, zr); 10752 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10753 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10754 __ adcs(U_1, U_1, zr); 10755 __ adc(U_2, U_2, zr); 10756 10757 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10758 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10759 __ br(~ Assembler::LT, LOOP); 10760 } 10761 10762 // Further reduce modulo 2^130 - 5 10763 __ lsr(rscratch1, U_2, 2); 10764 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10765 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10766 __ adcs(U_1, U_1, zr); 10767 __ andr(U_2, U_2, (u1)3); 10768 __ adc(U_2, U_2, zr); 10769 10770 // Unpack the sum into five 26-bit limbs and write to memory. 10771 __ ubfiz(rscratch1, U_0, 0, 26); 10772 __ ubfx(rscratch2, U_0, 26, 26); 10773 __ stp(rscratch1, rscratch2, Address(acc_start)); 10774 __ ubfx(rscratch1, U_0, 52, 12); 10775 __ bfi(rscratch1, U_1, 12, 14); 10776 __ ubfx(rscratch2, U_1, 14, 26); 10777 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10778 __ ubfx(rscratch1, U_1, 40, 24); 10779 __ bfi(rscratch1, U_2, 24, 3); 10780 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10781 10782 __ bind(DONE); 10783 __ pop(callee_saved, sp); 10784 __ leave(); 10785 __ ret(lr); 10786 10787 return start; 10788 } 10789 10790 // exception handler for upcall stubs 10791 address generate_upcall_stub_exception_handler() { 10792 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 10793 StubCodeMark mark(this, stub_id); 10794 address start = __ pc(); 10795 10796 // Native caller has no idea how to handle exceptions, 10797 // so we just crash here. Up to callee to catch exceptions. 10798 __ verify_oop(r0); 10799 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10800 __ blr(rscratch1); 10801 __ should_not_reach_here(); 10802 10803 return start; 10804 } 10805 10806 // load Method* target of MethodHandle 10807 // j_rarg0 = jobject receiver 10808 // rmethod = result 10809 address generate_upcall_stub_load_target() { 10810 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 10811 StubCodeMark mark(this, stub_id); 10812 address start = __ pc(); 10813 10814 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10815 // Load target method from receiver 10816 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10817 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10818 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10819 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10820 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10821 noreg, noreg); 10822 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10823 10824 __ ret(lr); 10825 10826 return start; 10827 } 10828 10829 #undef __ 10830 #define __ masm-> 10831 10832 class MontgomeryMultiplyGenerator : public MacroAssembler { 10833 10834 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10835 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10836 10837 RegSet _toSave; 10838 bool _squaring; 10839 10840 public: 10841 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10842 : MacroAssembler(as->code()), _squaring(squaring) { 10843 10844 // Register allocation 10845 10846 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10847 Pa_base = *regs; // Argument registers 10848 if (squaring) 10849 Pb_base = Pa_base; 10850 else 10851 Pb_base = *++regs; 10852 Pn_base = *++regs; 10853 Rlen= *++regs; 10854 inv = *++regs; 10855 Pm_base = *++regs; 10856 10857 // Working registers: 10858 Ra = *++regs; // The current digit of a, b, n, and m. 10859 Rb = *++regs; 10860 Rm = *++regs; 10861 Rn = *++regs; 10862 10863 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10864 Pb = *++regs; 10865 Pm = *++regs; 10866 Pn = *++regs; 10867 10868 t0 = *++regs; // Three registers which form a 10869 t1 = *++regs; // triple-precision accumuator. 10870 t2 = *++regs; 10871 10872 Ri = *++regs; // Inner and outer loop indexes. 10873 Rj = *++regs; 10874 10875 Rhi_ab = *++regs; // Product registers: low and high parts 10876 Rlo_ab = *++regs; // of a*b and m*n. 10877 Rhi_mn = *++regs; 10878 Rlo_mn = *++regs; 10879 10880 // r19 and up are callee-saved. 10881 _toSave = RegSet::range(r19, *regs) + Pm_base; 10882 } 10883 10884 private: 10885 void save_regs() { 10886 push(_toSave, sp); 10887 } 10888 10889 void restore_regs() { 10890 pop(_toSave, sp); 10891 } 10892 10893 template <typename T> 10894 void unroll_2(Register count, T block) { 10895 Label loop, end, odd; 10896 tbnz(count, 0, odd); 10897 cbz(count, end); 10898 align(16); 10899 bind(loop); 10900 (this->*block)(); 10901 bind(odd); 10902 (this->*block)(); 10903 subs(count, count, 2); 10904 br(Assembler::GT, loop); 10905 bind(end); 10906 } 10907 10908 template <typename T> 10909 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10910 Label loop, end, odd; 10911 tbnz(count, 0, odd); 10912 cbz(count, end); 10913 align(16); 10914 bind(loop); 10915 (this->*block)(d, s, tmp); 10916 bind(odd); 10917 (this->*block)(d, s, tmp); 10918 subs(count, count, 2); 10919 br(Assembler::GT, loop); 10920 bind(end); 10921 } 10922 10923 void pre1(RegisterOrConstant i) { 10924 block_comment("pre1"); 10925 // Pa = Pa_base; 10926 // Pb = Pb_base + i; 10927 // Pm = Pm_base; 10928 // Pn = Pn_base + i; 10929 // Ra = *Pa; 10930 // Rb = *Pb; 10931 // Rm = *Pm; 10932 // Rn = *Pn; 10933 ldr(Ra, Address(Pa_base)); 10934 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10935 ldr(Rm, Address(Pm_base)); 10936 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10937 lea(Pa, Address(Pa_base)); 10938 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10939 lea(Pm, Address(Pm_base)); 10940 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10941 10942 // Zero the m*n result. 10943 mov(Rhi_mn, zr); 10944 mov(Rlo_mn, zr); 10945 } 10946 10947 // The core multiply-accumulate step of a Montgomery 10948 // multiplication. The idea is to schedule operations as a 10949 // pipeline so that instructions with long latencies (loads and 10950 // multiplies) have time to complete before their results are 10951 // used. This most benefits in-order implementations of the 10952 // architecture but out-of-order ones also benefit. 10953 void step() { 10954 block_comment("step"); 10955 // MACC(Ra, Rb, t0, t1, t2); 10956 // Ra = *++Pa; 10957 // Rb = *--Pb; 10958 umulh(Rhi_ab, Ra, Rb); 10959 mul(Rlo_ab, Ra, Rb); 10960 ldr(Ra, pre(Pa, wordSize)); 10961 ldr(Rb, pre(Pb, -wordSize)); 10962 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 10963 // previous iteration. 10964 // MACC(Rm, Rn, t0, t1, t2); 10965 // Rm = *++Pm; 10966 // Rn = *--Pn; 10967 umulh(Rhi_mn, Rm, Rn); 10968 mul(Rlo_mn, Rm, Rn); 10969 ldr(Rm, pre(Pm, wordSize)); 10970 ldr(Rn, pre(Pn, -wordSize)); 10971 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10972 } 10973 10974 void post1() { 10975 block_comment("post1"); 10976 10977 // MACC(Ra, Rb, t0, t1, t2); 10978 // Ra = *++Pa; 10979 // Rb = *--Pb; 10980 umulh(Rhi_ab, Ra, Rb); 10981 mul(Rlo_ab, Ra, Rb); 10982 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10983 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10984 10985 // *Pm = Rm = t0 * inv; 10986 mul(Rm, t0, inv); 10987 str(Rm, Address(Pm)); 10988 10989 // MACC(Rm, Rn, t0, t1, t2); 10990 // t0 = t1; t1 = t2; t2 = 0; 10991 umulh(Rhi_mn, Rm, Rn); 10992 10993 #ifndef PRODUCT 10994 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10995 { 10996 mul(Rlo_mn, Rm, Rn); 10997 add(Rlo_mn, t0, Rlo_mn); 10998 Label ok; 10999 cbz(Rlo_mn, ok); { 11000 stop("broken Montgomery multiply"); 11001 } bind(ok); 11002 } 11003 #endif 11004 // We have very carefully set things up so that 11005 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11006 // the lower half of Rm * Rn because we know the result already: 11007 // it must be -t0. t0 + (-t0) must generate a carry iff 11008 // t0 != 0. So, rather than do a mul and an adds we just set 11009 // the carry flag iff t0 is nonzero. 11010 // 11011 // mul(Rlo_mn, Rm, Rn); 11012 // adds(zr, t0, Rlo_mn); 11013 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11014 adcs(t0, t1, Rhi_mn); 11015 adc(t1, t2, zr); 11016 mov(t2, zr); 11017 } 11018 11019 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 11020 block_comment("pre2"); 11021 // Pa = Pa_base + i-len; 11022 // Pb = Pb_base + len; 11023 // Pm = Pm_base + i-len; 11024 // Pn = Pn_base + len; 11025 11026 if (i.is_register()) { 11027 sub(Rj, i.as_register(), len); 11028 } else { 11029 mov(Rj, i.as_constant()); 11030 sub(Rj, Rj, len); 11031 } 11032 // Rj == i-len 11033 11034 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 11035 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 11036 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11037 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 11038 11039 // Ra = *++Pa; 11040 // Rb = *--Pb; 11041 // Rm = *++Pm; 11042 // Rn = *--Pn; 11043 ldr(Ra, pre(Pa, wordSize)); 11044 ldr(Rb, pre(Pb, -wordSize)); 11045 ldr(Rm, pre(Pm, wordSize)); 11046 ldr(Rn, pre(Pn, -wordSize)); 11047 11048 mov(Rhi_mn, zr); 11049 mov(Rlo_mn, zr); 11050 } 11051 11052 void post2(RegisterOrConstant i, RegisterOrConstant len) { 11053 block_comment("post2"); 11054 if (i.is_constant()) { 11055 mov(Rj, i.as_constant()-len.as_constant()); 11056 } else { 11057 sub(Rj, i.as_register(), len); 11058 } 11059 11060 adds(t0, t0, Rlo_mn); // The pending m*n, low part 11061 11062 // As soon as we know the least significant digit of our result, 11063 // store it. 11064 // Pm_base[i-len] = t0; 11065 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11066 11067 // t0 = t1; t1 = t2; t2 = 0; 11068 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 11069 adc(t1, t2, zr); 11070 mov(t2, zr); 11071 } 11072 11073 // A carry in t0 after Montgomery multiplication means that we 11074 // should subtract multiples of n from our result in m. We'll 11075 // keep doing that until there is no carry. 11076 void normalize(RegisterOrConstant len) { 11077 block_comment("normalize"); 11078 // while (t0) 11079 // t0 = sub(Pm_base, Pn_base, t0, len); 11080 Label loop, post, again; 11081 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 11082 cbz(t0, post); { 11083 bind(again); { 11084 mov(i, zr); 11085 mov(cnt, len); 11086 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11087 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11088 subs(zr, zr, zr); // set carry flag, i.e. no borrow 11089 align(16); 11090 bind(loop); { 11091 sbcs(Rm, Rm, Rn); 11092 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11093 add(i, i, 1); 11094 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11095 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11096 sub(cnt, cnt, 1); 11097 } cbnz(cnt, loop); 11098 sbc(t0, t0, zr); 11099 } cbnz(t0, again); 11100 } bind(post); 11101 } 11102 11103 // Move memory at s to d, reversing words. 11104 // Increments d to end of copied memory 11105 // Destroys tmp1, tmp2 11106 // Preserves len 11107 // Leaves s pointing to the address which was in d at start 11108 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 11109 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 11110 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 11111 11112 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 11113 mov(tmp1, len); 11114 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 11115 sub(s, d, len, ext::uxtw, LogBytesPerWord); 11116 } 11117 // where 11118 void reverse1(Register d, Register s, Register tmp) { 11119 ldr(tmp, pre(s, -wordSize)); 11120 ror(tmp, tmp, 32); 11121 str(tmp, post(d, wordSize)); 11122 } 11123 11124 void step_squaring() { 11125 // An extra ACC 11126 step(); 11127 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11128 } 11129 11130 void last_squaring(RegisterOrConstant i) { 11131 Label dont; 11132 // if ((i & 1) == 0) { 11133 tbnz(i.as_register(), 0, dont); { 11134 // MACC(Ra, Rb, t0, t1, t2); 11135 // Ra = *++Pa; 11136 // Rb = *--Pb; 11137 umulh(Rhi_ab, Ra, Rb); 11138 mul(Rlo_ab, Ra, Rb); 11139 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11140 } bind(dont); 11141 } 11142 11143 void extra_step_squaring() { 11144 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11145 11146 // MACC(Rm, Rn, t0, t1, t2); 11147 // Rm = *++Pm; 11148 // Rn = *--Pn; 11149 umulh(Rhi_mn, Rm, Rn); 11150 mul(Rlo_mn, Rm, Rn); 11151 ldr(Rm, pre(Pm, wordSize)); 11152 ldr(Rn, pre(Pn, -wordSize)); 11153 } 11154 11155 void post1_squaring() { 11156 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11157 11158 // *Pm = Rm = t0 * inv; 11159 mul(Rm, t0, inv); 11160 str(Rm, Address(Pm)); 11161 11162 // MACC(Rm, Rn, t0, t1, t2); 11163 // t0 = t1; t1 = t2; t2 = 0; 11164 umulh(Rhi_mn, Rm, Rn); 11165 11166 #ifndef PRODUCT 11167 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11168 { 11169 mul(Rlo_mn, Rm, Rn); 11170 add(Rlo_mn, t0, Rlo_mn); 11171 Label ok; 11172 cbz(Rlo_mn, ok); { 11173 stop("broken Montgomery multiply"); 11174 } bind(ok); 11175 } 11176 #endif 11177 // We have very carefully set things up so that 11178 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11179 // the lower half of Rm * Rn because we know the result already: 11180 // it must be -t0. t0 + (-t0) must generate a carry iff 11181 // t0 != 0. So, rather than do a mul and an adds we just set 11182 // the carry flag iff t0 is nonzero. 11183 // 11184 // mul(Rlo_mn, Rm, Rn); 11185 // adds(zr, t0, Rlo_mn); 11186 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11187 adcs(t0, t1, Rhi_mn); 11188 adc(t1, t2, zr); 11189 mov(t2, zr); 11190 } 11191 11192 void acc(Register Rhi, Register Rlo, 11193 Register t0, Register t1, Register t2) { 11194 adds(t0, t0, Rlo); 11195 adcs(t1, t1, Rhi); 11196 adc(t2, t2, zr); 11197 } 11198 11199 public: 11200 /** 11201 * Fast Montgomery multiplication. The derivation of the 11202 * algorithm is in A Cryptographic Library for the Motorola 11203 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 11204 * 11205 * Arguments: 11206 * 11207 * Inputs for multiplication: 11208 * c_rarg0 - int array elements a 11209 * c_rarg1 - int array elements b 11210 * c_rarg2 - int array elements n (the modulus) 11211 * c_rarg3 - int length 11212 * c_rarg4 - int inv 11213 * c_rarg5 - int array elements m (the result) 11214 * 11215 * Inputs for squaring: 11216 * c_rarg0 - int array elements a 11217 * c_rarg1 - int array elements n (the modulus) 11218 * c_rarg2 - int length 11219 * c_rarg3 - int inv 11220 * c_rarg4 - int array elements m (the result) 11221 * 11222 */ 11223 address generate_multiply() { 11224 Label argh, nothing; 11225 bind(argh); 11226 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11227 11228 align(CodeEntryAlignment); 11229 address entry = pc(); 11230 11231 cbzw(Rlen, nothing); 11232 11233 enter(); 11234 11235 // Make room. 11236 cmpw(Rlen, 512); 11237 br(Assembler::HI, argh); 11238 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11239 andr(sp, Ra, -2 * wordSize); 11240 11241 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11242 11243 { 11244 // Copy input args, reversing as we go. We use Ra as a 11245 // temporary variable. 11246 reverse(Ra, Pa_base, Rlen, t0, t1); 11247 if (!_squaring) 11248 reverse(Ra, Pb_base, Rlen, t0, t1); 11249 reverse(Ra, Pn_base, Rlen, t0, t1); 11250 } 11251 11252 // Push all call-saved registers and also Pm_base which we'll need 11253 // at the end. 11254 save_regs(); 11255 11256 #ifndef PRODUCT 11257 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 11258 { 11259 ldr(Rn, Address(Pn_base, 0)); 11260 mul(Rlo_mn, Rn, inv); 11261 subs(zr, Rlo_mn, -1); 11262 Label ok; 11263 br(EQ, ok); { 11264 stop("broken inverse in Montgomery multiply"); 11265 } bind(ok); 11266 } 11267 #endif 11268 11269 mov(Pm_base, Ra); 11270 11271 mov(t0, zr); 11272 mov(t1, zr); 11273 mov(t2, zr); 11274 11275 block_comment("for (int i = 0; i < len; i++) {"); 11276 mov(Ri, zr); { 11277 Label loop, end; 11278 cmpw(Ri, Rlen); 11279 br(Assembler::GE, end); 11280 11281 bind(loop); 11282 pre1(Ri); 11283 11284 block_comment(" for (j = i; j; j--) {"); { 11285 movw(Rj, Ri); 11286 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11287 } block_comment(" } // j"); 11288 11289 post1(); 11290 addw(Ri, Ri, 1); 11291 cmpw(Ri, Rlen); 11292 br(Assembler::LT, loop); 11293 bind(end); 11294 block_comment("} // i"); 11295 } 11296 11297 block_comment("for (int i = len; i < 2*len; i++) {"); 11298 mov(Ri, Rlen); { 11299 Label loop, end; 11300 cmpw(Ri, Rlen, Assembler::LSL, 1); 11301 br(Assembler::GE, end); 11302 11303 bind(loop); 11304 pre2(Ri, Rlen); 11305 11306 block_comment(" for (j = len*2-i-1; j; j--) {"); { 11307 lslw(Rj, Rlen, 1); 11308 subw(Rj, Rj, Ri); 11309 subw(Rj, Rj, 1); 11310 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11311 } block_comment(" } // j"); 11312 11313 post2(Ri, Rlen); 11314 addw(Ri, Ri, 1); 11315 cmpw(Ri, Rlen, Assembler::LSL, 1); 11316 br(Assembler::LT, loop); 11317 bind(end); 11318 } 11319 block_comment("} // i"); 11320 11321 normalize(Rlen); 11322 11323 mov(Ra, Pm_base); // Save Pm_base in Ra 11324 restore_regs(); // Restore caller's Pm_base 11325 11326 // Copy our result into caller's Pm_base 11327 reverse(Pm_base, Ra, Rlen, t0, t1); 11328 11329 leave(); 11330 bind(nothing); 11331 ret(lr); 11332 11333 return entry; 11334 } 11335 // In C, approximately: 11336 11337 // void 11338 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 11339 // julong Pn_base[], julong Pm_base[], 11340 // julong inv, int len) { 11341 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11342 // julong *Pa, *Pb, *Pn, *Pm; 11343 // julong Ra, Rb, Rn, Rm; 11344 11345 // int i; 11346 11347 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11348 11349 // for (i = 0; i < len; i++) { 11350 // int j; 11351 11352 // Pa = Pa_base; 11353 // Pb = Pb_base + i; 11354 // Pm = Pm_base; 11355 // Pn = Pn_base + i; 11356 11357 // Ra = *Pa; 11358 // Rb = *Pb; 11359 // Rm = *Pm; 11360 // Rn = *Pn; 11361 11362 // int iters = i; 11363 // for (j = 0; iters--; j++) { 11364 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11365 // MACC(Ra, Rb, t0, t1, t2); 11366 // Ra = *++Pa; 11367 // Rb = *--Pb; 11368 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11369 // MACC(Rm, Rn, t0, t1, t2); 11370 // Rm = *++Pm; 11371 // Rn = *--Pn; 11372 // } 11373 11374 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11375 // MACC(Ra, Rb, t0, t1, t2); 11376 // *Pm = Rm = t0 * inv; 11377 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11378 // MACC(Rm, Rn, t0, t1, t2); 11379 11380 // assert(t0 == 0, "broken Montgomery multiply"); 11381 11382 // t0 = t1; t1 = t2; t2 = 0; 11383 // } 11384 11385 // for (i = len; i < 2*len; i++) { 11386 // int j; 11387 11388 // Pa = Pa_base + i-len; 11389 // Pb = Pb_base + len; 11390 // Pm = Pm_base + i-len; 11391 // Pn = Pn_base + len; 11392 11393 // Ra = *++Pa; 11394 // Rb = *--Pb; 11395 // Rm = *++Pm; 11396 // Rn = *--Pn; 11397 11398 // int iters = len*2-i-1; 11399 // for (j = i-len+1; iters--; j++) { 11400 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11401 // MACC(Ra, Rb, t0, t1, t2); 11402 // Ra = *++Pa; 11403 // Rb = *--Pb; 11404 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11405 // MACC(Rm, Rn, t0, t1, t2); 11406 // Rm = *++Pm; 11407 // Rn = *--Pn; 11408 // } 11409 11410 // Pm_base[i-len] = t0; 11411 // t0 = t1; t1 = t2; t2 = 0; 11412 // } 11413 11414 // while (t0) 11415 // t0 = sub(Pm_base, Pn_base, t0, len); 11416 // } 11417 11418 /** 11419 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11420 * multiplies than Montgomery multiplication so it should be up to 11421 * 25% faster. However, its loop control is more complex and it 11422 * may actually run slower on some machines. 11423 * 11424 * Arguments: 11425 * 11426 * Inputs: 11427 * c_rarg0 - int array elements a 11428 * c_rarg1 - int array elements n (the modulus) 11429 * c_rarg2 - int length 11430 * c_rarg3 - int inv 11431 * c_rarg4 - int array elements m (the result) 11432 * 11433 */ 11434 address generate_square() { 11435 Label argh; 11436 bind(argh); 11437 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11438 11439 align(CodeEntryAlignment); 11440 address entry = pc(); 11441 11442 enter(); 11443 11444 // Make room. 11445 cmpw(Rlen, 512); 11446 br(Assembler::HI, argh); 11447 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11448 andr(sp, Ra, -2 * wordSize); 11449 11450 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11451 11452 { 11453 // Copy input args, reversing as we go. We use Ra as a 11454 // temporary variable. 11455 reverse(Ra, Pa_base, Rlen, t0, t1); 11456 reverse(Ra, Pn_base, Rlen, t0, t1); 11457 } 11458 11459 // Push all call-saved registers and also Pm_base which we'll need 11460 // at the end. 11461 save_regs(); 11462 11463 mov(Pm_base, Ra); 11464 11465 mov(t0, zr); 11466 mov(t1, zr); 11467 mov(t2, zr); 11468 11469 block_comment("for (int i = 0; i < len; i++) {"); 11470 mov(Ri, zr); { 11471 Label loop, end; 11472 bind(loop); 11473 cmp(Ri, Rlen); 11474 br(Assembler::GE, end); 11475 11476 pre1(Ri); 11477 11478 block_comment("for (j = (i+1)/2; j; j--) {"); { 11479 add(Rj, Ri, 1); 11480 lsr(Rj, Rj, 1); 11481 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11482 } block_comment(" } // j"); 11483 11484 last_squaring(Ri); 11485 11486 block_comment(" for (j = i/2; j; j--) {"); { 11487 lsr(Rj, Ri, 1); 11488 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11489 } block_comment(" } // j"); 11490 11491 post1_squaring(); 11492 add(Ri, Ri, 1); 11493 cmp(Ri, Rlen); 11494 br(Assembler::LT, loop); 11495 11496 bind(end); 11497 block_comment("} // i"); 11498 } 11499 11500 block_comment("for (int i = len; i < 2*len; i++) {"); 11501 mov(Ri, Rlen); { 11502 Label loop, end; 11503 bind(loop); 11504 cmp(Ri, Rlen, Assembler::LSL, 1); 11505 br(Assembler::GE, end); 11506 11507 pre2(Ri, Rlen); 11508 11509 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11510 lsl(Rj, Rlen, 1); 11511 sub(Rj, Rj, Ri); 11512 sub(Rj, Rj, 1); 11513 lsr(Rj, Rj, 1); 11514 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11515 } block_comment(" } // j"); 11516 11517 last_squaring(Ri); 11518 11519 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11520 lsl(Rj, Rlen, 1); 11521 sub(Rj, Rj, Ri); 11522 lsr(Rj, Rj, 1); 11523 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11524 } block_comment(" } // j"); 11525 11526 post2(Ri, Rlen); 11527 add(Ri, Ri, 1); 11528 cmp(Ri, Rlen, Assembler::LSL, 1); 11529 11530 br(Assembler::LT, loop); 11531 bind(end); 11532 block_comment("} // i"); 11533 } 11534 11535 normalize(Rlen); 11536 11537 mov(Ra, Pm_base); // Save Pm_base in Ra 11538 restore_regs(); // Restore caller's Pm_base 11539 11540 // Copy our result into caller's Pm_base 11541 reverse(Pm_base, Ra, Rlen, t0, t1); 11542 11543 leave(); 11544 ret(lr); 11545 11546 return entry; 11547 } 11548 // In C, approximately: 11549 11550 // void 11551 // montgomery_square(julong Pa_base[], julong Pn_base[], 11552 // julong Pm_base[], julong inv, int len) { 11553 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11554 // julong *Pa, *Pb, *Pn, *Pm; 11555 // julong Ra, Rb, Rn, Rm; 11556 11557 // int i; 11558 11559 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11560 11561 // for (i = 0; i < len; i++) { 11562 // int j; 11563 11564 // Pa = Pa_base; 11565 // Pb = Pa_base + i; 11566 // Pm = Pm_base; 11567 // Pn = Pn_base + i; 11568 11569 // Ra = *Pa; 11570 // Rb = *Pb; 11571 // Rm = *Pm; 11572 // Rn = *Pn; 11573 11574 // int iters = (i+1)/2; 11575 // for (j = 0; iters--; j++) { 11576 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11577 // MACC2(Ra, Rb, t0, t1, t2); 11578 // Ra = *++Pa; 11579 // Rb = *--Pb; 11580 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11581 // MACC(Rm, Rn, t0, t1, t2); 11582 // Rm = *++Pm; 11583 // Rn = *--Pn; 11584 // } 11585 // if ((i & 1) == 0) { 11586 // assert(Ra == Pa_base[j], "must be"); 11587 // MACC(Ra, Ra, t0, t1, t2); 11588 // } 11589 // iters = i/2; 11590 // assert(iters == i-j, "must be"); 11591 // for (; iters--; j++) { 11592 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11593 // MACC(Rm, Rn, t0, t1, t2); 11594 // Rm = *++Pm; 11595 // Rn = *--Pn; 11596 // } 11597 11598 // *Pm = Rm = t0 * inv; 11599 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11600 // MACC(Rm, Rn, t0, t1, t2); 11601 11602 // assert(t0 == 0, "broken Montgomery multiply"); 11603 11604 // t0 = t1; t1 = t2; t2 = 0; 11605 // } 11606 11607 // for (i = len; i < 2*len; i++) { 11608 // int start = i-len+1; 11609 // int end = start + (len - start)/2; 11610 // int j; 11611 11612 // Pa = Pa_base + i-len; 11613 // Pb = Pa_base + len; 11614 // Pm = Pm_base + i-len; 11615 // Pn = Pn_base + len; 11616 11617 // Ra = *++Pa; 11618 // Rb = *--Pb; 11619 // Rm = *++Pm; 11620 // Rn = *--Pn; 11621 11622 // int iters = (2*len-i-1)/2; 11623 // assert(iters == end-start, "must be"); 11624 // for (j = start; iters--; j++) { 11625 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11626 // MACC2(Ra, Rb, t0, t1, t2); 11627 // Ra = *++Pa; 11628 // Rb = *--Pb; 11629 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11630 // MACC(Rm, Rn, t0, t1, t2); 11631 // Rm = *++Pm; 11632 // Rn = *--Pn; 11633 // } 11634 // if ((i & 1) == 0) { 11635 // assert(Ra == Pa_base[j], "must be"); 11636 // MACC(Ra, Ra, t0, t1, t2); 11637 // } 11638 // iters = (2*len-i)/2; 11639 // assert(iters == len-j, "must be"); 11640 // for (; iters--; j++) { 11641 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11642 // MACC(Rm, Rn, t0, t1, t2); 11643 // Rm = *++Pm; 11644 // Rn = *--Pn; 11645 // } 11646 // Pm_base[i-len] = t0; 11647 // t0 = t1; t1 = t2; t2 = 0; 11648 // } 11649 11650 // while (t0) 11651 // t0 = sub(Pm_base, Pn_base, t0, len); 11652 // } 11653 }; 11654 11655 // Initialization 11656 void generate_preuniverse_stubs() { 11657 // preuniverse stubs are not needed for aarch64 11658 } 11659 11660 void generate_initial_stubs() { 11661 // Generate initial stubs and initializes the entry points 11662 11663 // entry points that exist in all platforms Note: This is code 11664 // that could be shared among different platforms - however the 11665 // benefit seems to be smaller than the disadvantage of having a 11666 // much more complicated generator structure. See also comment in 11667 // stubRoutines.hpp. 11668 11669 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11670 11671 StubRoutines::_call_stub_entry = 11672 generate_call_stub(StubRoutines::_call_stub_return_address); 11673 11674 // is referenced by megamorphic call 11675 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11676 11677 // Initialize table for copy memory (arraycopy) check. 11678 if (UnsafeMemoryAccess::_table == nullptr) { 11679 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11680 } 11681 11682 if (UseCRC32Intrinsics) { 11683 // set table address before stub generation which use it 11684 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 11685 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11686 } 11687 11688 if (UseCRC32CIntrinsics) { 11689 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11690 } 11691 11692 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11693 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11694 } 11695 11696 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11697 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11698 } 11699 11700 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11701 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11702 StubRoutines::_hf2f = generate_float16ToFloat(); 11703 StubRoutines::_f2hf = generate_floatToFloat16(); 11704 } 11705 } 11706 11707 void generate_continuation_stubs() { 11708 // Continuation stubs: 11709 StubRoutines::_cont_thaw = generate_cont_thaw(); 11710 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11711 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11712 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11713 } 11714 11715 void generate_final_stubs() { 11716 // support for verify_oop (must happen after universe_init) 11717 if (VerifyOops) { 11718 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11719 } 11720 11721 // arraycopy stubs used by compilers 11722 generate_arraycopy_stubs(); 11723 11724 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11725 11726 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11727 11728 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11729 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11730 11731 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11732 11733 generate_atomic_entry_points(); 11734 11735 #endif // LINUX 11736 11737 #ifdef COMPILER2 11738 if (UseSecondarySupersTable) { 11739 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11740 if (! InlineSecondarySupersTest) { 11741 generate_lookup_secondary_supers_table_stub(); 11742 } 11743 } 11744 #endif 11745 11746 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11747 11748 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11749 } 11750 11751 void generate_compiler_stubs() { 11752 #if COMPILER2_OR_JVMCI 11753 11754 if (UseSVE == 0) { 11755 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 11756 } 11757 11758 // array equals stub for large arrays. 11759 if (!UseSimpleArrayEquals) { 11760 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11761 } 11762 11763 // arrays_hascode stub for large arrays. 11764 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11765 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11766 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11767 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11768 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11769 11770 // byte_array_inflate stub for large arrays. 11771 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11772 11773 // countPositives stub for large arrays. 11774 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11775 11776 generate_compare_long_strings(); 11777 11778 generate_string_indexof_stubs(); 11779 11780 #ifdef COMPILER2 11781 if (UseMultiplyToLenIntrinsic) { 11782 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11783 } 11784 11785 if (UseSquareToLenIntrinsic) { 11786 StubRoutines::_squareToLen = generate_squareToLen(); 11787 } 11788 11789 if (UseMulAddIntrinsic) { 11790 StubRoutines::_mulAdd = generate_mulAdd(); 11791 } 11792 11793 if (UseSIMDForBigIntegerShiftIntrinsics) { 11794 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11795 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11796 } 11797 11798 if (UseMontgomeryMultiplyIntrinsic) { 11799 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 11800 StubCodeMark mark(this, stub_id); 11801 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11802 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11803 } 11804 11805 if (UseMontgomerySquareIntrinsic) { 11806 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 11807 StubCodeMark mark(this, stub_id); 11808 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11809 // We use generate_multiply() rather than generate_square() 11810 // because it's faster for the sizes of modulus we care about. 11811 StubRoutines::_montgomerySquare = g.generate_multiply(); 11812 } 11813 11814 #endif // COMPILER2 11815 11816 if (UseChaCha20Intrinsics) { 11817 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11818 } 11819 11820 if (UseKyberIntrinsics) { 11821 StubRoutines::_kyberNtt = generate_kyberNtt(); 11822 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11823 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11824 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11825 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11826 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11827 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11828 } 11829 11830 if (UseDilithiumIntrinsics) { 11831 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11832 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11833 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11834 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11835 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11836 } 11837 11838 if (UseBASE64Intrinsics) { 11839 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11840 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11841 } 11842 11843 // data cache line writeback 11844 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11845 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11846 11847 if (UseAESIntrinsics) { 11848 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11849 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11850 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11851 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11852 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11853 } 11854 if (UseGHASHIntrinsics) { 11855 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11856 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11857 } 11858 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11859 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11860 } 11861 11862 if (UseMD5Intrinsics) { 11863 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 11864 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 11865 } 11866 if (UseSHA1Intrinsics) { 11867 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 11868 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 11869 } 11870 if (UseSHA256Intrinsics) { 11871 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 11872 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 11873 } 11874 if (UseSHA512Intrinsics) { 11875 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 11876 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 11877 } 11878 if (UseSHA3Intrinsics) { 11879 11880 StubRoutines::_double_keccak = generate_double_keccak(); 11881 if (UseSIMDForSHA3Intrinsic) { 11882 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 11883 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 11884 } else { 11885 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubGenStubId::sha3_implCompress_id); 11886 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubGenStubId::sha3_implCompressMB_id); 11887 } 11888 } 11889 11890 if (UsePoly1305Intrinsics) { 11891 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11892 } 11893 11894 // generate Adler32 intrinsics code 11895 if (UseAdler32Intrinsics) { 11896 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11897 } 11898 11899 #endif // COMPILER2_OR_JVMCI 11900 } 11901 11902 public: 11903 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 11904 switch(blob_id) { 11905 case preuniverse_id: 11906 generate_preuniverse_stubs(); 11907 break; 11908 case initial_id: 11909 generate_initial_stubs(); 11910 break; 11911 case continuation_id: 11912 generate_continuation_stubs(); 11913 break; 11914 case compiler_id: 11915 generate_compiler_stubs(); 11916 break; 11917 case final_id: 11918 generate_final_stubs(); 11919 break; 11920 default: 11921 fatal("unexpected blob id: %d", blob_id); 11922 break; 11923 }; 11924 } 11925 }; // end class declaration 11926 11927 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 11928 StubGenerator g(code, blob_id); 11929 } 11930 11931 11932 #if defined (LINUX) 11933 11934 // Define pointers to atomic stubs and initialize them to point to the 11935 // code in atomic_aarch64.S. 11936 11937 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 11938 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 11939 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 11940 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 11941 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 11942 11943 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 11944 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 11945 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 11946 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 11947 DEFAULT_ATOMIC_OP(xchg, 4, ) 11948 DEFAULT_ATOMIC_OP(xchg, 8, ) 11949 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 11950 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 11951 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 11952 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 11953 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 11954 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 11955 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 11956 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 11957 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 11958 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 11959 11960 #undef DEFAULT_ATOMIC_OP 11961 11962 #endif // LINUX