1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "code/aotCodeCache.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubId stub_id = StubId::stubgen_call_stub_id; 207 StubCodeMark mark(this, stub_id); 208 address start = __ pc(); 209 210 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 211 212 const Address fpcr_save (rfp, fpcr_off * wordSize); 213 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 214 const Address result (rfp, result_off * wordSize); 215 const Address result_type (rfp, result_type_off * wordSize); 216 const Address method (rfp, method_off * wordSize); 217 const Address entry_point (rfp, entry_point_off * wordSize); 218 const Address parameter_size(rfp, parameter_size_off * wordSize); 219 220 const Address thread (rfp, thread_off * wordSize); 221 222 const Address d15_save (rfp, d15_off * wordSize); 223 const Address d13_save (rfp, d13_off * wordSize); 224 const Address d11_save (rfp, d11_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 227 const Address r28_save (rfp, r28_off * wordSize); 228 const Address r26_save (rfp, r26_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r22_save (rfp, r22_off * wordSize); 231 const Address r20_save (rfp, r20_off * wordSize); 232 233 // stub code 234 235 address aarch64_entry = __ pc(); 236 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 __ get_fpcr(rscratch1); 262 __ str(rscratch1, fpcr_save); 263 // Set FPCR to the state we need. We do want Round to Nearest. We 264 // don't want non-IEEE rounding modes or floating-point traps. 265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 267 __ set_fpcr(rscratch1); 268 269 // install Java thread in global register now we have saved 270 // whatever value it held 271 __ mov(rthread, c_rarg7); 272 // And method 273 __ mov(rmethod, c_rarg3); 274 275 // set up the heapbase register 276 __ reinit_heapbase(); 277 278 #ifdef ASSERT 279 // make sure we have no pending exceptions 280 { 281 Label L; 282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 283 __ cmp(rscratch1, (u1)NULL_WORD); 284 __ br(Assembler::EQ, L); 285 __ stop("StubRoutines::call_stub: entered with pending exception"); 286 __ BIND(L); 287 } 288 #endif 289 // pass parameters if any 290 __ mov(esp, sp); 291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 292 __ andr(sp, rscratch1, -2 * wordSize); 293 294 BLOCK_COMMENT("pass parameters if any"); 295 Label parameters_done; 296 // parameter count is still in c_rarg6 297 // and parameter pointer identifying param 1 is in c_rarg5 298 __ cbzw(c_rarg6, parameters_done); 299 300 address loop = __ pc(); 301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 302 __ subsw(c_rarg6, c_rarg6, 1); 303 __ push(rscratch1); 304 __ br(Assembler::GT, loop); 305 306 __ BIND(parameters_done); 307 308 // call Java entry -- passing methdoOop, and current sp 309 // rmethod: Method* 310 // r19_sender_sp: sender sp 311 BLOCK_COMMENT("call Java function"); 312 __ mov(r19_sender_sp, sp); 313 __ blr(c_rarg4); 314 315 // we do this here because the notify will already have been done 316 // if we get to the next instruction via an exception 317 // 318 // n.b. adding this instruction here affects the calculation of 319 // whether or not a routine returns to the call stub (used when 320 // doing stack walks) since the normal test is to check the return 321 // pc against the address saved below. so we may need to allow for 322 // this extra instruction in the check. 323 324 // save current address for use by exception handling code 325 326 return_address = __ pc(); 327 328 // store result depending on type (everything that is not 329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 330 // n.b. this assumes Java returns an integral result in r0 331 // and a floating result in j_farg0 332 __ ldr(j_rarg2, result); 333 Label is_long, is_float, is_double, exit; 334 __ ldr(j_rarg1, result_type); 335 __ cmp(j_rarg1, (u1)T_OBJECT); 336 __ br(Assembler::EQ, is_long); 337 __ cmp(j_rarg1, (u1)T_LONG); 338 __ br(Assembler::EQ, is_long); 339 __ cmp(j_rarg1, (u1)T_FLOAT); 340 __ br(Assembler::EQ, is_float); 341 __ cmp(j_rarg1, (u1)T_DOUBLE); 342 __ br(Assembler::EQ, is_double); 343 344 // handle T_INT case 345 __ strw(r0, Address(j_rarg2)); 346 347 __ BIND(exit); 348 349 // pop parameters 350 __ sub(esp, rfp, -sp_after_call_off * wordSize); 351 352 #ifdef ASSERT 353 // verify that threads correspond 354 { 355 Label L, S; 356 __ ldr(rscratch1, thread); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::NE, S); 359 __ get_thread(rscratch1); 360 __ cmp(rthread, rscratch1); 361 __ br(Assembler::EQ, L); 362 __ BIND(S); 363 __ stop("StubRoutines::call_stub: threads must correspond"); 364 __ BIND(L); 365 } 366 #endif 367 368 __ pop_cont_fastpath(rthread); 369 370 // restore callee-save registers 371 __ ldpd(v15, v14, d15_save); 372 __ ldpd(v13, v12, d13_save); 373 __ ldpd(v11, v10, d11_save); 374 __ ldpd(v9, v8, d9_save); 375 376 __ ldp(r28, r27, r28_save); 377 __ ldp(r26, r25, r26_save); 378 __ ldp(r24, r23, r24_save); 379 __ ldp(r22, r21, r22_save); 380 __ ldp(r20, r19, r20_save); 381 382 // restore fpcr 383 __ ldr(rscratch1, fpcr_save); 384 __ set_fpcr(rscratch1); 385 386 __ ldp(c_rarg0, c_rarg1, call_wrapper); 387 __ ldrw(c_rarg2, result_type); 388 __ ldr(c_rarg3, method); 389 __ ldp(c_rarg4, c_rarg5, entry_point); 390 __ ldp(c_rarg6, c_rarg7, parameter_size); 391 392 // leave frame and return to caller 393 __ leave(); 394 __ ret(lr); 395 396 // handle return types different from T_INT 397 398 __ BIND(is_long); 399 __ str(r0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(j_rarg2, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubId stub_id = StubId::stubgen_catch_exception_id; 427 StubCodeMark mark(this, stub_id); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != nullptr, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code with no x86 prolog 480 481 address generate_forward_exception() { 482 StubId stub_id = StubId::stubgen_forward_exception_id; 483 StubCodeMark mark(this, stub_id); 484 address start = __ pc(); 485 486 // Upon entry, LR points to the return address returning into 487 // Java (interpreted or compiled) code; i.e., the return address 488 // becomes the throwing pc. 489 // 490 // Arguments pushed before the runtime call are still on the stack 491 // but the exception handler will reset the stack pointer -> 492 // ignore them. A potential result in registers can be ignored as 493 // well. 494 495 #ifdef ASSERT 496 // make sure this code is only executed if there is a pending exception 497 { 498 Label L; 499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 500 __ cbnz(rscratch1, L); 501 __ stop("StubRoutines::forward exception: no pending exception (1)"); 502 __ bind(L); 503 } 504 #endif 505 506 // compute exception handler into r19 507 508 // call the VM to find the handler address associated with the 509 // caller address. pass thread in r0 and caller pc (ret address) 510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 511 // the stack. 512 __ mov(c_rarg1, lr); 513 // lr will be trashed by the VM call so we move it to R19 514 // (callee-saved) because we also need to pass it to the handler 515 // returned by this call. 516 __ mov(r19, lr); 517 BLOCK_COMMENT("call exception_handler_for_return_address"); 518 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 519 SharedRuntime::exception_handler_for_return_address), 520 rthread, c_rarg1); 521 // Reinitialize the ptrue predicate register, in case the external runtime 522 // call clobbers ptrue reg, as we may return to SVE compiled code. 523 __ reinitialize_ptrue(); 524 525 // we should not really care that lr is no longer the callee 526 // address. we saved the value the handler needs in r19 so we can 527 // just copy it to r3. however, the C2 handler will push its own 528 // frame and then calls into the VM and the VM code asserts that 529 // the PC for the frame above the handler belongs to a compiled 530 // Java method. So, we restore lr here to satisfy that assert. 531 __ mov(lr, r19); 532 // setup r0 & r3 & clear pending exception 533 __ mov(r3, r19); 534 __ mov(r19, r0); 535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 536 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 537 538 #ifdef ASSERT 539 // make sure exception is set 540 { 541 Label L; 542 __ cbnz(r0, L); 543 __ stop("StubRoutines::forward exception: no pending exception (2)"); 544 __ bind(L); 545 } 546 #endif 547 548 // continue at exception handler 549 // r0: exception 550 // r3: throwing pc 551 // r19: exception handler 552 __ verify_oop(r0); 553 __ br(r19); 554 555 return start; 556 } 557 558 // Non-destructive plausibility checks for oops 559 // 560 // Arguments: 561 // r0: oop to verify 562 // rscratch1: error message 563 // 564 // Stack after saving c_rarg3: 565 // [tos + 0]: saved c_rarg3 566 // [tos + 1]: saved c_rarg2 567 // [tos + 2]: saved lr 568 // [tos + 3]: saved rscratch2 569 // [tos + 4]: saved r0 570 // [tos + 5]: saved rscratch1 571 address generate_verify_oop() { 572 StubId stub_id = StubId::stubgen_verify_oop_id; 573 StubCodeMark mark(this, stub_id); 574 address start = __ pc(); 575 576 Label exit, error; 577 578 // save c_rarg2 and c_rarg3 579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 580 581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 583 __ ldr(c_rarg3, Address(c_rarg2)); 584 __ add(c_rarg3, c_rarg3, 1); 585 __ str(c_rarg3, Address(c_rarg2)); 586 587 // object is in r0 588 // make sure object is 'reasonable' 589 __ cbz(r0, exit); // if obj is null it is OK 590 591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blr(rscratch1); 615 __ hlt(0); 616 617 return start; 618 } 619 620 // Generate indices for iota vector. 621 address generate_iota_indices(StubId stub_id) { 622 __ align(CodeEntryAlignment); 623 StubCodeMark mark(this, stub_id); 624 address start = __ pc(); 625 // B 626 __ emit_data64(0x0706050403020100, relocInfo::none); 627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 628 // H 629 __ emit_data64(0x0003000200010000, relocInfo::none); 630 __ emit_data64(0x0007000600050004, relocInfo::none); 631 // S 632 __ emit_data64(0x0000000100000000, relocInfo::none); 633 __ emit_data64(0x0000000300000002, relocInfo::none); 634 // D 635 __ emit_data64(0x0000000000000000, relocInfo::none); 636 __ emit_data64(0x0000000000000001, relocInfo::none); 637 // S - FP 638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 640 // D - FP 641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 643 return start; 644 } 645 646 // The inner part of zero_words(). This is the bulk operation, 647 // zeroing words in blocks, possibly using DC ZVA to do it. The 648 // caller is responsible for zeroing the last few words. 649 // 650 // Inputs: 651 // r10: the HeapWord-aligned base address of an array to zero. 652 // r11: the count in HeapWords, r11 > 0. 653 // 654 // Returns r10 and r11, adjusted for the caller to clear. 655 // r10: the base address of the tail of words left to clear. 656 // r11: the number of words in the tail. 657 // r11 < MacroAssembler::zero_words_block_size. 658 659 address generate_zero_blocks() { 660 Label done; 661 Label base_aligned; 662 663 Register base = r10, cnt = r11; 664 665 __ align(CodeEntryAlignment); 666 StubId stub_id = StubId::stubgen_zero_blocks_id; 667 StubCodeMark mark(this, stub_id); 668 address start = __ pc(); 669 670 if (UseBlockZeroing) { 671 int zva_length = VM_Version::zva_length(); 672 673 // Ensure ZVA length can be divided by 16. This is required by 674 // the subsequent operations. 675 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 676 677 __ tbz(base, 3, base_aligned); 678 __ str(zr, Address(__ post(base, 8))); 679 __ sub(cnt, cnt, 1); 680 __ bind(base_aligned); 681 682 // Ensure count >= zva_length * 2 so that it still deserves a zva after 683 // alignment. 684 Label small; 685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 686 __ subs(rscratch1, cnt, low_limit >> 3); 687 __ br(Assembler::LT, small); 688 __ zero_dcache_blocks(base, cnt); 689 __ bind(small); 690 } 691 692 { 693 // Number of stp instructions we'll unroll 694 const int unroll = 695 MacroAssembler::zero_words_block_size / 2; 696 // Clear the remaining blocks. 697 Label loop; 698 __ subs(cnt, cnt, unroll * 2); 699 __ br(Assembler::LT, done); 700 __ bind(loop); 701 for (int i = 0; i < unroll; i++) 702 __ stp(zr, zr, __ post(base, 16)); 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::GE, loop); 705 __ bind(done); 706 __ add(cnt, cnt, unroll * 2); 707 } 708 709 __ ret(lr); 710 711 return start; 712 } 713 714 715 typedef enum { 716 copy_forwards = 1, 717 copy_backwards = -1 718 } copy_direction; 719 720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 721 // for arraycopy stubs. 722 class ArrayCopyBarrierSetHelper : StackObj { 723 BarrierSetAssembler* _bs_asm; 724 MacroAssembler* _masm; 725 DecoratorSet _decorators; 726 BasicType _type; 727 Register _gct1; 728 Register _gct2; 729 Register _gct3; 730 FloatRegister _gcvt1; 731 FloatRegister _gcvt2; 732 FloatRegister _gcvt3; 733 734 public: 735 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 736 DecoratorSet decorators, 737 BasicType type, 738 Register gct1, 739 Register gct2, 740 Register gct3, 741 FloatRegister gcvt1, 742 FloatRegister gcvt2, 743 FloatRegister gcvt3) 744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 745 _masm(masm), 746 _decorators(decorators), 747 _type(type), 748 _gct1(gct1), 749 _gct2(gct2), 750 _gct3(gct3), 751 _gcvt1(gcvt1), 752 _gcvt2(gcvt2), 753 _gcvt3(gcvt3) { 754 } 755 756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 758 dst1, dst2, src, 759 _gct1, _gct2, _gcvt1); 760 } 761 762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 764 dst, src1, src2, 765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 766 } 767 768 void copy_load_at_16(Register dst1, Register dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 770 dst1, dst2, src, 771 _gct1); 772 } 773 774 void copy_store_at_16(Address dst, Register src1, Register src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3); 778 } 779 780 void copy_load_at_8(Register dst, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 782 dst, noreg, src, 783 _gct1); 784 } 785 786 void copy_store_at_8(Address dst, Register src) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 788 dst, src, noreg, 789 _gct1, _gct2, _gct3); 790 } 791 }; 792 793 // Bulk copy of blocks of 8 words. 794 // 795 // count is a count of words. 796 // 797 // Precondition: count >= 8 798 // 799 // Postconditions: 800 // 801 // The least significant bit of count contains the remaining count 802 // of words to copy. The rest of count is trash. 803 // 804 // s and d are adjusted to point to the remaining words to copy 805 // 806 void generate_copy_longs(StubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 807 BasicType type; 808 copy_direction direction; 809 810 switch (stub_id) { 811 case StubId::stubgen_copy_byte_f_id: 812 direction = copy_forwards; 813 type = T_BYTE; 814 break; 815 case StubId::stubgen_copy_byte_b_id: 816 direction = copy_backwards; 817 type = T_BYTE; 818 break; 819 case StubId::stubgen_copy_oop_f_id: 820 direction = copy_forwards; 821 type = T_OBJECT; 822 break; 823 case StubId::stubgen_copy_oop_b_id: 824 direction = copy_backwards; 825 type = T_OBJECT; 826 break; 827 case StubId::stubgen_copy_oop_uninit_f_id: 828 direction = copy_forwards; 829 type = T_OBJECT; 830 break; 831 case StubId::stubgen_copy_oop_uninit_b_id: 832 direction = copy_backwards; 833 type = T_OBJECT; 834 break; 835 default: 836 ShouldNotReachHere(); 837 } 838 839 int unit = wordSize * direction; 840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 841 842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 843 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 844 const Register stride = r14; 845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 848 849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 850 assert_different_registers(s, d, count, rscratch1, rscratch2); 851 852 Label again, drain; 853 854 __ align(CodeEntryAlignment); 855 856 StubCodeMark mark(this, stub_id); 857 858 __ bind(start); 859 860 Label unaligned_copy_long; 861 if (AvoidUnalignedAccesses) { 862 __ tbnz(d, 3, unaligned_copy_long); 863 } 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, bias); 867 __ sub(d, d, bias); 868 } 869 870 #ifdef ASSERT 871 // Make sure we are never given < 8 words 872 { 873 Label L; 874 __ cmp(count, (u1)8); 875 __ br(Assembler::GE, L); 876 __ stop("genrate_copy_longs called with < 8 words"); 877 __ bind(L); 878 } 879 #endif 880 881 // Fill 8 registers 882 if (UseSIMDForMemoryOps) { 883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 885 } else { 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 890 } 891 892 __ subs(count, count, 16); 893 __ br(Assembler::LO, drain); 894 895 int prefetch = PrefetchCopyIntervalInBytes; 896 bool use_stride = false; 897 if (direction == copy_backwards) { 898 use_stride = prefetch > 256; 899 prefetch = -prefetch; 900 if (use_stride) __ mov(stride, prefetch); 901 } 902 903 __ bind(again); 904 905 if (PrefetchCopyIntervalInBytes > 0) 906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 907 908 if (UseSIMDForMemoryOps) { 909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 913 } else { 914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 922 } 923 924 __ subs(count, count, 8); 925 __ br(Assembler::HS, again); 926 927 // Drain 928 __ bind(drain); 929 if (UseSIMDForMemoryOps) { 930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 932 } else { 933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 937 } 938 939 { 940 Label L1, L2; 941 __ tbz(count, exact_log2(4), L1); 942 if (UseSIMDForMemoryOps) { 943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 945 } else { 946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 950 } 951 __ bind(L1); 952 953 if (direction == copy_forwards) { 954 __ add(s, s, bias); 955 __ add(d, d, bias); 956 } 957 958 __ tbz(count, 1, L2); 959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 961 __ bind(L2); 962 } 963 964 __ ret(lr); 965 966 if (AvoidUnalignedAccesses) { 967 Label drain, again; 968 // Register order for storing. Order is different for backward copy. 969 970 __ bind(unaligned_copy_long); 971 972 // source address is even aligned, target odd aligned 973 // 974 // when forward copying word pairs we read long pairs at offsets 975 // {0, 2, 4, 6} (in long words). when backwards copying we read 976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 977 // address by -2 in the forwards case so we can compute the 978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 979 // or -1. 980 // 981 // when forward copying we need to store 1 word, 3 pairs and 982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 983 // zero offset We adjust the destination by -1 which means we 984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 985 // 986 // When backwards copyng we need to store 1 word, 3 pairs and 987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 988 // offsets {1, 3, 5, 7, 8} * unit. 989 990 if (direction == copy_forwards) { 991 __ sub(s, s, 16); 992 __ sub(d, d, 8); 993 } 994 995 // Fill 8 registers 996 // 997 // for forwards copy s was offset by -16 from the original input 998 // value of s so the register contents are at these offsets 999 // relative to the 64 bit block addressed by that original input 1000 // and so on for each successive 64 byte block when s is updated 1001 // 1002 // t0 at offset 0, t1 at offset 8 1003 // t2 at offset 16, t3 at offset 24 1004 // t4 at offset 32, t5 at offset 40 1005 // t6 at offset 48, t7 at offset 56 1006 1007 // for backwards copy s was not offset so the register contents 1008 // are at these offsets into the preceding 64 byte block 1009 // relative to that original input and so on for each successive 1010 // preceding 64 byte block when s is updated. this explains the 1011 // slightly counter-intuitive looking pattern of register usage 1012 // in the stp instructions for backwards copy. 1013 // 1014 // t0 at offset -16, t1 at offset -8 1015 // t2 at offset -32, t3 at offset -24 1016 // t4 at offset -48, t5 at offset -40 1017 // t6 at offset -64, t7 at offset -56 1018 1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1023 1024 __ subs(count, count, 16); 1025 __ br(Assembler::LO, drain); 1026 1027 int prefetch = PrefetchCopyIntervalInBytes; 1028 bool use_stride = false; 1029 if (direction == copy_backwards) { 1030 use_stride = prefetch > 256; 1031 prefetch = -prefetch; 1032 if (use_stride) __ mov(stride, prefetch); 1033 } 1034 1035 __ bind(again); 1036 1037 if (PrefetchCopyIntervalInBytes > 0) 1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1039 1040 if (direction == copy_forwards) { 1041 // allowing for the offset of -8 the store instructions place 1042 // registers into the target 64 bit block at the following 1043 // offsets 1044 // 1045 // t0 at offset 0 1046 // t1 at offset 8, t2 at offset 16 1047 // t3 at offset 24, t4 at offset 32 1048 // t5 at offset 40, t6 at offset 48 1049 // t7 at offset 56 1050 1051 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } else { 1061 // d was not offset when we started so the registers are 1062 // written into the 64 bit block preceding d with the following 1063 // offsets 1064 // 1065 // t1 at offset -8 1066 // t3 at offset -24, t0 at offset -16 1067 // t5 at offset -48, t2 at offset -32 1068 // t7 at offset -56, t4 at offset -48 1069 // t6 at offset -64 1070 // 1071 // note that this matches the offsets previously noted for the 1072 // loads 1073 1074 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1083 } 1084 1085 __ subs(count, count, 8); 1086 __ br(Assembler::HS, again); 1087 1088 // Drain 1089 // 1090 // this uses the same pattern of offsets and register arguments 1091 // as above 1092 __ bind(drain); 1093 if (direction == copy_forwards) { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1099 } else { 1100 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1105 } 1106 // now we need to copy any remaining part block which may 1107 // include a 4 word block subblock and/or a 2 word subblock. 1108 // bits 2 and 1 in the count are the tell-tale for whether we 1109 // have each such subblock 1110 { 1111 Label L1, L2; 1112 __ tbz(count, exact_log2(4), L1); 1113 // this is the same as above but copying only 4 longs hence 1114 // with only one intervening stp between the str instructions 1115 // but note that the offsets and registers still follow the 1116 // same pattern 1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1119 if (direction == copy_forwards) { 1120 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1123 } else { 1124 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1127 } 1128 __ bind(L1); 1129 1130 __ tbz(count, 1, L2); 1131 // this is the same as above but copying only 2 longs hence 1132 // there is no intervening stp between the str instructions 1133 // but note that the offset and register patterns are still 1134 // the same 1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1136 if (direction == copy_forwards) { 1137 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1139 } else { 1140 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1142 } 1143 __ bind(L2); 1144 1145 // for forwards copy we need to re-adjust the offsets we 1146 // applied so that s and d are follow the last words written 1147 1148 if (direction == copy_forwards) { 1149 __ add(s, s, 16); 1150 __ add(d, d, 8); 1151 } 1152 1153 } 1154 1155 __ ret(lr); 1156 } 1157 } 1158 1159 // Small copy: less than 16 bytes. 1160 // 1161 // NB: Ignores all of the bits of count which represent more than 15 1162 // bytes, so a caller doesn't have to mask them. 1163 1164 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1165 bool is_backwards = step < 0; 1166 size_t granularity = g_uabs(step); 1167 int direction = is_backwards ? -1 : 1; 1168 1169 Label Lword, Lint, Lshort, Lbyte; 1170 1171 assert(granularity 1172 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1173 1174 const Register t0 = r3; 1175 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1176 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1177 1178 // ??? I don't know if this bit-test-and-branch is the right thing 1179 // to do. It does a lot of jumping, resulting in several 1180 // mispredicted branches. It might make more sense to do this 1181 // with something like Duff's device with a single computed branch. 1182 1183 __ tbz(count, 3 - exact_log2(granularity), Lword); 1184 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1185 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1186 __ bind(Lword); 1187 1188 if (granularity <= sizeof (jint)) { 1189 __ tbz(count, 2 - exact_log2(granularity), Lint); 1190 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1191 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1192 __ bind(Lint); 1193 } 1194 1195 if (granularity <= sizeof (jshort)) { 1196 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1197 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1198 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1199 __ bind(Lshort); 1200 } 1201 1202 if (granularity <= sizeof (jbyte)) { 1203 __ tbz(count, 0, Lbyte); 1204 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1205 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1206 __ bind(Lbyte); 1207 } 1208 } 1209 1210 Label copy_f, copy_b; 1211 Label copy_obj_f, copy_obj_b; 1212 Label copy_obj_uninit_f, copy_obj_uninit_b; 1213 1214 // All-singing all-dancing memory copy. 1215 // 1216 // Copy count units of memory from s to d. The size of a unit is 1217 // step, which can be positive or negative depending on the direction 1218 // of copy. If is_aligned is false, we align the source address. 1219 // 1220 1221 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1222 Register s, Register d, Register count, int step) { 1223 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1224 bool is_backwards = step < 0; 1225 unsigned int granularity = g_uabs(step); 1226 const Register t0 = r3, t1 = r4; 1227 1228 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1229 // load all the data before writing anything 1230 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1231 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1232 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1233 const Register send = r17, dend = r16; 1234 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1235 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1236 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1237 1238 if (PrefetchCopyIntervalInBytes > 0) 1239 __ prfm(Address(s, 0), PLDL1KEEP); 1240 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1241 __ br(Assembler::HI, copy_big); 1242 1243 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1244 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1245 1246 __ cmp(count, u1(16/granularity)); 1247 __ br(Assembler::LS, copy16); 1248 1249 __ cmp(count, u1(64/granularity)); 1250 __ br(Assembler::HI, copy80); 1251 1252 __ cmp(count, u1(32/granularity)); 1253 __ br(Assembler::LS, copy32); 1254 1255 // 33..64 bytes 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1259 bs.copy_store_at_32(Address(d, 0), v0, v1); 1260 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1261 } else { 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1264 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1265 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1266 1267 bs.copy_store_at_16(Address(d, 0), t0, t1); 1268 bs.copy_store_at_16(Address(d, 16), t2, t3); 1269 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1270 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1271 } 1272 __ b(finish); 1273 1274 // 17..32 bytes 1275 __ bind(copy32); 1276 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1277 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1278 1279 bs.copy_store_at_16(Address(d, 0), t0, t1); 1280 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1281 __ b(finish); 1282 1283 // 65..80/96 bytes 1284 // (96 bytes if SIMD because we do 32 byes per instruction) 1285 __ bind(copy80); 1286 if (UseSIMDForMemoryOps) { 1287 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1288 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1289 // Unaligned pointers can be an issue for copying. 1290 // The issue has more chances to happen when granularity of data is 1291 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1292 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1293 // The most performance drop has been seen for the range 65-80 bytes. 1294 // For such cases using the pair of ldp/stp instead of the third pair of 1295 // ldpq/stpq fixes the performance issue. 1296 if (granularity < sizeof (jint)) { 1297 Label copy96; 1298 __ cmp(count, u1(80/granularity)); 1299 __ br(Assembler::HI, copy96); 1300 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1301 1302 bs.copy_store_at_32(Address(d, 0), v0, v1); 1303 bs.copy_store_at_32(Address(d, 32), v2, v3); 1304 1305 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1306 __ b(finish); 1307 1308 __ bind(copy96); 1309 } 1310 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1311 1312 bs.copy_store_at_32(Address(d, 0), v0, v1); 1313 bs.copy_store_at_32(Address(d, 32), v2, v3); 1314 1315 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1316 } else { 1317 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1318 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1319 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1320 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1321 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1322 1323 bs.copy_store_at_16(Address(d, 0), t0, t1); 1324 bs.copy_store_at_16(Address(d, 16), t2, t3); 1325 bs.copy_store_at_16(Address(d, 32), t4, t5); 1326 bs.copy_store_at_16(Address(d, 48), t6, t7); 1327 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1328 } 1329 __ b(finish); 1330 1331 // 0..16 bytes 1332 __ bind(copy16); 1333 __ cmp(count, u1(8/granularity)); 1334 __ br(Assembler::LO, copy8); 1335 1336 // 8..16 bytes 1337 bs.copy_load_at_8(t0, Address(s, 0)); 1338 bs.copy_load_at_8(t1, Address(send, -8)); 1339 bs.copy_store_at_8(Address(d, 0), t0); 1340 bs.copy_store_at_8(Address(dend, -8), t1); 1341 __ b(finish); 1342 1343 if (granularity < 8) { 1344 // 4..7 bytes 1345 __ bind(copy8); 1346 __ tbz(count, 2 - exact_log2(granularity), copy4); 1347 __ ldrw(t0, Address(s, 0)); 1348 __ ldrw(t1, Address(send, -4)); 1349 __ strw(t0, Address(d, 0)); 1350 __ strw(t1, Address(dend, -4)); 1351 __ b(finish); 1352 if (granularity < 4) { 1353 // 0..3 bytes 1354 __ bind(copy4); 1355 __ cbz(count, finish); // get rid of 0 case 1356 if (granularity == 2) { 1357 __ ldrh(t0, Address(s, 0)); 1358 __ strh(t0, Address(d, 0)); 1359 } else { // granularity == 1 1360 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1361 // the first and last byte. 1362 // Handle the 3 byte case by loading and storing base + count/2 1363 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1364 // This does means in the 1 byte case we load/store the same 1365 // byte 3 times. 1366 __ lsr(count, count, 1); 1367 __ ldrb(t0, Address(s, 0)); 1368 __ ldrb(t1, Address(send, -1)); 1369 __ ldrb(t2, Address(s, count)); 1370 __ strb(t0, Address(d, 0)); 1371 __ strb(t1, Address(dend, -1)); 1372 __ strb(t2, Address(d, count)); 1373 } 1374 __ b(finish); 1375 } 1376 } 1377 1378 __ bind(copy_big); 1379 if (is_backwards) { 1380 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1381 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1382 } 1383 1384 // Now we've got the small case out of the way we can align the 1385 // source address on a 2-word boundary. 1386 1387 // Here we will materialize a count in r15, which is used by copy_memory_small 1388 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1389 // Up until here, we have used t9, which aliases r15, but from here on, that register 1390 // can not be used as a temp register, as it contains the count. 1391 1392 Label aligned; 1393 1394 if (is_aligned) { 1395 // We may have to adjust by 1 word to get s 2-word-aligned. 1396 __ tbz(s, exact_log2(wordSize), aligned); 1397 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1398 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1399 __ sub(count, count, wordSize/granularity); 1400 } else { 1401 if (is_backwards) { 1402 __ andr(r15, s, 2 * wordSize - 1); 1403 } else { 1404 __ neg(r15, s); 1405 __ andr(r15, r15, 2 * wordSize - 1); 1406 } 1407 // r15 is the byte adjustment needed to align s. 1408 __ cbz(r15, aligned); 1409 int shift = exact_log2(granularity); 1410 if (shift > 0) { 1411 __ lsr(r15, r15, shift); 1412 } 1413 __ sub(count, count, r15); 1414 1415 #if 0 1416 // ?? This code is only correct for a disjoint copy. It may or 1417 // may not make sense to use it in that case. 1418 1419 // Copy the first pair; s and d may not be aligned. 1420 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1421 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1422 1423 // Align s and d, adjust count 1424 if (is_backwards) { 1425 __ sub(s, s, r15); 1426 __ sub(d, d, r15); 1427 } else { 1428 __ add(s, s, r15); 1429 __ add(d, d, r15); 1430 } 1431 #else 1432 copy_memory_small(decorators, type, s, d, r15, step); 1433 #endif 1434 } 1435 1436 __ bind(aligned); 1437 1438 // s is now 2-word-aligned. 1439 1440 // We have a count of units and some trailing bytes. Adjust the 1441 // count and do a bulk copy of words. If the shift is zero 1442 // perform a move instead to benefit from zero latency moves. 1443 int shift = exact_log2(wordSize/granularity); 1444 if (shift > 0) { 1445 __ lsr(r15, count, shift); 1446 } else { 1447 __ mov(r15, count); 1448 } 1449 if (direction == copy_forwards) { 1450 if (type != T_OBJECT) { 1451 __ bl(copy_f); 1452 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1453 __ bl(copy_obj_uninit_f); 1454 } else { 1455 __ bl(copy_obj_f); 1456 } 1457 } else { 1458 if (type != T_OBJECT) { 1459 __ bl(copy_b); 1460 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1461 __ bl(copy_obj_uninit_b); 1462 } else { 1463 __ bl(copy_obj_b); 1464 } 1465 } 1466 1467 // And the tail. 1468 copy_memory_small(decorators, type, s, d, count, step); 1469 1470 if (granularity >= 8) __ bind(copy8); 1471 if (granularity >= 4) __ bind(copy4); 1472 __ bind(finish); 1473 } 1474 1475 1476 void clobber_registers() { 1477 #ifdef ASSERT 1478 RegSet clobbered 1479 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1480 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1481 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1482 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1483 __ mov(*it, rscratch1); 1484 } 1485 #endif 1486 1487 } 1488 1489 // Scan over array at a for count oops, verifying each one. 1490 // Preserves a and count, clobbers rscratch1 and rscratch2. 1491 void verify_oop_array (int size, Register a, Register count, Register temp) { 1492 Label loop, end; 1493 __ mov(rscratch1, a); 1494 __ mov(rscratch2, zr); 1495 __ bind(loop); 1496 __ cmp(rscratch2, count); 1497 __ br(Assembler::HS, end); 1498 if (size == wordSize) { 1499 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1500 __ verify_oop(temp); 1501 } else { 1502 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1503 __ decode_heap_oop(temp); // calls verify_oop 1504 } 1505 __ add(rscratch2, rscratch2, 1); 1506 __ b(loop); 1507 __ bind(end); 1508 } 1509 1510 // Arguments: 1511 // stub_id - is used to name the stub and identify all details of 1512 // how to perform the copy. 1513 // 1514 // entry - is assigned to the stub's post push entry point unless 1515 // it is null 1516 // 1517 // Inputs: 1518 // c_rarg0 - source array address 1519 // c_rarg1 - destination array address 1520 // c_rarg2 - element count, treated as ssize_t, can be zero 1521 // 1522 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1523 // the hardware handle it. The two dwords within qwords that span 1524 // cache line boundaries will still be loaded and stored atomically. 1525 // 1526 // Side Effects: entry is set to the (post push) entry point so it 1527 // can be used by the corresponding conjoint copy 1528 // method 1529 // 1530 address generate_disjoint_copy(StubId stub_id, address *entry) { 1531 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1532 RegSet saved_reg = RegSet::of(s, d, count); 1533 int size; 1534 bool aligned; 1535 bool is_oop; 1536 bool dest_uninitialized; 1537 switch (stub_id) { 1538 case StubId::stubgen_jbyte_disjoint_arraycopy_id: 1539 size = sizeof(jbyte); 1540 aligned = false; 1541 is_oop = false; 1542 dest_uninitialized = false; 1543 break; 1544 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id: 1545 size = sizeof(jbyte); 1546 aligned = true; 1547 is_oop = false; 1548 dest_uninitialized = false; 1549 break; 1550 case StubId::stubgen_jshort_disjoint_arraycopy_id: 1551 size = sizeof(jshort); 1552 aligned = false; 1553 is_oop = false; 1554 dest_uninitialized = false; 1555 break; 1556 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id: 1557 size = sizeof(jshort); 1558 aligned = true; 1559 is_oop = false; 1560 dest_uninitialized = false; 1561 break; 1562 case StubId::stubgen_jint_disjoint_arraycopy_id: 1563 size = sizeof(jint); 1564 aligned = false; 1565 is_oop = false; 1566 dest_uninitialized = false; 1567 break; 1568 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id: 1569 size = sizeof(jint); 1570 aligned = true; 1571 is_oop = false; 1572 dest_uninitialized = false; 1573 break; 1574 case StubId::stubgen_jlong_disjoint_arraycopy_id: 1575 // since this is always aligned we can (should!) use the same 1576 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1577 ShouldNotReachHere(); 1578 break; 1579 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id: 1580 size = sizeof(jlong); 1581 aligned = true; 1582 is_oop = false; 1583 dest_uninitialized = false; 1584 break; 1585 case StubId::stubgen_oop_disjoint_arraycopy_id: 1586 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1587 aligned = !UseCompressedOops; 1588 is_oop = true; 1589 dest_uninitialized = false; 1590 break; 1591 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id: 1592 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1593 aligned = !UseCompressedOops; 1594 is_oop = true; 1595 dest_uninitialized = false; 1596 break; 1597 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id: 1598 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1599 aligned = !UseCompressedOops; 1600 is_oop = true; 1601 dest_uninitialized = true; 1602 break; 1603 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id: 1604 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1605 aligned = !UseCompressedOops; 1606 is_oop = true; 1607 dest_uninitialized = true; 1608 break; 1609 default: 1610 ShouldNotReachHere(); 1611 break; 1612 } 1613 1614 __ align(CodeEntryAlignment); 1615 StubCodeMark mark(this, stub_id); 1616 address start = __ pc(); 1617 __ enter(); 1618 1619 if (entry != nullptr) { 1620 *entry = __ pc(); 1621 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1622 BLOCK_COMMENT("Entry:"); 1623 } 1624 1625 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1626 if (dest_uninitialized) { 1627 decorators |= IS_DEST_UNINITIALIZED; 1628 } 1629 if (aligned) { 1630 decorators |= ARRAYCOPY_ALIGNED; 1631 } 1632 1633 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1634 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1635 1636 if (is_oop) { 1637 // save regs before copy_memory 1638 __ push(RegSet::of(d, count), sp); 1639 } 1640 { 1641 // UnsafeMemoryAccess page error: continue after unsafe access 1642 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1643 UnsafeMemoryAccessMark umam(this, add_entry, true); 1644 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1645 } 1646 1647 if (is_oop) { 1648 __ pop(RegSet::of(d, count), sp); 1649 if (VerifyOops) 1650 verify_oop_array(size, d, count, r16); 1651 } 1652 1653 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1654 1655 __ leave(); 1656 __ mov(r0, zr); // return 0 1657 __ ret(lr); 1658 return start; 1659 } 1660 1661 // Arguments: 1662 // stub_id - is used to name the stub and identify all details of 1663 // how to perform the copy. 1664 // 1665 // nooverlap_target - identifes the (post push) entry for the 1666 // corresponding disjoint copy routine which can be 1667 // jumped to if the ranges do not actually overlap 1668 // 1669 // entry - is assigned to the stub's post push entry point unless 1670 // it is null 1671 // 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomically. 1681 // 1682 // Side Effects: 1683 // entry is set to the no-overlap entry point so it can be used by 1684 // some other conjoint copy method 1685 // 1686 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) { 1687 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1688 RegSet saved_regs = RegSet::of(s, d, count); 1689 int size; 1690 bool aligned; 1691 bool is_oop; 1692 bool dest_uninitialized; 1693 switch (stub_id) { 1694 case StubId::stubgen_jbyte_arraycopy_id: 1695 size = sizeof(jbyte); 1696 aligned = false; 1697 is_oop = false; 1698 dest_uninitialized = false; 1699 break; 1700 case StubId::stubgen_arrayof_jbyte_arraycopy_id: 1701 size = sizeof(jbyte); 1702 aligned = true; 1703 is_oop = false; 1704 dest_uninitialized = false; 1705 break; 1706 case StubId::stubgen_jshort_arraycopy_id: 1707 size = sizeof(jshort); 1708 aligned = false; 1709 is_oop = false; 1710 dest_uninitialized = false; 1711 break; 1712 case StubId::stubgen_arrayof_jshort_arraycopy_id: 1713 size = sizeof(jshort); 1714 aligned = true; 1715 is_oop = false; 1716 dest_uninitialized = false; 1717 break; 1718 case StubId::stubgen_jint_arraycopy_id: 1719 size = sizeof(jint); 1720 aligned = false; 1721 is_oop = false; 1722 dest_uninitialized = false; 1723 break; 1724 case StubId::stubgen_arrayof_jint_arraycopy_id: 1725 size = sizeof(jint); 1726 aligned = true; 1727 is_oop = false; 1728 dest_uninitialized = false; 1729 break; 1730 case StubId::stubgen_jlong_arraycopy_id: 1731 // since this is always aligned we can (should!) use the same 1732 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1733 ShouldNotReachHere(); 1734 break; 1735 case StubId::stubgen_arrayof_jlong_arraycopy_id: 1736 size = sizeof(jlong); 1737 aligned = true; 1738 is_oop = false; 1739 dest_uninitialized = false; 1740 break; 1741 case StubId::stubgen_oop_arraycopy_id: 1742 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1743 aligned = !UseCompressedOops; 1744 is_oop = true; 1745 dest_uninitialized = false; 1746 break; 1747 case StubId::stubgen_arrayof_oop_arraycopy_id: 1748 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1749 aligned = !UseCompressedOops; 1750 is_oop = true; 1751 dest_uninitialized = false; 1752 break; 1753 case StubId::stubgen_oop_arraycopy_uninit_id: 1754 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1755 aligned = !UseCompressedOops; 1756 is_oop = true; 1757 dest_uninitialized = true; 1758 break; 1759 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id: 1760 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1761 aligned = !UseCompressedOops; 1762 is_oop = true; 1763 dest_uninitialized = true; 1764 break; 1765 default: 1766 ShouldNotReachHere(); 1767 } 1768 1769 StubCodeMark mark(this, stub_id); 1770 address start = __ pc(); 1771 __ enter(); 1772 1773 if (entry != nullptr) { 1774 *entry = __ pc(); 1775 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1776 BLOCK_COMMENT("Entry:"); 1777 } 1778 1779 // use fwd copy when (d-s) above_equal (count*size) 1780 __ sub(rscratch1, d, s); 1781 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1782 __ br(Assembler::HS, nooverlap_target); 1783 1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1785 if (dest_uninitialized) { 1786 decorators |= IS_DEST_UNINITIALIZED; 1787 } 1788 if (aligned) { 1789 decorators |= ARRAYCOPY_ALIGNED; 1790 } 1791 1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1794 1795 if (is_oop) { 1796 // save regs before copy_memory 1797 __ push(RegSet::of(d, count), sp); 1798 } 1799 { 1800 // UnsafeMemoryAccess page error: continue after unsafe access 1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1802 UnsafeMemoryAccessMark umam(this, add_entry, true); 1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1804 } 1805 if (is_oop) { 1806 __ pop(RegSet::of(d, count), sp); 1807 if (VerifyOops) 1808 verify_oop_array(size, d, count, r16); 1809 } 1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1811 __ leave(); 1812 __ mov(r0, zr); // return 0 1813 __ ret(lr); 1814 return start; 1815 } 1816 1817 // Helper for generating a dynamic type check. 1818 // Smashes rscratch1, rscratch2. 1819 void generate_type_check(Register sub_klass, 1820 Register super_check_offset, 1821 Register super_klass, 1822 Register temp1, 1823 Register temp2, 1824 Register result, 1825 Label& L_success) { 1826 assert_different_registers(sub_klass, super_check_offset, super_klass); 1827 1828 BLOCK_COMMENT("type_check:"); 1829 1830 Label L_miss; 1831 1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1833 super_check_offset); 1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1835 1836 // Fall through on failure! 1837 __ BIND(L_miss); 1838 } 1839 1840 // 1841 // Generate checkcasting array copy stub 1842 // 1843 // Input: 1844 // c_rarg0 - source array address 1845 // c_rarg1 - destination array address 1846 // c_rarg2 - element count, treated as ssize_t, can be zero 1847 // c_rarg3 - size_t ckoff (super_check_offset) 1848 // c_rarg4 - oop ckval (super_klass) 1849 // 1850 // Output: 1851 // r0 == 0 - success 1852 // r0 == -1^K - failure, where K is partial transfer count 1853 // 1854 address generate_checkcast_copy(StubId stub_id, address *entry) { 1855 bool dest_uninitialized; 1856 switch (stub_id) { 1857 case StubId::stubgen_checkcast_arraycopy_id: 1858 dest_uninitialized = false; 1859 break; 1860 case StubId::stubgen_checkcast_arraycopy_uninit_id: 1861 dest_uninitialized = true; 1862 break; 1863 default: 1864 ShouldNotReachHere(); 1865 } 1866 1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1868 1869 // Input registers (after setup_arg_regs) 1870 const Register from = c_rarg0; // source array address 1871 const Register to = c_rarg1; // destination array address 1872 const Register count = c_rarg2; // elementscount 1873 const Register ckoff = c_rarg3; // super_check_offset 1874 const Register ckval = c_rarg4; // super_klass 1875 1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1877 RegSet wb_post_saved_regs = RegSet::of(count); 1878 1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1880 const Register copied_oop = r22; // actual oop copied 1881 const Register count_save = r21; // orig elementscount 1882 const Register start_to = r20; // destination array start address 1883 const Register r19_klass = r19; // oop._klass 1884 1885 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1886 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1887 1888 //--------------------------------------------------------------- 1889 // Assembler stub will be used for this call to arraycopy 1890 // if the two arrays are subtypes of Object[] but the 1891 // destination array type is not equal to or a supertype 1892 // of the source type. Each element must be separately 1893 // checked. 1894 1895 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1896 copied_oop, r19_klass, count_save); 1897 1898 __ align(CodeEntryAlignment); 1899 StubCodeMark mark(this, stub_id); 1900 address start = __ pc(); 1901 1902 __ enter(); // required for proper stackwalking of RuntimeStub frame 1903 1904 #ifdef ASSERT 1905 // caller guarantees that the arrays really are different 1906 // otherwise, we would have to make conjoint checks 1907 { Label L; 1908 __ b(L); // conjoint check not yet implemented 1909 __ stop("checkcast_copy within a single array"); 1910 __ bind(L); 1911 } 1912 #endif //ASSERT 1913 1914 // Caller of this entry point must set up the argument registers. 1915 if (entry != nullptr) { 1916 *entry = __ pc(); 1917 BLOCK_COMMENT("Entry:"); 1918 } 1919 1920 // Empty array: Nothing to do. 1921 __ cbz(count, L_done); 1922 __ push(RegSet::of(r19, r20, r21, r22), sp); 1923 1924 #ifdef ASSERT 1925 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1926 // The ckoff and ckval must be mutually consistent, 1927 // even though caller generates both. 1928 { Label L; 1929 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1930 __ ldrw(start_to, Address(ckval, sco_offset)); 1931 __ cmpw(ckoff, start_to); 1932 __ br(Assembler::EQ, L); 1933 __ stop("super_check_offset inconsistent"); 1934 __ bind(L); 1935 } 1936 #endif //ASSERT 1937 1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1939 bool is_oop = true; 1940 int element_size = UseCompressedOops ? 4 : 8; 1941 if (dest_uninitialized) { 1942 decorators |= IS_DEST_UNINITIALIZED; 1943 } 1944 1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1947 1948 // save the original count 1949 __ mov(count_save, count); 1950 1951 // Copy from low to high addresses 1952 __ mov(start_to, to); // Save destination array start address 1953 __ b(L_load_element); 1954 1955 // ======== begin loop ======== 1956 // (Loop is rotated; its entry is L_load_element.) 1957 // Loop control: 1958 // for (; count != 0; count--) { 1959 // copied_oop = load_heap_oop(from++); 1960 // ... generate_type_check ...; 1961 // store_heap_oop(to++, copied_oop); 1962 // } 1963 __ align(OptoLoopAlignment); 1964 1965 __ BIND(L_store_element); 1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1967 __ post(to, element_size), copied_oop, noreg, 1968 gct1, gct2, gct3); 1969 __ sub(count, count, 1); 1970 __ cbz(count, L_do_card_marks); 1971 1972 // ======== loop entry is here ======== 1973 __ BIND(L_load_element); 1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1975 copied_oop, noreg, __ post(from, element_size), 1976 gct1); 1977 __ cbz(copied_oop, L_store_element); 1978 1979 __ load_klass(r19_klass, copied_oop);// query the object klass 1980 1981 BLOCK_COMMENT("type_check:"); 1982 generate_type_check(/*sub_klass*/r19_klass, 1983 /*super_check_offset*/ckoff, 1984 /*super_klass*/ckval, 1985 /*r_array_base*/gct1, 1986 /*temp2*/gct2, 1987 /*result*/r10, L_store_element); 1988 1989 // Fall through on failure! 1990 1991 // ======== end loop ======== 1992 1993 // It was a real error; we must depend on the caller to finish the job. 1994 // Register count = remaining oops, count_orig = total oops. 1995 // Emit GC store barriers for the oops we have copied and report 1996 // their number to the caller. 1997 1998 __ subs(count, count_save, count); // K = partially copied oop count 1999 __ eon(count, count, zr); // report (-1^K) to caller 2000 __ br(Assembler::EQ, L_done_pop); 2001 2002 __ BIND(L_do_card_marks); 2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2004 2005 __ bind(L_done_pop); 2006 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2008 2009 __ bind(L_done); 2010 __ mov(r0, count); 2011 __ leave(); 2012 __ ret(lr); 2013 2014 return start; 2015 } 2016 2017 // Perform range checks on the proposed arraycopy. 2018 // Kills temp, but nothing else. 2019 // Also, clean the sign bits of src_pos and dst_pos. 2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2021 Register src_pos, // source position (c_rarg1) 2022 Register dst, // destination array oo (c_rarg2) 2023 Register dst_pos, // destination position (c_rarg3) 2024 Register length, 2025 Register temp, 2026 Label& L_failed) { 2027 BLOCK_COMMENT("arraycopy_range_checks:"); 2028 2029 assert_different_registers(rscratch1, temp); 2030 2031 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2033 __ addw(temp, length, src_pos); 2034 __ cmpw(temp, rscratch1); 2035 __ br(Assembler::HI, L_failed); 2036 2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2039 __ addw(temp, length, dst_pos); 2040 __ cmpw(temp, rscratch1); 2041 __ br(Assembler::HI, L_failed); 2042 2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2044 __ movw(src_pos, src_pos); 2045 __ movw(dst_pos, dst_pos); 2046 2047 BLOCK_COMMENT("arraycopy_range_checks done"); 2048 } 2049 2050 // These stubs get called from some dumb test routine. 2051 // I'll write them properly when they're called from 2052 // something that's actually doing something. 2053 static void fake_arraycopy_stub(address src, address dst, int count) { 2054 assert(count == 0, "huh?"); 2055 } 2056 2057 2058 // 2059 // Generate 'unsafe' array copy stub 2060 // Though just as safe as the other stubs, it takes an unscaled 2061 // size_t argument instead of an element count. 2062 // 2063 // Input: 2064 // c_rarg0 - source array address 2065 // c_rarg1 - destination array address 2066 // c_rarg2 - byte count, treated as ssize_t, can be zero 2067 // 2068 // Examines the alignment of the operands and dispatches 2069 // to a long, int, short, or byte copy loop. 2070 // 2071 address generate_unsafe_copy(address byte_copy_entry, 2072 address short_copy_entry, 2073 address int_copy_entry, 2074 address long_copy_entry) { 2075 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id; 2076 2077 Label L_long_aligned, L_int_aligned, L_short_aligned; 2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2079 2080 __ align(CodeEntryAlignment); 2081 StubCodeMark mark(this, stub_id); 2082 address start = __ pc(); 2083 __ enter(); // required for proper stackwalking of RuntimeStub frame 2084 2085 // bump this on entry, not on exit: 2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2087 2088 __ orr(rscratch1, s, d); 2089 __ orr(rscratch1, rscratch1, count); 2090 2091 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2092 __ cbz(rscratch1, L_long_aligned); 2093 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2094 __ cbz(rscratch1, L_int_aligned); 2095 __ tbz(rscratch1, 0, L_short_aligned); 2096 __ b(RuntimeAddress(byte_copy_entry)); 2097 2098 __ BIND(L_short_aligned); 2099 __ lsr(count, count, LogBytesPerShort); // size => short_count 2100 __ b(RuntimeAddress(short_copy_entry)); 2101 __ BIND(L_int_aligned); 2102 __ lsr(count, count, LogBytesPerInt); // size => int_count 2103 __ b(RuntimeAddress(int_copy_entry)); 2104 __ BIND(L_long_aligned); 2105 __ lsr(count, count, LogBytesPerLong); // size => long_count 2106 __ b(RuntimeAddress(long_copy_entry)); 2107 2108 return start; 2109 } 2110 2111 // 2112 // Generate generic array copy stubs 2113 // 2114 // Input: 2115 // c_rarg0 - src oop 2116 // c_rarg1 - src_pos (32-bits) 2117 // c_rarg2 - dst oop 2118 // c_rarg3 - dst_pos (32-bits) 2119 // c_rarg4 - element count (32-bits) 2120 // 2121 // Output: 2122 // r0 == 0 - success 2123 // r0 == -1^K - failure, where K is partial transfer count 2124 // 2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2126 address int_copy_entry, address oop_copy_entry, 2127 address long_copy_entry, address checkcast_copy_entry) { 2128 StubId stub_id = StubId::stubgen_generic_arraycopy_id; 2129 2130 Label L_failed, L_objArray; 2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2132 2133 // Input registers 2134 const Register src = c_rarg0; // source array oop 2135 const Register src_pos = c_rarg1; // source position 2136 const Register dst = c_rarg2; // destination array oop 2137 const Register dst_pos = c_rarg3; // destination position 2138 const Register length = c_rarg4; 2139 2140 2141 // Registers used as temps 2142 const Register dst_klass = c_rarg5; 2143 2144 __ align(CodeEntryAlignment); 2145 2146 StubCodeMark mark(this, stub_id); 2147 2148 address start = __ pc(); 2149 2150 __ enter(); // required for proper stackwalking of RuntimeStub frame 2151 2152 // bump this on entry, not on exit: 2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2154 2155 //----------------------------------------------------------------------- 2156 // Assembler stub will be used for this call to arraycopy 2157 // if the following conditions are met: 2158 // 2159 // (1) src and dst must not be null. 2160 // (2) src_pos must not be negative. 2161 // (3) dst_pos must not be negative. 2162 // (4) length must not be negative. 2163 // (5) src klass and dst klass should be the same and not null. 2164 // (6) src and dst should be arrays. 2165 // (7) src_pos + length must not exceed length of src. 2166 // (8) dst_pos + length must not exceed length of dst. 2167 // 2168 2169 // if (src == nullptr) return -1; 2170 __ cbz(src, L_failed); 2171 2172 // if (src_pos < 0) return -1; 2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2174 2175 // if (dst == nullptr) return -1; 2176 __ cbz(dst, L_failed); 2177 2178 // if (dst_pos < 0) return -1; 2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2180 2181 // registers used as temp 2182 const Register scratch_length = r16; // elements count to copy 2183 const Register scratch_src_klass = r17; // array klass 2184 const Register lh = r15; // layout helper 2185 2186 // if (length < 0) return -1; 2187 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2189 2190 __ load_klass(scratch_src_klass, src); 2191 #ifdef ASSERT 2192 // assert(src->klass() != nullptr); 2193 { 2194 BLOCK_COMMENT("assert klasses not null {"); 2195 Label L1, L2; 2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2197 __ bind(L1); 2198 __ stop("broken null klass"); 2199 __ bind(L2); 2200 __ load_klass(rscratch1, dst); 2201 __ cbz(rscratch1, L1); // this would be broken also 2202 BLOCK_COMMENT("} assert klasses not null done"); 2203 } 2204 #endif 2205 2206 // Load layout helper (32-bits) 2207 // 2208 // |array_tag| | header_size | element_type | |log2_element_size| 2209 // 32 30 24 16 8 2 0 2210 // 2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2212 // 2213 2214 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2215 2216 // Handle objArrays completely differently... 2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2219 __ movw(rscratch1, objArray_lh); 2220 __ eorw(rscratch2, lh, rscratch1); 2221 __ cbzw(rscratch2, L_objArray); 2222 2223 // if (src->klass() != dst->klass()) return -1; 2224 __ load_klass(rscratch2, dst); 2225 __ eor(rscratch2, rscratch2, scratch_src_klass); 2226 __ cbnz(rscratch2, L_failed); 2227 2228 // if (!src->is_Array()) return -1; 2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2230 2231 // At this point, it is known to be a typeArray (array_tag 0x3). 2232 #ifdef ASSERT 2233 { 2234 BLOCK_COMMENT("assert primitive array {"); 2235 Label L; 2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2237 __ cmpw(lh, rscratch2); 2238 __ br(Assembler::GE, L); 2239 __ stop("must be a primitive array"); 2240 __ bind(L); 2241 BLOCK_COMMENT("} assert primitive array done"); 2242 } 2243 #endif 2244 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 rscratch2, L_failed); 2247 2248 // TypeArrayKlass 2249 // 2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2252 // 2253 2254 const Register rscratch1_offset = rscratch1; // array offset 2255 const Register r15_elsize = lh; // element size 2256 2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2259 __ add(src, src, rscratch1_offset); // src array offset 2260 __ add(dst, dst, rscratch1_offset); // dst array offset 2261 BLOCK_COMMENT("choose copy loop based on element size"); 2262 2263 // next registers should be set before the jump to corresponding stub 2264 const Register from = c_rarg0; // source array address 2265 const Register to = c_rarg1; // destination array address 2266 const Register count = c_rarg2; // elements count 2267 2268 // 'from', 'to', 'count' registers should be set in such order 2269 // since they are the same as 'src', 'src_pos', 'dst'. 2270 2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2272 2273 // The possible values of elsize are 0-3, i.e. exact_log2(element 2274 // size in bytes). We do a simple bitwise binary search. 2275 __ BIND(L_copy_bytes); 2276 __ tbnz(r15_elsize, 1, L_copy_ints); 2277 __ tbnz(r15_elsize, 0, L_copy_shorts); 2278 __ lea(from, Address(src, src_pos));// src_addr 2279 __ lea(to, Address(dst, dst_pos));// dst_addr 2280 __ movw(count, scratch_length); // length 2281 __ b(RuntimeAddress(byte_copy_entry)); 2282 2283 __ BIND(L_copy_shorts); 2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2286 __ movw(count, scratch_length); // length 2287 __ b(RuntimeAddress(short_copy_entry)); 2288 2289 __ BIND(L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_longs); 2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(int_copy_entry)); 2295 2296 __ BIND(L_copy_longs); 2297 #ifdef ASSERT 2298 { 2299 BLOCK_COMMENT("assert long copy {"); 2300 Label L; 2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2302 __ cmpw(r15_elsize, LogBytesPerLong); 2303 __ br(Assembler::EQ, L); 2304 __ stop("must be long copy, but elsize is wrong"); 2305 __ bind(L); 2306 BLOCK_COMMENT("} assert long copy done"); 2307 } 2308 #endif 2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2311 __ movw(count, scratch_length); // length 2312 __ b(RuntimeAddress(long_copy_entry)); 2313 2314 // ObjArrayKlass 2315 __ BIND(L_objArray); 2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2317 2318 Label L_plain_copy, L_checkcast_copy; 2319 // test array classes for subtyping 2320 __ load_klass(r15, dst); 2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2322 __ br(Assembler::NE, L_checkcast_copy); 2323 2324 // Identically typed arrays can be copied without element-wise checks. 2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2326 rscratch2, L_failed); 2327 2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2332 __ movw(count, scratch_length); // length 2333 __ BIND(L_plain_copy); 2334 __ b(RuntimeAddress(oop_copy_entry)); 2335 2336 __ BIND(L_checkcast_copy); 2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2338 { 2339 // Before looking at dst.length, make sure dst is also an objArray. 2340 __ ldrw(rscratch1, Address(r15, lh_offset)); 2341 __ movw(rscratch2, objArray_lh); 2342 __ eorw(rscratch1, rscratch1, rscratch2); 2343 __ cbnzw(rscratch1, L_failed); 2344 2345 // It is safe to examine both src.length and dst.length. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 r15, L_failed); 2348 2349 __ load_klass(dst_klass, dst); // reload 2350 2351 // Marshal the base address arguments now, freeing registers. 2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2356 __ movw(count, length); // length (reloaded) 2357 Register sco_temp = c_rarg3; // this register is free now 2358 assert_different_registers(from, to, count, sco_temp, 2359 dst_klass, scratch_src_klass); 2360 // assert_clean_int(count, sco_temp); 2361 2362 // Generate the type check. 2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2365 2366 // Smashes rscratch1, rscratch2 2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2368 L_plain_copy); 2369 2370 // Fetch destination element klass from the ObjArrayKlass header. 2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2372 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2374 2375 // the checkcast_copy loop needs two extra arguments: 2376 assert(c_rarg3 == sco_temp, "#3 already in place"); 2377 // Set up arguments for checkcast_copy_entry. 2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2379 __ b(RuntimeAddress(checkcast_copy_entry)); 2380 } 2381 2382 __ BIND(L_failed); 2383 __ mov(r0, -1); 2384 __ leave(); // required for proper stackwalking of RuntimeStub frame 2385 __ ret(lr); 2386 2387 return start; 2388 } 2389 2390 // 2391 // Generate stub for array fill. If "aligned" is true, the 2392 // "to" address is assumed to be heapword aligned. 2393 // 2394 // Arguments for generated stub: 2395 // to: c_rarg0 2396 // value: c_rarg1 2397 // count: c_rarg2 treated as signed 2398 // 2399 address generate_fill(StubId stub_id) { 2400 BasicType t; 2401 bool aligned; 2402 2403 switch (stub_id) { 2404 case StubId::stubgen_jbyte_fill_id: 2405 t = T_BYTE; 2406 aligned = false; 2407 break; 2408 case StubId::stubgen_jshort_fill_id: 2409 t = T_SHORT; 2410 aligned = false; 2411 break; 2412 case StubId::stubgen_jint_fill_id: 2413 t = T_INT; 2414 aligned = false; 2415 break; 2416 case StubId::stubgen_arrayof_jbyte_fill_id: 2417 t = T_BYTE; 2418 aligned = true; 2419 break; 2420 case StubId::stubgen_arrayof_jshort_fill_id: 2421 t = T_SHORT; 2422 aligned = true; 2423 break; 2424 case StubId::stubgen_arrayof_jint_fill_id: 2425 t = T_INT; 2426 aligned = true; 2427 break; 2428 default: 2429 ShouldNotReachHere(); 2430 }; 2431 2432 __ align(CodeEntryAlignment); 2433 StubCodeMark mark(this, stub_id); 2434 address start = __ pc(); 2435 2436 BLOCK_COMMENT("Entry:"); 2437 2438 const Register to = c_rarg0; // source array address 2439 const Register value = c_rarg1; // value 2440 const Register count = c_rarg2; // elements count 2441 2442 const Register bz_base = r10; // base for block_zero routine 2443 const Register cnt_words = r11; // temp register 2444 2445 __ enter(); 2446 2447 Label L_fill_elements, L_exit1; 2448 2449 int shift = -1; 2450 switch (t) { 2451 case T_BYTE: 2452 shift = 0; 2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2456 __ br(Assembler::LO, L_fill_elements); 2457 break; 2458 case T_SHORT: 2459 shift = 1; 2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2462 __ br(Assembler::LO, L_fill_elements); 2463 break; 2464 case T_INT: 2465 shift = 2; 2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2467 __ br(Assembler::LO, L_fill_elements); 2468 break; 2469 default: ShouldNotReachHere(); 2470 } 2471 2472 // Align source address at 8 bytes address boundary. 2473 Label L_skip_align1, L_skip_align2, L_skip_align4; 2474 if (!aligned) { 2475 switch (t) { 2476 case T_BYTE: 2477 // One byte misalignment happens only for byte arrays. 2478 __ tbz(to, 0, L_skip_align1); 2479 __ strb(value, Address(__ post(to, 1))); 2480 __ subw(count, count, 1); 2481 __ bind(L_skip_align1); 2482 // Fallthrough 2483 case T_SHORT: 2484 // Two bytes misalignment happens only for byte and short (char) arrays. 2485 __ tbz(to, 1, L_skip_align2); 2486 __ strh(value, Address(__ post(to, 2))); 2487 __ subw(count, count, 2 >> shift); 2488 __ bind(L_skip_align2); 2489 // Fallthrough 2490 case T_INT: 2491 // Align to 8 bytes, we know we are 4 byte aligned to start. 2492 __ tbz(to, 2, L_skip_align4); 2493 __ strw(value, Address(__ post(to, 4))); 2494 __ subw(count, count, 4 >> shift); 2495 __ bind(L_skip_align4); 2496 break; 2497 default: ShouldNotReachHere(); 2498 } 2499 } 2500 2501 // 2502 // Fill large chunks 2503 // 2504 __ lsrw(cnt_words, count, 3 - shift); // number of words 2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2507 if (UseBlockZeroing) { 2508 Label non_block_zeroing, rest; 2509 // If the fill value is zero we can use the fast zero_words(). 2510 __ cbnz(value, non_block_zeroing); 2511 __ mov(bz_base, to); 2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2513 address tpc = __ zero_words(bz_base, cnt_words); 2514 if (tpc == nullptr) { 2515 fatal("CodeCache is full at generate_fill"); 2516 } 2517 __ b(rest); 2518 __ bind(non_block_zeroing); 2519 __ fill_words(to, cnt_words, value); 2520 __ bind(rest); 2521 } else { 2522 __ fill_words(to, cnt_words, value); 2523 } 2524 2525 // Remaining count is less than 8 bytes. Fill it by a single store. 2526 // Note that the total length is no less than 8 bytes. 2527 if (t == T_BYTE || t == T_SHORT) { 2528 Label L_exit1; 2529 __ cbzw(count, L_exit1); 2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2531 __ str(value, Address(to, -8)); // overwrite some elements 2532 __ bind(L_exit1); 2533 __ leave(); 2534 __ ret(lr); 2535 } 2536 2537 // Handle copies less than 8 bytes. 2538 Label L_fill_2, L_fill_4, L_exit2; 2539 __ bind(L_fill_elements); 2540 switch (t) { 2541 case T_BYTE: 2542 __ tbz(count, 0, L_fill_2); 2543 __ strb(value, Address(__ post(to, 1))); 2544 __ bind(L_fill_2); 2545 __ tbz(count, 1, L_fill_4); 2546 __ strh(value, Address(__ post(to, 2))); 2547 __ bind(L_fill_4); 2548 __ tbz(count, 2, L_exit2); 2549 __ strw(value, Address(to)); 2550 break; 2551 case T_SHORT: 2552 __ tbz(count, 0, L_fill_4); 2553 __ strh(value, Address(__ post(to, 2))); 2554 __ bind(L_fill_4); 2555 __ tbz(count, 1, L_exit2); 2556 __ strw(value, Address(to)); 2557 break; 2558 case T_INT: 2559 __ cbzw(count, L_exit2); 2560 __ strw(value, Address(to)); 2561 break; 2562 default: ShouldNotReachHere(); 2563 } 2564 __ bind(L_exit2); 2565 __ leave(); 2566 __ ret(lr); 2567 return start; 2568 } 2569 2570 address generate_unsafecopy_common_error_exit() { 2571 address start_pc = __ pc(); 2572 __ leave(); 2573 __ mov(r0, 0); 2574 __ ret(lr); 2575 return start_pc; 2576 } 2577 2578 // 2579 // Generate 'unsafe' set memory stub 2580 // Though just as safe as the other stubs, it takes an unscaled 2581 // size_t (# bytes) argument instead of an element count. 2582 // 2583 // This fill operation is atomicity preserving: as long as the 2584 // address supplied is sufficiently aligned, all writes of up to 64 2585 // bits in size are single-copy atomic. 2586 // 2587 // Input: 2588 // c_rarg0 - destination array address 2589 // c_rarg1 - byte count (size_t) 2590 // c_rarg2 - byte value 2591 // 2592 address generate_unsafe_setmemory() { 2593 __ align(CodeEntryAlignment); 2594 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id); 2595 address start = __ pc(); 2596 2597 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2598 Label tail; 2599 2600 UnsafeMemoryAccessMark umam(this, true, false); 2601 2602 __ enter(); // required for proper stackwalking of RuntimeStub frame 2603 2604 __ dup(v0, __ T16B, value); 2605 2606 if (AvoidUnalignedAccesses) { 2607 __ cmp(count, (u1)16); 2608 __ br(__ LO, tail); 2609 2610 __ mov(rscratch1, 16); 2611 __ andr(rscratch2, dest, 15); 2612 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2613 __ strq(v0, Address(dest)); 2614 __ sub(count, count, rscratch1); 2615 __ add(dest, dest, rscratch1); 2616 } 2617 2618 __ subs(count, count, (u1)64); 2619 __ br(__ LO, tail); 2620 { 2621 Label again; 2622 __ bind(again); 2623 __ stpq(v0, v0, Address(dest)); 2624 __ stpq(v0, v0, Address(dest, 32)); 2625 2626 __ subs(count, count, 64); 2627 __ add(dest, dest, 64); 2628 __ br(__ HS, again); 2629 } 2630 2631 __ bind(tail); 2632 // The count of bytes is off by 64, but we don't need to correct 2633 // it because we're only going to use the least-significant few 2634 // count bits from here on. 2635 // __ add(count, count, 64); 2636 2637 { 2638 Label dont; 2639 __ tbz(count, exact_log2(32), dont); 2640 __ stpq(v0, v0, __ post(dest, 32)); 2641 __ bind(dont); 2642 } 2643 { 2644 Label dont; 2645 __ tbz(count, exact_log2(16), dont); 2646 __ strq(v0, __ post(dest, 16)); 2647 __ bind(dont); 2648 } 2649 { 2650 Label dont; 2651 __ tbz(count, exact_log2(8), dont); 2652 __ strd(v0, __ post(dest, 8)); 2653 __ bind(dont); 2654 } 2655 2656 Label finished; 2657 __ tst(count, 7); 2658 __ br(__ EQ, finished); 2659 2660 { 2661 Label dont; 2662 __ tbz(count, exact_log2(4), dont); 2663 __ strs(v0, __ post(dest, 4)); 2664 __ bind(dont); 2665 } 2666 { 2667 Label dont; 2668 __ tbz(count, exact_log2(2), dont); 2669 __ bfi(value, value, 8, 8); 2670 __ strh(value, __ post(dest, 2)); 2671 __ bind(dont); 2672 } 2673 { 2674 Label dont; 2675 __ tbz(count, exact_log2(1), dont); 2676 __ strb(value, Address(dest)); 2677 __ bind(dont); 2678 } 2679 2680 __ bind(finished); 2681 __ leave(); 2682 __ ret(lr); 2683 2684 return start; 2685 } 2686 2687 address generate_data_cache_writeback() { 2688 const Register line = c_rarg0; // address of line to write back 2689 2690 __ align(CodeEntryAlignment); 2691 2692 StubId stub_id = StubId::stubgen_data_cache_writeback_id; 2693 StubCodeMark mark(this, stub_id); 2694 2695 address start = __ pc(); 2696 __ enter(); 2697 __ cache_wb(Address(line, 0)); 2698 __ leave(); 2699 __ ret(lr); 2700 2701 return start; 2702 } 2703 2704 address generate_data_cache_writeback_sync() { 2705 const Register is_pre = c_rarg0; // pre or post sync 2706 2707 __ align(CodeEntryAlignment); 2708 2709 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id; 2710 StubCodeMark mark(this, stub_id); 2711 2712 // pre wbsync is a no-op 2713 // post wbsync translates to an sfence 2714 2715 Label skip; 2716 address start = __ pc(); 2717 __ enter(); 2718 __ cbnz(is_pre, skip); 2719 __ cache_wbsync(false); 2720 __ bind(skip); 2721 __ leave(); 2722 __ ret(lr); 2723 2724 return start; 2725 } 2726 2727 void generate_arraycopy_stubs() { 2728 address entry; 2729 address entry_jbyte_arraycopy; 2730 address entry_jshort_arraycopy; 2731 address entry_jint_arraycopy; 2732 address entry_oop_arraycopy; 2733 address entry_jlong_arraycopy; 2734 address entry_checkcast_arraycopy; 2735 2736 // generate the common exit first so later stubs can rely on it if 2737 // they want an UnsafeMemoryAccess exit non-local to the stub 2738 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit(); 2739 // register the stub as the default exit with class UnsafeMemoryAccess 2740 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit); 2741 2742 generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2743 generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2744 2745 generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2746 generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2747 2748 generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2749 generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2750 2751 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2752 2753 //*** jbyte 2754 // Always need aligned and unaligned versions 2755 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry); 2756 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2757 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry); 2758 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr); 2759 2760 //*** jshort 2761 // Always need aligned and unaligned versions 2762 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry); 2763 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2764 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry); 2765 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr); 2766 2767 //*** jint 2768 // Aligned versions 2769 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry); 2770 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2771 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2772 // entry_jint_arraycopy always points to the unaligned version 2773 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry); 2774 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2775 2776 //*** jlong 2777 // It is always aligned 2778 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry); 2779 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2780 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2781 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2782 2783 //*** oops 2784 { 2785 // With compressed oops we need unaligned versions; notice that 2786 // we overwrite entry_oop_arraycopy. 2787 bool aligned = !UseCompressedOops; 2788 2789 StubRoutines::_arrayof_oop_disjoint_arraycopy 2790 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry); 2791 StubRoutines::_arrayof_oop_arraycopy 2792 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2793 // Aligned versions without pre-barriers 2794 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2795 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2796 StubRoutines::_arrayof_oop_arraycopy_uninit 2797 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2798 } 2799 2800 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2801 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2802 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2803 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2804 2805 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2806 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr); 2807 2808 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2809 entry_jshort_arraycopy, 2810 entry_jint_arraycopy, 2811 entry_jlong_arraycopy); 2812 2813 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2814 entry_jshort_arraycopy, 2815 entry_jint_arraycopy, 2816 entry_oop_arraycopy, 2817 entry_jlong_arraycopy, 2818 entry_checkcast_arraycopy); 2819 2820 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id); 2821 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id); 2822 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id); 2823 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id); 2824 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id); 2825 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id); 2826 } 2827 2828 void generate_math_stubs() { Unimplemented(); } 2829 2830 // Arguments: 2831 // 2832 // Inputs: 2833 // c_rarg0 - source byte array address 2834 // c_rarg1 - destination byte array address 2835 // c_rarg2 - K (key) in little endian int array 2836 // 2837 address generate_aescrypt_encryptBlock() { 2838 __ align(CodeEntryAlignment); 2839 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id; 2840 StubCodeMark mark(this, stub_id); 2841 2842 const Register from = c_rarg0; // source array address 2843 const Register to = c_rarg1; // destination array address 2844 const Register key = c_rarg2; // key array address 2845 const Register keylen = rscratch1; 2846 2847 address start = __ pc(); 2848 __ enter(); 2849 2850 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2851 2852 __ aesenc_loadkeys(key, keylen); 2853 __ aesecb_encrypt(from, to, keylen); 2854 2855 __ mov(r0, 0); 2856 2857 __ leave(); 2858 __ ret(lr); 2859 2860 return start; 2861 } 2862 2863 // Arguments: 2864 // 2865 // Inputs: 2866 // c_rarg0 - source byte array address 2867 // c_rarg1 - destination byte array address 2868 // c_rarg2 - K (key) in little endian int array 2869 // 2870 address generate_aescrypt_decryptBlock() { 2871 assert(UseAES, "need AES cryptographic extension support"); 2872 __ align(CodeEntryAlignment); 2873 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id; 2874 StubCodeMark mark(this, stub_id); 2875 Label L_doLast; 2876 2877 const Register from = c_rarg0; // source array address 2878 const Register to = c_rarg1; // destination array address 2879 const Register key = c_rarg2; // key array address 2880 const Register keylen = rscratch1; 2881 2882 address start = __ pc(); 2883 __ enter(); // required for proper stackwalking of RuntimeStub frame 2884 2885 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2886 2887 __ aesecb_decrypt(from, to, key, keylen); 2888 2889 __ mov(r0, 0); 2890 2891 __ leave(); 2892 __ ret(lr); 2893 2894 return start; 2895 } 2896 2897 // Arguments: 2898 // 2899 // Inputs: 2900 // c_rarg0 - source byte array address 2901 // c_rarg1 - destination byte array address 2902 // c_rarg2 - K (key) in little endian int array 2903 // c_rarg3 - r vector byte array address 2904 // c_rarg4 - input length 2905 // 2906 // Output: 2907 // x0 - input length 2908 // 2909 address generate_cipherBlockChaining_encryptAESCrypt() { 2910 assert(UseAES, "need AES cryptographic extension support"); 2911 __ align(CodeEntryAlignment); 2912 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id; 2913 StubCodeMark mark(this, stub_id); 2914 2915 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2916 2917 const Register from = c_rarg0; // source array address 2918 const Register to = c_rarg1; // destination array address 2919 const Register key = c_rarg2; // key array address 2920 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2921 // and left with the results of the last encryption block 2922 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2923 const Register keylen = rscratch1; 2924 2925 address start = __ pc(); 2926 2927 __ enter(); 2928 2929 __ movw(rscratch2, len_reg); 2930 2931 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2932 2933 __ ld1(v0, __ T16B, rvec); 2934 2935 __ cmpw(keylen, 52); 2936 __ br(Assembler::CC, L_loadkeys_44); 2937 __ br(Assembler::EQ, L_loadkeys_52); 2938 2939 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2940 __ rev32(v17, __ T16B, v17); 2941 __ rev32(v18, __ T16B, v18); 2942 __ BIND(L_loadkeys_52); 2943 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2944 __ rev32(v19, __ T16B, v19); 2945 __ rev32(v20, __ T16B, v20); 2946 __ BIND(L_loadkeys_44); 2947 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2948 __ rev32(v21, __ T16B, v21); 2949 __ rev32(v22, __ T16B, v22); 2950 __ rev32(v23, __ T16B, v23); 2951 __ rev32(v24, __ T16B, v24); 2952 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2953 __ rev32(v25, __ T16B, v25); 2954 __ rev32(v26, __ T16B, v26); 2955 __ rev32(v27, __ T16B, v27); 2956 __ rev32(v28, __ T16B, v28); 2957 __ ld1(v29, v30, v31, __ T16B, key); 2958 __ rev32(v29, __ T16B, v29); 2959 __ rev32(v30, __ T16B, v30); 2960 __ rev32(v31, __ T16B, v31); 2961 2962 __ BIND(L_aes_loop); 2963 __ ld1(v1, __ T16B, __ post(from, 16)); 2964 __ eor(v0, __ T16B, v0, v1); 2965 2966 __ br(Assembler::CC, L_rounds_44); 2967 __ br(Assembler::EQ, L_rounds_52); 2968 2969 __ aese(v0, v17); __ aesmc(v0, v0); 2970 __ aese(v0, v18); __ aesmc(v0, v0); 2971 __ BIND(L_rounds_52); 2972 __ aese(v0, v19); __ aesmc(v0, v0); 2973 __ aese(v0, v20); __ aesmc(v0, v0); 2974 __ BIND(L_rounds_44); 2975 __ aese(v0, v21); __ aesmc(v0, v0); 2976 __ aese(v0, v22); __ aesmc(v0, v0); 2977 __ aese(v0, v23); __ aesmc(v0, v0); 2978 __ aese(v0, v24); __ aesmc(v0, v0); 2979 __ aese(v0, v25); __ aesmc(v0, v0); 2980 __ aese(v0, v26); __ aesmc(v0, v0); 2981 __ aese(v0, v27); __ aesmc(v0, v0); 2982 __ aese(v0, v28); __ aesmc(v0, v0); 2983 __ aese(v0, v29); __ aesmc(v0, v0); 2984 __ aese(v0, v30); 2985 __ eor(v0, __ T16B, v0, v31); 2986 2987 __ st1(v0, __ T16B, __ post(to, 16)); 2988 2989 __ subw(len_reg, len_reg, 16); 2990 __ cbnzw(len_reg, L_aes_loop); 2991 2992 __ st1(v0, __ T16B, rvec); 2993 2994 __ mov(r0, rscratch2); 2995 2996 __ leave(); 2997 __ ret(lr); 2998 2999 return start; 3000 } 3001 3002 // Arguments: 3003 // 3004 // Inputs: 3005 // c_rarg0 - source byte array address 3006 // c_rarg1 - destination byte array address 3007 // c_rarg2 - K (key) in little endian int array 3008 // c_rarg3 - r vector byte array address 3009 // c_rarg4 - input length 3010 // 3011 // Output: 3012 // r0 - input length 3013 // 3014 address generate_cipherBlockChaining_decryptAESCrypt() { 3015 assert(UseAES, "need AES cryptographic extension support"); 3016 __ align(CodeEntryAlignment); 3017 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id; 3018 StubCodeMark mark(this, stub_id); 3019 3020 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3021 3022 const Register from = c_rarg0; // source array address 3023 const Register to = c_rarg1; // destination array address 3024 const Register key = c_rarg2; // key array address 3025 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3026 // and left with the results of the last encryption block 3027 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3028 const Register keylen = rscratch1; 3029 3030 address start = __ pc(); 3031 3032 __ enter(); 3033 3034 __ movw(rscratch2, len_reg); 3035 3036 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3037 3038 __ ld1(v2, __ T16B, rvec); 3039 3040 __ ld1(v31, __ T16B, __ post(key, 16)); 3041 __ rev32(v31, __ T16B, v31); 3042 3043 __ cmpw(keylen, 52); 3044 __ br(Assembler::CC, L_loadkeys_44); 3045 __ br(Assembler::EQ, L_loadkeys_52); 3046 3047 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3048 __ rev32(v17, __ T16B, v17); 3049 __ rev32(v18, __ T16B, v18); 3050 __ BIND(L_loadkeys_52); 3051 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3052 __ rev32(v19, __ T16B, v19); 3053 __ rev32(v20, __ T16B, v20); 3054 __ BIND(L_loadkeys_44); 3055 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3056 __ rev32(v21, __ T16B, v21); 3057 __ rev32(v22, __ T16B, v22); 3058 __ rev32(v23, __ T16B, v23); 3059 __ rev32(v24, __ T16B, v24); 3060 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3061 __ rev32(v25, __ T16B, v25); 3062 __ rev32(v26, __ T16B, v26); 3063 __ rev32(v27, __ T16B, v27); 3064 __ rev32(v28, __ T16B, v28); 3065 __ ld1(v29, v30, __ T16B, key); 3066 __ rev32(v29, __ T16B, v29); 3067 __ rev32(v30, __ T16B, v30); 3068 3069 __ BIND(L_aes_loop); 3070 __ ld1(v0, __ T16B, __ post(from, 16)); 3071 __ orr(v1, __ T16B, v0, v0); 3072 3073 __ br(Assembler::CC, L_rounds_44); 3074 __ br(Assembler::EQ, L_rounds_52); 3075 3076 __ aesd(v0, v17); __ aesimc(v0, v0); 3077 __ aesd(v0, v18); __ aesimc(v0, v0); 3078 __ BIND(L_rounds_52); 3079 __ aesd(v0, v19); __ aesimc(v0, v0); 3080 __ aesd(v0, v20); __ aesimc(v0, v0); 3081 __ BIND(L_rounds_44); 3082 __ aesd(v0, v21); __ aesimc(v0, v0); 3083 __ aesd(v0, v22); __ aesimc(v0, v0); 3084 __ aesd(v0, v23); __ aesimc(v0, v0); 3085 __ aesd(v0, v24); __ aesimc(v0, v0); 3086 __ aesd(v0, v25); __ aesimc(v0, v0); 3087 __ aesd(v0, v26); __ aesimc(v0, v0); 3088 __ aesd(v0, v27); __ aesimc(v0, v0); 3089 __ aesd(v0, v28); __ aesimc(v0, v0); 3090 __ aesd(v0, v29); __ aesimc(v0, v0); 3091 __ aesd(v0, v30); 3092 __ eor(v0, __ T16B, v0, v31); 3093 __ eor(v0, __ T16B, v0, v2); 3094 3095 __ st1(v0, __ T16B, __ post(to, 16)); 3096 __ orr(v2, __ T16B, v1, v1); 3097 3098 __ subw(len_reg, len_reg, 16); 3099 __ cbnzw(len_reg, L_aes_loop); 3100 3101 __ st1(v2, __ T16B, rvec); 3102 3103 __ mov(r0, rscratch2); 3104 3105 __ leave(); 3106 __ ret(lr); 3107 3108 return start; 3109 } 3110 3111 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3112 // Inputs: 128-bits. in is preserved. 3113 // The least-significant 64-bit word is in the upper dword of each vector. 3114 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3115 // Output: result 3116 void be_add_128_64(FloatRegister result, FloatRegister in, 3117 FloatRegister inc, FloatRegister tmp) { 3118 assert_different_registers(result, tmp, inc); 3119 3120 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3121 // input 3122 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3123 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3124 // MSD == 0 (must be!) to LSD 3125 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3126 } 3127 3128 // CTR AES crypt. 3129 // Arguments: 3130 // 3131 // Inputs: 3132 // c_rarg0 - source byte array address 3133 // c_rarg1 - destination byte array address 3134 // c_rarg2 - K (key) in little endian int array 3135 // c_rarg3 - counter vector byte array address 3136 // c_rarg4 - input length 3137 // c_rarg5 - saved encryptedCounter start 3138 // c_rarg6 - saved used length 3139 // 3140 // Output: 3141 // r0 - input length 3142 // 3143 address generate_counterMode_AESCrypt() { 3144 const Register in = c_rarg0; 3145 const Register out = c_rarg1; 3146 const Register key = c_rarg2; 3147 const Register counter = c_rarg3; 3148 const Register saved_len = c_rarg4, len = r10; 3149 const Register saved_encrypted_ctr = c_rarg5; 3150 const Register used_ptr = c_rarg6, used = r12; 3151 3152 const Register offset = r7; 3153 const Register keylen = r11; 3154 3155 const unsigned char block_size = 16; 3156 const int bulk_width = 4; 3157 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3158 // performance with larger data sizes, but it also means that the 3159 // fast path isn't used until you have at least 8 blocks, and up 3160 // to 127 bytes of data will be executed on the slow path. For 3161 // that reason, and also so as not to blow away too much icache, 4 3162 // blocks seems like a sensible compromise. 3163 3164 // Algorithm: 3165 // 3166 // if (len == 0) { 3167 // goto DONE; 3168 // } 3169 // int result = len; 3170 // do { 3171 // if (used >= blockSize) { 3172 // if (len >= bulk_width * blockSize) { 3173 // CTR_large_block(); 3174 // if (len == 0) 3175 // goto DONE; 3176 // } 3177 // for (;;) { 3178 // 16ByteVector v0 = counter; 3179 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3180 // used = 0; 3181 // if (len < blockSize) 3182 // break; /* goto NEXT */ 3183 // 16ByteVector v1 = load16Bytes(in, offset); 3184 // v1 = v1 ^ encryptedCounter; 3185 // store16Bytes(out, offset); 3186 // used = blockSize; 3187 // offset += blockSize; 3188 // len -= blockSize; 3189 // if (len == 0) 3190 // goto DONE; 3191 // } 3192 // } 3193 // NEXT: 3194 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3195 // len--; 3196 // } while (len != 0); 3197 // DONE: 3198 // return result; 3199 // 3200 // CTR_large_block() 3201 // Wide bulk encryption of whole blocks. 3202 3203 __ align(CodeEntryAlignment); 3204 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id; 3205 StubCodeMark mark(this, stub_id); 3206 const address start = __ pc(); 3207 __ enter(); 3208 3209 Label DONE, CTR_large_block, large_block_return; 3210 __ ldrw(used, Address(used_ptr)); 3211 __ cbzw(saved_len, DONE); 3212 3213 __ mov(len, saved_len); 3214 __ mov(offset, 0); 3215 3216 // Compute #rounds for AES based on the length of the key array 3217 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3218 3219 __ aesenc_loadkeys(key, keylen); 3220 3221 { 3222 Label L_CTR_loop, NEXT; 3223 3224 __ bind(L_CTR_loop); 3225 3226 __ cmp(used, block_size); 3227 __ br(__ LO, NEXT); 3228 3229 // Maybe we have a lot of data 3230 __ subsw(rscratch1, len, bulk_width * block_size); 3231 __ br(__ HS, CTR_large_block); 3232 __ BIND(large_block_return); 3233 __ cbzw(len, DONE); 3234 3235 // Setup the counter 3236 __ movi(v4, __ T4S, 0); 3237 __ movi(v5, __ T4S, 1); 3238 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3239 3240 // 128-bit big-endian increment 3241 __ ld1(v0, __ T16B, counter); 3242 __ rev64(v16, __ T16B, v0); 3243 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3244 __ rev64(v16, __ T16B, v16); 3245 __ st1(v16, __ T16B, counter); 3246 // Previous counter value is in v0 3247 // v4 contains { 0, 1 } 3248 3249 { 3250 // We have fewer than bulk_width blocks of data left. Encrypt 3251 // them one by one until there is less than a full block 3252 // remaining, being careful to save both the encrypted counter 3253 // and the counter. 3254 3255 Label inner_loop; 3256 __ bind(inner_loop); 3257 // Counter to encrypt is in v0 3258 __ aesecb_encrypt(noreg, noreg, keylen); 3259 __ st1(v0, __ T16B, saved_encrypted_ctr); 3260 3261 // Do we have a remaining full block? 3262 3263 __ mov(used, 0); 3264 __ cmp(len, block_size); 3265 __ br(__ LO, NEXT); 3266 3267 // Yes, we have a full block 3268 __ ldrq(v1, Address(in, offset)); 3269 __ eor(v1, __ T16B, v1, v0); 3270 __ strq(v1, Address(out, offset)); 3271 __ mov(used, block_size); 3272 __ add(offset, offset, block_size); 3273 3274 __ subw(len, len, block_size); 3275 __ cbzw(len, DONE); 3276 3277 // Increment the counter, store it back 3278 __ orr(v0, __ T16B, v16, v16); 3279 __ rev64(v16, __ T16B, v16); 3280 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3281 __ rev64(v16, __ T16B, v16); 3282 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3283 3284 __ b(inner_loop); 3285 } 3286 3287 __ BIND(NEXT); 3288 3289 // Encrypt a single byte, and loop. 3290 // We expect this to be a rare event. 3291 __ ldrb(rscratch1, Address(in, offset)); 3292 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3293 __ eor(rscratch1, rscratch1, rscratch2); 3294 __ strb(rscratch1, Address(out, offset)); 3295 __ add(offset, offset, 1); 3296 __ add(used, used, 1); 3297 __ subw(len, len,1); 3298 __ cbnzw(len, L_CTR_loop); 3299 } 3300 3301 __ bind(DONE); 3302 __ strw(used, Address(used_ptr)); 3303 __ mov(r0, saved_len); 3304 3305 __ leave(); // required for proper stackwalking of RuntimeStub frame 3306 __ ret(lr); 3307 3308 // Bulk encryption 3309 3310 __ BIND (CTR_large_block); 3311 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3312 3313 if (bulk_width == 8) { 3314 __ sub(sp, sp, 4 * 16); 3315 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3316 } 3317 __ sub(sp, sp, 4 * 16); 3318 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3319 RegSet saved_regs = (RegSet::of(in, out, offset) 3320 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3321 __ push(saved_regs, sp); 3322 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3323 __ add(in, in, offset); 3324 __ add(out, out, offset); 3325 3326 // Keys should already be loaded into the correct registers 3327 3328 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3329 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3330 3331 // AES/CTR loop 3332 { 3333 Label L_CTR_loop; 3334 __ BIND(L_CTR_loop); 3335 3336 // Setup the counters 3337 __ movi(v8, __ T4S, 0); 3338 __ movi(v9, __ T4S, 1); 3339 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3340 3341 for (int i = 0; i < bulk_width; i++) { 3342 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3343 __ rev64(v0_ofs, __ T16B, v16); 3344 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3345 } 3346 3347 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3348 3349 // Encrypt the counters 3350 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3351 3352 if (bulk_width == 8) { 3353 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3354 } 3355 3356 // XOR the encrypted counters with the inputs 3357 for (int i = 0; i < bulk_width; i++) { 3358 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3359 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3360 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3361 } 3362 3363 // Write the encrypted data 3364 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3365 if (bulk_width == 8) { 3366 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3367 } 3368 3369 __ subw(len, len, 16 * bulk_width); 3370 __ cbnzw(len, L_CTR_loop); 3371 } 3372 3373 // Save the counter back where it goes 3374 __ rev64(v16, __ T16B, v16); 3375 __ st1(v16, __ T16B, counter); 3376 3377 __ pop(saved_regs, sp); 3378 3379 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3380 if (bulk_width == 8) { 3381 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3382 } 3383 3384 __ andr(rscratch1, len, -16 * bulk_width); 3385 __ sub(len, len, rscratch1); 3386 __ add(offset, offset, rscratch1); 3387 __ mov(used, 16); 3388 __ strw(used, Address(used_ptr)); 3389 __ b(large_block_return); 3390 3391 return start; 3392 } 3393 3394 // Vector AES Galois Counter Mode implementation. Parameters: 3395 // 3396 // in = c_rarg0 3397 // len = c_rarg1 3398 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3399 // out = c_rarg3 3400 // key = c_rarg4 3401 // state = c_rarg5 - GHASH.state 3402 // subkeyHtbl = c_rarg6 - powers of H 3403 // counter = c_rarg7 - 16 bytes of CTR 3404 // return - number of processed bytes 3405 address generate_galoisCounterMode_AESCrypt() { 3406 address ghash_polynomial = __ pc(); 3407 __ emit_int64(0x87); // The low-order bits of the field 3408 // polynomial (i.e. p = z^7+z^2+z+1) 3409 // repeated in the low and high parts of a 3410 // 128-bit vector 3411 __ emit_int64(0x87); 3412 3413 __ align(CodeEntryAlignment); 3414 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id; 3415 StubCodeMark mark(this, stub_id); 3416 address start = __ pc(); 3417 __ enter(); 3418 3419 const Register in = c_rarg0; 3420 const Register len = c_rarg1; 3421 const Register ct = c_rarg2; 3422 const Register out = c_rarg3; 3423 // and updated with the incremented counter in the end 3424 3425 const Register key = c_rarg4; 3426 const Register state = c_rarg5; 3427 3428 const Register subkeyHtbl = c_rarg6; 3429 3430 const Register counter = c_rarg7; 3431 3432 const Register keylen = r10; 3433 // Save state before entering routine 3434 __ sub(sp, sp, 4 * 16); 3435 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3436 __ sub(sp, sp, 4 * 16); 3437 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3438 3439 // __ andr(len, len, -512); 3440 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3441 __ str(len, __ pre(sp, -2 * wordSize)); 3442 3443 Label DONE; 3444 __ cbz(len, DONE); 3445 3446 // Compute #rounds for AES based on the length of the key array 3447 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3448 3449 __ aesenc_loadkeys(key, keylen); 3450 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3451 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3452 3453 // AES/CTR loop 3454 { 3455 Label L_CTR_loop; 3456 __ BIND(L_CTR_loop); 3457 3458 // Setup the counters 3459 __ movi(v8, __ T4S, 0); 3460 __ movi(v9, __ T4S, 1); 3461 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3462 3463 assert(v0->encoding() < v8->encoding(), ""); 3464 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3465 FloatRegister f = as_FloatRegister(i); 3466 __ rev32(f, __ T16B, v16); 3467 __ addv(v16, __ T4S, v16, v8); 3468 } 3469 3470 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3471 3472 // Encrypt the counters 3473 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3474 3475 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3476 3477 // XOR the encrypted counters with the inputs 3478 for (int i = 0; i < 8; i++) { 3479 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3480 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3481 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3482 } 3483 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3484 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3485 3486 __ subw(len, len, 16 * 8); 3487 __ cbnzw(len, L_CTR_loop); 3488 } 3489 3490 __ rev32(v16, __ T16B, v16); 3491 __ st1(v16, __ T16B, counter); 3492 3493 __ ldr(len, Address(sp)); 3494 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3495 3496 // GHASH/CTR loop 3497 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3498 len, /*unrolls*/4); 3499 3500 #ifdef ASSERT 3501 { Label L; 3502 __ cmp(len, (unsigned char)0); 3503 __ br(Assembler::EQ, L); 3504 __ stop("stubGenerator: abort"); 3505 __ bind(L); 3506 } 3507 #endif 3508 3509 __ bind(DONE); 3510 // Return the number of bytes processed 3511 __ ldr(r0, __ post(sp, 2 * wordSize)); 3512 3513 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3514 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3515 3516 __ leave(); // required for proper stackwalking of RuntimeStub frame 3517 __ ret(lr); 3518 return start; 3519 } 3520 3521 class Cached64Bytes { 3522 private: 3523 MacroAssembler *_masm; 3524 Register _regs[8]; 3525 3526 public: 3527 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3528 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3529 auto it = rs.begin(); 3530 for (auto &r: _regs) { 3531 r = *it; 3532 ++it; 3533 } 3534 } 3535 3536 void gen_loads(Register base) { 3537 for (int i = 0; i < 8; i += 2) { 3538 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3539 } 3540 } 3541 3542 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3543 void extract_u32(Register dest, int i) { 3544 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3545 } 3546 }; 3547 3548 // Utility routines for md5. 3549 // Clobbers r10 and r11. 3550 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3551 int k, int s, int t) { 3552 Register rscratch3 = r10; 3553 Register rscratch4 = r11; 3554 3555 __ eorw(rscratch3, r3, r4); 3556 __ movw(rscratch2, t); 3557 __ andw(rscratch3, rscratch3, r2); 3558 __ addw(rscratch4, r1, rscratch2); 3559 reg_cache.extract_u32(rscratch1, k); 3560 __ eorw(rscratch3, rscratch3, r4); 3561 __ addw(rscratch4, rscratch4, rscratch1); 3562 __ addw(rscratch3, rscratch3, rscratch4); 3563 __ rorw(rscratch2, rscratch3, 32 - s); 3564 __ addw(r1, rscratch2, r2); 3565 } 3566 3567 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3568 int k, int s, int t) { 3569 Register rscratch3 = r10; 3570 Register rscratch4 = r11; 3571 3572 reg_cache.extract_u32(rscratch1, k); 3573 __ movw(rscratch2, t); 3574 __ addw(rscratch4, r1, rscratch2); 3575 __ addw(rscratch4, rscratch4, rscratch1); 3576 __ bicw(rscratch2, r3, r4); 3577 __ andw(rscratch3, r2, r4); 3578 __ addw(rscratch2, rscratch2, rscratch4); 3579 __ addw(rscratch2, rscratch2, rscratch3); 3580 __ rorw(rscratch2, rscratch2, 32 - s); 3581 __ addw(r1, rscratch2, r2); 3582 } 3583 3584 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3585 int k, int s, int t) { 3586 Register rscratch3 = r10; 3587 Register rscratch4 = r11; 3588 3589 __ eorw(rscratch3, r3, r4); 3590 __ movw(rscratch2, t); 3591 __ addw(rscratch4, r1, rscratch2); 3592 reg_cache.extract_u32(rscratch1, k); 3593 __ eorw(rscratch3, rscratch3, r2); 3594 __ addw(rscratch4, rscratch4, rscratch1); 3595 __ addw(rscratch3, rscratch3, rscratch4); 3596 __ rorw(rscratch2, rscratch3, 32 - s); 3597 __ addw(r1, rscratch2, r2); 3598 } 3599 3600 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3601 int k, int s, int t) { 3602 Register rscratch3 = r10; 3603 Register rscratch4 = r11; 3604 3605 __ movw(rscratch3, t); 3606 __ ornw(rscratch2, r2, r4); 3607 __ addw(rscratch4, r1, rscratch3); 3608 reg_cache.extract_u32(rscratch1, k); 3609 __ eorw(rscratch3, rscratch2, r3); 3610 __ addw(rscratch4, rscratch4, rscratch1); 3611 __ addw(rscratch3, rscratch3, rscratch4); 3612 __ rorw(rscratch2, rscratch3, 32 - s); 3613 __ addw(r1, rscratch2, r2); 3614 } 3615 3616 // Arguments: 3617 // 3618 // Inputs: 3619 // c_rarg0 - byte[] source+offset 3620 // c_rarg1 - int[] SHA.state 3621 // c_rarg2 - int offset 3622 // c_rarg3 - int limit 3623 // 3624 address generate_md5_implCompress(StubId stub_id) { 3625 bool multi_block; 3626 switch (stub_id) { 3627 case StubId::stubgen_md5_implCompress_id: 3628 multi_block = false; 3629 break; 3630 case StubId::stubgen_md5_implCompressMB_id: 3631 multi_block = true; 3632 break; 3633 default: 3634 ShouldNotReachHere(); 3635 } 3636 __ align(CodeEntryAlignment); 3637 3638 StubCodeMark mark(this, stub_id); 3639 address start = __ pc(); 3640 3641 Register buf = c_rarg0; 3642 Register state = c_rarg1; 3643 Register ofs = c_rarg2; 3644 Register limit = c_rarg3; 3645 Register a = r4; 3646 Register b = r5; 3647 Register c = r6; 3648 Register d = r7; 3649 Register rscratch3 = r10; 3650 Register rscratch4 = r11; 3651 3652 Register state_regs[2] = { r12, r13 }; 3653 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3654 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3655 3656 __ push(saved_regs, sp); 3657 3658 __ ldp(state_regs[0], state_regs[1], Address(state)); 3659 __ ubfx(a, state_regs[0], 0, 32); 3660 __ ubfx(b, state_regs[0], 32, 32); 3661 __ ubfx(c, state_regs[1], 0, 32); 3662 __ ubfx(d, state_regs[1], 32, 32); 3663 3664 Label md5_loop; 3665 __ BIND(md5_loop); 3666 3667 reg_cache.gen_loads(buf); 3668 3669 // Round 1 3670 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3671 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3672 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3673 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3674 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3675 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3676 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3677 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3678 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3679 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3680 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3681 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3682 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3683 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3684 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3685 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3686 3687 // Round 2 3688 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3689 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3690 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3691 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3692 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3693 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3694 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3695 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3696 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3697 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3698 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3699 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3700 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3701 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3702 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3703 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3704 3705 // Round 3 3706 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3707 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3708 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3709 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3710 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3711 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3712 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3713 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3714 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3715 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3716 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3717 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3718 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3719 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3720 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3721 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3722 3723 // Round 4 3724 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3725 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3726 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3727 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3728 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3729 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3730 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3731 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3732 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3733 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3734 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3735 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3736 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3737 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3738 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3739 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3740 3741 __ addw(a, state_regs[0], a); 3742 __ ubfx(rscratch2, state_regs[0], 32, 32); 3743 __ addw(b, rscratch2, b); 3744 __ addw(c, state_regs[1], c); 3745 __ ubfx(rscratch4, state_regs[1], 32, 32); 3746 __ addw(d, rscratch4, d); 3747 3748 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3749 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3750 3751 if (multi_block) { 3752 __ add(buf, buf, 64); 3753 __ add(ofs, ofs, 64); 3754 __ cmp(ofs, limit); 3755 __ br(Assembler::LE, md5_loop); 3756 __ mov(c_rarg0, ofs); // return ofs 3757 } 3758 3759 // write hash values back in the correct order 3760 __ stp(state_regs[0], state_regs[1], Address(state)); 3761 3762 __ pop(saved_regs, sp); 3763 3764 __ ret(lr); 3765 3766 return start; 3767 } 3768 3769 // Arguments: 3770 // 3771 // Inputs: 3772 // c_rarg0 - byte[] source+offset 3773 // c_rarg1 - int[] SHA.state 3774 // c_rarg2 - int offset 3775 // c_rarg3 - int limit 3776 // 3777 address generate_sha1_implCompress(StubId stub_id) { 3778 bool multi_block; 3779 switch (stub_id) { 3780 case StubId::stubgen_sha1_implCompress_id: 3781 multi_block = false; 3782 break; 3783 case StubId::stubgen_sha1_implCompressMB_id: 3784 multi_block = true; 3785 break; 3786 default: 3787 ShouldNotReachHere(); 3788 } 3789 3790 __ align(CodeEntryAlignment); 3791 3792 StubCodeMark mark(this, stub_id); 3793 address start = __ pc(); 3794 3795 Register buf = c_rarg0; 3796 Register state = c_rarg1; 3797 Register ofs = c_rarg2; 3798 Register limit = c_rarg3; 3799 3800 Label keys; 3801 Label sha1_loop; 3802 3803 // load the keys into v0..v3 3804 __ adr(rscratch1, keys); 3805 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3806 // load 5 words state into v6, v7 3807 __ ldrq(v6, Address(state, 0)); 3808 __ ldrs(v7, Address(state, 16)); 3809 3810 3811 __ BIND(sha1_loop); 3812 // load 64 bytes of data into v16..v19 3813 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3814 __ rev32(v16, __ T16B, v16); 3815 __ rev32(v17, __ T16B, v17); 3816 __ rev32(v18, __ T16B, v18); 3817 __ rev32(v19, __ T16B, v19); 3818 3819 // do the sha1 3820 __ addv(v4, __ T4S, v16, v0); 3821 __ orr(v20, __ T16B, v6, v6); 3822 3823 FloatRegister d0 = v16; 3824 FloatRegister d1 = v17; 3825 FloatRegister d2 = v18; 3826 FloatRegister d3 = v19; 3827 3828 for (int round = 0; round < 20; round++) { 3829 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3830 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3831 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3832 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3833 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3834 3835 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3836 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3837 __ sha1h(tmp2, __ T4S, v20); 3838 if (round < 5) 3839 __ sha1c(v20, __ T4S, tmp3, tmp4); 3840 else if (round < 10 || round >= 15) 3841 __ sha1p(v20, __ T4S, tmp3, tmp4); 3842 else 3843 __ sha1m(v20, __ T4S, tmp3, tmp4); 3844 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3845 3846 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3847 } 3848 3849 __ addv(v7, __ T2S, v7, v21); 3850 __ addv(v6, __ T4S, v6, v20); 3851 3852 if (multi_block) { 3853 __ add(ofs, ofs, 64); 3854 __ cmp(ofs, limit); 3855 __ br(Assembler::LE, sha1_loop); 3856 __ mov(c_rarg0, ofs); // return ofs 3857 } 3858 3859 __ strq(v6, Address(state, 0)); 3860 __ strs(v7, Address(state, 16)); 3861 3862 __ ret(lr); 3863 3864 __ bind(keys); 3865 __ emit_int32(0x5a827999); 3866 __ emit_int32(0x6ed9eba1); 3867 __ emit_int32(0x8f1bbcdc); 3868 __ emit_int32(0xca62c1d6); 3869 3870 return start; 3871 } 3872 3873 3874 // Arguments: 3875 // 3876 // Inputs: 3877 // c_rarg0 - byte[] source+offset 3878 // c_rarg1 - int[] SHA.state 3879 // c_rarg2 - int offset 3880 // c_rarg3 - int limit 3881 // 3882 address generate_sha256_implCompress(StubId stub_id) { 3883 bool multi_block; 3884 switch (stub_id) { 3885 case StubId::stubgen_sha256_implCompress_id: 3886 multi_block = false; 3887 break; 3888 case StubId::stubgen_sha256_implCompressMB_id: 3889 multi_block = true; 3890 break; 3891 default: 3892 ShouldNotReachHere(); 3893 } 3894 3895 static const uint32_t round_consts[64] = { 3896 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3897 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3898 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3899 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3900 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3901 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3902 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3903 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3904 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3905 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3906 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3907 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3908 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3909 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3910 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3911 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3912 }; 3913 3914 __ align(CodeEntryAlignment); 3915 3916 StubCodeMark mark(this, stub_id); 3917 address start = __ pc(); 3918 3919 Register buf = c_rarg0; 3920 Register state = c_rarg1; 3921 Register ofs = c_rarg2; 3922 Register limit = c_rarg3; 3923 3924 Label sha1_loop; 3925 3926 __ stpd(v8, v9, __ pre(sp, -32)); 3927 __ stpd(v10, v11, Address(sp, 16)); 3928 3929 // dga == v0 3930 // dgb == v1 3931 // dg0 == v2 3932 // dg1 == v3 3933 // dg2 == v4 3934 // t0 == v6 3935 // t1 == v7 3936 3937 // load 16 keys to v16..v31 3938 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3939 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3940 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3941 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3942 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3943 3944 // load 8 words (256 bits) state 3945 __ ldpq(v0, v1, state); 3946 3947 __ BIND(sha1_loop); 3948 // load 64 bytes of data into v8..v11 3949 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3950 __ rev32(v8, __ T16B, v8); 3951 __ rev32(v9, __ T16B, v9); 3952 __ rev32(v10, __ T16B, v10); 3953 __ rev32(v11, __ T16B, v11); 3954 3955 __ addv(v6, __ T4S, v8, v16); 3956 __ orr(v2, __ T16B, v0, v0); 3957 __ orr(v3, __ T16B, v1, v1); 3958 3959 FloatRegister d0 = v8; 3960 FloatRegister d1 = v9; 3961 FloatRegister d2 = v10; 3962 FloatRegister d3 = v11; 3963 3964 3965 for (int round = 0; round < 16; round++) { 3966 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3967 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3968 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3969 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3970 3971 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3972 __ orr(v4, __ T16B, v2, v2); 3973 if (round < 15) 3974 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3975 __ sha256h(v2, __ T4S, v3, tmp2); 3976 __ sha256h2(v3, __ T4S, v4, tmp2); 3977 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3978 3979 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3980 } 3981 3982 __ addv(v0, __ T4S, v0, v2); 3983 __ addv(v1, __ T4S, v1, v3); 3984 3985 if (multi_block) { 3986 __ add(ofs, ofs, 64); 3987 __ cmp(ofs, limit); 3988 __ br(Assembler::LE, sha1_loop); 3989 __ mov(c_rarg0, ofs); // return ofs 3990 } 3991 3992 __ ldpd(v10, v11, Address(sp, 16)); 3993 __ ldpd(v8, v9, __ post(sp, 32)); 3994 3995 __ stpq(v0, v1, state); 3996 3997 __ ret(lr); 3998 3999 return start; 4000 } 4001 4002 // Double rounds for sha512. 4003 void sha512_dround(int dr, 4004 FloatRegister vi0, FloatRegister vi1, 4005 FloatRegister vi2, FloatRegister vi3, 4006 FloatRegister vi4, FloatRegister vrc0, 4007 FloatRegister vrc1, FloatRegister vin0, 4008 FloatRegister vin1, FloatRegister vin2, 4009 FloatRegister vin3, FloatRegister vin4) { 4010 if (dr < 36) { 4011 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4012 } 4013 __ addv(v5, __ T2D, vrc0, vin0); 4014 __ ext(v6, __ T16B, vi2, vi3, 8); 4015 __ ext(v5, __ T16B, v5, v5, 8); 4016 __ ext(v7, __ T16B, vi1, vi2, 8); 4017 __ addv(vi3, __ T2D, vi3, v5); 4018 if (dr < 32) { 4019 __ ext(v5, __ T16B, vin3, vin4, 8); 4020 __ sha512su0(vin0, __ T2D, vin1); 4021 } 4022 __ sha512h(vi3, __ T2D, v6, v7); 4023 if (dr < 32) { 4024 __ sha512su1(vin0, __ T2D, vin2, v5); 4025 } 4026 __ addv(vi4, __ T2D, vi1, vi3); 4027 __ sha512h2(vi3, __ T2D, vi1, vi0); 4028 } 4029 4030 // Arguments: 4031 // 4032 // Inputs: 4033 // c_rarg0 - byte[] source+offset 4034 // c_rarg1 - int[] SHA.state 4035 // c_rarg2 - int offset 4036 // c_rarg3 - int limit 4037 // 4038 address generate_sha512_implCompress(StubId stub_id) { 4039 bool multi_block; 4040 switch (stub_id) { 4041 case StubId::stubgen_sha512_implCompress_id: 4042 multi_block = false; 4043 break; 4044 case StubId::stubgen_sha512_implCompressMB_id: 4045 multi_block = true; 4046 break; 4047 default: 4048 ShouldNotReachHere(); 4049 } 4050 4051 static const uint64_t round_consts[80] = { 4052 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4053 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4054 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4055 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4056 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4057 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4058 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4059 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4060 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4061 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4062 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4063 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4064 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4065 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4066 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4067 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4068 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4069 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4070 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4071 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4072 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4073 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4074 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4075 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4076 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4077 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4078 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4079 }; 4080 4081 __ align(CodeEntryAlignment); 4082 4083 StubCodeMark mark(this, stub_id); 4084 address start = __ pc(); 4085 4086 Register buf = c_rarg0; 4087 Register state = c_rarg1; 4088 Register ofs = c_rarg2; 4089 Register limit = c_rarg3; 4090 4091 __ stpd(v8, v9, __ pre(sp, -64)); 4092 __ stpd(v10, v11, Address(sp, 16)); 4093 __ stpd(v12, v13, Address(sp, 32)); 4094 __ stpd(v14, v15, Address(sp, 48)); 4095 4096 Label sha512_loop; 4097 4098 // load state 4099 __ ld1(v8, v9, v10, v11, __ T2D, state); 4100 4101 // load first 4 round constants 4102 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4103 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4104 4105 __ BIND(sha512_loop); 4106 // load 128B of data into v12..v19 4107 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4108 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4109 __ rev64(v12, __ T16B, v12); 4110 __ rev64(v13, __ T16B, v13); 4111 __ rev64(v14, __ T16B, v14); 4112 __ rev64(v15, __ T16B, v15); 4113 __ rev64(v16, __ T16B, v16); 4114 __ rev64(v17, __ T16B, v17); 4115 __ rev64(v18, __ T16B, v18); 4116 __ rev64(v19, __ T16B, v19); 4117 4118 __ mov(rscratch2, rscratch1); 4119 4120 __ mov(v0, __ T16B, v8); 4121 __ mov(v1, __ T16B, v9); 4122 __ mov(v2, __ T16B, v10); 4123 __ mov(v3, __ T16B, v11); 4124 4125 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4126 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4127 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4128 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4129 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4130 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4131 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4132 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4133 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4134 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4135 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4136 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4137 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4138 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4139 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4140 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4141 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4142 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4143 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4144 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4145 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4146 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4147 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4148 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4149 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4150 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4151 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4152 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4153 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4154 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4155 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4156 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4157 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4158 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4159 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4160 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4161 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4162 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4163 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4164 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4165 4166 __ addv(v8, __ T2D, v8, v0); 4167 __ addv(v9, __ T2D, v9, v1); 4168 __ addv(v10, __ T2D, v10, v2); 4169 __ addv(v11, __ T2D, v11, v3); 4170 4171 if (multi_block) { 4172 __ add(ofs, ofs, 128); 4173 __ cmp(ofs, limit); 4174 __ br(Assembler::LE, sha512_loop); 4175 __ mov(c_rarg0, ofs); // return ofs 4176 } 4177 4178 __ st1(v8, v9, v10, v11, __ T2D, state); 4179 4180 __ ldpd(v14, v15, Address(sp, 48)); 4181 __ ldpd(v12, v13, Address(sp, 32)); 4182 __ ldpd(v10, v11, Address(sp, 16)); 4183 __ ldpd(v8, v9, __ post(sp, 64)); 4184 4185 __ ret(lr); 4186 4187 return start; 4188 } 4189 4190 // Execute one round of keccak of two computations in parallel. 4191 // One of the states should be loaded into the lower halves of 4192 // the vector registers v0-v24, the other should be loaded into 4193 // the upper halves of those registers. The ld1r instruction loads 4194 // the round constant into both halves of register v31. 4195 // Intermediate results c0...c5 and d0...d5 are computed 4196 // in registers v25...v30. 4197 // All vector instructions that are used operate on both register 4198 // halves in parallel. 4199 // If only a single computation is needed, one can only load the lower halves. 4200 void keccak_round(Register rscratch1) { 4201 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4202 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4203 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4204 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4205 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4206 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4207 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4208 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4209 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4210 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4211 4212 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4213 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4214 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4215 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4216 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4217 4218 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4219 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4220 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4221 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4222 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4223 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4224 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4225 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4226 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4227 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4228 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4229 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4230 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4231 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4232 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4233 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4234 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4235 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4236 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4237 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4238 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4239 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4240 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4241 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4242 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4243 4244 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4245 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4246 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4247 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4248 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4249 4250 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4251 4252 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4253 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4254 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4255 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4256 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4257 4258 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4259 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4260 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4261 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4262 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4263 4264 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4265 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4266 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4267 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4268 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4269 4270 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4271 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4272 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4273 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4274 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4275 4276 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4277 } 4278 4279 // Arguments: 4280 // 4281 // Inputs: 4282 // c_rarg0 - byte[] source+offset 4283 // c_rarg1 - byte[] SHA.state 4284 // c_rarg2 - int block_size 4285 // c_rarg3 - int offset 4286 // c_rarg4 - int limit 4287 // 4288 address generate_sha3_implCompress(StubId stub_id) { 4289 bool multi_block; 4290 switch (stub_id) { 4291 case StubId::stubgen_sha3_implCompress_id: 4292 multi_block = false; 4293 break; 4294 case StubId::stubgen_sha3_implCompressMB_id: 4295 multi_block = true; 4296 break; 4297 default: 4298 ShouldNotReachHere(); 4299 } 4300 4301 static const uint64_t round_consts[24] = { 4302 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4303 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4304 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4305 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4306 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4307 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4308 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4309 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4310 }; 4311 4312 __ align(CodeEntryAlignment); 4313 4314 StubCodeMark mark(this, stub_id); 4315 address start = __ pc(); 4316 4317 Register buf = c_rarg0; 4318 Register state = c_rarg1; 4319 Register block_size = c_rarg2; 4320 Register ofs = c_rarg3; 4321 Register limit = c_rarg4; 4322 4323 Label sha3_loop, rounds24_loop; 4324 Label sha3_512_or_sha3_384, shake128; 4325 4326 __ stpd(v8, v9, __ pre(sp, -64)); 4327 __ stpd(v10, v11, Address(sp, 16)); 4328 __ stpd(v12, v13, Address(sp, 32)); 4329 __ stpd(v14, v15, Address(sp, 48)); 4330 4331 // load state 4332 __ add(rscratch1, state, 32); 4333 __ ld1(v0, v1, v2, v3, __ T1D, state); 4334 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4335 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4336 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4337 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4338 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4339 __ ld1(v24, __ T1D, rscratch1); 4340 4341 __ BIND(sha3_loop); 4342 4343 // 24 keccak rounds 4344 __ movw(rscratch2, 24); 4345 4346 // load round_constants base 4347 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4348 4349 // load input 4350 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4351 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4352 __ eor(v0, __ T8B, v0, v25); 4353 __ eor(v1, __ T8B, v1, v26); 4354 __ eor(v2, __ T8B, v2, v27); 4355 __ eor(v3, __ T8B, v3, v28); 4356 __ eor(v4, __ T8B, v4, v29); 4357 __ eor(v5, __ T8B, v5, v30); 4358 __ eor(v6, __ T8B, v6, v31); 4359 4360 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4361 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4362 4363 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4364 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4365 __ eor(v7, __ T8B, v7, v25); 4366 __ eor(v8, __ T8B, v8, v26); 4367 __ eor(v9, __ T8B, v9, v27); 4368 __ eor(v10, __ T8B, v10, v28); 4369 __ eor(v11, __ T8B, v11, v29); 4370 __ eor(v12, __ T8B, v12, v30); 4371 __ eor(v13, __ T8B, v13, v31); 4372 4373 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4374 __ eor(v14, __ T8B, v14, v25); 4375 __ eor(v15, __ T8B, v15, v26); 4376 __ eor(v16, __ T8B, v16, v27); 4377 4378 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4379 __ andw(c_rarg5, block_size, 48); 4380 __ cbzw(c_rarg5, rounds24_loop); 4381 4382 __ tbnz(block_size, 5, shake128); 4383 // block_size == 144, bit5 == 0, SHA3-224 4384 __ ldrd(v28, __ post(buf, 8)); 4385 __ eor(v17, __ T8B, v17, v28); 4386 __ b(rounds24_loop); 4387 4388 __ BIND(shake128); 4389 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4390 __ eor(v17, __ T8B, v17, v28); 4391 __ eor(v18, __ T8B, v18, v29); 4392 __ eor(v19, __ T8B, v19, v30); 4393 __ eor(v20, __ T8B, v20, v31); 4394 __ b(rounds24_loop); // block_size == 168, SHAKE128 4395 4396 __ BIND(sha3_512_or_sha3_384); 4397 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4398 __ eor(v7, __ T8B, v7, v25); 4399 __ eor(v8, __ T8B, v8, v26); 4400 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4401 4402 // SHA3-384 4403 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4404 __ eor(v9, __ T8B, v9, v27); 4405 __ eor(v10, __ T8B, v10, v28); 4406 __ eor(v11, __ T8B, v11, v29); 4407 __ eor(v12, __ T8B, v12, v30); 4408 4409 __ BIND(rounds24_loop); 4410 __ subw(rscratch2, rscratch2, 1); 4411 4412 keccak_round(rscratch1); 4413 4414 __ cbnzw(rscratch2, rounds24_loop); 4415 4416 if (multi_block) { 4417 __ add(ofs, ofs, block_size); 4418 __ cmp(ofs, limit); 4419 __ br(Assembler::LE, sha3_loop); 4420 __ mov(c_rarg0, ofs); // return ofs 4421 } 4422 4423 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4424 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4425 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4426 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4427 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4428 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4429 __ st1(v24, __ T1D, state); 4430 4431 // restore callee-saved registers 4432 __ ldpd(v14, v15, Address(sp, 48)); 4433 __ ldpd(v12, v13, Address(sp, 32)); 4434 __ ldpd(v10, v11, Address(sp, 16)); 4435 __ ldpd(v8, v9, __ post(sp, 64)); 4436 4437 __ ret(lr); 4438 4439 return start; 4440 } 4441 4442 // Inputs: 4443 // c_rarg0 - long[] state0 4444 // c_rarg1 - long[] state1 4445 address generate_double_keccak() { 4446 static const uint64_t round_consts[24] = { 4447 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4448 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4449 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4450 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4451 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4452 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4453 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4454 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4455 }; 4456 4457 // Implements the double_keccak() method of the 4458 // sun.secyrity.provider.SHA3Parallel class 4459 __ align(CodeEntryAlignment); 4460 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4461 address start = __ pc(); 4462 __ enter(); 4463 4464 Register state0 = c_rarg0; 4465 Register state1 = c_rarg1; 4466 4467 Label rounds24_loop; 4468 4469 // save callee-saved registers 4470 __ stpd(v8, v9, __ pre(sp, -64)); 4471 __ stpd(v10, v11, Address(sp, 16)); 4472 __ stpd(v12, v13, Address(sp, 32)); 4473 __ stpd(v14, v15, Address(sp, 48)); 4474 4475 // load states 4476 __ add(rscratch1, state0, 32); 4477 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4478 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4479 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4480 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4481 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4482 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4483 __ ld1(v24, __ D, 0, rscratch1); 4484 __ add(rscratch1, state1, 32); 4485 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4486 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4487 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4488 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4489 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4490 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4491 __ ld1(v24, __ D, 1, rscratch1); 4492 4493 // 24 keccak rounds 4494 __ movw(rscratch2, 24); 4495 4496 // load round_constants base 4497 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4498 4499 __ BIND(rounds24_loop); 4500 __ subw(rscratch2, rscratch2, 1); 4501 keccak_round(rscratch1); 4502 __ cbnzw(rscratch2, rounds24_loop); 4503 4504 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4505 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4506 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4507 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4508 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4509 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4510 __ st1(v24, __ D, 0, state0); 4511 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4512 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4513 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4514 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4515 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4516 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4517 __ st1(v24, __ D, 1, state1); 4518 4519 // restore callee-saved vector registers 4520 __ ldpd(v14, v15, Address(sp, 48)); 4521 __ ldpd(v12, v13, Address(sp, 32)); 4522 __ ldpd(v10, v11, Address(sp, 16)); 4523 __ ldpd(v8, v9, __ post(sp, 64)); 4524 4525 __ leave(); // required for proper stackwalking of RuntimeStub frame 4526 __ mov(r0, zr); // return 0 4527 __ ret(lr); 4528 4529 return start; 4530 } 4531 4532 // ChaCha20 block function. This version parallelizes the 32-bit 4533 // state elements on each of 16 vectors, producing 4 blocks of 4534 // keystream at a time. 4535 // 4536 // state (int[16]) = c_rarg0 4537 // keystream (byte[256]) = c_rarg1 4538 // return - number of bytes of produced keystream (always 256) 4539 // 4540 // This implementation takes each 32-bit integer from the state 4541 // array and broadcasts it across all 4 32-bit lanes of a vector register 4542 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4543 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4544 // the quarter round schedule is implemented as outlined in RFC 7539 section 4545 // 2.3. However, instead of sequentially processing the 3 quarter round 4546 // operations represented by one QUARTERROUND function, we instead stack all 4547 // the adds, xors and left-rotations from the first 4 quarter rounds together 4548 // and then do the same for the second set of 4 quarter rounds. This removes 4549 // some latency that would otherwise be incurred by waiting for an add to 4550 // complete before performing an xor (which depends on the result of the 4551 // add), etc. An adjustment happens between the first and second groups of 4 4552 // quarter rounds, but this is done only in the inputs to the macro functions 4553 // that generate the assembly instructions - these adjustments themselves are 4554 // not part of the resulting assembly. 4555 // The 4 registers v0-v3 are used during the quarter round operations as 4556 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4557 // registers become the vectors involved in adding the start state back onto 4558 // the post-QR working state. After the adds are complete, each of the 16 4559 // vectors write their first lane back to the keystream buffer, followed 4560 // by the second lane from all vectors and so on. 4561 address generate_chacha20Block_blockpar() { 4562 Label L_twoRounds, L_cc20_const; 4563 // The constant data is broken into two 128-bit segments to be loaded 4564 // onto FloatRegisters. The first 128 bits are a counter add overlay 4565 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4566 // The second 128-bits is a table constant used for 8-bit left rotations. 4567 __ BIND(L_cc20_const); 4568 __ emit_int64(0x0000000100000000UL); 4569 __ emit_int64(0x0000000300000002UL); 4570 __ emit_int64(0x0605040702010003UL); 4571 __ emit_int64(0x0E0D0C0F0A09080BUL); 4572 4573 __ align(CodeEntryAlignment); 4574 StubId stub_id = StubId::stubgen_chacha20Block_id; 4575 StubCodeMark mark(this, stub_id); 4576 address start = __ pc(); 4577 __ enter(); 4578 4579 int i, j; 4580 const Register state = c_rarg0; 4581 const Register keystream = c_rarg1; 4582 const Register loopCtr = r10; 4583 const Register tmpAddr = r11; 4584 const FloatRegister ctrAddOverlay = v28; 4585 const FloatRegister lrot8Tbl = v29; 4586 4587 // Organize SIMD registers in an array that facilitates 4588 // putting repetitive opcodes into loop structures. It is 4589 // important that each grouping of 4 registers is monotonically 4590 // increasing to support the requirements of multi-register 4591 // instructions (e.g. ld4r, st4, etc.) 4592 const FloatRegister workSt[16] = { 4593 v4, v5, v6, v7, v16, v17, v18, v19, 4594 v20, v21, v22, v23, v24, v25, v26, v27 4595 }; 4596 4597 // Pull in constant data. The first 16 bytes are the add overlay 4598 // which is applied to the vector holding the counter (state[12]). 4599 // The second 16 bytes is the index register for the 8-bit left 4600 // rotation tbl instruction. 4601 __ adr(tmpAddr, L_cc20_const); 4602 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4603 4604 // Load from memory and interlace across 16 SIMD registers, 4605 // With each word from memory being broadcast to all lanes of 4606 // each successive SIMD register. 4607 // Addr(0) -> All lanes in workSt[i] 4608 // Addr(4) -> All lanes workSt[i + 1], etc. 4609 __ mov(tmpAddr, state); 4610 for (i = 0; i < 16; i += 4) { 4611 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4612 __ post(tmpAddr, 16)); 4613 } 4614 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4615 4616 // Before entering the loop, create 5 4-register arrays. These 4617 // will hold the 4 registers that represent the a/b/c/d fields 4618 // in the quarter round operation. For instance the "b" field 4619 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4620 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4621 // since it is part of a diagonal organization. The aSet and scratch 4622 // register sets are defined at declaration time because they do not change 4623 // organization at any point during the 20-round processing. 4624 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4625 FloatRegister bSet[4]; 4626 FloatRegister cSet[4]; 4627 FloatRegister dSet[4]; 4628 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4629 4630 // Set up the 10 iteration loop and perform all 8 quarter round ops 4631 __ mov(loopCtr, 10); 4632 __ BIND(L_twoRounds); 4633 4634 // Set to columnar organization and do the following 4 quarter-rounds: 4635 // QUARTERROUND(0, 4, 8, 12) 4636 // QUARTERROUND(1, 5, 9, 13) 4637 // QUARTERROUND(2, 6, 10, 14) 4638 // QUARTERROUND(3, 7, 11, 15) 4639 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4640 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4641 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4642 4643 __ cc20_qr_add4(aSet, bSet); // a += b 4644 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4645 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4646 4647 __ cc20_qr_add4(cSet, dSet); // c += d 4648 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4649 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4650 4651 __ cc20_qr_add4(aSet, bSet); // a += b 4652 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4653 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4654 4655 __ cc20_qr_add4(cSet, dSet); // c += d 4656 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4657 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4658 4659 // Set to diagonal organization and do the next 4 quarter-rounds: 4660 // QUARTERROUND(0, 5, 10, 15) 4661 // QUARTERROUND(1, 6, 11, 12) 4662 // QUARTERROUND(2, 7, 8, 13) 4663 // QUARTERROUND(3, 4, 9, 14) 4664 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4665 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4666 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4667 4668 __ cc20_qr_add4(aSet, bSet); // a += b 4669 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4670 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4671 4672 __ cc20_qr_add4(cSet, dSet); // c += d 4673 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4674 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4675 4676 __ cc20_qr_add4(aSet, bSet); // a += b 4677 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4678 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4679 4680 __ cc20_qr_add4(cSet, dSet); // c += d 4681 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4682 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4683 4684 // Decrement and iterate 4685 __ sub(loopCtr, loopCtr, 1); 4686 __ cbnz(loopCtr, L_twoRounds); 4687 4688 __ mov(tmpAddr, state); 4689 4690 // Add the starting state back to the post-loop keystream 4691 // state. We read/interlace the state array from memory into 4692 // 4 registers similar to what we did in the beginning. Then 4693 // add the counter overlay onto workSt[12] at the end. 4694 for (i = 0; i < 16; i += 4) { 4695 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4696 __ addv(workSt[i], __ T4S, workSt[i], v0); 4697 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4698 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4699 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4700 } 4701 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4702 4703 // Write working state into the keystream buffer. This is accomplished 4704 // by taking the lane "i" from each of the four vectors and writing 4705 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4706 // repeating with the next 4 vectors until all 16 vectors have been used. 4707 // Then move to the next lane and repeat the process until all lanes have 4708 // been written. 4709 for (i = 0; i < 4; i++) { 4710 for (j = 0; j < 16; j += 4) { 4711 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4712 __ post(keystream, 16)); 4713 } 4714 } 4715 4716 __ mov(r0, 256); // Return length of output keystream 4717 __ leave(); 4718 __ ret(lr); 4719 4720 return start; 4721 } 4722 4723 // Helpers to schedule parallel operation bundles across vector 4724 // register sequences of size 2, 4 or 8. 4725 4726 // Implement various primitive computations across vector sequences 4727 4728 template<int N> 4729 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4730 const VSeq<N>& v1, const VSeq<N>& v2) { 4731 // output must not be constant 4732 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4733 // output cannot overwrite pending inputs 4734 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4735 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4736 for (int i = 0; i < N; i++) { 4737 __ addv(v[i], T, v1[i], v2[i]); 4738 } 4739 } 4740 4741 template<int N> 4742 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4743 const VSeq<N>& v1, const VSeq<N>& v2) { 4744 // output must not be constant 4745 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4746 // output cannot overwrite pending inputs 4747 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4748 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4749 for (int i = 0; i < N; i++) { 4750 __ subv(v[i], T, v1[i], v2[i]); 4751 } 4752 } 4753 4754 template<int N> 4755 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4756 const VSeq<N>& v1, const VSeq<N>& v2) { 4757 // output must not be constant 4758 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4759 // output cannot overwrite pending inputs 4760 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4761 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4762 for (int i = 0; i < N; i++) { 4763 __ mulv(v[i], T, v1[i], v2[i]); 4764 } 4765 } 4766 4767 template<int N> 4768 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4769 // output must not be constant 4770 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4771 // output cannot overwrite pending inputs 4772 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4773 for (int i = 0; i < N; i++) { 4774 __ negr(v[i], T, v1[i]); 4775 } 4776 } 4777 4778 template<int N> 4779 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4780 const VSeq<N>& v1, int shift) { 4781 // output must not be constant 4782 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4783 // output cannot overwrite pending inputs 4784 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4785 for (int i = 0; i < N; i++) { 4786 __ sshr(v[i], T, v1[i], shift); 4787 } 4788 } 4789 4790 template<int N> 4791 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4792 // output must not be constant 4793 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4794 // output cannot overwrite pending inputs 4795 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4796 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4797 for (int i = 0; i < N; i++) { 4798 __ andr(v[i], __ T16B, v1[i], v2[i]); 4799 } 4800 } 4801 4802 template<int N> 4803 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4804 // output must not be constant 4805 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4806 // output cannot overwrite pending inputs 4807 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4808 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4809 for (int i = 0; i < N; i++) { 4810 __ orr(v[i], __ T16B, v1[i], v2[i]); 4811 } 4812 } 4813 4814 template<int N> 4815 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4816 // output must not be constant 4817 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4818 // output cannot overwrite pending inputs 4819 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4820 for (int i = 0; i < N; i++) { 4821 __ notr(v[i], __ T16B, v1[i]); 4822 } 4823 } 4824 4825 template<int N> 4826 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4827 // output must not be constant 4828 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4829 // output cannot overwrite pending inputs 4830 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4831 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4832 for (int i = 0; i < N; i++) { 4833 __ sqdmulh(v[i], T, v1[i], v2[i]); 4834 } 4835 } 4836 4837 template<int N> 4838 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4839 // output must not be constant 4840 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4841 // output cannot overwrite pending inputs 4842 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4843 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4844 for (int i = 0; i < N; i++) { 4845 __ mlsv(v[i], T, v1[i], v2[i]); 4846 } 4847 } 4848 4849 // load N/2 successive pairs of quadword values from memory in order 4850 // into N successive vector registers of the sequence via the 4851 // address supplied in base. 4852 template<int N> 4853 void vs_ldpq(const VSeq<N>& v, Register base) { 4854 for (int i = 0; i < N; i += 2) { 4855 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4856 } 4857 } 4858 4859 // load N/2 successive pairs of quadword values from memory in order 4860 // into N vector registers of the sequence via the address supplied 4861 // in base using post-increment addressing 4862 template<int N> 4863 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4864 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4865 for (int i = 0; i < N; i += 2) { 4866 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4867 } 4868 } 4869 4870 // store N successive vector registers of the sequence into N/2 4871 // successive pairs of quadword memory locations via the address 4872 // supplied in base using post-increment addressing 4873 template<int N> 4874 void vs_stpq_post(const VSeq<N>& v, Register base) { 4875 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4876 for (int i = 0; i < N; i += 2) { 4877 __ stpq(v[i], v[i+1], __ post(base, 32)); 4878 } 4879 } 4880 4881 // load N/2 pairs of quadword values from memory de-interleaved into 4882 // N vector registers 2 at a time via the address supplied in base 4883 // using post-increment addressing. 4884 template<int N> 4885 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4886 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4887 for (int i = 0; i < N; i += 2) { 4888 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4889 } 4890 } 4891 4892 // store N vector registers interleaved into N/2 pairs of quadword 4893 // memory locations via the address supplied in base using 4894 // post-increment addressing. 4895 template<int N> 4896 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4897 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4898 for (int i = 0; i < N; i += 2) { 4899 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4900 } 4901 } 4902 4903 // load N quadword values from memory de-interleaved into N vector 4904 // registers 3 elements at a time via the address supplied in base. 4905 template<int N> 4906 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4907 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4908 for (int i = 0; i < N; i += 3) { 4909 __ ld3(v[i], v[i+1], v[i+2], T, base); 4910 } 4911 } 4912 4913 // load N quadword values from memory de-interleaved into N vector 4914 // registers 3 elements at a time via the address supplied in base 4915 // using post-increment addressing. 4916 template<int N> 4917 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4918 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4919 for (int i = 0; i < N; i += 3) { 4920 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4921 } 4922 } 4923 4924 // load N/2 pairs of quadword values from memory into N vector 4925 // registers via the address supplied in base with each pair indexed 4926 // using the the start offset plus the corresponding entry in the 4927 // offsets array 4928 template<int N> 4929 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4930 for (int i = 0; i < N/2; i++) { 4931 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4932 } 4933 } 4934 4935 // store N vector registers into N/2 pairs of quadword memory 4936 // locations via the address supplied in base with each pair indexed 4937 // using the the start offset plus the corresponding entry in the 4938 // offsets array 4939 template<int N> 4940 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4941 for (int i = 0; i < N/2; i++) { 4942 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4943 } 4944 } 4945 4946 // load N single quadword values from memory into N vector registers 4947 // via the address supplied in base with each value indexed using 4948 // the the start offset plus the corresponding entry in the offsets 4949 // array 4950 template<int N> 4951 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4952 int start, int (&offsets)[N]) { 4953 for (int i = 0; i < N; i++) { 4954 __ ldr(v[i], T, Address(base, start + offsets[i])); 4955 } 4956 } 4957 4958 // store N vector registers into N single quadword memory locations 4959 // via the address supplied in base with each value indexed using 4960 // the the start offset plus the corresponding entry in the offsets 4961 // array 4962 template<int N> 4963 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4964 int start, int (&offsets)[N]) { 4965 for (int i = 0; i < N; i++) { 4966 __ str(v[i], T, Address(base, start + offsets[i])); 4967 } 4968 } 4969 4970 // load N/2 pairs of quadword values from memory de-interleaved into 4971 // N vector registers 2 at a time via the address supplied in base 4972 // with each pair indexed using the the start offset plus the 4973 // corresponding entry in the offsets array 4974 template<int N> 4975 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4976 Register tmp, int start, int (&offsets)[N/2]) { 4977 for (int i = 0; i < N/2; i++) { 4978 __ add(tmp, base, start + offsets[i]); 4979 __ ld2(v[2*i], v[2*i+1], T, tmp); 4980 } 4981 } 4982 4983 // store N vector registers 2 at a time interleaved into N/2 pairs 4984 // of quadword memory locations via the address supplied in base 4985 // with each pair indexed using the the start offset plus the 4986 // corresponding entry in the offsets array 4987 template<int N> 4988 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4989 Register tmp, int start, int (&offsets)[N/2]) { 4990 for (int i = 0; i < N/2; i++) { 4991 __ add(tmp, base, start + offsets[i]); 4992 __ st2(v[2*i], v[2*i+1], T, tmp); 4993 } 4994 } 4995 4996 // Helper routines for various flavours of Montgomery multiply 4997 4998 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 4999 // multiplications in parallel 5000 // 5001 5002 // See the montMul() method of the sun.security.provider.ML_DSA 5003 // class. 5004 // 5005 // Computes 4x4S results or 8x8H results 5006 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5007 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5008 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5009 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5010 // Outputs: va - 4x4S or 4x8H vector register sequences 5011 // vb, vc, vtmp and vq must all be disjoint 5012 // va must be disjoint from all other inputs/temps or must equal vc 5013 // va must have a non-zero delta i.e. it must not be a constant vseq. 5014 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5015 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5016 Assembler::SIMD_Arrangement T, 5017 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5018 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5019 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5020 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5021 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5022 5023 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5024 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5025 5026 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5027 5028 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5029 assert(vs_disjoint(va, vb), "va and vb overlap"); 5030 assert(vs_disjoint(va, vq), "va and vq overlap"); 5031 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5032 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5033 5034 // schedule 4 streams of instructions across the vector sequences 5035 for (int i = 0; i < 4; i++) { 5036 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5037 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5038 } 5039 5040 for (int i = 0; i < 4; i++) { 5041 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5042 } 5043 5044 for (int i = 0; i < 4; i++) { 5045 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5046 } 5047 5048 for (int i = 0; i < 4; i++) { 5049 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5050 } 5051 } 5052 5053 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5054 // multiplications in parallel 5055 // 5056 5057 // See the montMul() method of the sun.security.provider.ML_DSA 5058 // class. 5059 // 5060 // Computes 4x4S results or 8x8H results 5061 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5062 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5063 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5064 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5065 // Outputs: va - 4x4S or 4x8H vector register sequences 5066 // vb, vc, vtmp and vq must all be disjoint 5067 // va must be disjoint from all other inputs/temps or must equal vc 5068 // va must have a non-zero delta i.e. it must not be a constant vseq. 5069 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5070 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5071 Assembler::SIMD_Arrangement T, 5072 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5073 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5074 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5075 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5076 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5077 5078 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5079 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5080 5081 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5082 5083 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5084 assert(vs_disjoint(va, vb), "va and vb overlap"); 5085 assert(vs_disjoint(va, vq), "va and vq overlap"); 5086 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5087 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5088 5089 // schedule 2 streams of instructions across the vector sequences 5090 for (int i = 0; i < 2; i++) { 5091 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5092 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5093 } 5094 5095 for (int i = 0; i < 2; i++) { 5096 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5097 } 5098 5099 for (int i = 0; i < 2; i++) { 5100 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5101 } 5102 5103 for (int i = 0; i < 2; i++) { 5104 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5105 } 5106 } 5107 5108 // Perform 16 16-bit Montgomery multiplications in parallel. 5109 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5110 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5111 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5112 // It will assert that the register use is valid 5113 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5114 } 5115 5116 // Perform 32 16-bit Montgomery multiplications in parallel. 5117 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5118 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5119 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5120 // It will assert that the register use is valid 5121 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5122 } 5123 5124 // Perform 64 16-bit Montgomery multiplications in parallel. 5125 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5126 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5127 // Schedule two successive 4x8H multiplies via the montmul helper 5128 // on the front and back halves of va, vb and vc. The helper will 5129 // assert that the register use has no overlap conflicts on each 5130 // individual call but we also need to ensure that the necessary 5131 // disjoint/equality constraints are met across both calls. 5132 5133 // vb, vc, vtmp and vq must be disjoint. va must either be 5134 // disjoint from all other registers or equal vc 5135 5136 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5137 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5138 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5139 5140 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5141 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5142 5143 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5144 5145 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5146 assert(vs_disjoint(va, vb), "va and vb overlap"); 5147 assert(vs_disjoint(va, vq), "va and vq overlap"); 5148 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5149 5150 // we multiply the front and back halves of each sequence 4 at a 5151 // time because 5152 // 5153 // 1) we are currently only able to get 4-way instruction 5154 // parallelism at best 5155 // 5156 // 2) we need registers for the constants in vq and temporary 5157 // scratch registers to hold intermediate results so vtmp can only 5158 // be a VSeq<4> which means we only have 4 scratch slots 5159 5160 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5161 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5162 } 5163 5164 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5165 const VSeq<4>& vc, 5166 const VSeq<4>& vtmp, 5167 const VSeq<2>& vq) { 5168 // compute a = montmul(a1, c) 5169 kyber_montmul32(vc, va1, vc, vtmp, vq); 5170 // ouptut a1 = a0 - a 5171 vs_subv(va1, __ T8H, va0, vc); 5172 // and a0 = a0 + a 5173 vs_addv(va0, __ T8H, va0, vc); 5174 } 5175 5176 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5177 const VSeq<4>& vb, 5178 const VSeq<4>& vtmp1, 5179 const VSeq<4>& vtmp2, 5180 const VSeq<2>& vq) { 5181 // compute c = a0 - a1 5182 vs_subv(vtmp1, __ T8H, va0, va1); 5183 // output a0 = a0 + a1 5184 vs_addv(va0, __ T8H, va0, va1); 5185 // output a1 = b montmul c 5186 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5187 } 5188 5189 void load64shorts(const VSeq<8>& v, Register shorts) { 5190 vs_ldpq_post(v, shorts); 5191 } 5192 5193 void load32shorts(const VSeq<4>& v, Register shorts) { 5194 vs_ldpq_post(v, shorts); 5195 } 5196 5197 void store64shorts(VSeq<8> v, Register tmpAddr) { 5198 vs_stpq_post(v, tmpAddr); 5199 } 5200 5201 // Kyber NTT function. 5202 // Implements 5203 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5204 // 5205 // coeffs (short[256]) = c_rarg0 5206 // ntt_zetas (short[256]) = c_rarg1 5207 address generate_kyberNtt() { 5208 5209 __ align(CodeEntryAlignment); 5210 StubId stub_id = StubId::stubgen_kyberNtt_id; 5211 StubCodeMark mark(this, stub_id); 5212 address start = __ pc(); 5213 __ enter(); 5214 5215 const Register coeffs = c_rarg0; 5216 const Register zetas = c_rarg1; 5217 5218 const Register kyberConsts = r10; 5219 const Register tmpAddr = r11; 5220 5221 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5222 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5223 VSeq<2> vq(30); // n.b. constants overlap vs3 5224 5225 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5226 // load the montmul constants 5227 vs_ldpq(vq, kyberConsts); 5228 5229 // Each level corresponds to an iteration of the outermost loop of the 5230 // Java method seilerNTT(int[] coeffs). There are some differences 5231 // from what is done in the seilerNTT() method, though: 5232 // 1. The computation is using 16-bit signed values, we do not convert them 5233 // to ints here. 5234 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5235 // this array for each level, it is easier that way to fill up the vector 5236 // registers. 5237 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5238 // multiplications (this is because that way there should not be any 5239 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5240 // that we can use the 16-bit arithmetic in the vector unit. 5241 // 5242 // On each level, we fill up the vector registers in such a way that the 5243 // array elements that need to be multiplied by the zetas go into one 5244 // set of vector registers while the corresponding ones that don't need to 5245 // be multiplied, go into another set. 5246 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5247 // registers interleaving the steps of 4 identical computations, 5248 // each done on 8 16-bit values per register. 5249 5250 // At levels 0-3 the coefficients multiplied by or added/subtracted 5251 // to the zetas occur in discrete blocks whose size is some multiple 5252 // of 32. 5253 5254 // level 0 5255 __ add(tmpAddr, coeffs, 256); 5256 load64shorts(vs1, tmpAddr); 5257 load64shorts(vs2, zetas); 5258 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5259 __ add(tmpAddr, coeffs, 0); 5260 load64shorts(vs1, tmpAddr); 5261 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5262 vs_addv(vs1, __ T8H, vs1, vs2); 5263 __ add(tmpAddr, coeffs, 0); 5264 vs_stpq_post(vs1, tmpAddr); 5265 __ add(tmpAddr, coeffs, 256); 5266 vs_stpq_post(vs3, tmpAddr); 5267 // restore montmul constants 5268 vs_ldpq(vq, kyberConsts); 5269 load64shorts(vs1, tmpAddr); 5270 load64shorts(vs2, zetas); 5271 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5272 __ add(tmpAddr, coeffs, 128); 5273 load64shorts(vs1, tmpAddr); 5274 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5275 vs_addv(vs1, __ T8H, vs1, vs2); 5276 __ add(tmpAddr, coeffs, 128); 5277 store64shorts(vs1, tmpAddr); 5278 __ add(tmpAddr, coeffs, 384); 5279 store64shorts(vs3, tmpAddr); 5280 5281 // level 1 5282 // restore montmul constants 5283 vs_ldpq(vq, kyberConsts); 5284 __ add(tmpAddr, coeffs, 128); 5285 load64shorts(vs1, tmpAddr); 5286 load64shorts(vs2, zetas); 5287 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5288 __ add(tmpAddr, coeffs, 0); 5289 load64shorts(vs1, tmpAddr); 5290 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5291 vs_addv(vs1, __ T8H, vs1, vs2); 5292 __ add(tmpAddr, coeffs, 0); 5293 store64shorts(vs1, tmpAddr); 5294 store64shorts(vs3, tmpAddr); 5295 vs_ldpq(vq, kyberConsts); 5296 __ add(tmpAddr, coeffs, 384); 5297 load64shorts(vs1, tmpAddr); 5298 load64shorts(vs2, zetas); 5299 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5300 __ add(tmpAddr, coeffs, 256); 5301 load64shorts(vs1, tmpAddr); 5302 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5303 vs_addv(vs1, __ T8H, vs1, vs2); 5304 __ add(tmpAddr, coeffs, 256); 5305 store64shorts(vs1, tmpAddr); 5306 store64shorts(vs3, tmpAddr); 5307 5308 // level 2 5309 vs_ldpq(vq, kyberConsts); 5310 int offsets1[4] = { 0, 32, 128, 160 }; 5311 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5312 load64shorts(vs2, zetas); 5313 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5314 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5315 // kyber_subv_addv64(); 5316 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5317 vs_addv(vs1, __ T8H, vs1, vs2); 5318 __ add(tmpAddr, coeffs, 0); 5319 vs_stpq_post(vs_front(vs1), tmpAddr); 5320 vs_stpq_post(vs_front(vs3), tmpAddr); 5321 vs_stpq_post(vs_back(vs1), tmpAddr); 5322 vs_stpq_post(vs_back(vs3), tmpAddr); 5323 vs_ldpq(vq, kyberConsts); 5324 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5325 load64shorts(vs2, zetas); 5326 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5327 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5328 // kyber_subv_addv64(); 5329 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5330 vs_addv(vs1, __ T8H, vs1, vs2); 5331 __ add(tmpAddr, coeffs, 256); 5332 vs_stpq_post(vs_front(vs1), tmpAddr); 5333 vs_stpq_post(vs_front(vs3), tmpAddr); 5334 vs_stpq_post(vs_back(vs1), tmpAddr); 5335 vs_stpq_post(vs_back(vs3), tmpAddr); 5336 5337 // level 3 5338 vs_ldpq(vq, kyberConsts); 5339 int offsets2[4] = { 0, 64, 128, 192 }; 5340 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5341 load64shorts(vs2, zetas); 5342 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5343 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5344 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5345 vs_addv(vs1, __ T8H, vs1, vs2); 5346 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5347 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5348 5349 vs_ldpq(vq, kyberConsts); 5350 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5351 load64shorts(vs2, zetas); 5352 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5353 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5354 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5355 vs_addv(vs1, __ T8H, vs1, vs2); 5356 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5357 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5358 5359 // level 4 5360 // At level 4 coefficients occur in 8 discrete blocks of size 16 5361 // so they are loaded using employing an ldr at 8 distinct offsets. 5362 5363 vs_ldpq(vq, kyberConsts); 5364 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5365 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5366 load64shorts(vs2, zetas); 5367 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5368 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5369 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5370 vs_addv(vs1, __ T8H, vs1, vs2); 5371 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5372 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5373 5374 vs_ldpq(vq, kyberConsts); 5375 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5376 load64shorts(vs2, zetas); 5377 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5378 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5379 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5380 vs_addv(vs1, __ T8H, vs1, vs2); 5381 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5382 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5383 5384 // level 5 5385 // At level 5 related coefficients occur in discrete blocks of size 8 so 5386 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5387 5388 vs_ldpq(vq, kyberConsts); 5389 int offsets4[4] = { 0, 32, 64, 96 }; 5390 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5391 load32shorts(vs_front(vs2), zetas); 5392 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5393 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5394 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5395 load32shorts(vs_front(vs2), zetas); 5396 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5397 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5398 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5399 load32shorts(vs_front(vs2), zetas); 5400 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5401 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5402 5403 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5404 load32shorts(vs_front(vs2), zetas); 5405 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5406 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5407 5408 // level 6 5409 // At level 6 related coefficients occur in discrete blocks of size 4 so 5410 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5411 5412 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5413 load32shorts(vs_front(vs2), zetas); 5414 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5415 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5416 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5417 // __ ldpq(v18, v19, __ post(zetas, 32)); 5418 load32shorts(vs_front(vs2), zetas); 5419 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5420 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5421 5422 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5423 load32shorts(vs_front(vs2), zetas); 5424 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5425 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5426 5427 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5428 load32shorts(vs_front(vs2), zetas); 5429 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5430 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5431 5432 __ leave(); // required for proper stackwalking of RuntimeStub frame 5433 __ mov(r0, zr); // return 0 5434 __ ret(lr); 5435 5436 return start; 5437 } 5438 5439 // Kyber Inverse NTT function 5440 // Implements 5441 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5442 // 5443 // coeffs (short[256]) = c_rarg0 5444 // ntt_zetas (short[256]) = c_rarg1 5445 address generate_kyberInverseNtt() { 5446 5447 __ align(CodeEntryAlignment); 5448 StubId stub_id = StubId::stubgen_kyberInverseNtt_id; 5449 StubCodeMark mark(this, stub_id); 5450 address start = __ pc(); 5451 __ enter(); 5452 5453 const Register coeffs = c_rarg0; 5454 const Register zetas = c_rarg1; 5455 5456 const Register kyberConsts = r10; 5457 const Register tmpAddr = r11; 5458 const Register tmpAddr2 = c_rarg2; 5459 5460 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5461 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5462 VSeq<2> vq(30); // n.b. constants overlap vs3 5463 5464 __ lea(kyberConsts, 5465 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5466 5467 // level 0 5468 // At level 0 related coefficients occur in discrete blocks of size 4 so 5469 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5470 5471 vs_ldpq(vq, kyberConsts); 5472 int offsets4[4] = { 0, 32, 64, 96 }; 5473 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5474 load32shorts(vs_front(vs2), zetas); 5475 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5476 vs_front(vs2), vs_back(vs2), vtmp, vq); 5477 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5478 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5479 load32shorts(vs_front(vs2), zetas); 5480 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5481 vs_front(vs2), vs_back(vs2), vtmp, vq); 5482 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5483 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5484 load32shorts(vs_front(vs2), zetas); 5485 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5486 vs_front(vs2), vs_back(vs2), vtmp, vq); 5487 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5488 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5489 load32shorts(vs_front(vs2), zetas); 5490 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5491 vs_front(vs2), vs_back(vs2), vtmp, vq); 5492 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5493 5494 // level 1 5495 // At level 1 related coefficients occur in discrete blocks of size 8 so 5496 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5497 5498 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5499 load32shorts(vs_front(vs2), zetas); 5500 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5501 vs_front(vs2), vs_back(vs2), vtmp, vq); 5502 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5503 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5504 load32shorts(vs_front(vs2), zetas); 5505 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5506 vs_front(vs2), vs_back(vs2), vtmp, vq); 5507 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5508 5509 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5510 load32shorts(vs_front(vs2), zetas); 5511 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5512 vs_front(vs2), vs_back(vs2), vtmp, vq); 5513 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5514 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5515 load32shorts(vs_front(vs2), zetas); 5516 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5517 vs_front(vs2), vs_back(vs2), vtmp, vq); 5518 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5519 5520 // level 2 5521 // At level 2 coefficients occur in 8 discrete blocks of size 16 5522 // so they are loaded using employing an ldr at 8 distinct offsets. 5523 5524 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5525 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5526 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5527 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5528 vs_subv(vs1, __ T8H, vs1, vs2); 5529 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5530 load64shorts(vs2, zetas); 5531 vs_ldpq(vq, kyberConsts); 5532 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5533 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5534 5535 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5536 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5537 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5538 vs_subv(vs1, __ T8H, vs1, vs2); 5539 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5540 load64shorts(vs2, zetas); 5541 vs_ldpq(vq, kyberConsts); 5542 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5543 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5544 5545 // Barrett reduction at indexes where overflow may happen 5546 5547 // load q and the multiplier for the Barrett reduction 5548 __ add(tmpAddr, kyberConsts, 16); 5549 vs_ldpq(vq, tmpAddr); 5550 5551 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5552 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5553 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5554 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5555 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5556 vs_sshr(vs2, __ T8H, vs2, 11); 5557 vs_mlsv(vs1, __ T8H, vs2, vq1); 5558 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5559 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5560 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5561 vs_sshr(vs2, __ T8H, vs2, 11); 5562 vs_mlsv(vs1, __ T8H, vs2, vq1); 5563 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5564 5565 // level 3 5566 // From level 3 upwards coefficients occur in discrete blocks whose size is 5567 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5568 5569 int offsets2[4] = { 0, 64, 128, 192 }; 5570 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5571 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5572 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5573 vs_subv(vs1, __ T8H, vs1, vs2); 5574 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5575 load64shorts(vs2, zetas); 5576 vs_ldpq(vq, kyberConsts); 5577 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5578 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5579 5580 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5581 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5582 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5583 vs_subv(vs1, __ T8H, vs1, vs2); 5584 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5585 load64shorts(vs2, zetas); 5586 vs_ldpq(vq, kyberConsts); 5587 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5588 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5589 5590 // level 4 5591 5592 int offsets1[4] = { 0, 32, 128, 160 }; 5593 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5594 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5595 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5596 vs_subv(vs1, __ T8H, vs1, vs2); 5597 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5598 load64shorts(vs2, zetas); 5599 vs_ldpq(vq, kyberConsts); 5600 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5601 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5602 5603 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5604 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5605 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5606 vs_subv(vs1, __ T8H, vs1, vs2); 5607 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5608 load64shorts(vs2, zetas); 5609 vs_ldpq(vq, kyberConsts); 5610 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5611 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5612 5613 // level 5 5614 5615 __ add(tmpAddr, coeffs, 0); 5616 load64shorts(vs1, tmpAddr); 5617 __ add(tmpAddr, coeffs, 128); 5618 load64shorts(vs2, tmpAddr); 5619 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5620 vs_subv(vs1, __ T8H, vs1, vs2); 5621 __ add(tmpAddr, coeffs, 0); 5622 store64shorts(vs3, tmpAddr); 5623 load64shorts(vs2, zetas); 5624 vs_ldpq(vq, kyberConsts); 5625 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5626 __ add(tmpAddr, coeffs, 128); 5627 store64shorts(vs2, tmpAddr); 5628 5629 load64shorts(vs1, tmpAddr); 5630 __ add(tmpAddr, coeffs, 384); 5631 load64shorts(vs2, tmpAddr); 5632 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5633 vs_subv(vs1, __ T8H, vs1, vs2); 5634 __ add(tmpAddr, coeffs, 256); 5635 store64shorts(vs3, tmpAddr); 5636 load64shorts(vs2, zetas); 5637 vs_ldpq(vq, kyberConsts); 5638 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5639 __ add(tmpAddr, coeffs, 384); 5640 store64shorts(vs2, tmpAddr); 5641 5642 // Barrett reduction at indexes where overflow may happen 5643 5644 // load q and the multiplier for the Barrett reduction 5645 __ add(tmpAddr, kyberConsts, 16); 5646 vs_ldpq(vq, tmpAddr); 5647 5648 int offsets0[2] = { 0, 256 }; 5649 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5650 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5651 vs_sshr(vs2, __ T8H, vs2, 11); 5652 vs_mlsv(vs1, __ T8H, vs2, vq1); 5653 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5654 5655 // level 6 5656 5657 __ add(tmpAddr, coeffs, 0); 5658 load64shorts(vs1, tmpAddr); 5659 __ add(tmpAddr, coeffs, 256); 5660 load64shorts(vs2, tmpAddr); 5661 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5662 vs_subv(vs1, __ T8H, vs1, vs2); 5663 __ add(tmpAddr, coeffs, 0); 5664 store64shorts(vs3, tmpAddr); 5665 load64shorts(vs2, zetas); 5666 vs_ldpq(vq, kyberConsts); 5667 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5668 __ add(tmpAddr, coeffs, 256); 5669 store64shorts(vs2, tmpAddr); 5670 5671 __ add(tmpAddr, coeffs, 128); 5672 load64shorts(vs1, tmpAddr); 5673 __ add(tmpAddr, coeffs, 384); 5674 load64shorts(vs2, tmpAddr); 5675 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5676 vs_subv(vs1, __ T8H, vs1, vs2); 5677 __ add(tmpAddr, coeffs, 128); 5678 store64shorts(vs3, tmpAddr); 5679 load64shorts(vs2, zetas); 5680 vs_ldpq(vq, kyberConsts); 5681 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5682 __ add(tmpAddr, coeffs, 384); 5683 store64shorts(vs2, tmpAddr); 5684 5685 // multiply by 2^-n 5686 5687 // load toMont(2^-n mod q) 5688 __ add(tmpAddr, kyberConsts, 48); 5689 __ ldr(v29, __ Q, tmpAddr); 5690 5691 vs_ldpq(vq, kyberConsts); 5692 __ add(tmpAddr, coeffs, 0); 5693 load64shorts(vs1, tmpAddr); 5694 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5695 __ add(tmpAddr, coeffs, 0); 5696 store64shorts(vs2, tmpAddr); 5697 5698 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5699 load64shorts(vs1, tmpAddr); 5700 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5701 __ add(tmpAddr, coeffs, 128); 5702 store64shorts(vs2, tmpAddr); 5703 5704 // now tmpAddr contains coeffs + 256 5705 load64shorts(vs1, tmpAddr); 5706 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5707 __ add(tmpAddr, coeffs, 256); 5708 store64shorts(vs2, tmpAddr); 5709 5710 // now tmpAddr contains coeffs + 384 5711 load64shorts(vs1, tmpAddr); 5712 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5713 __ add(tmpAddr, coeffs, 384); 5714 store64shorts(vs2, tmpAddr); 5715 5716 __ leave(); // required for proper stackwalking of RuntimeStub frame 5717 __ mov(r0, zr); // return 0 5718 __ ret(lr); 5719 5720 return start; 5721 } 5722 5723 // Kyber multiply polynomials in the NTT domain. 5724 // Implements 5725 // static int implKyberNttMult( 5726 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5727 // 5728 // result (short[256]) = c_rarg0 5729 // ntta (short[256]) = c_rarg1 5730 // nttb (short[256]) = c_rarg2 5731 // zetas (short[128]) = c_rarg3 5732 address generate_kyberNttMult() { 5733 5734 __ align(CodeEntryAlignment); 5735 StubId stub_id = StubId::stubgen_kyberNttMult_id; 5736 StubCodeMark mark(this, stub_id); 5737 address start = __ pc(); 5738 __ enter(); 5739 5740 const Register result = c_rarg0; 5741 const Register ntta = c_rarg1; 5742 const Register nttb = c_rarg2; 5743 const Register zetas = c_rarg3; 5744 5745 const Register kyberConsts = r10; 5746 const Register limit = r11; 5747 5748 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5749 VSeq<4> vs3(16), vs4(20); 5750 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5751 VSeq<2> vz(28); // pair of zetas 5752 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5753 5754 __ lea(kyberConsts, 5755 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5756 5757 Label kyberNttMult_loop; 5758 5759 __ add(limit, result, 512); 5760 5761 // load q and qinv 5762 vs_ldpq(vq, kyberConsts); 5763 5764 // load R^2 mod q (to convert back from Montgomery representation) 5765 __ add(kyberConsts, kyberConsts, 64); 5766 __ ldr(v27, __ Q, kyberConsts); 5767 5768 __ BIND(kyberNttMult_loop); 5769 5770 // load 16 zetas 5771 vs_ldpq_post(vz, zetas); 5772 5773 // load 2 sets of 32 coefficients from the two input arrays 5774 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5775 // are striped across pairs of vector registers 5776 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5777 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5778 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5779 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5780 5781 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5782 // i.e. montmul the first and second halves of vs1 in order and 5783 // then with one sequence reversed storing the two results in vs3 5784 // 5785 // vs3[0] <- montmul(a0, b0) 5786 // vs3[1] <- montmul(a1, b1) 5787 // vs3[2] <- montmul(a0, b1) 5788 // vs3[3] <- montmul(a1, b0) 5789 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5790 kyber_montmul16(vs_back(vs3), 5791 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5792 5793 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5794 // i.e. montmul the first and second halves of vs4 in order and 5795 // then with one sequence reversed storing the two results in vs1 5796 // 5797 // vs1[0] <- montmul(a2, b2) 5798 // vs1[1] <- montmul(a3, b3) 5799 // vs1[2] <- montmul(a2, b3) 5800 // vs1[3] <- montmul(a3, b2) 5801 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5802 kyber_montmul16(vs_back(vs1), 5803 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5804 5805 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5806 // We can schedule two montmuls at a time if we use a suitable vector 5807 // sequence <vs3[1], vs1[1]>. 5808 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5809 VSeq<2> vs5(vs3[1], delta); 5810 5811 // vs3[1] <- montmul(montmul(a1, b1), z0) 5812 // vs1[1] <- montmul(montmul(a3, b3), z1) 5813 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5814 5815 // add results in pairs storing in vs3 5816 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5817 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5818 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5819 5820 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5821 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5822 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5823 5824 // vs1 <- montmul(vs3, montRSquareModQ) 5825 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5826 5827 // store back the two pairs of result vectors de-interleaved as 8H elements 5828 // i.e. storing each pairs of shorts striped across a register pair adjacent 5829 // in memory 5830 vs_st2_post(vs1, __ T8H, result); 5831 5832 __ cmp(result, limit); 5833 __ br(Assembler::NE, kyberNttMult_loop); 5834 5835 __ leave(); // required for proper stackwalking of RuntimeStub frame 5836 __ mov(r0, zr); // return 0 5837 __ ret(lr); 5838 5839 return start; 5840 } 5841 5842 // Kyber add 2 polynomials. 5843 // Implements 5844 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5845 // 5846 // result (short[256]) = c_rarg0 5847 // a (short[256]) = c_rarg1 5848 // b (short[256]) = c_rarg2 5849 address generate_kyberAddPoly_2() { 5850 5851 __ align(CodeEntryAlignment); 5852 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id; 5853 StubCodeMark mark(this, stub_id); 5854 address start = __ pc(); 5855 __ enter(); 5856 5857 const Register result = c_rarg0; 5858 const Register a = c_rarg1; 5859 const Register b = c_rarg2; 5860 5861 const Register kyberConsts = r11; 5862 5863 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5864 // So, we can load, add and store the data in 3 groups of 11, 5865 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5866 // registers. A further constraint is that the mapping needs 5867 // to skip callee saves. So, we allocate the register 5868 // sequences using two 8 sequences, two 2 sequences and two 5869 // single registers. 5870 VSeq<8> vs1_1(0); 5871 VSeq<2> vs1_2(16); 5872 FloatRegister vs1_3 = v28; 5873 VSeq<8> vs2_1(18); 5874 VSeq<2> vs2_2(26); 5875 FloatRegister vs2_3 = v29; 5876 5877 // two constant vector sequences 5878 VSeq<8> vc_1(31, 0); 5879 VSeq<2> vc_2(31, 0); 5880 5881 FloatRegister vc_3 = v31; 5882 __ lea(kyberConsts, 5883 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5884 5885 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5886 for (int i = 0; i < 3; i++) { 5887 // load 80 or 88 values from a into vs1_1/2/3 5888 vs_ldpq_post(vs1_1, a); 5889 vs_ldpq_post(vs1_2, a); 5890 if (i < 2) { 5891 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5892 } 5893 // load 80 or 88 values from b into vs2_1/2/3 5894 vs_ldpq_post(vs2_1, b); 5895 vs_ldpq_post(vs2_2, b); 5896 if (i < 2) { 5897 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5898 } 5899 // sum 80 or 88 values across vs1 and vs2 into vs1 5900 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5901 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5902 if (i < 2) { 5903 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5904 } 5905 // add constant to all 80 or 88 results 5906 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5907 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5908 if (i < 2) { 5909 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5910 } 5911 // store 80 or 88 values 5912 vs_stpq_post(vs1_1, result); 5913 vs_stpq_post(vs1_2, result); 5914 if (i < 2) { 5915 __ str(vs1_3, __ Q, __ post(result, 16)); 5916 } 5917 } 5918 5919 __ leave(); // required for proper stackwalking of RuntimeStub frame 5920 __ mov(r0, zr); // return 0 5921 __ ret(lr); 5922 5923 return start; 5924 } 5925 5926 // Kyber add 3 polynomials. 5927 // Implements 5928 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5929 // 5930 // result (short[256]) = c_rarg0 5931 // a (short[256]) = c_rarg1 5932 // b (short[256]) = c_rarg2 5933 // c (short[256]) = c_rarg3 5934 address generate_kyberAddPoly_3() { 5935 5936 __ align(CodeEntryAlignment); 5937 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id; 5938 StubCodeMark mark(this, stub_id); 5939 address start = __ pc(); 5940 __ enter(); 5941 5942 const Register result = c_rarg0; 5943 const Register a = c_rarg1; 5944 const Register b = c_rarg2; 5945 const Register c = c_rarg3; 5946 5947 const Register kyberConsts = r11; 5948 5949 // As above we sum 256 sets of values in total i.e. 32 x 8H 5950 // quadwords. So, we can load, add and store the data in 3 5951 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5952 // of 10 or 11 registers. A further constraint is that the 5953 // mapping needs to skip callee saves. So, we allocate the 5954 // register sequences using two 8 sequences, two 2 sequences 5955 // and two single registers. 5956 VSeq<8> vs1_1(0); 5957 VSeq<2> vs1_2(16); 5958 FloatRegister vs1_3 = v28; 5959 VSeq<8> vs2_1(18); 5960 VSeq<2> vs2_2(26); 5961 FloatRegister vs2_3 = v29; 5962 5963 // two constant vector sequences 5964 VSeq<8> vc_1(31, 0); 5965 VSeq<2> vc_2(31, 0); 5966 5967 FloatRegister vc_3 = v31; 5968 5969 __ lea(kyberConsts, 5970 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5971 5972 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5973 for (int i = 0; i < 3; i++) { 5974 // load 80 or 88 values from a into vs1_1/2/3 5975 vs_ldpq_post(vs1_1, a); 5976 vs_ldpq_post(vs1_2, a); 5977 if (i < 2) { 5978 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5979 } 5980 // load 80 or 88 values from b into vs2_1/2/3 5981 vs_ldpq_post(vs2_1, b); 5982 vs_ldpq_post(vs2_2, b); 5983 if (i < 2) { 5984 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5985 } 5986 // sum 80 or 88 values across vs1 and vs2 into vs1 5987 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5988 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5989 if (i < 2) { 5990 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5991 } 5992 // load 80 or 88 values from c into vs2_1/2/3 5993 vs_ldpq_post(vs2_1, c); 5994 vs_ldpq_post(vs2_2, c); 5995 if (i < 2) { 5996 __ ldr(vs2_3, __ Q, __ post(c, 16)); 5997 } 5998 // sum 80 or 88 values across vs1 and vs2 into vs1 5999 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6000 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6001 if (i < 2) { 6002 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6003 } 6004 // add constant to all 80 or 88 results 6005 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6006 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6007 if (i < 2) { 6008 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6009 } 6010 // store 80 or 88 values 6011 vs_stpq_post(vs1_1, result); 6012 vs_stpq_post(vs1_2, result); 6013 if (i < 2) { 6014 __ str(vs1_3, __ Q, __ post(result, 16)); 6015 } 6016 } 6017 6018 __ leave(); // required for proper stackwalking of RuntimeStub frame 6019 __ mov(r0, zr); // return 0 6020 __ ret(lr); 6021 6022 return start; 6023 } 6024 6025 // Kyber parse XOF output to polynomial coefficient candidates 6026 // or decodePoly(12, ...). 6027 // Implements 6028 // static int implKyber12To16( 6029 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6030 // 6031 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6032 // 6033 // condensed (byte[]) = c_rarg0 6034 // condensedIndex = c_rarg1 6035 // parsed (short[112 or 256]) = c_rarg2 6036 // parsedLength (112 or 256) = c_rarg3 6037 address generate_kyber12To16() { 6038 Label L_F00, L_loop, L_end; 6039 6040 __ BIND(L_F00); 6041 __ emit_int64(0x0f000f000f000f00); 6042 __ emit_int64(0x0f000f000f000f00); 6043 6044 __ align(CodeEntryAlignment); 6045 StubId stub_id = StubId::stubgen_kyber12To16_id; 6046 StubCodeMark mark(this, stub_id); 6047 address start = __ pc(); 6048 __ enter(); 6049 6050 const Register condensed = c_rarg0; 6051 const Register condensedOffs = c_rarg1; 6052 const Register parsed = c_rarg2; 6053 const Register parsedLength = c_rarg3; 6054 6055 const Register tmpAddr = r11; 6056 6057 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6058 // quadwords so we need a 6 vector sequence for the inputs. 6059 // Parsing produces 64 shorts, employing two 8 vector 6060 // sequences to store and combine the intermediate data. 6061 VSeq<6> vin(24); 6062 VSeq<8> va(0), vb(16); 6063 6064 __ adr(tmpAddr, L_F00); 6065 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6066 __ add(condensed, condensed, condensedOffs); 6067 6068 __ BIND(L_loop); 6069 // load 96 (6 x 16B) byte values 6070 vs_ld3_post(vin, __ T16B, condensed); 6071 6072 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6073 // holds 48 (16x3) contiguous bytes from memory striped 6074 // horizontally across each of the 16 byte lanes. Equivalently, 6075 // that is 16 pairs of 12-bit integers. Likewise the back half 6076 // holds the next 48 bytes in the same arrangement. 6077 6078 // Each vector in the front half can also be viewed as a vertical 6079 // strip across the 16 pairs of 12 bit integers. Each byte in 6080 // vin[0] stores the low 8 bits of the first int in a pair. Each 6081 // byte in vin[1] stores the high 4 bits of the first int and the 6082 // low 4 bits of the second int. Each byte in vin[2] stores the 6083 // high 8 bits of the second int. Likewise the vectors in second 6084 // half. 6085 6086 // Converting the data to 16-bit shorts requires first of all 6087 // expanding each of the 6 x 16B vectors into 6 corresponding 6088 // pairs of 8H vectors. Mask, shift and add operations on the 6089 // resulting vector pairs can be used to combine 4 and 8 bit 6090 // parts of related 8H vector elements. 6091 // 6092 // The middle vectors (vin[2] and vin[5]) are actually expanded 6093 // twice, one copy manipulated to provide the lower 4 bits 6094 // belonging to the first short in a pair and another copy 6095 // manipulated to provide the higher 4 bits belonging to the 6096 // second short in a pair. This is why the the vector sequences va 6097 // and vb used to hold the expanded 8H elements are of length 8. 6098 6099 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6100 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6101 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6102 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6103 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6104 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6105 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6106 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6107 6108 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6109 // and vb[4:5] 6110 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6111 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6112 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6113 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6114 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6115 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6116 6117 // shift lo byte of copy 1 of the middle stripe into the high byte 6118 __ shl(va[2], __ T8H, va[2], 8); 6119 __ shl(va[3], __ T8H, va[3], 8); 6120 __ shl(vb[2], __ T8H, vb[2], 8); 6121 __ shl(vb[3], __ T8H, vb[3], 8); 6122 6123 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6124 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6125 // are in bit positions [4..11]. 6126 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6127 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6128 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6129 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6130 6131 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6132 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6133 // copy2 6134 __ andr(va[2], __ T16B, va[2], v31); 6135 __ andr(va[3], __ T16B, va[3], v31); 6136 __ ushr(va[4], __ T8H, va[4], 4); 6137 __ ushr(va[5], __ T8H, va[5], 4); 6138 __ andr(vb[2], __ T16B, vb[2], v31); 6139 __ andr(vb[3], __ T16B, vb[3], v31); 6140 __ ushr(vb[4], __ T8H, vb[4], 4); 6141 __ ushr(vb[5], __ T8H, vb[5], 4); 6142 6143 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6144 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6145 // n.b. the ordering ensures: i) inputs are consumed before they 6146 // are overwritten ii) the order of 16-bit results across successive 6147 // pairs of vectors in va and then vb reflects the order of the 6148 // corresponding 12-bit inputs 6149 __ addv(va[0], __ T8H, va[0], va[2]); 6150 __ addv(va[2], __ T8H, va[1], va[3]); 6151 __ addv(va[1], __ T8H, va[4], va[6]); 6152 __ addv(va[3], __ T8H, va[5], va[7]); 6153 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6154 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6155 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6156 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6157 6158 // store 64 results interleaved as shorts 6159 vs_st2_post(vs_front(va), __ T8H, parsed); 6160 vs_st2_post(vs_front(vb), __ T8H, parsed); 6161 6162 __ sub(parsedLength, parsedLength, 64); 6163 __ cmp(parsedLength, (u1)64); 6164 __ br(Assembler::GE, L_loop); 6165 __ cbz(parsedLength, L_end); 6166 6167 // if anything is left it should be a final 72 bytes of input 6168 // i.e. a final 48 12-bit values. so we handle this by loading 6169 // 48 bytes into all 16B lanes of front(vin) and only 24 6170 // bytes into the lower 8B lane of back(vin) 6171 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6172 vs_ld3(vs_back(vin), __ T8B, condensed); 6173 6174 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6175 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6176 // 5 and target element 2 of vb duplicates element 4. 6177 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6178 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6179 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6180 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6181 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6182 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6183 6184 // This time expand just the lower 8 lanes 6185 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6186 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6187 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6188 6189 // shift lo byte of copy 1 of the middle stripe into the high byte 6190 __ shl(va[2], __ T8H, va[2], 8); 6191 __ shl(va[3], __ T8H, va[3], 8); 6192 __ shl(vb[2], __ T8H, vb[2], 8); 6193 6194 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6195 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6196 // int are in bit positions [4..11]. 6197 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6198 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6199 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6200 6201 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6202 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6203 // copy2 6204 __ andr(va[2], __ T16B, va[2], v31); 6205 __ andr(va[3], __ T16B, va[3], v31); 6206 __ ushr(va[4], __ T8H, va[4], 4); 6207 __ ushr(va[5], __ T8H, va[5], 4); 6208 __ andr(vb[2], __ T16B, vb[2], v31); 6209 __ ushr(vb[4], __ T8H, vb[4], 4); 6210 6211 6212 6213 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6214 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6215 6216 // n.b. ordering ensures: i) inputs are consumed before they are 6217 // overwritten ii) order of 16-bit results across succsessive 6218 // pairs of vectors in va and then lower half of vb reflects order 6219 // of corresponding 12-bit inputs 6220 __ addv(va[0], __ T8H, va[0], va[2]); 6221 __ addv(va[2], __ T8H, va[1], va[3]); 6222 __ addv(va[1], __ T8H, va[4], va[6]); 6223 __ addv(va[3], __ T8H, va[5], va[7]); 6224 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6225 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6226 6227 // store 48 results interleaved as shorts 6228 vs_st2_post(vs_front(va), __ T8H, parsed); 6229 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6230 6231 __ BIND(L_end); 6232 6233 __ leave(); // required for proper stackwalking of RuntimeStub frame 6234 __ mov(r0, zr); // return 0 6235 __ ret(lr); 6236 6237 return start; 6238 } 6239 6240 // Kyber Barrett reduce function. 6241 // Implements 6242 // static int implKyberBarrettReduce(short[] coeffs) {} 6243 // 6244 // coeffs (short[256]) = c_rarg0 6245 address generate_kyberBarrettReduce() { 6246 6247 __ align(CodeEntryAlignment); 6248 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id; 6249 StubCodeMark mark(this, stub_id); 6250 address start = __ pc(); 6251 __ enter(); 6252 6253 const Register coeffs = c_rarg0; 6254 6255 const Register kyberConsts = r10; 6256 const Register result = r11; 6257 6258 // As above we process 256 sets of values in total i.e. 32 x 6259 // 8H quadwords. So, we can load, add and store the data in 3 6260 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6261 // of 10 or 11 registers. A further constraint is that the 6262 // mapping needs to skip callee saves. So, we allocate the 6263 // register sequences using two 8 sequences, two 2 sequences 6264 // and two single registers. 6265 VSeq<8> vs1_1(0); 6266 VSeq<2> vs1_2(16); 6267 FloatRegister vs1_3 = v28; 6268 VSeq<8> vs2_1(18); 6269 VSeq<2> vs2_2(26); 6270 FloatRegister vs2_3 = v29; 6271 6272 // we also need a pair of corresponding constant sequences 6273 6274 VSeq<8> vc1_1(30, 0); 6275 VSeq<2> vc1_2(30, 0); 6276 FloatRegister vc1_3 = v30; // for kyber_q 6277 6278 VSeq<8> vc2_1(31, 0); 6279 VSeq<2> vc2_2(31, 0); 6280 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6281 6282 __ add(result, coeffs, 0); 6283 __ lea(kyberConsts, 6284 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6285 6286 // load q and the multiplier for the Barrett reduction 6287 __ add(kyberConsts, kyberConsts, 16); 6288 __ ldpq(vc1_3, vc2_3, kyberConsts); 6289 6290 for (int i = 0; i < 3; i++) { 6291 // load 80 or 88 coefficients 6292 vs_ldpq_post(vs1_1, coeffs); 6293 vs_ldpq_post(vs1_2, coeffs); 6294 if (i < 2) { 6295 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6296 } 6297 6298 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6299 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6300 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6301 if (i < 2) { 6302 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6303 } 6304 6305 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6306 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6307 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6308 if (i < 2) { 6309 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6310 } 6311 6312 // vs1 <- vs1 - vs2 * kyber_q 6313 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6314 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6315 if (i < 2) { 6316 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6317 } 6318 6319 vs_stpq_post(vs1_1, result); 6320 vs_stpq_post(vs1_2, result); 6321 if (i < 2) { 6322 __ str(vs1_3, __ Q, __ post(result, 16)); 6323 } 6324 } 6325 6326 __ leave(); // required for proper stackwalking of RuntimeStub frame 6327 __ mov(r0, zr); // return 0 6328 __ ret(lr); 6329 6330 return start; 6331 } 6332 6333 6334 // Dilithium-specific montmul helper routines that generate parallel 6335 // code for, respectively, a single 4x4s vector sequence montmul or 6336 // two such multiplies in a row. 6337 6338 // Perform 16 32-bit Montgomery multiplications in parallel 6339 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6340 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6341 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6342 // It will assert that the register use is valid 6343 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6344 } 6345 6346 // Perform 2x16 32-bit Montgomery multiplications in parallel 6347 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6348 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6349 // Schedule two successive 4x4S multiplies via the montmul helper 6350 // on the front and back halves of va, vb and vc. The helper will 6351 // assert that the register use has no overlap conflicts on each 6352 // individual call but we also need to ensure that the necessary 6353 // disjoint/equality constraints are met across both calls. 6354 6355 // vb, vc, vtmp and vq must be disjoint. va must either be 6356 // disjoint from all other registers or equal vc 6357 6358 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6359 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6360 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6361 6362 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6363 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6364 6365 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6366 6367 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6368 assert(vs_disjoint(va, vb), "va and vb overlap"); 6369 assert(vs_disjoint(va, vq), "va and vq overlap"); 6370 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6371 6372 // We multiply the front and back halves of each sequence 4 at a 6373 // time because 6374 // 6375 // 1) we are currently only able to get 4-way instruction 6376 // parallelism at best 6377 // 6378 // 2) we need registers for the constants in vq and temporary 6379 // scratch registers to hold intermediate results so vtmp can only 6380 // be a VSeq<4> which means we only have 4 scratch slots. 6381 6382 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6383 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6384 } 6385 6386 // Perform combined montmul then add/sub on 4x4S vectors. 6387 void dilithium_montmul16_sub_add( 6388 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6389 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6390 // compute a = montmul(a1, c) 6391 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6392 // ouptut a1 = a0 - a 6393 vs_subv(va1, __ T4S, va0, vc); 6394 // and a0 = a0 + a 6395 vs_addv(va0, __ T4S, va0, vc); 6396 } 6397 6398 // Perform combined add/sub then montul on 4x4S vectors. 6399 void dilithium_sub_add_montmul16( 6400 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6401 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6402 // compute c = a0 - a1 6403 vs_subv(vtmp1, __ T4S, va0, va1); 6404 // output a0 = a0 + a1 6405 vs_addv(va0, __ T4S, va0, va1); 6406 // output a1 = b montmul c 6407 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6408 } 6409 6410 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6411 // in the Java implementation come in sequences of at least 8, so we 6412 // can use ldpq to collect the corresponding data into pairs of vector 6413 // registers. 6414 // We collect the coefficients corresponding to the 'j+l' indexes into 6415 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6416 // then we do the (Montgomery) multiplications by the zetas in parallel 6417 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6418 // v0-v7, then do the additions into v24-v31 and the subtractions into 6419 // v0-v7 and finally save the results back to the coeffs array. 6420 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6421 const Register coeffs, const Register zetas) { 6422 int c1 = 0; 6423 int c2 = 512; 6424 int startIncr; 6425 // don't use callee save registers v8 - v15 6426 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6427 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6428 VSeq<2> vq(30); // n.b. constants overlap vs3 6429 int offsets[4] = { 0, 32, 64, 96 }; 6430 6431 for (int level = 0; level < 5; level++) { 6432 int c1Start = c1; 6433 int c2Start = c2; 6434 if (level == 3) { 6435 offsets[1] = 32; 6436 offsets[2] = 128; 6437 offsets[3] = 160; 6438 } else if (level == 4) { 6439 offsets[1] = 64; 6440 offsets[2] = 128; 6441 offsets[3] = 192; 6442 } 6443 6444 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6445 // time at 4 different offsets and multiply them in order by the 6446 // next set of input values. So we employ indexed load and store 6447 // pair instructions with arrangement 4S. 6448 for (int i = 0; i < 4; i++) { 6449 // reload q and qinv 6450 vs_ldpq(vq, dilithiumConsts); // qInv, q 6451 // load 8x4S coefficients via second start pos == c2 6452 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6453 // load next 8x4S inputs == b 6454 vs_ldpq_post(vs2, zetas); 6455 // compute a == c2 * b mod MONT_Q 6456 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6457 // load 8x4s coefficients via first start pos == c1 6458 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6459 // compute a1 = c1 + a 6460 vs_addv(vs3, __ T4S, vs1, vs2); 6461 // compute a2 = c1 - a 6462 vs_subv(vs1, __ T4S, vs1, vs2); 6463 // output a1 and a2 6464 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6465 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6466 6467 int k = 4 * level + i; 6468 6469 if (k > 7) { 6470 startIncr = 256; 6471 } else if (k == 5) { 6472 startIncr = 384; 6473 } else { 6474 startIncr = 128; 6475 } 6476 6477 c1Start += startIncr; 6478 c2Start += startIncr; 6479 } 6480 6481 c2 /= 2; 6482 } 6483 } 6484 6485 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6486 // Implements the method 6487 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6488 // of the Java class sun.security.provider 6489 // 6490 // coeffs (int[256]) = c_rarg0 6491 // zetas (int[256]) = c_rarg1 6492 address generate_dilithiumAlmostNtt() { 6493 6494 __ align(CodeEntryAlignment); 6495 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id; 6496 StubCodeMark mark(this, stub_id); 6497 address start = __ pc(); 6498 __ enter(); 6499 6500 const Register coeffs = c_rarg0; 6501 const Register zetas = c_rarg1; 6502 6503 const Register tmpAddr = r9; 6504 const Register dilithiumConsts = r10; 6505 const Register result = r11; 6506 // don't use callee save registers v8 - v15 6507 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6508 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6509 VSeq<2> vq(30); // n.b. constants overlap vs3 6510 int offsets[4] = { 0, 32, 64, 96}; 6511 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6512 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6513 __ add(result, coeffs, 0); 6514 __ lea(dilithiumConsts, 6515 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6516 6517 // Each level represents one iteration of the outer for loop of the Java version. 6518 6519 // level 0-4 6520 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6521 6522 // level 5 6523 6524 // At level 5 the coefficients we need to combine with the zetas 6525 // are grouped in memory in blocks of size 4. So, for both sets of 6526 // coefficients we load 4 adjacent values at 8 different offsets 6527 // using an indexed ldr with register variant Q and multiply them 6528 // in sequence order by the next set of inputs. Likewise we store 6529 // the resuls using an indexed str with register variant Q. 6530 for (int i = 0; i < 1024; i += 256) { 6531 // reload constants q, qinv each iteration as they get clobbered later 6532 vs_ldpq(vq, dilithiumConsts); // qInv, q 6533 // load 32 (8x4S) coefficients via first offsets = c1 6534 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6535 // load next 32 (8x4S) inputs = b 6536 vs_ldpq_post(vs2, zetas); 6537 // a = b montul c1 6538 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6539 // load 32 (8x4S) coefficients via second offsets = c2 6540 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6541 // add/sub with result of multiply 6542 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6543 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6544 // write back new coefficients using same offsets 6545 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6546 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6547 } 6548 6549 // level 6 6550 // At level 6 the coefficients we need to combine with the zetas 6551 // are grouped in memory in pairs, the first two being montmul 6552 // inputs and the second add/sub inputs. We can still implement 6553 // the montmul+sub+add using 4-way parallelism but only if we 6554 // combine the coefficients with the zetas 16 at a time. We load 8 6555 // adjacent values at 4 different offsets using an ld2 load with 6556 // arrangement 2D. That interleaves the lower and upper halves of 6557 // each pair of quadwords into successive vector registers. We 6558 // then need to montmul the 4 even elements of the coefficients 6559 // register sequence by the zetas in order and then add/sub the 4 6560 // odd elements of the coefficients register sequence. We use an 6561 // equivalent st2 operation to store the results back into memory 6562 // de-interleaved. 6563 for (int i = 0; i < 1024; i += 128) { 6564 // reload constants q, qinv each iteration as they get clobbered later 6565 vs_ldpq(vq, dilithiumConsts); // qInv, q 6566 // load interleaved 16 (4x2D) coefficients via offsets 6567 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6568 // load next 16 (4x4S) inputs 6569 vs_ldpq_post(vs_front(vs2), zetas); 6570 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6571 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6572 vs_front(vs2), vtmp, vq); 6573 // store interleaved 16 (4x2D) coefficients via offsets 6574 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6575 } 6576 6577 // level 7 6578 // At level 7 the coefficients we need to combine with the zetas 6579 // occur singly with montmul inputs alterating with add/sub 6580 // inputs. Once again we can use 4-way parallelism to combine 16 6581 // zetas at a time. However, we have to load 8 adjacent values at 6582 // 4 different offsets using an ld2 load with arrangement 4S. That 6583 // interleaves the the odd words of each pair into one 6584 // coefficients vector register and the even words of the pair 6585 // into the next register. We then need to montmul the 4 even 6586 // elements of the coefficients register sequence by the zetas in 6587 // order and then add/sub the 4 odd elements of the coefficients 6588 // register sequence. We use an equivalent st2 operation to store 6589 // the results back into memory de-interleaved. 6590 6591 for (int i = 0; i < 1024; i += 128) { 6592 // reload constants q, qinv each iteration as they get clobbered later 6593 vs_ldpq(vq, dilithiumConsts); // qInv, q 6594 // load interleaved 16 (4x4S) coefficients via offsets 6595 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6596 // load next 16 (4x4S) inputs 6597 vs_ldpq_post(vs_front(vs2), zetas); 6598 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6599 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6600 vs_front(vs2), vtmp, vq); 6601 // store interleaved 16 (4x4S) coefficients via offsets 6602 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6603 } 6604 __ leave(); // required for proper stackwalking of RuntimeStub frame 6605 __ mov(r0, zr); // return 0 6606 __ ret(lr); 6607 6608 return start; 6609 } 6610 6611 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6612 // in the Java implementation come in sequences of at least 8, so we 6613 // can use ldpq to collect the corresponding data into pairs of vector 6614 // registers 6615 // We collect the coefficients that correspond to the 'j's into vs1 6616 // the coefficiets that correspond to the 'j+l's into vs2 then 6617 // do the additions into vs3 and the subtractions into vs1 then 6618 // save the result of the additions, load the zetas into vs2 6619 // do the (Montgomery) multiplications by zeta in parallel into vs2 6620 // finally save the results back to the coeffs array 6621 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6622 const Register coeffs, const Register zetas) { 6623 int c1 = 0; 6624 int c2 = 32; 6625 int startIncr; 6626 int offsets[4]; 6627 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6628 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6629 VSeq<2> vq(30); // n.b. constants overlap vs3 6630 6631 offsets[0] = 0; 6632 6633 for (int level = 3; level < 8; level++) { 6634 int c1Start = c1; 6635 int c2Start = c2; 6636 if (level == 3) { 6637 offsets[1] = 64; 6638 offsets[2] = 128; 6639 offsets[3] = 192; 6640 } else if (level == 4) { 6641 offsets[1] = 32; 6642 offsets[2] = 128; 6643 offsets[3] = 160; 6644 } else { 6645 offsets[1] = 32; 6646 offsets[2] = 64; 6647 offsets[3] = 96; 6648 } 6649 6650 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6651 // time at 4 different offsets and multiply them in order by the 6652 // next set of input values. So we employ indexed load and store 6653 // pair instructions with arrangement 4S. 6654 for (int i = 0; i < 4; i++) { 6655 // load v1 32 (8x4S) coefficients relative to first start index 6656 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6657 // load v2 32 (8x4S) coefficients relative to second start index 6658 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6659 // a0 = v1 + v2 -- n.b. clobbers vqs 6660 vs_addv(vs3, __ T4S, vs1, vs2); 6661 // a1 = v1 - v2 6662 vs_subv(vs1, __ T4S, vs1, vs2); 6663 // save a1 relative to first start index 6664 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6665 // load constants q, qinv each iteration as they get clobbered above 6666 vs_ldpq(vq, dilithiumConsts); // qInv, q 6667 // load b next 32 (8x4S) inputs 6668 vs_ldpq_post(vs2, zetas); 6669 // a = a1 montmul b 6670 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6671 // save a relative to second start index 6672 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6673 6674 int k = 4 * level + i; 6675 6676 if (k < 24) { 6677 startIncr = 256; 6678 } else if (k == 25) { 6679 startIncr = 384; 6680 } else { 6681 startIncr = 128; 6682 } 6683 6684 c1Start += startIncr; 6685 c2Start += startIncr; 6686 } 6687 6688 c2 *= 2; 6689 } 6690 } 6691 6692 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6693 // Implements the method 6694 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6695 // the sun.security.provider.ML_DSA class. 6696 // 6697 // coeffs (int[256]) = c_rarg0 6698 // zetas (int[256]) = c_rarg1 6699 address generate_dilithiumAlmostInverseNtt() { 6700 6701 __ align(CodeEntryAlignment); 6702 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id; 6703 StubCodeMark mark(this, stub_id); 6704 address start = __ pc(); 6705 __ enter(); 6706 6707 const Register coeffs = c_rarg0; 6708 const Register zetas = c_rarg1; 6709 6710 const Register tmpAddr = r9; 6711 const Register dilithiumConsts = r10; 6712 const Register result = r11; 6713 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6714 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6715 VSeq<2> vq(30); // n.b. constants overlap vs3 6716 int offsets[4] = { 0, 32, 64, 96 }; 6717 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6718 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6719 6720 __ add(result, coeffs, 0); 6721 __ lea(dilithiumConsts, 6722 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6723 6724 // Each level represents one iteration of the outer for loop of the Java version 6725 6726 // level 0 6727 // At level 0 we need to interleave adjacent quartets of 6728 // coefficients before we multiply and add/sub by the next 16 6729 // zetas just as we did for level 7 in the multiply code. So we 6730 // load and store the values using an ld2/st2 with arrangement 4S. 6731 for (int i = 0; i < 1024; i += 128) { 6732 // load constants q, qinv 6733 // n.b. this can be moved out of the loop as they do not get 6734 // clobbered by first two loops 6735 vs_ldpq(vq, dilithiumConsts); // qInv, q 6736 // a0/a1 load interleaved 32 (8x4S) coefficients 6737 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6738 // b load next 32 (8x4S) inputs 6739 vs_ldpq_post(vs_front(vs2), zetas); 6740 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6741 // n.b. second half of vs2 provides temporary register storage 6742 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6743 vs_front(vs2), vs_back(vs2), vtmp, vq); 6744 // a0/a1 store interleaved 32 (8x4S) coefficients 6745 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6746 } 6747 6748 // level 1 6749 // At level 1 we need to interleave pairs of adjacent pairs of 6750 // coefficients before we multiply by the next 16 zetas just as we 6751 // did for level 6 in the multiply code. So we load and store the 6752 // values an ld2/st2 with arrangement 2D. 6753 for (int i = 0; i < 1024; i += 128) { 6754 // a0/a1 load interleaved 32 (8x2D) coefficients 6755 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6756 // b load next 16 (4x4S) inputs 6757 vs_ldpq_post(vs_front(vs2), zetas); 6758 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6759 // n.b. second half of vs2 provides temporary register storage 6760 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6761 vs_front(vs2), vs_back(vs2), vtmp, vq); 6762 // a0/a1 store interleaved 32 (8x2D) coefficients 6763 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6764 } 6765 6766 // level 2 6767 // At level 2 coefficients come in blocks of 4. So, we load 4 6768 // adjacent coefficients at 8 distinct offsets for both the first 6769 // and second coefficient sequences, using an ldr with register 6770 // variant Q then combine them with next set of 32 zetas. Likewise 6771 // we store the results using an str with register variant Q. 6772 for (int i = 0; i < 1024; i += 256) { 6773 // c0 load 32 (8x4S) coefficients via first offsets 6774 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6775 // c1 load 32 (8x4S) coefficients via second offsets 6776 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6777 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6778 vs_addv(vs3, __ T4S, vs1, vs2); 6779 // c = c0 - c1 6780 vs_subv(vs1, __ T4S, vs1, vs2); 6781 // store a0 32 (8x4S) coefficients via first offsets 6782 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6783 // b load 32 (8x4S) next inputs 6784 vs_ldpq_post(vs2, zetas); 6785 // reload constants q, qinv -- they were clobbered earlier 6786 vs_ldpq(vq, dilithiumConsts); // qInv, q 6787 // compute a1 = b montmul c 6788 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6789 // store a1 32 (8x4S) coefficients via second offsets 6790 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6791 } 6792 6793 // level 3-7 6794 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6795 6796 __ leave(); // required for proper stackwalking of RuntimeStub frame 6797 __ mov(r0, zr); // return 0 6798 __ ret(lr); 6799 6800 return start; 6801 } 6802 6803 // Dilithium multiply polynomials in the NTT domain. 6804 // Straightforward implementation of the method 6805 // static int implDilithiumNttMult( 6806 // int[] result, int[] ntta, int[] nttb {} of 6807 // the sun.security.provider.ML_DSA class. 6808 // 6809 // result (int[256]) = c_rarg0 6810 // poly1 (int[256]) = c_rarg1 6811 // poly2 (int[256]) = c_rarg2 6812 address generate_dilithiumNttMult() { 6813 6814 __ align(CodeEntryAlignment); 6815 StubId stub_id = StubId::stubgen_dilithiumNttMult_id; 6816 StubCodeMark mark(this, stub_id); 6817 address start = __ pc(); 6818 __ enter(); 6819 6820 Label L_loop; 6821 6822 const Register result = c_rarg0; 6823 const Register poly1 = c_rarg1; 6824 const Register poly2 = c_rarg2; 6825 6826 const Register dilithiumConsts = r10; 6827 const Register len = r11; 6828 6829 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6830 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6831 VSeq<2> vq(30); // n.b. constants overlap vs3 6832 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6833 6834 __ lea(dilithiumConsts, 6835 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6836 6837 // load constants q, qinv 6838 vs_ldpq(vq, dilithiumConsts); // qInv, q 6839 // load constant rSquare into v29 6840 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6841 6842 __ mov(len, zr); 6843 __ add(len, len, 1024); 6844 6845 __ BIND(L_loop); 6846 6847 // b load 32 (8x4S) next inputs from poly1 6848 vs_ldpq_post(vs1, poly1); 6849 // c load 32 (8x4S) next inputs from poly2 6850 vs_ldpq_post(vs2, poly2); 6851 // compute a = b montmul c 6852 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6853 // compute a = rsquare montmul a 6854 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6855 // save a 32 (8x4S) results 6856 vs_stpq_post(vs2, result); 6857 6858 __ sub(len, len, 128); 6859 __ cmp(len, (u1)128); 6860 __ br(Assembler::GE, L_loop); 6861 6862 __ leave(); // required for proper stackwalking of RuntimeStub frame 6863 __ mov(r0, zr); // return 0 6864 __ ret(lr); 6865 6866 return start; 6867 } 6868 6869 // Dilithium Motgomery multiply an array by a constant. 6870 // A straightforward implementation of the method 6871 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6872 // of the sun.security.provider.MLDSA class 6873 // 6874 // coeffs (int[256]) = c_rarg0 6875 // constant (int) = c_rarg1 6876 address generate_dilithiumMontMulByConstant() { 6877 6878 __ align(CodeEntryAlignment); 6879 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id; 6880 StubCodeMark mark(this, stub_id); 6881 address start = __ pc(); 6882 __ enter(); 6883 6884 Label L_loop; 6885 6886 const Register coeffs = c_rarg0; 6887 const Register constant = c_rarg1; 6888 6889 const Register dilithiumConsts = r10; 6890 const Register result = r11; 6891 const Register len = r12; 6892 6893 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6894 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6895 VSeq<2> vq(30); // n.b. constants overlap vs3 6896 VSeq<8> vconst(29, 0); // for montmul by constant 6897 6898 // results track inputs 6899 __ add(result, coeffs, 0); 6900 __ lea(dilithiumConsts, 6901 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6902 6903 // load constants q, qinv -- they do not get clobbered by first two loops 6904 vs_ldpq(vq, dilithiumConsts); // qInv, q 6905 // copy caller supplied constant across vconst 6906 __ dup(vconst[0], __ T4S, constant); 6907 __ mov(len, zr); 6908 __ add(len, len, 1024); 6909 6910 __ BIND(L_loop); 6911 6912 // load next 32 inputs 6913 vs_ldpq_post(vs2, coeffs); 6914 // mont mul by constant 6915 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6916 // write next 32 results 6917 vs_stpq_post(vs2, result); 6918 6919 __ sub(len, len, 128); 6920 __ cmp(len, (u1)128); 6921 __ br(Assembler::GE, L_loop); 6922 6923 __ leave(); // required for proper stackwalking of RuntimeStub frame 6924 __ mov(r0, zr); // return 0 6925 __ ret(lr); 6926 6927 return start; 6928 } 6929 6930 // Dilithium decompose poly. 6931 // Implements the method 6932 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6933 // of the sun.security.provider.ML_DSA class 6934 // 6935 // input (int[256]) = c_rarg0 6936 // lowPart (int[256]) = c_rarg1 6937 // highPart (int[256]) = c_rarg2 6938 // twoGamma2 (int) = c_rarg3 6939 // multiplier (int) = c_rarg4 6940 address generate_dilithiumDecomposePoly() { 6941 6942 __ align(CodeEntryAlignment); 6943 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id; 6944 StubCodeMark mark(this, stub_id); 6945 address start = __ pc(); 6946 Label L_loop; 6947 6948 const Register input = c_rarg0; 6949 const Register lowPart = c_rarg1; 6950 const Register highPart = c_rarg2; 6951 const Register twoGamma2 = c_rarg3; 6952 const Register multiplier = c_rarg4; 6953 6954 const Register len = r9; 6955 const Register dilithiumConsts = r10; 6956 const Register tmp = r11; 6957 6958 // 6 independent sets of 4x4s values 6959 VSeq<4> vs1(0), vs2(4), vs3(8); 6960 VSeq<4> vs4(12), vs5(16), vtmp(20); 6961 6962 // 7 constants for cross-multiplying 6963 VSeq<4> one(25, 0); 6964 VSeq<4> qminus1(26, 0); 6965 VSeq<4> g2(27, 0); 6966 VSeq<4> twog2(28, 0); 6967 VSeq<4> mult(29, 0); 6968 VSeq<4> q(30, 0); 6969 VSeq<4> qadd(31, 0); 6970 6971 __ enter(); 6972 6973 __ lea(dilithiumConsts, 6974 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6975 6976 // save callee-saved registers 6977 __ stpd(v8, v9, __ pre(sp, -64)); 6978 __ stpd(v10, v11, Address(sp, 16)); 6979 __ stpd(v12, v13, Address(sp, 32)); 6980 __ stpd(v14, v15, Address(sp, 48)); 6981 6982 // populate constant registers 6983 __ mov(tmp, zr); 6984 __ add(tmp, tmp, 1); 6985 __ dup(one[0], __ T4S, tmp); // 1 6986 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 6987 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 6988 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 6989 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 6990 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 6991 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 6992 6993 __ mov(len, zr); 6994 __ add(len, len, 1024); 6995 6996 __ BIND(L_loop); 6997 6998 // load next 4x4S inputs interleaved: rplus --> vs1 6999 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 7000 7001 // rplus = rplus - ((rplus + qadd) >> 23) * q 7002 vs_addv(vtmp, __ T4S, vs1, qadd); 7003 vs_sshr(vtmp, __ T4S, vtmp, 23); 7004 vs_mulv(vtmp, __ T4S, vtmp, q); 7005 vs_subv(vs1, __ T4S, vs1, vtmp); 7006 7007 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7008 vs_sshr(vtmp, __ T4S, vs1, 31); 7009 vs_andr(vtmp, vtmp, q); 7010 vs_addv(vs1, __ T4S, vs1, vtmp); 7011 7012 // quotient --> vs2 7013 // int quotient = (rplus * multiplier) >> 22; 7014 vs_mulv(vtmp, __ T4S, vs1, mult); 7015 vs_sshr(vs2, __ T4S, vtmp, 22); 7016 7017 // r0 --> vs3 7018 // int r0 = rplus - quotient * twoGamma2; 7019 vs_mulv(vtmp, __ T4S, vs2, twog2); 7020 vs_subv(vs3, __ T4S, vs1, vtmp); 7021 7022 // mask --> vs4 7023 // int mask = (twoGamma2 - r0) >> 22; 7024 vs_subv(vtmp, __ T4S, twog2, vs3); 7025 vs_sshr(vs4, __ T4S, vtmp, 22); 7026 7027 // r0 -= (mask & twoGamma2); 7028 vs_andr(vtmp, vs4, twog2); 7029 vs_subv(vs3, __ T4S, vs3, vtmp); 7030 7031 // quotient += (mask & 1); 7032 vs_andr(vtmp, vs4, one); 7033 vs_addv(vs2, __ T4S, vs2, vtmp); 7034 7035 // mask = (twoGamma2 / 2 - r0) >> 31; 7036 vs_subv(vtmp, __ T4S, g2, vs3); 7037 vs_sshr(vs4, __ T4S, vtmp, 31); 7038 7039 // r0 -= (mask & twoGamma2); 7040 vs_andr(vtmp, vs4, twog2); 7041 vs_subv(vs3, __ T4S, vs3, vtmp); 7042 7043 // quotient += (mask & 1); 7044 vs_andr(vtmp, vs4, one); 7045 vs_addv(vs2, __ T4S, vs2, vtmp); 7046 7047 // r1 --> vs5 7048 // int r1 = rplus - r0 - (dilithium_q - 1); 7049 vs_subv(vtmp, __ T4S, vs1, vs3); 7050 vs_subv(vs5, __ T4S, vtmp, qminus1); 7051 7052 // r1 --> vs1 (overwriting rplus) 7053 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7054 vs_negr(vtmp, __ T4S, vs5); 7055 vs_orr(vtmp, vs5, vtmp); 7056 vs_sshr(vs1, __ T4S, vtmp, 31); 7057 7058 // r0 += ~r1; 7059 vs_notr(vtmp, vs1); 7060 vs_addv(vs3, __ T4S, vs3, vtmp); 7061 7062 // r1 = r1 & quotient; 7063 vs_andr(vs1, vs2, vs1); 7064 7065 // store results inteleaved 7066 // lowPart[m] = r0; 7067 // highPart[m] = r1; 7068 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7069 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7070 7071 __ sub(len, len, 64); 7072 __ cmp(len, (u1)64); 7073 __ br(Assembler::GE, L_loop); 7074 7075 // restore callee-saved vector registers 7076 __ ldpd(v14, v15, Address(sp, 48)); 7077 __ ldpd(v12, v13, Address(sp, 32)); 7078 __ ldpd(v10, v11, Address(sp, 16)); 7079 __ ldpd(v8, v9, __ post(sp, 64)); 7080 7081 __ leave(); // required for proper stackwalking of RuntimeStub frame 7082 __ mov(r0, zr); // return 0 7083 __ ret(lr); 7084 7085 return start; 7086 } 7087 7088 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4, 7089 Register tmp0, Register tmp1, Register tmp2) { 7090 __ bic(tmp0, a2, a1); // for a0 7091 __ bic(tmp1, a3, a2); // for a1 7092 __ bic(tmp2, a4, a3); // for a2 7093 __ eor(a2, a2, tmp2); 7094 __ bic(tmp2, a0, a4); // for a3 7095 __ eor(a3, a3, tmp2); 7096 __ bic(tmp2, a1, a0); // for a4 7097 __ eor(a0, a0, tmp0); 7098 __ eor(a1, a1, tmp1); 7099 __ eor(a4, a4, tmp2); 7100 } 7101 7102 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc, 7103 Register a0, Register a1, Register a2, Register a3, Register a4, 7104 Register a5, Register a6, Register a7, Register a8, Register a9, 7105 Register a10, Register a11, Register a12, Register a13, Register a14, 7106 Register a15, Register a16, Register a17, Register a18, Register a19, 7107 Register a20, Register a21, Register a22, Register a23, Register a24, 7108 Register tmp0, Register tmp1, Register tmp2) { 7109 __ eor3(tmp1, a4, a9, a14); 7110 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4 7111 __ eor3(tmp2, a1, a6, a11); 7112 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1 7113 __ rax1(tmp2, tmp0, tmp1); // d0 7114 { 7115 7116 Register tmp3, tmp4; 7117 if (can_use_fp && can_use_r18) { 7118 tmp3 = rfp; 7119 tmp4 = r18_tls; 7120 } else { 7121 tmp3 = a4; 7122 tmp4 = a9; 7123 __ stp(tmp3, tmp4, __ pre(sp, -16)); 7124 } 7125 7126 __ eor3(tmp3, a0, a5, a10); 7127 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0 7128 __ eor(a0, a0, tmp2); 7129 __ eor(a5, a5, tmp2); 7130 __ eor(a10, a10, tmp2); 7131 __ eor(a15, a15, tmp2); 7132 __ eor(a20, a20, tmp2); // d0(tmp2) 7133 __ eor3(tmp3, a2, a7, a12); 7134 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2 7135 __ rax1(tmp3, tmp4, tmp2); // d1 7136 __ eor(a1, a1, tmp3); 7137 __ eor(a6, a6, tmp3); 7138 __ eor(a11, a11, tmp3); 7139 __ eor(a16, a16, tmp3); 7140 __ eor(a21, a21, tmp3); // d1(tmp3) 7141 __ rax1(tmp3, tmp2, tmp0); // d3 7142 __ eor3(tmp2, a3, a8, a13); 7143 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3 7144 __ eor(a3, a3, tmp3); 7145 __ eor(a8, a8, tmp3); 7146 __ eor(a13, a13, tmp3); 7147 __ eor(a18, a18, tmp3); 7148 __ eor(a23, a23, tmp3); 7149 __ rax1(tmp2, tmp1, tmp0); // d2 7150 __ eor(a2, a2, tmp2); 7151 __ eor(a7, a7, tmp2); 7152 __ eor(a12, a12, tmp2); 7153 __ rax1(tmp0, tmp0, tmp4); // d4 7154 if (!can_use_fp || !can_use_r18) { 7155 __ ldp(tmp3, tmp4, __ post(sp, 16)); 7156 } 7157 __ eor(a17, a17, tmp2); 7158 __ eor(a22, a22, tmp2); 7159 __ eor(a4, a4, tmp0); 7160 __ eor(a9, a9, tmp0); 7161 __ eor(a14, a14, tmp0); 7162 __ eor(a19, a19, tmp0); 7163 __ eor(a24, a24, tmp0); 7164 } 7165 7166 __ rol(tmp0, a10, 3); 7167 __ rol(a10, a1, 1); 7168 __ rol(a1, a6, 44); 7169 __ rol(a6, a9, 20); 7170 __ rol(a9, a22, 61); 7171 __ rol(a22, a14, 39); 7172 __ rol(a14, a20, 18); 7173 __ rol(a20, a2, 62); 7174 __ rol(a2, a12, 43); 7175 __ rol(a12, a13, 25); 7176 __ rol(a13, a19, 8) ; 7177 __ rol(a19, a23, 56); 7178 __ rol(a23, a15, 41); 7179 __ rol(a15, a4, 27); 7180 __ rol(a4, a24, 14); 7181 __ rol(a24, a21, 2); 7182 __ rol(a21, a8, 55); 7183 __ rol(a8, a16, 45); 7184 __ rol(a16, a5, 36); 7185 __ rol(a5, a3, 28); 7186 __ rol(a3, a18, 21); 7187 __ rol(a18, a17, 15); 7188 __ rol(a17, a11, 10); 7189 __ rol(a11, a7, 6); 7190 __ mov(a7, tmp0); 7191 7192 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2); 7193 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2); 7194 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2); 7195 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2); 7196 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2); 7197 7198 __ ldr(tmp1, __ post(rc, 8)); 7199 __ eor(a0, a0, tmp1); 7200 7201 } 7202 7203 // Arguments: 7204 // 7205 // Inputs: 7206 // c_rarg0 - byte[] source+offset 7207 // c_rarg1 - byte[] SHA.state 7208 // c_rarg2 - int block_size 7209 // c_rarg3 - int offset 7210 // c_rarg4 - int limit 7211 // 7212 address generate_sha3_implCompress_gpr(StubId stub_id) { 7213 bool multi_block; 7214 switch (stub_id) { 7215 case StubId::stubgen_sha3_implCompress_id: 7216 multi_block = false; 7217 break; 7218 case StubId::stubgen_sha3_implCompressMB_id: 7219 multi_block = true; 7220 break; 7221 default: 7222 ShouldNotReachHere(); 7223 } 7224 7225 static const uint64_t round_consts[24] = { 7226 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 7227 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 7228 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 7229 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 7230 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 7231 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 7232 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 7233 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 7234 }; 7235 7236 __ align(CodeEntryAlignment); 7237 StubCodeMark mark(this, stub_id); 7238 address start = __ pc(); 7239 7240 Register buf = c_rarg0; 7241 Register state = c_rarg1; 7242 Register block_size = c_rarg2; 7243 Register ofs = c_rarg3; 7244 Register limit = c_rarg4; 7245 7246 // use r3.r17,r19..r28 to keep a0..a24. 7247 // a0..a24 are respective locals from SHA3.java 7248 Register a0 = r25, 7249 a1 = r26, 7250 a2 = r27, 7251 a3 = r3, 7252 a4 = r4, 7253 a5 = r5, 7254 a6 = r6, 7255 a7 = r7, 7256 a8 = rscratch1, // r8 7257 a9 = rscratch2, // r9 7258 a10 = r10, 7259 a11 = r11, 7260 a12 = r12, 7261 a13 = r13, 7262 a14 = r14, 7263 a15 = r15, 7264 a16 = r16, 7265 a17 = r17, 7266 a18 = r28, 7267 a19 = r19, 7268 a20 = r20, 7269 a21 = r21, 7270 a22 = r22, 7271 a23 = r23, 7272 a24 = r24; 7273 7274 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30; 7275 7276 Label sha3_loop, rounds24_preloop, loop_body; 7277 Label sha3_512_or_sha3_384, shake128; 7278 7279 bool can_use_r18 = false; 7280 #ifndef R18_RESERVED 7281 can_use_r18 = true; 7282 #endif 7283 bool can_use_fp = !PreserveFramePointer; 7284 7285 __ enter(); 7286 7287 // save almost all yet unsaved gpr registers on stack 7288 __ str(block_size, __ pre(sp, -128)); 7289 if (multi_block) { 7290 __ stpw(ofs, limit, Address(sp, 8)); 7291 } 7292 // 8 bytes at sp+16 will be used to keep buf 7293 __ stp(r19, r20, Address(sp, 32)); 7294 __ stp(r21, r22, Address(sp, 48)); 7295 __ stp(r23, r24, Address(sp, 64)); 7296 __ stp(r25, r26, Address(sp, 80)); 7297 __ stp(r27, r28, Address(sp, 96)); 7298 if (can_use_r18 && can_use_fp) { 7299 __ stp(r18_tls, state, Address(sp, 112)); 7300 } else { 7301 __ str(state, Address(sp, 112)); 7302 } 7303 7304 // begin sha3 calculations: loading a0..a24 from state arrary 7305 __ ldp(a0, a1, state); 7306 __ ldp(a2, a3, Address(state, 16)); 7307 __ ldp(a4, a5, Address(state, 32)); 7308 __ ldp(a6, a7, Address(state, 48)); 7309 __ ldp(a8, a9, Address(state, 64)); 7310 __ ldp(a10, a11, Address(state, 80)); 7311 __ ldp(a12, a13, Address(state, 96)); 7312 __ ldp(a14, a15, Address(state, 112)); 7313 __ ldp(a16, a17, Address(state, 128)); 7314 __ ldp(a18, a19, Address(state, 144)); 7315 __ ldp(a20, a21, Address(state, 160)); 7316 __ ldp(a22, a23, Address(state, 176)); 7317 __ ldr(a24, Address(state, 192)); 7318 7319 __ BIND(sha3_loop); 7320 7321 // load input 7322 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7323 __ eor(a0, a0, tmp3); 7324 __ eor(a1, a1, tmp2); 7325 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7326 __ eor(a2, a2, tmp3); 7327 __ eor(a3, a3, tmp2); 7328 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7329 __ eor(a4, a4, tmp3); 7330 __ eor(a5, a5, tmp2); 7331 __ ldr(tmp3, __ post(buf, 8)); 7332 __ eor(a6, a6, tmp3); 7333 7334 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 7335 __ tbz(block_size, 7, sha3_512_or_sha3_384); 7336 7337 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7338 __ eor(a7, a7, tmp3); 7339 __ eor(a8, a8, tmp2); 7340 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7341 __ eor(a9, a9, tmp3); 7342 __ eor(a10, a10, tmp2); 7343 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7344 __ eor(a11, a11, tmp3); 7345 __ eor(a12, a12, tmp2); 7346 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7347 __ eor(a13, a13, tmp3); 7348 __ eor(a14, a14, tmp2); 7349 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7350 __ eor(a15, a15, tmp3); 7351 __ eor(a16, a16, tmp2); 7352 7353 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 7354 __ andw(tmp2, block_size, 48); 7355 __ cbzw(tmp2, rounds24_preloop); 7356 __ tbnz(block_size, 5, shake128); 7357 // block_size == 144, bit5 == 0, SHA3-244 7358 __ ldr(tmp3, __ post(buf, 8)); 7359 __ eor(a17, a17, tmp3); 7360 __ b(rounds24_preloop); 7361 7362 __ BIND(shake128); 7363 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7364 __ eor(a17, a17, tmp3); 7365 __ eor(a18, a18, tmp2); 7366 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7367 __ eor(a19, a19, tmp3); 7368 __ eor(a20, a20, tmp2); 7369 __ b(rounds24_preloop); // block_size == 168, SHAKE128 7370 7371 __ BIND(sha3_512_or_sha3_384); 7372 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7373 __ eor(a7, a7, tmp3); 7374 __ eor(a8, a8, tmp2); 7375 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512 7376 7377 // SHA3-384 7378 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7379 __ eor(a9, a9, tmp3); 7380 __ eor(a10, a10, tmp2); 7381 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7382 __ eor(a11, a11, tmp3); 7383 __ eor(a12, a12, tmp2); 7384 7385 __ BIND(rounds24_preloop); 7386 __ fmovs(v0, 24.0); // float loop counter, 7387 __ fmovs(v1, 1.0); // exact representation 7388 7389 __ str(buf, Address(sp, 16)); 7390 __ lea(tmp3, ExternalAddress((address) round_consts)); 7391 7392 __ BIND(loop_body); 7393 keccak_round_gpr(can_use_fp, can_use_r18, tmp3, 7394 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, 7395 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, 7396 tmp0, tmp1, tmp2); 7397 __ fsubs(v0, v0, v1); 7398 __ fcmps(v0, 0.0); 7399 __ br(__ NE, loop_body); 7400 7401 if (multi_block) { 7402 __ ldrw(block_size, sp); // block_size 7403 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit 7404 __ addw(tmp2, tmp2, block_size); 7405 __ cmpw(tmp2, tmp1); 7406 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping 7407 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping 7408 __ br(Assembler::LE, sha3_loop); 7409 __ movw(c_rarg0, tmp2); // return offset 7410 } 7411 if (can_use_fp && can_use_r18) { 7412 __ ldp(r18_tls, state, Address(sp, 112)); 7413 } else { 7414 __ ldr(state, Address(sp, 112)); 7415 } 7416 // save calculated sha3 state 7417 __ stp(a0, a1, Address(state)); 7418 __ stp(a2, a3, Address(state, 16)); 7419 __ stp(a4, a5, Address(state, 32)); 7420 __ stp(a6, a7, Address(state, 48)); 7421 __ stp(a8, a9, Address(state, 64)); 7422 __ stp(a10, a11, Address(state, 80)); 7423 __ stp(a12, a13, Address(state, 96)); 7424 __ stp(a14, a15, Address(state, 112)); 7425 __ stp(a16, a17, Address(state, 128)); 7426 __ stp(a18, a19, Address(state, 144)); 7427 __ stp(a20, a21, Address(state, 160)); 7428 __ stp(a22, a23, Address(state, 176)); 7429 __ str(a24, Address(state, 192)); 7430 7431 // restore required registers from stack 7432 __ ldp(r19, r20, Address(sp, 32)); 7433 __ ldp(r21, r22, Address(sp, 48)); 7434 __ ldp(r23, r24, Address(sp, 64)); 7435 __ ldp(r25, r26, Address(sp, 80)); 7436 __ ldp(r27, r28, Address(sp, 96)); 7437 if (can_use_fp && can_use_r18) { 7438 __ add(rfp, sp, 128); // leave() will copy rfp to sp below 7439 } // else no need to recalculate rfp, since it wasn't changed 7440 7441 __ leave(); 7442 7443 __ ret(lr); 7444 7445 return start; 7446 } 7447 7448 /** 7449 * Arguments: 7450 * 7451 * Inputs: 7452 * c_rarg0 - int crc 7453 * c_rarg1 - byte* buf 7454 * c_rarg2 - int length 7455 * 7456 * Output: 7457 * rax - int crc result 7458 */ 7459 address generate_updateBytesCRC32() { 7460 assert(UseCRC32Intrinsics, "what are we doing here?"); 7461 7462 __ align(CodeEntryAlignment); 7463 StubId stub_id = StubId::stubgen_updateBytesCRC32_id; 7464 StubCodeMark mark(this, stub_id); 7465 7466 address start = __ pc(); 7467 7468 const Register crc = c_rarg0; // crc 7469 const Register buf = c_rarg1; // source java byte array address 7470 const Register len = c_rarg2; // length 7471 const Register table0 = c_rarg3; // crc_table address 7472 const Register table1 = c_rarg4; 7473 const Register table2 = c_rarg5; 7474 const Register table3 = c_rarg6; 7475 const Register tmp3 = c_rarg7; 7476 7477 BLOCK_COMMENT("Entry:"); 7478 __ enter(); // required for proper stackwalking of RuntimeStub frame 7479 7480 __ kernel_crc32(crc, buf, len, 7481 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7482 7483 __ leave(); // required for proper stackwalking of RuntimeStub frame 7484 __ ret(lr); 7485 7486 return start; 7487 } 7488 7489 /** 7490 * Arguments: 7491 * 7492 * Inputs: 7493 * c_rarg0 - int crc 7494 * c_rarg1 - byte* buf 7495 * c_rarg2 - int length 7496 * c_rarg3 - int* table 7497 * 7498 * Output: 7499 * r0 - int crc result 7500 */ 7501 address generate_updateBytesCRC32C() { 7502 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7503 7504 __ align(CodeEntryAlignment); 7505 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id; 7506 StubCodeMark mark(this, stub_id); 7507 7508 address start = __ pc(); 7509 7510 const Register crc = c_rarg0; // crc 7511 const Register buf = c_rarg1; // source java byte array address 7512 const Register len = c_rarg2; // length 7513 const Register table0 = c_rarg3; // crc_table address 7514 const Register table1 = c_rarg4; 7515 const Register table2 = c_rarg5; 7516 const Register table3 = c_rarg6; 7517 const Register tmp3 = c_rarg7; 7518 7519 BLOCK_COMMENT("Entry:"); 7520 __ enter(); // required for proper stackwalking of RuntimeStub frame 7521 7522 __ kernel_crc32c(crc, buf, len, 7523 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7524 7525 __ leave(); // required for proper stackwalking of RuntimeStub frame 7526 __ ret(lr); 7527 7528 return start; 7529 } 7530 7531 /*** 7532 * Arguments: 7533 * 7534 * Inputs: 7535 * c_rarg0 - int adler 7536 * c_rarg1 - byte* buff 7537 * c_rarg2 - int len 7538 * 7539 * Output: 7540 * c_rarg0 - int adler result 7541 */ 7542 address generate_updateBytesAdler32() { 7543 __ align(CodeEntryAlignment); 7544 StubId stub_id = StubId::stubgen_updateBytesAdler32_id; 7545 StubCodeMark mark(this, stub_id); 7546 address start = __ pc(); 7547 7548 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7549 7550 // Aliases 7551 Register adler = c_rarg0; 7552 Register s1 = c_rarg0; 7553 Register s2 = c_rarg3; 7554 Register buff = c_rarg1; 7555 Register len = c_rarg2; 7556 Register nmax = r4; 7557 Register base = r5; 7558 Register count = r6; 7559 Register temp0 = rscratch1; 7560 Register temp1 = rscratch2; 7561 FloatRegister vbytes = v0; 7562 FloatRegister vs1acc = v1; 7563 FloatRegister vs2acc = v2; 7564 FloatRegister vtable = v3; 7565 7566 // Max number of bytes we can process before having to take the mod 7567 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7568 uint64_t BASE = 0xfff1; 7569 uint64_t NMAX = 0x15B0; 7570 7571 __ mov(base, BASE); 7572 __ mov(nmax, NMAX); 7573 7574 // Load accumulation coefficients for the upper 16 bits 7575 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7576 __ ld1(vtable, __ T16B, Address(temp0)); 7577 7578 // s1 is initialized to the lower 16 bits of adler 7579 // s2 is initialized to the upper 16 bits of adler 7580 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7581 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7582 7583 // The pipelined loop needs at least 16 elements for 1 iteration 7584 // It does check this, but it is more effective to skip to the cleanup loop 7585 __ cmp(len, (u1)16); 7586 __ br(Assembler::HS, L_nmax); 7587 __ cbz(len, L_combine); 7588 7589 __ bind(L_simple_by1_loop); 7590 __ ldrb(temp0, Address(__ post(buff, 1))); 7591 __ add(s1, s1, temp0); 7592 __ add(s2, s2, s1); 7593 __ subs(len, len, 1); 7594 __ br(Assembler::HI, L_simple_by1_loop); 7595 7596 // s1 = s1 % BASE 7597 __ subs(temp0, s1, base); 7598 __ csel(s1, temp0, s1, Assembler::HS); 7599 7600 // s2 = s2 % BASE 7601 __ lsr(temp0, s2, 16); 7602 __ lsl(temp1, temp0, 4); 7603 __ sub(temp1, temp1, temp0); 7604 __ add(s2, temp1, s2, ext::uxth); 7605 7606 __ subs(temp0, s2, base); 7607 __ csel(s2, temp0, s2, Assembler::HS); 7608 7609 __ b(L_combine); 7610 7611 __ bind(L_nmax); 7612 __ subs(len, len, nmax); 7613 __ sub(count, nmax, 16); 7614 __ br(Assembler::LO, L_by16); 7615 7616 __ bind(L_nmax_loop); 7617 7618 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7619 vbytes, vs1acc, vs2acc, vtable); 7620 7621 __ subs(count, count, 16); 7622 __ br(Assembler::HS, L_nmax_loop); 7623 7624 // s1 = s1 % BASE 7625 __ lsr(temp0, s1, 16); 7626 __ lsl(temp1, temp0, 4); 7627 __ sub(temp1, temp1, temp0); 7628 __ add(temp1, temp1, s1, ext::uxth); 7629 7630 __ lsr(temp0, temp1, 16); 7631 __ lsl(s1, temp0, 4); 7632 __ sub(s1, s1, temp0); 7633 __ add(s1, s1, temp1, ext:: uxth); 7634 7635 __ subs(temp0, s1, base); 7636 __ csel(s1, temp0, s1, Assembler::HS); 7637 7638 // s2 = s2 % BASE 7639 __ lsr(temp0, s2, 16); 7640 __ lsl(temp1, temp0, 4); 7641 __ sub(temp1, temp1, temp0); 7642 __ add(temp1, temp1, s2, ext::uxth); 7643 7644 __ lsr(temp0, temp1, 16); 7645 __ lsl(s2, temp0, 4); 7646 __ sub(s2, s2, temp0); 7647 __ add(s2, s2, temp1, ext:: uxth); 7648 7649 __ subs(temp0, s2, base); 7650 __ csel(s2, temp0, s2, Assembler::HS); 7651 7652 __ subs(len, len, nmax); 7653 __ sub(count, nmax, 16); 7654 __ br(Assembler::HS, L_nmax_loop); 7655 7656 __ bind(L_by16); 7657 __ adds(len, len, count); 7658 __ br(Assembler::LO, L_by1); 7659 7660 __ bind(L_by16_loop); 7661 7662 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7663 vbytes, vs1acc, vs2acc, vtable); 7664 7665 __ subs(len, len, 16); 7666 __ br(Assembler::HS, L_by16_loop); 7667 7668 __ bind(L_by1); 7669 __ adds(len, len, 15); 7670 __ br(Assembler::LO, L_do_mod); 7671 7672 __ bind(L_by1_loop); 7673 __ ldrb(temp0, Address(__ post(buff, 1))); 7674 __ add(s1, temp0, s1); 7675 __ add(s2, s2, s1); 7676 __ subs(len, len, 1); 7677 __ br(Assembler::HS, L_by1_loop); 7678 7679 __ bind(L_do_mod); 7680 // s1 = s1 % BASE 7681 __ lsr(temp0, s1, 16); 7682 __ lsl(temp1, temp0, 4); 7683 __ sub(temp1, temp1, temp0); 7684 __ add(temp1, temp1, s1, ext::uxth); 7685 7686 __ lsr(temp0, temp1, 16); 7687 __ lsl(s1, temp0, 4); 7688 __ sub(s1, s1, temp0); 7689 __ add(s1, s1, temp1, ext:: uxth); 7690 7691 __ subs(temp0, s1, base); 7692 __ csel(s1, temp0, s1, Assembler::HS); 7693 7694 // s2 = s2 % BASE 7695 __ lsr(temp0, s2, 16); 7696 __ lsl(temp1, temp0, 4); 7697 __ sub(temp1, temp1, temp0); 7698 __ add(temp1, temp1, s2, ext::uxth); 7699 7700 __ lsr(temp0, temp1, 16); 7701 __ lsl(s2, temp0, 4); 7702 __ sub(s2, s2, temp0); 7703 __ add(s2, s2, temp1, ext:: uxth); 7704 7705 __ subs(temp0, s2, base); 7706 __ csel(s2, temp0, s2, Assembler::HS); 7707 7708 // Combine lower bits and higher bits 7709 __ bind(L_combine); 7710 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7711 7712 __ ret(lr); 7713 7714 return start; 7715 } 7716 7717 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7718 Register temp0, Register temp1, FloatRegister vbytes, 7719 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7720 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7721 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7722 // In non-vectorized code, we update s1 and s2 as: 7723 // s1 <- s1 + b1 7724 // s2 <- s2 + s1 7725 // s1 <- s1 + b2 7726 // s2 <- s2 + b1 7727 // ... 7728 // s1 <- s1 + b16 7729 // s2 <- s2 + s1 7730 // Putting above assignments together, we have: 7731 // s1_new = s1 + b1 + b2 + ... + b16 7732 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7733 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7734 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7735 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7736 7737 // s2 = s2 + s1 * 16 7738 __ add(s2, s2, s1, Assembler::LSL, 4); 7739 7740 // vs1acc = b1 + b2 + b3 + ... + b16 7741 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7742 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7743 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7744 __ uaddlv(vs1acc, __ T16B, vbytes); 7745 __ uaddlv(vs2acc, __ T8H, vs2acc); 7746 7747 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7748 __ fmovd(temp0, vs1acc); 7749 __ fmovd(temp1, vs2acc); 7750 __ add(s1, s1, temp0); 7751 __ add(s2, s2, temp1); 7752 } 7753 7754 /** 7755 * Arguments: 7756 * 7757 * Input: 7758 * c_rarg0 - x address 7759 * c_rarg1 - x length 7760 * c_rarg2 - y address 7761 * c_rarg3 - y length 7762 * c_rarg4 - z address 7763 */ 7764 address generate_multiplyToLen() { 7765 __ align(CodeEntryAlignment); 7766 StubId stub_id = StubId::stubgen_multiplyToLen_id; 7767 StubCodeMark mark(this, stub_id); 7768 7769 address start = __ pc(); 7770 7771 if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 7772 return start; 7773 } 7774 const Register x = r0; 7775 const Register xlen = r1; 7776 const Register y = r2; 7777 const Register ylen = r3; 7778 const Register z = r4; 7779 7780 const Register tmp0 = r5; 7781 const Register tmp1 = r10; 7782 const Register tmp2 = r11; 7783 const Register tmp3 = r12; 7784 const Register tmp4 = r13; 7785 const Register tmp5 = r14; 7786 const Register tmp6 = r15; 7787 const Register tmp7 = r16; 7788 7789 BLOCK_COMMENT("Entry:"); 7790 __ enter(); // required for proper stackwalking of RuntimeStub frame 7791 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7792 __ leave(); // required for proper stackwalking of RuntimeStub frame 7793 __ ret(lr); 7794 7795 AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 7796 return start; 7797 } 7798 7799 address generate_squareToLen() { 7800 // squareToLen algorithm for sizes 1..127 described in java code works 7801 // faster than multiply_to_len on some CPUs and slower on others, but 7802 // multiply_to_len shows a bit better overall results 7803 __ align(CodeEntryAlignment); 7804 StubId stub_id = StubId::stubgen_squareToLen_id; 7805 StubCodeMark mark(this, stub_id); 7806 address start = __ pc(); 7807 7808 if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 7809 return start; 7810 } 7811 const Register x = r0; 7812 const Register xlen = r1; 7813 const Register z = r2; 7814 const Register y = r4; // == x 7815 const Register ylen = r5; // == xlen 7816 7817 const Register tmp0 = r3; 7818 const Register tmp1 = r10; 7819 const Register tmp2 = r11; 7820 const Register tmp3 = r12; 7821 const Register tmp4 = r13; 7822 const Register tmp5 = r14; 7823 const Register tmp6 = r15; 7824 const Register tmp7 = r16; 7825 7826 RegSet spilled_regs = RegSet::of(y, ylen); 7827 BLOCK_COMMENT("Entry:"); 7828 __ enter(); 7829 __ push(spilled_regs, sp); 7830 __ mov(y, x); 7831 __ mov(ylen, xlen); 7832 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7833 __ pop(spilled_regs, sp); 7834 __ leave(); 7835 __ ret(lr); 7836 7837 AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 7838 return start; 7839 } 7840 7841 address generate_mulAdd() { 7842 __ align(CodeEntryAlignment); 7843 StubId stub_id = StubId::stubgen_mulAdd_id; 7844 StubCodeMark mark(this, stub_id); 7845 7846 address start = __ pc(); 7847 7848 if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 7849 return start; 7850 } 7851 const Register out = r0; 7852 const Register in = r1; 7853 const Register offset = r2; 7854 const Register len = r3; 7855 const Register k = r4; 7856 7857 BLOCK_COMMENT("Entry:"); 7858 __ enter(); 7859 __ mul_add(out, in, offset, len, k); 7860 __ leave(); 7861 __ ret(lr); 7862 7863 AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 7864 return start; 7865 } 7866 7867 // Arguments: 7868 // 7869 // Input: 7870 // c_rarg0 - newArr address 7871 // c_rarg1 - oldArr address 7872 // c_rarg2 - newIdx 7873 // c_rarg3 - shiftCount 7874 // c_rarg4 - numIter 7875 // 7876 address generate_bigIntegerRightShift() { 7877 __ align(CodeEntryAlignment); 7878 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id; 7879 StubCodeMark mark(this, stub_id); 7880 address start = __ pc(); 7881 7882 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7883 7884 Register newArr = c_rarg0; 7885 Register oldArr = c_rarg1; 7886 Register newIdx = c_rarg2; 7887 Register shiftCount = c_rarg3; 7888 Register numIter = c_rarg4; 7889 Register idx = numIter; 7890 7891 Register newArrCur = rscratch1; 7892 Register shiftRevCount = rscratch2; 7893 Register oldArrCur = r13; 7894 Register oldArrNext = r14; 7895 7896 FloatRegister oldElem0 = v0; 7897 FloatRegister oldElem1 = v1; 7898 FloatRegister newElem = v2; 7899 FloatRegister shiftVCount = v3; 7900 FloatRegister shiftVRevCount = v4; 7901 7902 __ cbz(idx, Exit); 7903 7904 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7905 7906 // left shift count 7907 __ movw(shiftRevCount, 32); 7908 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7909 7910 // numIter too small to allow a 4-words SIMD loop, rolling back 7911 __ cmp(numIter, (u1)4); 7912 __ br(Assembler::LT, ShiftThree); 7913 7914 __ dup(shiftVCount, __ T4S, shiftCount); 7915 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7916 __ negr(shiftVCount, __ T4S, shiftVCount); 7917 7918 __ BIND(ShiftSIMDLoop); 7919 7920 // Calculate the load addresses 7921 __ sub(idx, idx, 4); 7922 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7923 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7924 __ add(oldArrCur, oldArrNext, 4); 7925 7926 // Load 4 words and process 7927 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7928 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7929 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7930 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7931 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7932 __ st1(newElem, __ T4S, Address(newArrCur)); 7933 7934 __ cmp(idx, (u1)4); 7935 __ br(Assembler::LT, ShiftTwoLoop); 7936 __ b(ShiftSIMDLoop); 7937 7938 __ BIND(ShiftTwoLoop); 7939 __ cbz(idx, Exit); 7940 __ cmp(idx, (u1)1); 7941 __ br(Assembler::EQ, ShiftOne); 7942 7943 // Calculate the load addresses 7944 __ sub(idx, idx, 2); 7945 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7946 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7947 __ add(oldArrCur, oldArrNext, 4); 7948 7949 // Load 2 words and process 7950 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7951 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7952 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7953 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7954 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7955 __ st1(newElem, __ T2S, Address(newArrCur)); 7956 __ b(ShiftTwoLoop); 7957 7958 __ BIND(ShiftThree); 7959 __ tbz(idx, 1, ShiftOne); 7960 __ tbz(idx, 0, ShiftTwo); 7961 __ ldrw(r10, Address(oldArr, 12)); 7962 __ ldrw(r11, Address(oldArr, 8)); 7963 __ lsrvw(r10, r10, shiftCount); 7964 __ lslvw(r11, r11, shiftRevCount); 7965 __ orrw(r12, r10, r11); 7966 __ strw(r12, Address(newArr, 8)); 7967 7968 __ BIND(ShiftTwo); 7969 __ ldrw(r10, Address(oldArr, 8)); 7970 __ ldrw(r11, Address(oldArr, 4)); 7971 __ lsrvw(r10, r10, shiftCount); 7972 __ lslvw(r11, r11, shiftRevCount); 7973 __ orrw(r12, r10, r11); 7974 __ strw(r12, Address(newArr, 4)); 7975 7976 __ BIND(ShiftOne); 7977 __ ldrw(r10, Address(oldArr, 4)); 7978 __ ldrw(r11, Address(oldArr)); 7979 __ lsrvw(r10, r10, shiftCount); 7980 __ lslvw(r11, r11, shiftRevCount); 7981 __ orrw(r12, r10, r11); 7982 __ strw(r12, Address(newArr)); 7983 7984 __ BIND(Exit); 7985 __ ret(lr); 7986 7987 return start; 7988 } 7989 7990 // Arguments: 7991 // 7992 // Input: 7993 // c_rarg0 - newArr address 7994 // c_rarg1 - oldArr address 7995 // c_rarg2 - newIdx 7996 // c_rarg3 - shiftCount 7997 // c_rarg4 - numIter 7998 // 7999 address generate_bigIntegerLeftShift() { 8000 __ align(CodeEntryAlignment); 8001 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id; 8002 StubCodeMark mark(this, stub_id); 8003 address start = __ pc(); 8004 8005 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 8006 8007 Register newArr = c_rarg0; 8008 Register oldArr = c_rarg1; 8009 Register newIdx = c_rarg2; 8010 Register shiftCount = c_rarg3; 8011 Register numIter = c_rarg4; 8012 8013 Register shiftRevCount = rscratch1; 8014 Register oldArrNext = rscratch2; 8015 8016 FloatRegister oldElem0 = v0; 8017 FloatRegister oldElem1 = v1; 8018 FloatRegister newElem = v2; 8019 FloatRegister shiftVCount = v3; 8020 FloatRegister shiftVRevCount = v4; 8021 8022 __ cbz(numIter, Exit); 8023 8024 __ add(oldArrNext, oldArr, 4); 8025 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 8026 8027 // right shift count 8028 __ movw(shiftRevCount, 32); 8029 __ subw(shiftRevCount, shiftRevCount, shiftCount); 8030 8031 // numIter too small to allow a 4-words SIMD loop, rolling back 8032 __ cmp(numIter, (u1)4); 8033 __ br(Assembler::LT, ShiftThree); 8034 8035 __ dup(shiftVCount, __ T4S, shiftCount); 8036 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 8037 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 8038 8039 __ BIND(ShiftSIMDLoop); 8040 8041 // load 4 words and process 8042 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 8043 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 8044 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 8045 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 8046 __ orr(newElem, __ T16B, oldElem0, oldElem1); 8047 __ st1(newElem, __ T4S, __ post(newArr, 16)); 8048 __ sub(numIter, numIter, 4); 8049 8050 __ cmp(numIter, (u1)4); 8051 __ br(Assembler::LT, ShiftTwoLoop); 8052 __ b(ShiftSIMDLoop); 8053 8054 __ BIND(ShiftTwoLoop); 8055 __ cbz(numIter, Exit); 8056 __ cmp(numIter, (u1)1); 8057 __ br(Assembler::EQ, ShiftOne); 8058 8059 // load 2 words and process 8060 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 8061 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 8062 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 8063 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 8064 __ orr(newElem, __ T8B, oldElem0, oldElem1); 8065 __ st1(newElem, __ T2S, __ post(newArr, 8)); 8066 __ sub(numIter, numIter, 2); 8067 __ b(ShiftTwoLoop); 8068 8069 __ BIND(ShiftThree); 8070 __ ldrw(r10, __ post(oldArr, 4)); 8071 __ ldrw(r11, __ post(oldArrNext, 4)); 8072 __ lslvw(r10, r10, shiftCount); 8073 __ lsrvw(r11, r11, shiftRevCount); 8074 __ orrw(r12, r10, r11); 8075 __ strw(r12, __ post(newArr, 4)); 8076 __ tbz(numIter, 1, Exit); 8077 __ tbz(numIter, 0, ShiftOne); 8078 8079 __ BIND(ShiftTwo); 8080 __ ldrw(r10, __ post(oldArr, 4)); 8081 __ ldrw(r11, __ post(oldArrNext, 4)); 8082 __ lslvw(r10, r10, shiftCount); 8083 __ lsrvw(r11, r11, shiftRevCount); 8084 __ orrw(r12, r10, r11); 8085 __ strw(r12, __ post(newArr, 4)); 8086 8087 __ BIND(ShiftOne); 8088 __ ldrw(r10, Address(oldArr)); 8089 __ ldrw(r11, Address(oldArrNext)); 8090 __ lslvw(r10, r10, shiftCount); 8091 __ lsrvw(r11, r11, shiftRevCount); 8092 __ orrw(r12, r10, r11); 8093 __ strw(r12, Address(newArr)); 8094 8095 __ BIND(Exit); 8096 __ ret(lr); 8097 8098 return start; 8099 } 8100 8101 address generate_count_positives(address &count_positives_long) { 8102 const u1 large_loop_size = 64; 8103 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 8104 int dcache_line = VM_Version::dcache_line_size(); 8105 8106 Register ary1 = r1, len = r2, result = r0; 8107 8108 __ align(CodeEntryAlignment); 8109 8110 StubId stub_id = StubId::stubgen_count_positives_id; 8111 StubCodeMark mark(this, stub_id); 8112 8113 address entry = __ pc(); 8114 8115 __ enter(); 8116 // precondition: a copy of len is already in result 8117 // __ mov(result, len); 8118 8119 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 8120 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 8121 8122 __ cmp(len, (u1)15); 8123 __ br(Assembler::GT, LEN_OVER_15); 8124 // The only case when execution falls into this code is when pointer is near 8125 // the end of memory page and we have to avoid reading next page 8126 __ add(ary1, ary1, len); 8127 __ subs(len, len, 8); 8128 __ br(Assembler::GT, LEN_OVER_8); 8129 __ ldr(rscratch2, Address(ary1, -8)); 8130 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 8131 __ lsrv(rscratch2, rscratch2, rscratch1); 8132 __ tst(rscratch2, UPPER_BIT_MASK); 8133 __ csel(result, zr, result, Assembler::NE); 8134 __ leave(); 8135 __ ret(lr); 8136 __ bind(LEN_OVER_8); 8137 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 8138 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 8139 __ tst(rscratch2, UPPER_BIT_MASK); 8140 __ br(Assembler::NE, RET_NO_POP); 8141 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 8142 __ lsrv(rscratch1, rscratch1, rscratch2); 8143 __ tst(rscratch1, UPPER_BIT_MASK); 8144 __ bind(RET_NO_POP); 8145 __ csel(result, zr, result, Assembler::NE); 8146 __ leave(); 8147 __ ret(lr); 8148 8149 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 8150 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 8151 8152 count_positives_long = __ pc(); // 2nd entry point 8153 8154 __ enter(); 8155 8156 __ bind(LEN_OVER_15); 8157 __ push(spilled_regs, sp); 8158 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 8159 __ cbz(rscratch2, ALIGNED); 8160 __ ldp(tmp6, tmp1, Address(ary1)); 8161 __ mov(tmp5, 16); 8162 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 8163 __ add(ary1, ary1, rscratch1); 8164 __ orr(tmp6, tmp6, tmp1); 8165 __ tst(tmp6, UPPER_BIT_MASK); 8166 __ br(Assembler::NE, RET_ADJUST); 8167 __ sub(len, len, rscratch1); 8168 8169 __ bind(ALIGNED); 8170 __ cmp(len, large_loop_size); 8171 __ br(Assembler::LT, CHECK_16); 8172 // Perform 16-byte load as early return in pre-loop to handle situation 8173 // when initially aligned large array has negative values at starting bytes, 8174 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 8175 // slower. Cases with negative bytes further ahead won't be affected that 8176 // much. In fact, it'll be faster due to early loads, less instructions and 8177 // less branches in LARGE_LOOP. 8178 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 8179 __ sub(len, len, 16); 8180 __ orr(tmp6, tmp6, tmp1); 8181 __ tst(tmp6, UPPER_BIT_MASK); 8182 __ br(Assembler::NE, RET_ADJUST_16); 8183 __ cmp(len, large_loop_size); 8184 __ br(Assembler::LT, CHECK_16); 8185 8186 if (SoftwarePrefetchHintDistance >= 0 8187 && SoftwarePrefetchHintDistance >= dcache_line) { 8188 // initial prefetch 8189 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 8190 } 8191 __ bind(LARGE_LOOP); 8192 if (SoftwarePrefetchHintDistance >= 0) { 8193 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 8194 } 8195 // Issue load instructions first, since it can save few CPU/MEM cycles, also 8196 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 8197 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 8198 // instructions per cycle and have less branches, but this approach disables 8199 // early return, thus, all 64 bytes are loaded and checked every time. 8200 __ ldp(tmp2, tmp3, Address(ary1)); 8201 __ ldp(tmp4, tmp5, Address(ary1, 16)); 8202 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 8203 __ ldp(tmp6, tmp1, Address(ary1, 48)); 8204 __ add(ary1, ary1, large_loop_size); 8205 __ sub(len, len, large_loop_size); 8206 __ orr(tmp2, tmp2, tmp3); 8207 __ orr(tmp4, tmp4, tmp5); 8208 __ orr(rscratch1, rscratch1, rscratch2); 8209 __ orr(tmp6, tmp6, tmp1); 8210 __ orr(tmp2, tmp2, tmp4); 8211 __ orr(rscratch1, rscratch1, tmp6); 8212 __ orr(tmp2, tmp2, rscratch1); 8213 __ tst(tmp2, UPPER_BIT_MASK); 8214 __ br(Assembler::NE, RET_ADJUST_LONG); 8215 __ cmp(len, large_loop_size); 8216 __ br(Assembler::GE, LARGE_LOOP); 8217 8218 __ bind(CHECK_16); // small 16-byte load pre-loop 8219 __ cmp(len, (u1)16); 8220 __ br(Assembler::LT, POST_LOOP16); 8221 8222 __ bind(LOOP16); // small 16-byte load loop 8223 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 8224 __ sub(len, len, 16); 8225 __ orr(tmp2, tmp2, tmp3); 8226 __ tst(tmp2, UPPER_BIT_MASK); 8227 __ br(Assembler::NE, RET_ADJUST_16); 8228 __ cmp(len, (u1)16); 8229 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 8230 8231 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 8232 __ cmp(len, (u1)8); 8233 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 8234 __ ldr(tmp3, Address(__ post(ary1, 8))); 8235 __ tst(tmp3, UPPER_BIT_MASK); 8236 __ br(Assembler::NE, RET_ADJUST); 8237 __ sub(len, len, 8); 8238 8239 __ bind(POST_LOOP16_LOAD_TAIL); 8240 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 8241 __ ldr(tmp1, Address(ary1)); 8242 __ mov(tmp2, 64); 8243 __ sub(tmp4, tmp2, len, __ LSL, 3); 8244 __ lslv(tmp1, tmp1, tmp4); 8245 __ tst(tmp1, UPPER_BIT_MASK); 8246 __ br(Assembler::NE, RET_ADJUST); 8247 // Fallthrough 8248 8249 __ bind(RET_LEN); 8250 __ pop(spilled_regs, sp); 8251 __ leave(); 8252 __ ret(lr); 8253 8254 // difference result - len is the count of guaranteed to be 8255 // positive bytes 8256 8257 __ bind(RET_ADJUST_LONG); 8258 __ add(len, len, (u1)(large_loop_size - 16)); 8259 __ bind(RET_ADJUST_16); 8260 __ add(len, len, 16); 8261 __ bind(RET_ADJUST); 8262 __ pop(spilled_regs, sp); 8263 __ leave(); 8264 __ sub(result, result, len); 8265 __ ret(lr); 8266 8267 return entry; 8268 } 8269 8270 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 8271 bool usePrefetch, Label &NOT_EQUAL) { 8272 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8273 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8274 tmp7 = r12, tmp8 = r13; 8275 Label LOOP; 8276 8277 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8278 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8279 __ bind(LOOP); 8280 if (usePrefetch) { 8281 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8282 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8283 } 8284 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8285 __ eor(tmp1, tmp1, tmp2); 8286 __ eor(tmp3, tmp3, tmp4); 8287 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8288 __ orr(tmp1, tmp1, tmp3); 8289 __ cbnz(tmp1, NOT_EQUAL); 8290 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8291 __ eor(tmp5, tmp5, tmp6); 8292 __ eor(tmp7, tmp7, tmp8); 8293 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8294 __ orr(tmp5, tmp5, tmp7); 8295 __ cbnz(tmp5, NOT_EQUAL); 8296 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8297 __ eor(tmp1, tmp1, tmp2); 8298 __ eor(tmp3, tmp3, tmp4); 8299 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8300 __ orr(tmp1, tmp1, tmp3); 8301 __ cbnz(tmp1, NOT_EQUAL); 8302 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8303 __ eor(tmp5, tmp5, tmp6); 8304 __ sub(cnt1, cnt1, 8 * wordSize); 8305 __ eor(tmp7, tmp7, tmp8); 8306 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8307 // tmp6 is not used. MacroAssembler::subs is used here (rather than 8308 // cmp) because subs allows an unlimited range of immediate operand. 8309 __ subs(tmp6, cnt1, loopThreshold); 8310 __ orr(tmp5, tmp5, tmp7); 8311 __ cbnz(tmp5, NOT_EQUAL); 8312 __ br(__ GE, LOOP); 8313 // post-loop 8314 __ eor(tmp1, tmp1, tmp2); 8315 __ eor(tmp3, tmp3, tmp4); 8316 __ orr(tmp1, tmp1, tmp3); 8317 __ sub(cnt1, cnt1, 2 * wordSize); 8318 __ cbnz(tmp1, NOT_EQUAL); 8319 } 8320 8321 void generate_large_array_equals_loop_simd(int loopThreshold, 8322 bool usePrefetch, Label &NOT_EQUAL) { 8323 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8324 tmp2 = rscratch2; 8325 Label LOOP; 8326 8327 __ bind(LOOP); 8328 if (usePrefetch) { 8329 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8330 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8331 } 8332 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 8333 __ sub(cnt1, cnt1, 8 * wordSize); 8334 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 8335 __ subs(tmp1, cnt1, loopThreshold); 8336 __ eor(v0, __ T16B, v0, v4); 8337 __ eor(v1, __ T16B, v1, v5); 8338 __ eor(v2, __ T16B, v2, v6); 8339 __ eor(v3, __ T16B, v3, v7); 8340 __ orr(v0, __ T16B, v0, v1); 8341 __ orr(v1, __ T16B, v2, v3); 8342 __ orr(v0, __ T16B, v0, v1); 8343 __ umov(tmp1, v0, __ D, 0); 8344 __ umov(tmp2, v0, __ D, 1); 8345 __ orr(tmp1, tmp1, tmp2); 8346 __ cbnz(tmp1, NOT_EQUAL); 8347 __ br(__ GE, LOOP); 8348 } 8349 8350 // a1 = r1 - array1 address 8351 // a2 = r2 - array2 address 8352 // result = r0 - return value. Already contains "false" 8353 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 8354 // r3-r5 are reserved temporary registers 8355 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 8356 address generate_large_array_equals() { 8357 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8358 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8359 tmp7 = r12, tmp8 = r13; 8360 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 8361 SMALL_LOOP, POST_LOOP; 8362 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 8363 // calculate if at least 32 prefetched bytes are used 8364 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 8365 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 8366 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 8367 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 8368 tmp5, tmp6, tmp7, tmp8); 8369 8370 __ align(CodeEntryAlignment); 8371 8372 StubId stub_id = StubId::stubgen_large_array_equals_id; 8373 StubCodeMark mark(this, stub_id); 8374 8375 address entry = __ pc(); 8376 __ enter(); 8377 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8378 // also advance pointers to use post-increment instead of pre-increment 8379 __ add(a1, a1, wordSize); 8380 __ add(a2, a2, wordSize); 8381 if (AvoidUnalignedAccesses) { 8382 // both implementations (SIMD/nonSIMD) are using relatively large load 8383 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8384 // on some CPUs in case of address is not at least 16-byte aligned. 8385 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8386 // load if needed at least for 1st address and make if 16-byte aligned. 8387 Label ALIGNED16; 8388 __ tbz(a1, 3, ALIGNED16); 8389 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8390 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8391 __ sub(cnt1, cnt1, wordSize); 8392 __ eor(tmp1, tmp1, tmp2); 8393 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8394 __ bind(ALIGNED16); 8395 } 8396 if (UseSIMDForArrayEquals) { 8397 if (SoftwarePrefetchHintDistance >= 0) { 8398 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8399 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8400 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8401 /* prfm = */ true, NOT_EQUAL); 8402 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8403 __ br(__ LT, TAIL); 8404 } 8405 __ bind(NO_PREFETCH_LARGE_LOOP); 8406 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8407 /* prfm = */ false, NOT_EQUAL); 8408 } else { 8409 __ push(spilled_regs, sp); 8410 if (SoftwarePrefetchHintDistance >= 0) { 8411 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8412 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8413 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8414 /* prfm = */ true, NOT_EQUAL); 8415 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8416 __ br(__ LT, TAIL); 8417 } 8418 __ bind(NO_PREFETCH_LARGE_LOOP); 8419 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8420 /* prfm = */ false, NOT_EQUAL); 8421 } 8422 __ bind(TAIL); 8423 __ cbz(cnt1, EQUAL); 8424 __ subs(cnt1, cnt1, wordSize); 8425 __ br(__ LE, POST_LOOP); 8426 __ bind(SMALL_LOOP); 8427 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8428 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8429 __ subs(cnt1, cnt1, wordSize); 8430 __ eor(tmp1, tmp1, tmp2); 8431 __ cbnz(tmp1, NOT_EQUAL); 8432 __ br(__ GT, SMALL_LOOP); 8433 __ bind(POST_LOOP); 8434 __ ldr(tmp1, Address(a1, cnt1)); 8435 __ ldr(tmp2, Address(a2, cnt1)); 8436 __ eor(tmp1, tmp1, tmp2); 8437 __ cbnz(tmp1, NOT_EQUAL); 8438 __ bind(EQUAL); 8439 __ mov(result, true); 8440 __ bind(NOT_EQUAL); 8441 if (!UseSIMDForArrayEquals) { 8442 __ pop(spilled_regs, sp); 8443 } 8444 __ bind(NOT_EQUAL_NO_POP); 8445 __ leave(); 8446 __ ret(lr); 8447 return entry; 8448 } 8449 8450 // result = r0 - return value. Contains initial hashcode value on entry. 8451 // ary = r1 - array address 8452 // cnt = r2 - elements count 8453 // Clobbers: v0-v13, rscratch1, rscratch2 8454 address generate_large_arrays_hashcode(BasicType eltype) { 8455 const Register result = r0, ary = r1, cnt = r2; 8456 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8457 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8458 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8459 const FloatRegister vpowm = v13; 8460 8461 ARRAYS_HASHCODE_REGISTERS; 8462 8463 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8464 8465 unsigned int vf; // vectorization factor 8466 bool multiply_by_halves; 8467 Assembler::SIMD_Arrangement load_arrangement; 8468 switch (eltype) { 8469 case T_BOOLEAN: 8470 case T_BYTE: 8471 load_arrangement = Assembler::T8B; 8472 multiply_by_halves = true; 8473 vf = 8; 8474 break; 8475 case T_CHAR: 8476 case T_SHORT: 8477 load_arrangement = Assembler::T8H; 8478 multiply_by_halves = true; 8479 vf = 8; 8480 break; 8481 case T_INT: 8482 load_arrangement = Assembler::T4S; 8483 multiply_by_halves = false; 8484 vf = 4; 8485 break; 8486 default: 8487 ShouldNotReachHere(); 8488 } 8489 8490 // Unroll factor 8491 const unsigned uf = 4; 8492 8493 // Effective vectorization factor 8494 const unsigned evf = vf * uf; 8495 8496 __ align(CodeEntryAlignment); 8497 8498 StubId stub_id; 8499 switch (eltype) { 8500 case T_BOOLEAN: 8501 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id; 8502 break; 8503 case T_BYTE: 8504 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id; 8505 break; 8506 case T_CHAR: 8507 stub_id = StubId::stubgen_large_arrays_hashcode_char_id; 8508 break; 8509 case T_SHORT: 8510 stub_id = StubId::stubgen_large_arrays_hashcode_short_id; 8511 break; 8512 case T_INT: 8513 stub_id = StubId::stubgen_large_arrays_hashcode_int_id; 8514 break; 8515 default: 8516 stub_id = StubId::NO_STUBID; 8517 ShouldNotReachHere(); 8518 }; 8519 8520 StubCodeMark mark(this, stub_id); 8521 8522 address entry = __ pc(); 8523 __ enter(); 8524 8525 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8526 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8527 // value shouldn't change throughout both loops. 8528 __ movw(rscratch1, intpow(31U, 3)); 8529 __ mov(vpow, Assembler::S, 0, rscratch1); 8530 __ movw(rscratch1, intpow(31U, 2)); 8531 __ mov(vpow, Assembler::S, 1, rscratch1); 8532 __ movw(rscratch1, intpow(31U, 1)); 8533 __ mov(vpow, Assembler::S, 2, rscratch1); 8534 __ movw(rscratch1, intpow(31U, 0)); 8535 __ mov(vpow, Assembler::S, 3, rscratch1); 8536 8537 __ mov(vmul0, Assembler::T16B, 0); 8538 __ mov(vmul0, Assembler::S, 3, result); 8539 8540 __ andr(rscratch2, cnt, (uf - 1) * vf); 8541 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8542 8543 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8544 __ mov(vpowm, Assembler::S, 0, rscratch1); 8545 8546 // SMALL LOOP 8547 __ bind(SMALL_LOOP); 8548 8549 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8550 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8551 __ subsw(rscratch2, rscratch2, vf); 8552 8553 if (load_arrangement == Assembler::T8B) { 8554 // Extend 8B to 8H to be able to use vector multiply 8555 // instructions 8556 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8557 if (is_signed_subword_type(eltype)) { 8558 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8559 } else { 8560 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8561 } 8562 } 8563 8564 switch (load_arrangement) { 8565 case Assembler::T4S: 8566 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8567 break; 8568 case Assembler::T8B: 8569 case Assembler::T8H: 8570 assert(is_subword_type(eltype), "subword type expected"); 8571 if (is_signed_subword_type(eltype)) { 8572 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8573 } else { 8574 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8575 } 8576 break; 8577 default: 8578 __ should_not_reach_here(); 8579 } 8580 8581 // Process the upper half of a vector 8582 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8583 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8584 if (is_signed_subword_type(eltype)) { 8585 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8586 } else { 8587 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8588 } 8589 } 8590 8591 __ br(Assembler::HI, SMALL_LOOP); 8592 8593 // SMALL LOOP'S EPILOQUE 8594 __ lsr(rscratch2, cnt, exact_log2(evf)); 8595 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8596 8597 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8598 __ addv(vmul0, Assembler::T4S, vmul0); 8599 __ umov(result, vmul0, Assembler::S, 0); 8600 8601 // TAIL 8602 __ bind(TAIL); 8603 8604 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8605 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8606 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8607 __ andr(rscratch2, cnt, vf - 1); 8608 __ bind(TAIL_SHORTCUT); 8609 __ adr(rscratch1, BR_BASE); 8610 // For Cortex-A53 offset is 4 because 2 nops are generated. 8611 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8612 __ movw(rscratch2, 0x1f); 8613 __ br(rscratch1); 8614 8615 for (size_t i = 0; i < vf - 1; ++i) { 8616 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8617 eltype); 8618 __ maddw(result, result, rscratch2, rscratch1); 8619 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8620 // Generate 2nd nop to have 4 instructions per iteration. 8621 if (VM_Version::supports_a53mac()) { 8622 __ nop(); 8623 } 8624 } 8625 __ bind(BR_BASE); 8626 8627 __ leave(); 8628 __ ret(lr); 8629 8630 // LARGE LOOP 8631 __ bind(LARGE_LOOP_PREHEADER); 8632 8633 __ lsr(rscratch2, cnt, exact_log2(evf)); 8634 8635 if (multiply_by_halves) { 8636 // 31^4 - multiplier between lower and upper parts of a register 8637 __ movw(rscratch1, intpow(31U, vf / 2)); 8638 __ mov(vpowm, Assembler::S, 1, rscratch1); 8639 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8640 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8641 __ mov(vpowm, Assembler::S, 0, rscratch1); 8642 } else { 8643 // 31^16 8644 __ movw(rscratch1, intpow(31U, evf)); 8645 __ mov(vpowm, Assembler::S, 0, rscratch1); 8646 } 8647 8648 __ mov(vmul3, Assembler::T16B, 0); 8649 __ mov(vmul2, Assembler::T16B, 0); 8650 __ mov(vmul1, Assembler::T16B, 0); 8651 8652 __ bind(LARGE_LOOP); 8653 8654 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8655 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8656 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8657 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8658 8659 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8660 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8661 8662 if (load_arrangement == Assembler::T8B) { 8663 // Extend 8B to 8H to be able to use vector multiply 8664 // instructions 8665 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8666 if (is_signed_subword_type(eltype)) { 8667 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8668 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8669 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8670 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8671 } else { 8672 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8673 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8674 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8675 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8676 } 8677 } 8678 8679 switch (load_arrangement) { 8680 case Assembler::T4S: 8681 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8682 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8683 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8684 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8685 break; 8686 case Assembler::T8B: 8687 case Assembler::T8H: 8688 assert(is_subword_type(eltype), "subword type expected"); 8689 if (is_signed_subword_type(eltype)) { 8690 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8691 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8692 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8693 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8694 } else { 8695 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8696 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8697 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8698 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8699 } 8700 break; 8701 default: 8702 __ should_not_reach_here(); 8703 } 8704 8705 // Process the upper half of a vector 8706 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8707 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8708 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8709 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8710 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8711 if (is_signed_subword_type(eltype)) { 8712 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8713 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8714 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8715 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8716 } else { 8717 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8718 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8719 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8720 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8721 } 8722 } 8723 8724 __ subsw(rscratch2, rscratch2, 1); 8725 __ br(Assembler::HI, LARGE_LOOP); 8726 8727 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8728 __ addv(vmul3, Assembler::T4S, vmul3); 8729 __ umov(result, vmul3, Assembler::S, 0); 8730 8731 __ mov(rscratch2, intpow(31U, vf)); 8732 8733 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8734 __ addv(vmul2, Assembler::T4S, vmul2); 8735 __ umov(rscratch1, vmul2, Assembler::S, 0); 8736 __ maddw(result, result, rscratch2, rscratch1); 8737 8738 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8739 __ addv(vmul1, Assembler::T4S, vmul1); 8740 __ umov(rscratch1, vmul1, Assembler::S, 0); 8741 __ maddw(result, result, rscratch2, rscratch1); 8742 8743 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8744 __ addv(vmul0, Assembler::T4S, vmul0); 8745 __ umov(rscratch1, vmul0, Assembler::S, 0); 8746 __ maddw(result, result, rscratch2, rscratch1); 8747 8748 __ andr(rscratch2, cnt, vf - 1); 8749 __ cbnz(rscratch2, TAIL_SHORTCUT); 8750 8751 __ leave(); 8752 __ ret(lr); 8753 8754 return entry; 8755 } 8756 8757 address generate_dsin_dcos(bool isCos) { 8758 __ align(CodeEntryAlignment); 8759 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id); 8760 StubCodeMark mark(this, stub_id); 8761 address start = __ pc(); 8762 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8763 (address)StubRoutines::aarch64::_two_over_pi, 8764 (address)StubRoutines::aarch64::_pio2, 8765 (address)StubRoutines::aarch64::_dsin_coef, 8766 (address)StubRoutines::aarch64::_dcos_coef); 8767 return start; 8768 } 8769 8770 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8771 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8772 Label &DIFF2) { 8773 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8774 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8775 8776 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8777 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8778 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8779 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8780 8781 __ fmovd(tmpL, vtmp3); 8782 __ eor(rscratch2, tmp3, tmpL); 8783 __ cbnz(rscratch2, DIFF2); 8784 8785 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8786 __ umov(tmpL, vtmp3, __ D, 1); 8787 __ eor(rscratch2, tmpU, tmpL); 8788 __ cbnz(rscratch2, DIFF1); 8789 8790 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8791 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8792 __ fmovd(tmpL, vtmp); 8793 __ eor(rscratch2, tmp3, tmpL); 8794 __ cbnz(rscratch2, DIFF2); 8795 8796 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8797 __ umov(tmpL, vtmp, __ D, 1); 8798 __ eor(rscratch2, tmpU, tmpL); 8799 __ cbnz(rscratch2, DIFF1); 8800 } 8801 8802 // r0 = result 8803 // r1 = str1 8804 // r2 = cnt1 8805 // r3 = str2 8806 // r4 = cnt2 8807 // r10 = tmp1 8808 // r11 = tmp2 8809 address generate_compare_long_string_different_encoding(bool isLU) { 8810 __ align(CodeEntryAlignment); 8811 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id); 8812 StubCodeMark mark(this, stub_id); 8813 address entry = __ pc(); 8814 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8815 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8816 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8817 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8818 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8819 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8820 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8821 8822 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8823 8824 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8825 // cnt2 == amount of characters left to compare 8826 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8827 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8828 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8829 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8830 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8831 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8832 __ eor(rscratch2, tmp1, tmp2); 8833 __ mov(rscratch1, tmp2); 8834 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8835 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8836 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8837 __ push(spilled_regs, sp); 8838 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8839 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8840 8841 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8842 8843 if (SoftwarePrefetchHintDistance >= 0) { 8844 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8845 __ br(__ LT, NO_PREFETCH); 8846 __ bind(LARGE_LOOP_PREFETCH); 8847 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8848 __ mov(tmp4, 2); 8849 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8850 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8851 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8852 __ subs(tmp4, tmp4, 1); 8853 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8854 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8855 __ mov(tmp4, 2); 8856 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8857 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8858 __ subs(tmp4, tmp4, 1); 8859 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8860 __ sub(cnt2, cnt2, 64); 8861 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8862 __ br(__ GE, LARGE_LOOP_PREFETCH); 8863 } 8864 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8865 __ bind(NO_PREFETCH); 8866 __ subs(cnt2, cnt2, 16); 8867 __ br(__ LT, TAIL); 8868 __ align(OptoLoopAlignment); 8869 __ bind(SMALL_LOOP); // smaller loop 8870 __ subs(cnt2, cnt2, 16); 8871 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8872 __ br(__ GE, SMALL_LOOP); 8873 __ cmn(cnt2, (u1)16); 8874 __ br(__ EQ, LOAD_LAST); 8875 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8876 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8877 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8878 __ ldr(tmp3, Address(cnt1, -8)); 8879 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8880 __ b(LOAD_LAST); 8881 __ bind(DIFF2); 8882 __ mov(tmpU, tmp3); 8883 __ bind(DIFF1); 8884 __ pop(spilled_regs, sp); 8885 __ b(CALCULATE_DIFFERENCE); 8886 __ bind(LOAD_LAST); 8887 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8888 // No need to load it again 8889 __ mov(tmpU, tmp3); 8890 __ pop(spilled_regs, sp); 8891 8892 // tmp2 points to the address of the last 4 Latin1 characters right now 8893 __ ldrs(vtmp, Address(tmp2)); 8894 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8895 __ fmovd(tmpL, vtmp); 8896 8897 __ eor(rscratch2, tmpU, tmpL); 8898 __ cbz(rscratch2, DONE); 8899 8900 // Find the first different characters in the longwords and 8901 // compute their difference. 8902 __ bind(CALCULATE_DIFFERENCE); 8903 __ rev(rscratch2, rscratch2); 8904 __ clz(rscratch2, rscratch2); 8905 __ andr(rscratch2, rscratch2, -16); 8906 __ lsrv(tmp1, tmp1, rscratch2); 8907 __ uxthw(tmp1, tmp1); 8908 __ lsrv(rscratch1, rscratch1, rscratch2); 8909 __ uxthw(rscratch1, rscratch1); 8910 __ subw(result, tmp1, rscratch1); 8911 __ bind(DONE); 8912 __ ret(lr); 8913 return entry; 8914 } 8915 8916 // r0 = input (float16) 8917 // v0 = result (float) 8918 // v1 = temporary float register 8919 address generate_float16ToFloat() { 8920 __ align(CodeEntryAlignment); 8921 StubId stub_id = StubId::stubgen_hf2f_id; 8922 StubCodeMark mark(this, stub_id); 8923 address entry = __ pc(); 8924 BLOCK_COMMENT("Entry:"); 8925 __ flt16_to_flt(v0, r0, v1); 8926 __ ret(lr); 8927 return entry; 8928 } 8929 8930 // v0 = input (float) 8931 // r0 = result (float16) 8932 // v1 = temporary float register 8933 address generate_floatToFloat16() { 8934 __ align(CodeEntryAlignment); 8935 StubId stub_id = StubId::stubgen_f2hf_id; 8936 StubCodeMark mark(this, stub_id); 8937 address entry = __ pc(); 8938 BLOCK_COMMENT("Entry:"); 8939 __ flt_to_flt16(r0, v0, v1); 8940 __ ret(lr); 8941 return entry; 8942 } 8943 8944 address generate_method_entry_barrier() { 8945 __ align(CodeEntryAlignment); 8946 StubId stub_id = StubId::stubgen_method_entry_barrier_id; 8947 StubCodeMark mark(this, stub_id); 8948 8949 Label deoptimize_label; 8950 8951 address start = __ pc(); 8952 8953 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8954 8955 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8956 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8957 // We can get here despite the nmethod being good, if we have not 8958 // yet applied our cross modification fence (or data fence). 8959 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8960 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8961 __ ldrw(rscratch2, rscratch2); 8962 __ strw(rscratch2, thread_epoch_addr); 8963 __ isb(); 8964 __ membar(__ LoadLoad); 8965 } 8966 8967 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8968 8969 __ enter(); 8970 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8971 8972 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8973 8974 __ push_call_clobbered_registers(); 8975 8976 __ mov(c_rarg0, rscratch2); 8977 __ call_VM_leaf 8978 (CAST_FROM_FN_PTR 8979 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8980 8981 __ reset_last_Java_frame(true); 8982 8983 __ mov(rscratch1, r0); 8984 8985 __ pop_call_clobbered_registers(); 8986 8987 __ cbnz(rscratch1, deoptimize_label); 8988 8989 __ leave(); 8990 __ ret(lr); 8991 8992 __ BIND(deoptimize_label); 8993 8994 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8995 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 8996 8997 __ mov(sp, rscratch1); 8998 __ br(rscratch2); 8999 9000 return start; 9001 } 9002 9003 // r0 = result 9004 // r1 = str1 9005 // r2 = cnt1 9006 // r3 = str2 9007 // r4 = cnt2 9008 // r10 = tmp1 9009 // r11 = tmp2 9010 address generate_compare_long_string_same_encoding(bool isLL) { 9011 __ align(CodeEntryAlignment); 9012 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id); 9013 StubCodeMark mark(this, stub_id); 9014 address entry = __ pc(); 9015 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9016 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 9017 9018 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 9019 9020 // exit from large loop when less than 64 bytes left to read or we're about 9021 // to prefetch memory behind array border 9022 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 9023 9024 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 9025 __ eor(rscratch2, tmp1, tmp2); 9026 __ cbnz(rscratch2, CAL_DIFFERENCE); 9027 9028 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 9029 // update pointers, because of previous read 9030 __ add(str1, str1, wordSize); 9031 __ add(str2, str2, wordSize); 9032 if (SoftwarePrefetchHintDistance >= 0) { 9033 __ align(OptoLoopAlignment); 9034 __ bind(LARGE_LOOP_PREFETCH); 9035 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 9036 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 9037 9038 for (int i = 0; i < 4; i++) { 9039 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 9040 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 9041 __ cmp(tmp1, tmp2); 9042 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9043 __ br(Assembler::NE, DIFF); 9044 } 9045 __ sub(cnt2, cnt2, isLL ? 64 : 32); 9046 __ add(str1, str1, 64); 9047 __ add(str2, str2, 64); 9048 __ subs(rscratch2, cnt2, largeLoopExitCondition); 9049 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 9050 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 9051 } 9052 9053 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 9054 __ br(Assembler::LE, LESS16); 9055 __ align(OptoLoopAlignment); 9056 __ bind(LOOP_COMPARE16); 9057 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9058 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9059 __ cmp(tmp1, tmp2); 9060 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9061 __ br(Assembler::NE, DIFF); 9062 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9063 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9064 __ br(Assembler::LT, LESS16); 9065 9066 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9067 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9068 __ cmp(tmp1, tmp2); 9069 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9070 __ br(Assembler::NE, DIFF); 9071 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9072 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9073 __ br(Assembler::GE, LOOP_COMPARE16); 9074 __ cbz(cnt2, LENGTH_DIFF); 9075 9076 __ bind(LESS16); 9077 // each 8 compare 9078 __ subs(cnt2, cnt2, isLL ? 8 : 4); 9079 __ br(Assembler::LE, LESS8); 9080 __ ldr(tmp1, Address(__ post(str1, 8))); 9081 __ ldr(tmp2, Address(__ post(str2, 8))); 9082 __ eor(rscratch2, tmp1, tmp2); 9083 __ cbnz(rscratch2, CAL_DIFFERENCE); 9084 __ sub(cnt2, cnt2, isLL ? 8 : 4); 9085 9086 __ bind(LESS8); // directly load last 8 bytes 9087 if (!isLL) { 9088 __ add(cnt2, cnt2, cnt2); 9089 } 9090 __ ldr(tmp1, Address(str1, cnt2)); 9091 __ ldr(tmp2, Address(str2, cnt2)); 9092 __ eor(rscratch2, tmp1, tmp2); 9093 __ cbz(rscratch2, LENGTH_DIFF); 9094 __ b(CAL_DIFFERENCE); 9095 9096 __ bind(DIFF); 9097 __ cmp(tmp1, tmp2); 9098 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 9099 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 9100 // reuse rscratch2 register for the result of eor instruction 9101 __ eor(rscratch2, tmp1, tmp2); 9102 9103 __ bind(CAL_DIFFERENCE); 9104 __ rev(rscratch2, rscratch2); 9105 __ clz(rscratch2, rscratch2); 9106 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 9107 __ lsrv(tmp1, tmp1, rscratch2); 9108 __ lsrv(tmp2, tmp2, rscratch2); 9109 if (isLL) { 9110 __ uxtbw(tmp1, tmp1); 9111 __ uxtbw(tmp2, tmp2); 9112 } else { 9113 __ uxthw(tmp1, tmp1); 9114 __ uxthw(tmp2, tmp2); 9115 } 9116 __ subw(result, tmp1, tmp2); 9117 9118 __ bind(LENGTH_DIFF); 9119 __ ret(lr); 9120 return entry; 9121 } 9122 9123 enum string_compare_mode { 9124 LL, 9125 LU, 9126 UL, 9127 UU, 9128 }; 9129 9130 // The following registers are declared in aarch64.ad 9131 // r0 = result 9132 // r1 = str1 9133 // r2 = cnt1 9134 // r3 = str2 9135 // r4 = cnt2 9136 // r10 = tmp1 9137 // r11 = tmp2 9138 // z0 = ztmp1 9139 // z1 = ztmp2 9140 // p0 = pgtmp1 9141 // p1 = pgtmp2 9142 address generate_compare_long_string_sve(string_compare_mode mode) { 9143 StubId stub_id; 9144 switch (mode) { 9145 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break; 9146 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break; 9147 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break; 9148 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break; 9149 default: ShouldNotReachHere(); 9150 } 9151 9152 __ align(CodeEntryAlignment); 9153 address entry = __ pc(); 9154 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9155 tmp1 = r10, tmp2 = r11; 9156 9157 Label LOOP, DONE, MISMATCH; 9158 Register vec_len = tmp1; 9159 Register idx = tmp2; 9160 // The minimum of the string lengths has been stored in cnt2. 9161 Register cnt = cnt2; 9162 FloatRegister ztmp1 = z0, ztmp2 = z1; 9163 PRegister pgtmp1 = p0, pgtmp2 = p1; 9164 9165 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 9166 switch (mode) { \ 9167 case LL: \ 9168 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 9169 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 9170 break; \ 9171 case LU: \ 9172 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 9173 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9174 break; \ 9175 case UL: \ 9176 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9177 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 9178 break; \ 9179 case UU: \ 9180 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9181 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9182 break; \ 9183 default: \ 9184 ShouldNotReachHere(); \ 9185 } 9186 9187 StubCodeMark mark(this, stub_id); 9188 9189 __ mov(idx, 0); 9190 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9191 9192 if (mode == LL) { 9193 __ sve_cntb(vec_len); 9194 } else { 9195 __ sve_cnth(vec_len); 9196 } 9197 9198 __ sub(rscratch1, cnt, vec_len); 9199 9200 __ bind(LOOP); 9201 9202 // main loop 9203 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9204 __ add(idx, idx, vec_len); 9205 // Compare strings. 9206 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9207 __ br(__ NE, MISMATCH); 9208 __ cmp(idx, rscratch1); 9209 __ br(__ LT, LOOP); 9210 9211 // post loop, last iteration 9212 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9213 9214 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9215 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9216 __ br(__ EQ, DONE); 9217 9218 __ bind(MISMATCH); 9219 9220 // Crop the vector to find its location. 9221 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 9222 // Extract the first different characters of each string. 9223 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 9224 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 9225 9226 // Compute the difference of the first different characters. 9227 __ sub(result, rscratch1, rscratch2); 9228 9229 __ bind(DONE); 9230 __ ret(lr); 9231 #undef LOAD_PAIR 9232 return entry; 9233 } 9234 9235 void generate_compare_long_strings() { 9236 if (UseSVE == 0) { 9237 StubRoutines::aarch64::_compare_long_string_LL 9238 = generate_compare_long_string_same_encoding(true); 9239 StubRoutines::aarch64::_compare_long_string_UU 9240 = generate_compare_long_string_same_encoding(false); 9241 StubRoutines::aarch64::_compare_long_string_LU 9242 = generate_compare_long_string_different_encoding(true); 9243 StubRoutines::aarch64::_compare_long_string_UL 9244 = generate_compare_long_string_different_encoding(false); 9245 } else { 9246 StubRoutines::aarch64::_compare_long_string_LL 9247 = generate_compare_long_string_sve(LL); 9248 StubRoutines::aarch64::_compare_long_string_UU 9249 = generate_compare_long_string_sve(UU); 9250 StubRoutines::aarch64::_compare_long_string_LU 9251 = generate_compare_long_string_sve(LU); 9252 StubRoutines::aarch64::_compare_long_string_UL 9253 = generate_compare_long_string_sve(UL); 9254 } 9255 } 9256 9257 // R0 = result 9258 // R1 = str2 9259 // R2 = cnt1 9260 // R3 = str1 9261 // R4 = cnt2 9262 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 9263 // 9264 // This generic linear code use few additional ideas, which makes it faster: 9265 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 9266 // in order to skip initial loading(help in systems with 1 ld pipeline) 9267 // 2) we can use "fast" algorithm of finding single character to search for 9268 // first symbol with less branches(1 branch per each loaded register instead 9269 // of branch for each symbol), so, this is where constants like 9270 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 9271 // 3) after loading and analyzing 1st register of source string, it can be 9272 // used to search for every 1st character entry, saving few loads in 9273 // comparison with "simplier-but-slower" implementation 9274 // 4) in order to avoid lots of push/pop operations, code below is heavily 9275 // re-using/re-initializing/compressing register values, which makes code 9276 // larger and a bit less readable, however, most of extra operations are 9277 // issued during loads or branches, so, penalty is minimal 9278 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 9279 StubId stub_id; 9280 if (str1_isL) { 9281 if (str2_isL) { 9282 stub_id = StubId::stubgen_string_indexof_linear_ll_id; 9283 } else { 9284 stub_id = StubId::stubgen_string_indexof_linear_ul_id; 9285 } 9286 } else { 9287 if (str2_isL) { 9288 ShouldNotReachHere(); 9289 } else { 9290 stub_id = StubId::stubgen_string_indexof_linear_uu_id; 9291 } 9292 } 9293 __ align(CodeEntryAlignment); 9294 StubCodeMark mark(this, stub_id); 9295 address entry = __ pc(); 9296 9297 int str1_chr_size = str1_isL ? 1 : 2; 9298 int str2_chr_size = str2_isL ? 1 : 2; 9299 int str1_chr_shift = str1_isL ? 0 : 1; 9300 int str2_chr_shift = str2_isL ? 0 : 1; 9301 bool isL = str1_isL && str2_isL; 9302 // parameters 9303 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 9304 // temporary registers 9305 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 9306 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 9307 // redefinitions 9308 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 9309 9310 __ push(spilled_regs, sp); 9311 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 9312 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 9313 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 9314 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 9315 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 9316 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 9317 // Read whole register from str1. It is safe, because length >=8 here 9318 __ ldr(ch1, Address(str1)); 9319 // Read whole register from str2. It is safe, because length >=8 here 9320 __ ldr(ch2, Address(str2)); 9321 __ sub(cnt2, cnt2, cnt1); 9322 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 9323 if (str1_isL != str2_isL) { 9324 __ eor(v0, __ T16B, v0, v0); 9325 } 9326 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 9327 __ mul(first, first, tmp1); 9328 // check if we have less than 1 register to check 9329 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 9330 if (str1_isL != str2_isL) { 9331 __ fmovd(v1, ch1); 9332 } 9333 __ br(__ LE, L_SMALL); 9334 __ eor(ch2, first, ch2); 9335 if (str1_isL != str2_isL) { 9336 __ zip1(v1, __ T16B, v1, v0); 9337 } 9338 __ sub(tmp2, ch2, tmp1); 9339 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9340 __ bics(tmp2, tmp2, ch2); 9341 if (str1_isL != str2_isL) { 9342 __ fmovd(ch1, v1); 9343 } 9344 __ br(__ NE, L_HAS_ZERO); 9345 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9346 __ add(result, result, wordSize/str2_chr_size); 9347 __ add(str2, str2, wordSize); 9348 __ br(__ LT, L_POST_LOOP); 9349 __ BIND(L_LOOP); 9350 __ ldr(ch2, Address(str2)); 9351 __ eor(ch2, first, ch2); 9352 __ sub(tmp2, ch2, tmp1); 9353 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9354 __ bics(tmp2, tmp2, ch2); 9355 __ br(__ NE, L_HAS_ZERO); 9356 __ BIND(L_LOOP_PROCEED); 9357 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9358 __ add(str2, str2, wordSize); 9359 __ add(result, result, wordSize/str2_chr_size); 9360 __ br(__ GE, L_LOOP); 9361 __ BIND(L_POST_LOOP); 9362 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 9363 __ br(__ LE, NOMATCH); 9364 __ ldr(ch2, Address(str2)); 9365 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9366 __ eor(ch2, first, ch2); 9367 __ sub(tmp2, ch2, tmp1); 9368 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9369 __ mov(tmp4, -1); // all bits set 9370 __ b(L_SMALL_PROCEED); 9371 __ align(OptoLoopAlignment); 9372 __ BIND(L_SMALL); 9373 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9374 __ eor(ch2, first, ch2); 9375 if (str1_isL != str2_isL) { 9376 __ zip1(v1, __ T16B, v1, v0); 9377 } 9378 __ sub(tmp2, ch2, tmp1); 9379 __ mov(tmp4, -1); // all bits set 9380 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9381 if (str1_isL != str2_isL) { 9382 __ fmovd(ch1, v1); // move converted 4 symbols 9383 } 9384 __ BIND(L_SMALL_PROCEED); 9385 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9386 __ bic(tmp2, tmp2, ch2); 9387 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9388 __ rbit(tmp2, tmp2); 9389 __ br(__ EQ, NOMATCH); 9390 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9391 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9392 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9393 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9394 if (str2_isL) { // LL 9395 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9396 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9397 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9398 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9399 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9400 } else { 9401 __ mov(ch2, 0xE); // all bits in byte set except last one 9402 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9403 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9404 __ lslv(tmp2, tmp2, tmp4); 9405 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9406 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9407 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9408 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9409 } 9410 __ cmp(ch1, ch2); 9411 __ mov(tmp4, wordSize/str2_chr_size); 9412 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9413 __ BIND(L_SMALL_CMP_LOOP); 9414 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9415 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9416 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9417 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9418 __ add(tmp4, tmp4, 1); 9419 __ cmp(tmp4, cnt1); 9420 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9421 __ cmp(first, ch2); 9422 __ br(__ EQ, L_SMALL_CMP_LOOP); 9423 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9424 __ cbz(tmp2, NOMATCH); // no more matches. exit 9425 __ clz(tmp4, tmp2); 9426 __ add(result, result, 1); // advance index 9427 __ add(str2, str2, str2_chr_size); // advance pointer 9428 __ b(L_SMALL_HAS_ZERO_LOOP); 9429 __ align(OptoLoopAlignment); 9430 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9431 __ cmp(first, ch2); 9432 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9433 __ b(DONE); 9434 __ align(OptoLoopAlignment); 9435 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9436 if (str2_isL) { // LL 9437 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9438 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9439 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9440 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9441 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9442 } else { 9443 __ mov(ch2, 0xE); // all bits in byte set except last one 9444 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9445 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9446 __ lslv(tmp2, tmp2, tmp4); 9447 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9448 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9449 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9450 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9451 } 9452 __ cmp(ch1, ch2); 9453 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9454 __ b(DONE); 9455 __ align(OptoLoopAlignment); 9456 __ BIND(L_HAS_ZERO); 9457 __ rbit(tmp2, tmp2); 9458 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9459 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9460 // It's fine because both counters are 32bit and are not changed in this 9461 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9462 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9463 __ sub(result, result, 1); 9464 __ BIND(L_HAS_ZERO_LOOP); 9465 __ mov(cnt1, wordSize/str2_chr_size); 9466 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9467 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9468 if (str2_isL) { 9469 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9470 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9471 __ lslv(tmp2, tmp2, tmp4); 9472 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9473 __ add(tmp4, tmp4, 1); 9474 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9475 __ lsl(tmp2, tmp2, 1); 9476 __ mov(tmp4, wordSize/str2_chr_size); 9477 } else { 9478 __ mov(ch2, 0xE); 9479 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9480 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9481 __ lslv(tmp2, tmp2, tmp4); 9482 __ add(tmp4, tmp4, 1); 9483 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9484 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9485 __ lsl(tmp2, tmp2, 1); 9486 __ mov(tmp4, wordSize/str2_chr_size); 9487 __ sub(str2, str2, str2_chr_size); 9488 } 9489 __ cmp(ch1, ch2); 9490 __ mov(tmp4, wordSize/str2_chr_size); 9491 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9492 __ BIND(L_CMP_LOOP); 9493 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9494 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9495 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9496 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9497 __ add(tmp4, tmp4, 1); 9498 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9499 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9500 __ cmp(cnt1, ch2); 9501 __ br(__ EQ, L_CMP_LOOP); 9502 __ BIND(L_CMP_LOOP_NOMATCH); 9503 // here we're not matched 9504 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9505 __ clz(tmp4, tmp2); 9506 __ add(str2, str2, str2_chr_size); // advance pointer 9507 __ b(L_HAS_ZERO_LOOP); 9508 __ align(OptoLoopAlignment); 9509 __ BIND(L_CMP_LOOP_LAST_CMP); 9510 __ cmp(cnt1, ch2); 9511 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9512 __ b(DONE); 9513 __ align(OptoLoopAlignment); 9514 __ BIND(L_CMP_LOOP_LAST_CMP2); 9515 if (str2_isL) { 9516 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9517 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9518 __ lslv(tmp2, tmp2, tmp4); 9519 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9520 __ add(tmp4, tmp4, 1); 9521 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9522 __ lsl(tmp2, tmp2, 1); 9523 } else { 9524 __ mov(ch2, 0xE); 9525 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9526 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9527 __ lslv(tmp2, tmp2, tmp4); 9528 __ add(tmp4, tmp4, 1); 9529 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9530 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9531 __ lsl(tmp2, tmp2, 1); 9532 __ sub(str2, str2, str2_chr_size); 9533 } 9534 __ cmp(ch1, ch2); 9535 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9536 __ b(DONE); 9537 __ align(OptoLoopAlignment); 9538 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9539 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9540 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9541 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9542 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9543 // result by analyzed characters value, so, we can just reset lower bits 9544 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9545 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9546 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9547 // index of last analyzed substring inside current octet. So, str2 in at 9548 // respective start address. We need to advance it to next octet 9549 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9550 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9551 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9552 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9553 __ movw(cnt2, cnt2); 9554 __ b(L_LOOP_PROCEED); 9555 __ align(OptoLoopAlignment); 9556 __ BIND(NOMATCH); 9557 __ mov(result, -1); 9558 __ BIND(DONE); 9559 __ pop(spilled_regs, sp); 9560 __ ret(lr); 9561 return entry; 9562 } 9563 9564 void generate_string_indexof_stubs() { 9565 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9566 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9567 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9568 } 9569 9570 void inflate_and_store_2_fp_registers(bool generatePrfm, 9571 FloatRegister src1, FloatRegister src2) { 9572 Register dst = r1; 9573 __ zip1(v1, __ T16B, src1, v0); 9574 __ zip2(v2, __ T16B, src1, v0); 9575 if (generatePrfm) { 9576 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9577 } 9578 __ zip1(v3, __ T16B, src2, v0); 9579 __ zip2(v4, __ T16B, src2, v0); 9580 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9581 } 9582 9583 // R0 = src 9584 // R1 = dst 9585 // R2 = len 9586 // R3 = len >> 3 9587 // V0 = 0 9588 // v1 = loaded 8 bytes 9589 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9590 address generate_large_byte_array_inflate() { 9591 __ align(CodeEntryAlignment); 9592 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id; 9593 StubCodeMark mark(this, stub_id); 9594 address entry = __ pc(); 9595 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9596 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9597 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9598 9599 // do one more 8-byte read to have address 16-byte aligned in most cases 9600 // also use single store instruction 9601 __ ldrd(v2, __ post(src, 8)); 9602 __ sub(octetCounter, octetCounter, 2); 9603 __ zip1(v1, __ T16B, v1, v0); 9604 __ zip1(v2, __ T16B, v2, v0); 9605 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9606 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9607 __ subs(rscratch1, octetCounter, large_loop_threshold); 9608 __ br(__ LE, LOOP_START); 9609 __ b(LOOP_PRFM_START); 9610 __ bind(LOOP_PRFM); 9611 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9612 __ bind(LOOP_PRFM_START); 9613 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9614 __ sub(octetCounter, octetCounter, 8); 9615 __ subs(rscratch1, octetCounter, large_loop_threshold); 9616 inflate_and_store_2_fp_registers(true, v3, v4); 9617 inflate_and_store_2_fp_registers(true, v5, v6); 9618 __ br(__ GT, LOOP_PRFM); 9619 __ cmp(octetCounter, (u1)8); 9620 __ br(__ LT, DONE); 9621 __ bind(LOOP); 9622 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9623 __ bind(LOOP_START); 9624 __ sub(octetCounter, octetCounter, 8); 9625 __ cmp(octetCounter, (u1)8); 9626 inflate_and_store_2_fp_registers(false, v3, v4); 9627 inflate_and_store_2_fp_registers(false, v5, v6); 9628 __ br(__ GE, LOOP); 9629 __ bind(DONE); 9630 __ ret(lr); 9631 return entry; 9632 } 9633 9634 /** 9635 * Arguments: 9636 * 9637 * Input: 9638 * c_rarg0 - current state address 9639 * c_rarg1 - H key address 9640 * c_rarg2 - data address 9641 * c_rarg3 - number of blocks 9642 * 9643 * Output: 9644 * Updated state at c_rarg0 9645 */ 9646 address generate_ghash_processBlocks() { 9647 // Bafflingly, GCM uses little-endian for the byte order, but 9648 // big-endian for the bit order. For example, the polynomial 1 is 9649 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9650 // 9651 // So, we must either reverse the bytes in each word and do 9652 // everything big-endian or reverse the bits in each byte and do 9653 // it little-endian. On AArch64 it's more idiomatic to reverse 9654 // the bits in each byte (we have an instruction, RBIT, to do 9655 // that) and keep the data in little-endian bit order through the 9656 // calculation, bit-reversing the inputs and outputs. 9657 9658 StubId stub_id = StubId::stubgen_ghash_processBlocks_id; 9659 StubCodeMark mark(this, stub_id); 9660 __ align(wordSize * 2); 9661 address p = __ pc(); 9662 __ emit_int64(0x87); // The low-order bits of the field 9663 // polynomial (i.e. p = z^7+z^2+z+1) 9664 // repeated in the low and high parts of a 9665 // 128-bit vector 9666 __ emit_int64(0x87); 9667 9668 __ align(CodeEntryAlignment); 9669 address start = __ pc(); 9670 9671 Register state = c_rarg0; 9672 Register subkeyH = c_rarg1; 9673 Register data = c_rarg2; 9674 Register blocks = c_rarg3; 9675 9676 FloatRegister vzr = v30; 9677 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9678 9679 __ ldrq(v24, p); // The field polynomial 9680 9681 __ ldrq(v0, Address(state)); 9682 __ ldrq(v1, Address(subkeyH)); 9683 9684 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9685 __ rbit(v0, __ T16B, v0); 9686 __ rev64(v1, __ T16B, v1); 9687 __ rbit(v1, __ T16B, v1); 9688 9689 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9690 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9691 9692 { 9693 Label L_ghash_loop; 9694 __ bind(L_ghash_loop); 9695 9696 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9697 // reversing each byte 9698 __ rbit(v2, __ T16B, v2); 9699 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9700 9701 // Multiply state in v2 by subkey in v1 9702 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9703 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9704 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9705 // Reduce v7:v5 by the field polynomial 9706 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9707 9708 __ sub(blocks, blocks, 1); 9709 __ cbnz(blocks, L_ghash_loop); 9710 } 9711 9712 // The bit-reversed result is at this point in v0 9713 __ rev64(v0, __ T16B, v0); 9714 __ rbit(v0, __ T16B, v0); 9715 9716 __ st1(v0, __ T16B, state); 9717 __ ret(lr); 9718 9719 return start; 9720 } 9721 9722 address generate_ghash_processBlocks_wide() { 9723 address small = generate_ghash_processBlocks(); 9724 9725 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id; 9726 StubCodeMark mark(this, stub_id); 9727 __ align(wordSize * 2); 9728 address p = __ pc(); 9729 __ emit_int64(0x87); // The low-order bits of the field 9730 // polynomial (i.e. p = z^7+z^2+z+1) 9731 // repeated in the low and high parts of a 9732 // 128-bit vector 9733 __ emit_int64(0x87); 9734 9735 __ align(CodeEntryAlignment); 9736 address start = __ pc(); 9737 9738 Register state = c_rarg0; 9739 Register subkeyH = c_rarg1; 9740 Register data = c_rarg2; 9741 Register blocks = c_rarg3; 9742 9743 const int unroll = 4; 9744 9745 __ cmp(blocks, (unsigned char)(unroll * 2)); 9746 __ br(__ LT, small); 9747 9748 if (unroll > 1) { 9749 // Save state before entering routine 9750 __ sub(sp, sp, 4 * 16); 9751 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9752 __ sub(sp, sp, 4 * 16); 9753 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9754 } 9755 9756 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9757 9758 if (unroll > 1) { 9759 // And restore state 9760 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9761 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9762 } 9763 9764 __ cmp(blocks, (unsigned char)0); 9765 __ br(__ GT, small); 9766 9767 __ ret(lr); 9768 9769 return start; 9770 } 9771 9772 void generate_base64_encode_simdround(Register src, Register dst, 9773 FloatRegister codec, u8 size) { 9774 9775 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9776 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9777 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9778 9779 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9780 9781 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9782 9783 __ ushr(ind0, arrangement, in0, 2); 9784 9785 __ ushr(ind1, arrangement, in1, 2); 9786 __ shl(in0, arrangement, in0, 6); 9787 __ orr(ind1, arrangement, ind1, in0); 9788 __ ushr(ind1, arrangement, ind1, 2); 9789 9790 __ ushr(ind2, arrangement, in2, 4); 9791 __ shl(in1, arrangement, in1, 4); 9792 __ orr(ind2, arrangement, in1, ind2); 9793 __ ushr(ind2, arrangement, ind2, 2); 9794 9795 __ shl(ind3, arrangement, in2, 2); 9796 __ ushr(ind3, arrangement, ind3, 2); 9797 9798 __ tbl(out0, arrangement, codec, 4, ind0); 9799 __ tbl(out1, arrangement, codec, 4, ind1); 9800 __ tbl(out2, arrangement, codec, 4, ind2); 9801 __ tbl(out3, arrangement, codec, 4, ind3); 9802 9803 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9804 } 9805 9806 /** 9807 * Arguments: 9808 * 9809 * Input: 9810 * c_rarg0 - src_start 9811 * c_rarg1 - src_offset 9812 * c_rarg2 - src_length 9813 * c_rarg3 - dest_start 9814 * c_rarg4 - dest_offset 9815 * c_rarg5 - isURL 9816 * 9817 */ 9818 address generate_base64_encodeBlock() { 9819 9820 static const char toBase64[64] = { 9821 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9822 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9823 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9824 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9825 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9826 }; 9827 9828 static const char toBase64URL[64] = { 9829 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9830 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9831 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9832 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9833 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9834 }; 9835 9836 __ align(CodeEntryAlignment); 9837 StubId stub_id = StubId::stubgen_base64_encodeBlock_id; 9838 StubCodeMark mark(this, stub_id); 9839 address start = __ pc(); 9840 9841 Register src = c_rarg0; // source array 9842 Register soff = c_rarg1; // source start offset 9843 Register send = c_rarg2; // source end offset 9844 Register dst = c_rarg3; // dest array 9845 Register doff = c_rarg4; // position for writing to dest array 9846 Register isURL = c_rarg5; // Base64 or URL character set 9847 9848 // c_rarg6 and c_rarg7 are free to use as temps 9849 Register codec = c_rarg6; 9850 Register length = c_rarg7; 9851 9852 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9853 9854 __ add(src, src, soff); 9855 __ add(dst, dst, doff); 9856 __ sub(length, send, soff); 9857 9858 // load the codec base address 9859 __ lea(codec, ExternalAddress((address) toBase64)); 9860 __ cbz(isURL, ProcessData); 9861 __ lea(codec, ExternalAddress((address) toBase64URL)); 9862 9863 __ BIND(ProcessData); 9864 9865 // too short to formup a SIMD loop, roll back 9866 __ cmp(length, (u1)24); 9867 __ br(Assembler::LT, Process3B); 9868 9869 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9870 9871 __ BIND(Process48B); 9872 __ cmp(length, (u1)48); 9873 __ br(Assembler::LT, Process24B); 9874 generate_base64_encode_simdround(src, dst, v0, 16); 9875 __ sub(length, length, 48); 9876 __ b(Process48B); 9877 9878 __ BIND(Process24B); 9879 __ cmp(length, (u1)24); 9880 __ br(Assembler::LT, SIMDExit); 9881 generate_base64_encode_simdround(src, dst, v0, 8); 9882 __ sub(length, length, 24); 9883 9884 __ BIND(SIMDExit); 9885 __ cbz(length, Exit); 9886 9887 __ BIND(Process3B); 9888 // 3 src bytes, 24 bits 9889 __ ldrb(r10, __ post(src, 1)); 9890 __ ldrb(r11, __ post(src, 1)); 9891 __ ldrb(r12, __ post(src, 1)); 9892 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9893 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9894 // codec index 9895 __ ubfmw(r15, r12, 18, 23); 9896 __ ubfmw(r14, r12, 12, 17); 9897 __ ubfmw(r13, r12, 6, 11); 9898 __ andw(r12, r12, 63); 9899 // get the code based on the codec 9900 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9901 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9902 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9903 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9904 __ strb(r15, __ post(dst, 1)); 9905 __ strb(r14, __ post(dst, 1)); 9906 __ strb(r13, __ post(dst, 1)); 9907 __ strb(r12, __ post(dst, 1)); 9908 __ sub(length, length, 3); 9909 __ cbnz(length, Process3B); 9910 9911 __ BIND(Exit); 9912 __ ret(lr); 9913 9914 return start; 9915 } 9916 9917 void generate_base64_decode_simdround(Register src, Register dst, 9918 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9919 9920 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9921 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9922 9923 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9924 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9925 9926 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9927 9928 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9929 9930 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9931 9932 // we need unsigned saturating subtract, to make sure all input values 9933 // in range [0, 63] will have 0U value in the higher half lookup 9934 __ uqsubv(decH0, __ T16B, in0, v27); 9935 __ uqsubv(decH1, __ T16B, in1, v27); 9936 __ uqsubv(decH2, __ T16B, in2, v27); 9937 __ uqsubv(decH3, __ T16B, in3, v27); 9938 9939 // lower half lookup 9940 __ tbl(decL0, arrangement, codecL, 4, in0); 9941 __ tbl(decL1, arrangement, codecL, 4, in1); 9942 __ tbl(decL2, arrangement, codecL, 4, in2); 9943 __ tbl(decL3, arrangement, codecL, 4, in3); 9944 9945 // higher half lookup 9946 __ tbx(decH0, arrangement, codecH, 4, decH0); 9947 __ tbx(decH1, arrangement, codecH, 4, decH1); 9948 __ tbx(decH2, arrangement, codecH, 4, decH2); 9949 __ tbx(decH3, arrangement, codecH, 4, decH3); 9950 9951 // combine lower and higher 9952 __ orr(decL0, arrangement, decL0, decH0); 9953 __ orr(decL1, arrangement, decL1, decH1); 9954 __ orr(decL2, arrangement, decL2, decH2); 9955 __ orr(decL3, arrangement, decL3, decH3); 9956 9957 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9958 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9959 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9960 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9961 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9962 __ orr(in0, arrangement, decH0, decH1); 9963 __ orr(in1, arrangement, decH2, decH3); 9964 __ orr(in2, arrangement, in0, in1); 9965 __ umaxv(in3, arrangement, in2); 9966 __ umov(rscratch2, in3, __ B, 0); 9967 9968 // get the data to output 9969 __ shl(out0, arrangement, decL0, 2); 9970 __ ushr(out1, arrangement, decL1, 4); 9971 __ orr(out0, arrangement, out0, out1); 9972 __ shl(out1, arrangement, decL1, 4); 9973 __ ushr(out2, arrangement, decL2, 2); 9974 __ orr(out1, arrangement, out1, out2); 9975 __ shl(out2, arrangement, decL2, 6); 9976 __ orr(out2, arrangement, out2, decL3); 9977 9978 __ cbz(rscratch2, NoIllegalData); 9979 9980 // handle illegal input 9981 __ umov(r10, in2, __ D, 0); 9982 if (size == 16) { 9983 __ cbnz(r10, ErrorInLowerHalf); 9984 9985 // illegal input is in higher half, store the lower half now. 9986 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9987 9988 __ umov(r10, in2, __ D, 1); 9989 __ umov(r11, out0, __ D, 1); 9990 __ umov(r12, out1, __ D, 1); 9991 __ umov(r13, out2, __ D, 1); 9992 __ b(StoreLegalData); 9993 9994 __ BIND(ErrorInLowerHalf); 9995 } 9996 __ umov(r11, out0, __ D, 0); 9997 __ umov(r12, out1, __ D, 0); 9998 __ umov(r13, out2, __ D, 0); 9999 10000 __ BIND(StoreLegalData); 10001 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 10002 __ strb(r11, __ post(dst, 1)); 10003 __ strb(r12, __ post(dst, 1)); 10004 __ strb(r13, __ post(dst, 1)); 10005 __ lsr(r10, r10, 8); 10006 __ lsr(r11, r11, 8); 10007 __ lsr(r12, r12, 8); 10008 __ lsr(r13, r13, 8); 10009 __ b(StoreLegalData); 10010 10011 __ BIND(NoIllegalData); 10012 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 10013 } 10014 10015 10016 /** 10017 * Arguments: 10018 * 10019 * Input: 10020 * c_rarg0 - src_start 10021 * c_rarg1 - src_offset 10022 * c_rarg2 - src_length 10023 * c_rarg3 - dest_start 10024 * c_rarg4 - dest_offset 10025 * c_rarg5 - isURL 10026 * c_rarg6 - isMIME 10027 * 10028 */ 10029 address generate_base64_decodeBlock() { 10030 10031 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 10032 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 10033 // titled "Base64 decoding". 10034 10035 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 10036 // except the trailing character '=' is also treated illegal value in this intrinsic. That 10037 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 10038 static const uint8_t fromBase64ForNoSIMD[256] = { 10039 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10040 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10041 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10042 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10043 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10044 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 10045 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10046 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10047 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10048 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10049 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10050 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10051 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10052 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10053 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10054 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10055 }; 10056 10057 static const uint8_t fromBase64URLForNoSIMD[256] = { 10058 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10059 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10060 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10061 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10062 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10063 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 10064 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10065 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10066 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10067 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10068 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10069 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10070 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10071 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10072 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10073 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10074 }; 10075 10076 // A legal value of base64 code is in range [0, 127]. We need two lookups 10077 // with tbl/tbx and combine them to get the decode data. The 1st table vector 10078 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 10079 // table vector lookup use tbx, out of range indices are unchanged in 10080 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 10081 // The value of index 64 is set to 0, so that we know that we already get the 10082 // decoded data with the 1st lookup. 10083 static const uint8_t fromBase64ForSIMD[128] = { 10084 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10085 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10086 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10087 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10088 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10089 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10090 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10091 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10092 }; 10093 10094 static const uint8_t fromBase64URLForSIMD[128] = { 10095 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10096 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10097 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10098 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10099 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10100 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10101 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10102 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10103 }; 10104 10105 __ align(CodeEntryAlignment); 10106 StubId stub_id = StubId::stubgen_base64_decodeBlock_id; 10107 StubCodeMark mark(this, stub_id); 10108 address start = __ pc(); 10109 10110 Register src = c_rarg0; // source array 10111 Register soff = c_rarg1; // source start offset 10112 Register send = c_rarg2; // source end offset 10113 Register dst = c_rarg3; // dest array 10114 Register doff = c_rarg4; // position for writing to dest array 10115 Register isURL = c_rarg5; // Base64 or URL character set 10116 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 10117 10118 Register length = send; // reuse send as length of source data to process 10119 10120 Register simd_codec = c_rarg6; 10121 Register nosimd_codec = c_rarg7; 10122 10123 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 10124 10125 __ enter(); 10126 10127 __ add(src, src, soff); 10128 __ add(dst, dst, doff); 10129 10130 __ mov(doff, dst); 10131 10132 __ sub(length, send, soff); 10133 __ bfm(length, zr, 0, 1); 10134 10135 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 10136 __ cbz(isURL, ProcessData); 10137 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 10138 10139 __ BIND(ProcessData); 10140 __ mov(rscratch1, length); 10141 __ cmp(length, (u1)144); // 144 = 80 + 64 10142 __ br(Assembler::LT, Process4B); 10143 10144 // In the MIME case, the line length cannot be more than 76 10145 // bytes (see RFC 2045). This is too short a block for SIMD 10146 // to be worthwhile, so we use non-SIMD here. 10147 __ movw(rscratch1, 79); 10148 10149 __ BIND(Process4B); 10150 __ ldrw(r14, __ post(src, 4)); 10151 __ ubfxw(r10, r14, 0, 8); 10152 __ ubfxw(r11, r14, 8, 8); 10153 __ ubfxw(r12, r14, 16, 8); 10154 __ ubfxw(r13, r14, 24, 8); 10155 // get the de-code 10156 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 10157 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 10158 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 10159 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 10160 // error detection, 255u indicates an illegal input 10161 __ orrw(r14, r10, r11); 10162 __ orrw(r15, r12, r13); 10163 __ orrw(r14, r14, r15); 10164 __ tbnz(r14, 7, Exit); 10165 // recover the data 10166 __ lslw(r14, r10, 10); 10167 __ bfiw(r14, r11, 4, 6); 10168 __ bfmw(r14, r12, 2, 5); 10169 __ rev16w(r14, r14); 10170 __ bfiw(r13, r12, 6, 2); 10171 __ strh(r14, __ post(dst, 2)); 10172 __ strb(r13, __ post(dst, 1)); 10173 // non-simd loop 10174 __ subsw(rscratch1, rscratch1, 4); 10175 __ br(Assembler::GT, Process4B); 10176 10177 // if exiting from PreProcess80B, rscratch1 == -1; 10178 // otherwise, rscratch1 == 0. 10179 __ cbzw(rscratch1, Exit); 10180 __ sub(length, length, 80); 10181 10182 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 10183 __ cbz(isURL, SIMDEnter); 10184 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 10185 10186 __ BIND(SIMDEnter); 10187 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 10188 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 10189 __ mov(rscratch1, 63); 10190 __ dup(v27, __ T16B, rscratch1); 10191 10192 __ BIND(Process64B); 10193 __ cmp(length, (u1)64); 10194 __ br(Assembler::LT, Process32B); 10195 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 10196 __ sub(length, length, 64); 10197 __ b(Process64B); 10198 10199 __ BIND(Process32B); 10200 __ cmp(length, (u1)32); 10201 __ br(Assembler::LT, SIMDExit); 10202 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 10203 __ sub(length, length, 32); 10204 __ b(Process32B); 10205 10206 __ BIND(SIMDExit); 10207 __ cbz(length, Exit); 10208 __ movw(rscratch1, length); 10209 __ b(Process4B); 10210 10211 __ BIND(Exit); 10212 __ sub(c_rarg0, dst, doff); 10213 10214 __ leave(); 10215 __ ret(lr); 10216 10217 return start; 10218 } 10219 10220 // Support for spin waits. 10221 address generate_spin_wait() { 10222 __ align(CodeEntryAlignment); 10223 StubId stub_id = StubId::stubgen_spin_wait_id; 10224 StubCodeMark mark(this, stub_id); 10225 address start = __ pc(); 10226 10227 __ spin_wait(); 10228 __ ret(lr); 10229 10230 return start; 10231 } 10232 10233 void generate_lookup_secondary_supers_table_stub() { 10234 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id; 10235 StubCodeMark mark(this, stub_id); 10236 10237 const Register 10238 r_super_klass = r0, 10239 r_array_base = r1, 10240 r_array_length = r2, 10241 r_array_index = r3, 10242 r_sub_klass = r4, 10243 r_bitmap = rscratch2, 10244 result = r5; 10245 const FloatRegister 10246 vtemp = v0; 10247 10248 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 10249 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 10250 Label L_success; 10251 __ enter(); 10252 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 10253 r_array_base, r_array_length, r_array_index, 10254 vtemp, result, slot, 10255 /*stub_is_near*/true); 10256 __ leave(); 10257 __ ret(lr); 10258 } 10259 } 10260 10261 // Slow path implementation for UseSecondarySupersTable. 10262 address generate_lookup_secondary_supers_table_slow_path_stub() { 10263 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id; 10264 StubCodeMark mark(this, stub_id); 10265 10266 address start = __ pc(); 10267 const Register 10268 r_super_klass = r0, // argument 10269 r_array_base = r1, // argument 10270 temp1 = r2, // temp 10271 r_array_index = r3, // argument 10272 r_bitmap = rscratch2, // argument 10273 result = r5; // argument 10274 10275 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 10276 __ ret(lr); 10277 10278 return start; 10279 } 10280 10281 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10282 10283 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 10284 // 10285 // If LSE is in use, generate LSE versions of all the stubs. The 10286 // non-LSE versions are in atomic_aarch64.S. 10287 10288 // class AtomicStubMark records the entry point of a stub and the 10289 // stub pointer which will point to it. The stub pointer is set to 10290 // the entry point when ~AtomicStubMark() is called, which must be 10291 // after ICache::invalidate_range. This ensures safe publication of 10292 // the generated code. 10293 class AtomicStubMark { 10294 address _entry_point; 10295 aarch64_atomic_stub_t *_stub; 10296 MacroAssembler *_masm; 10297 public: 10298 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 10299 _masm = masm; 10300 __ align(32); 10301 _entry_point = __ pc(); 10302 _stub = stub; 10303 } 10304 ~AtomicStubMark() { 10305 *_stub = (aarch64_atomic_stub_t)_entry_point; 10306 } 10307 }; 10308 10309 // NB: For memory_order_conservative we need a trailing membar after 10310 // LSE atomic operations but not a leading membar. 10311 // 10312 // We don't need a leading membar because a clause in the Arm ARM 10313 // says: 10314 // 10315 // Barrier-ordered-before 10316 // 10317 // Barrier instructions order prior Memory effects before subsequent 10318 // Memory effects generated by the same Observer. A read or a write 10319 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 10320 // Observer if and only if RW1 appears in program order before RW 2 10321 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 10322 // instruction with both Acquire and Release semantics. 10323 // 10324 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 10325 // and Release semantics, therefore we don't need a leading 10326 // barrier. However, there is no corresponding Barrier-ordered-after 10327 // relationship, therefore we need a trailing membar to prevent a 10328 // later store or load from being reordered with the store in an 10329 // atomic instruction. 10330 // 10331 // This was checked by using the herd7 consistency model simulator 10332 // (http://diy.inria.fr/) with this test case: 10333 // 10334 // AArch64 LseCas 10335 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 10336 // P0 | P1; 10337 // LDR W4, [X2] | MOV W3, #0; 10338 // DMB LD | MOV W4, #1; 10339 // LDR W3, [X1] | CASAL W3, W4, [X1]; 10340 // | DMB ISH; 10341 // | STR W4, [X2]; 10342 // exists 10343 // (0:X3=0 /\ 0:X4=1) 10344 // 10345 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 10346 // with the store to x in P1. Without the DMB in P1 this may happen. 10347 // 10348 // At the time of writing we don't know of any AArch64 hardware that 10349 // reorders stores in this way, but the Reference Manual permits it. 10350 10351 void gen_cas_entry(Assembler::operand_size size, 10352 atomic_memory_order order) { 10353 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 10354 exchange_val = c_rarg2; 10355 bool acquire, release; 10356 switch (order) { 10357 case memory_order_relaxed: 10358 acquire = false; 10359 release = false; 10360 break; 10361 case memory_order_release: 10362 acquire = false; 10363 release = true; 10364 break; 10365 default: 10366 acquire = true; 10367 release = true; 10368 break; 10369 } 10370 __ mov(prev, compare_val); 10371 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 10372 if (order == memory_order_conservative) { 10373 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10374 } 10375 if (size == Assembler::xword) { 10376 __ mov(r0, prev); 10377 } else { 10378 __ movw(r0, prev); 10379 } 10380 __ ret(lr); 10381 } 10382 10383 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10384 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10385 // If not relaxed, then default to conservative. Relaxed is the only 10386 // case we use enough to be worth specializing. 10387 if (order == memory_order_relaxed) { 10388 __ ldadd(size, incr, prev, addr); 10389 } else { 10390 __ ldaddal(size, incr, prev, addr); 10391 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10392 } 10393 if (size == Assembler::xword) { 10394 __ mov(r0, prev); 10395 } else { 10396 __ movw(r0, prev); 10397 } 10398 __ ret(lr); 10399 } 10400 10401 void gen_swpal_entry(Assembler::operand_size size) { 10402 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10403 __ swpal(size, incr, prev, addr); 10404 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10405 if (size == Assembler::xword) { 10406 __ mov(r0, prev); 10407 } else { 10408 __ movw(r0, prev); 10409 } 10410 __ ret(lr); 10411 } 10412 10413 void generate_atomic_entry_points() { 10414 if (! UseLSE) { 10415 return; 10416 } 10417 __ align(CodeEntryAlignment); 10418 StubId stub_id = StubId::stubgen_atomic_entry_points_id; 10419 StubCodeMark mark(this, stub_id); 10420 address first_entry = __ pc(); 10421 10422 // ADD, memory_order_conservative 10423 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10424 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10425 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10426 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10427 10428 // ADD, memory_order_relaxed 10429 AtomicStubMark mark_fetch_add_4_relaxed 10430 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10431 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10432 AtomicStubMark mark_fetch_add_8_relaxed 10433 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10434 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10435 10436 // XCHG, memory_order_conservative 10437 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10438 gen_swpal_entry(Assembler::word); 10439 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10440 gen_swpal_entry(Assembler::xword); 10441 10442 // CAS, memory_order_conservative 10443 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10444 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10445 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10446 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10447 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10448 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10449 10450 // CAS, memory_order_relaxed 10451 AtomicStubMark mark_cmpxchg_1_relaxed 10452 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10453 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10454 AtomicStubMark mark_cmpxchg_4_relaxed 10455 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10456 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10457 AtomicStubMark mark_cmpxchg_8_relaxed 10458 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10459 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10460 10461 AtomicStubMark mark_cmpxchg_4_release 10462 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10463 gen_cas_entry(MacroAssembler::word, memory_order_release); 10464 AtomicStubMark mark_cmpxchg_8_release 10465 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10466 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10467 10468 AtomicStubMark mark_cmpxchg_4_seq_cst 10469 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10470 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10471 AtomicStubMark mark_cmpxchg_8_seq_cst 10472 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10473 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10474 10475 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10476 } 10477 #endif // LINUX 10478 10479 address generate_cont_thaw(Continuation::thaw_kind kind) { 10480 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10481 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10482 10483 address start = __ pc(); 10484 10485 if (return_barrier) { 10486 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10487 __ mov(sp, rscratch1); 10488 } 10489 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10490 10491 if (return_barrier) { 10492 // preserve possible return value from a method returning to the return barrier 10493 __ fmovd(rscratch1, v0); 10494 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10495 } 10496 10497 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10498 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10499 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10500 10501 if (return_barrier) { 10502 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10503 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10504 __ fmovd(v0, rscratch1); 10505 } 10506 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10507 10508 10509 Label thaw_success; 10510 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10511 __ cbnz(rscratch2, thaw_success); 10512 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10513 __ br(rscratch1); 10514 __ bind(thaw_success); 10515 10516 // make room for the thawed frames 10517 __ sub(rscratch1, sp, rscratch2); 10518 __ andr(rscratch1, rscratch1, -16); // align 10519 __ mov(sp, rscratch1); 10520 10521 if (return_barrier) { 10522 // save original return value -- again 10523 __ fmovd(rscratch1, v0); 10524 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10525 } 10526 10527 // If we want, we can templatize thaw by kind, and have three different entries 10528 __ movw(c_rarg1, (uint32_t)kind); 10529 10530 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10531 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10532 10533 if (return_barrier) { 10534 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10535 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10536 __ fmovd(v0, rscratch1); 10537 } else { 10538 __ mov(r0, zr); // return 0 (success) from doYield 10539 } 10540 10541 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10542 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10543 __ mov(rfp, sp); 10544 10545 if (return_barrier_exception) { 10546 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10547 __ authenticate_return_address(c_rarg1); 10548 __ verify_oop(r0); 10549 // save return value containing the exception oop in callee-saved R19 10550 __ mov(r19, r0); 10551 10552 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10553 10554 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10555 // __ reinitialize_ptrue(); 10556 10557 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10558 10559 __ mov(r1, r0); // the exception handler 10560 __ mov(r0, r19); // restore return value containing the exception oop 10561 __ verify_oop(r0); 10562 10563 __ leave(); 10564 __ mov(r3, lr); 10565 __ br(r1); // the exception handler 10566 } else { 10567 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10568 __ leave(); 10569 __ ret(lr); 10570 } 10571 10572 return start; 10573 } 10574 10575 address generate_cont_thaw() { 10576 if (!Continuations::enabled()) return nullptr; 10577 10578 StubId stub_id = StubId::stubgen_cont_thaw_id; 10579 StubCodeMark mark(this, stub_id); 10580 address start = __ pc(); 10581 generate_cont_thaw(Continuation::thaw_top); 10582 return start; 10583 } 10584 10585 address generate_cont_returnBarrier() { 10586 if (!Continuations::enabled()) return nullptr; 10587 10588 // TODO: will probably need multiple return barriers depending on return type 10589 StubId stub_id = StubId::stubgen_cont_returnBarrier_id; 10590 StubCodeMark mark(this, stub_id); 10591 address start = __ pc(); 10592 10593 generate_cont_thaw(Continuation::thaw_return_barrier); 10594 10595 return start; 10596 } 10597 10598 address generate_cont_returnBarrier_exception() { 10599 if (!Continuations::enabled()) return nullptr; 10600 10601 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id; 10602 StubCodeMark mark(this, stub_id); 10603 address start = __ pc(); 10604 10605 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10606 10607 return start; 10608 } 10609 10610 address generate_cont_preempt_stub() { 10611 if (!Continuations::enabled()) return nullptr; 10612 StubId stub_id = StubId::stubgen_cont_preempt_id; 10613 StubCodeMark mark(this, stub_id); 10614 address start = __ pc(); 10615 10616 __ reset_last_Java_frame(true); 10617 10618 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10619 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10620 __ mov(sp, rscratch2); 10621 10622 Label preemption_cancelled; 10623 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10624 __ cbnz(rscratch1, preemption_cancelled); 10625 10626 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10627 SharedRuntime::continuation_enter_cleanup(_masm); 10628 __ leave(); 10629 __ ret(lr); 10630 10631 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10632 __ bind(preemption_cancelled); 10633 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10634 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10635 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10636 __ ldr(rscratch1, Address(rscratch1)); 10637 __ br(rscratch1); 10638 10639 return start; 10640 } 10641 10642 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10643 // are represented as long[5], with BITS_PER_LIMB = 26. 10644 // Pack five 26-bit limbs into three 64-bit registers. 10645 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10646 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10647 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10648 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10649 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10650 10651 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10652 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10653 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10654 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10655 10656 if (dest2->is_valid()) { 10657 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10658 } else { 10659 #ifdef ASSERT 10660 Label OK; 10661 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10662 __ br(__ EQ, OK); 10663 __ stop("high bits of Poly1305 integer should be zero"); 10664 __ should_not_reach_here(); 10665 __ bind(OK); 10666 #endif 10667 } 10668 } 10669 10670 // As above, but return only a 128-bit integer, packed into two 10671 // 64-bit registers. 10672 void pack_26(Register dest0, Register dest1, Register src) { 10673 pack_26(dest0, dest1, noreg, src); 10674 } 10675 10676 // Multiply and multiply-accumulate unsigned 64-bit registers. 10677 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10678 __ mul(prod_lo, n, m); 10679 __ umulh(prod_hi, n, m); 10680 } 10681 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10682 wide_mul(rscratch1, rscratch2, n, m); 10683 __ adds(sum_lo, sum_lo, rscratch1); 10684 __ adc(sum_hi, sum_hi, rscratch2); 10685 } 10686 10687 // Poly1305, RFC 7539 10688 10689 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10690 // description of the tricks used to simplify and accelerate this 10691 // computation. 10692 10693 address generate_poly1305_processBlocks() { 10694 __ align(CodeEntryAlignment); 10695 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id; 10696 StubCodeMark mark(this, stub_id); 10697 address start = __ pc(); 10698 Label here; 10699 __ enter(); 10700 RegSet callee_saved = RegSet::range(r19, r28); 10701 __ push(callee_saved, sp); 10702 10703 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10704 10705 // Arguments 10706 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10707 10708 // R_n is the 128-bit randomly-generated key, packed into two 10709 // registers. The caller passes this key to us as long[5], with 10710 // BITS_PER_LIMB = 26. 10711 const Register R_0 = *++regs, R_1 = *++regs; 10712 pack_26(R_0, R_1, r_start); 10713 10714 // RR_n is (R_n >> 2) * 5 10715 const Register RR_0 = *++regs, RR_1 = *++regs; 10716 __ lsr(RR_0, R_0, 2); 10717 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10718 __ lsr(RR_1, R_1, 2); 10719 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10720 10721 // U_n is the current checksum 10722 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10723 pack_26(U_0, U_1, U_2, acc_start); 10724 10725 static constexpr int BLOCK_LENGTH = 16; 10726 Label DONE, LOOP; 10727 10728 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10729 __ br(Assembler::LT, DONE); { 10730 __ bind(LOOP); 10731 10732 // S_n is to be the sum of U_n and the next block of data 10733 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10734 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10735 __ adds(S_0, U_0, S_0); 10736 __ adcs(S_1, U_1, S_1); 10737 __ adc(S_2, U_2, zr); 10738 __ add(S_2, S_2, 1); 10739 10740 const Register U_0HI = *++regs, U_1HI = *++regs; 10741 10742 // NB: this logic depends on some of the special properties of 10743 // Poly1305 keys. In particular, because we know that the top 10744 // four bits of R_0 and R_1 are zero, we can add together 10745 // partial products without any risk of needing to propagate a 10746 // carry out. 10747 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10748 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10749 __ andr(U_2, R_0, 3); 10750 __ mul(U_2, S_2, U_2); 10751 10752 // Recycle registers S_0, S_1, S_2 10753 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10754 10755 // Partial reduction mod 2**130 - 5 10756 __ adds(U_1, U_0HI, U_1); 10757 __ adc(U_2, U_1HI, U_2); 10758 // Sum now in U_2:U_1:U_0. 10759 // Dead: U_0HI, U_1HI. 10760 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10761 10762 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10763 10764 // First, U_2:U_1:U_0 += (U_2 >> 2) 10765 __ lsr(rscratch1, U_2, 2); 10766 __ andr(U_2, U_2, (u8)3); 10767 __ adds(U_0, U_0, rscratch1); 10768 __ adcs(U_1, U_1, zr); 10769 __ adc(U_2, U_2, zr); 10770 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10771 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10772 __ adcs(U_1, U_1, zr); 10773 __ adc(U_2, U_2, zr); 10774 10775 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10776 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10777 __ br(~ Assembler::LT, LOOP); 10778 } 10779 10780 // Further reduce modulo 2^130 - 5 10781 __ lsr(rscratch1, U_2, 2); 10782 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10783 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10784 __ adcs(U_1, U_1, zr); 10785 __ andr(U_2, U_2, (u1)3); 10786 __ adc(U_2, U_2, zr); 10787 10788 // Unpack the sum into five 26-bit limbs and write to memory. 10789 __ ubfiz(rscratch1, U_0, 0, 26); 10790 __ ubfx(rscratch2, U_0, 26, 26); 10791 __ stp(rscratch1, rscratch2, Address(acc_start)); 10792 __ ubfx(rscratch1, U_0, 52, 12); 10793 __ bfi(rscratch1, U_1, 12, 14); 10794 __ ubfx(rscratch2, U_1, 14, 26); 10795 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10796 __ ubfx(rscratch1, U_1, 40, 24); 10797 __ bfi(rscratch1, U_2, 24, 3); 10798 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10799 10800 __ bind(DONE); 10801 __ pop(callee_saved, sp); 10802 __ leave(); 10803 __ ret(lr); 10804 10805 return start; 10806 } 10807 10808 // exception handler for upcall stubs 10809 address generate_upcall_stub_exception_handler() { 10810 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id; 10811 StubCodeMark mark(this, stub_id); 10812 address start = __ pc(); 10813 10814 // Native caller has no idea how to handle exceptions, 10815 // so we just crash here. Up to callee to catch exceptions. 10816 __ verify_oop(r0); 10817 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10818 __ blr(rscratch1); 10819 __ should_not_reach_here(); 10820 10821 return start; 10822 } 10823 10824 // load Method* target of MethodHandle 10825 // j_rarg0 = jobject receiver 10826 // rmethod = result 10827 address generate_upcall_stub_load_target() { 10828 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id; 10829 StubCodeMark mark(this, stub_id); 10830 address start = __ pc(); 10831 10832 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10833 // Load target method from receiver 10834 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10835 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10836 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10837 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10838 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10839 noreg, noreg); 10840 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10841 10842 __ ret(lr); 10843 10844 return start; 10845 } 10846 10847 #undef __ 10848 #define __ masm-> 10849 10850 class MontgomeryMultiplyGenerator : public MacroAssembler { 10851 10852 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10853 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10854 10855 RegSet _toSave; 10856 bool _squaring; 10857 10858 public: 10859 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10860 : MacroAssembler(as->code()), _squaring(squaring) { 10861 10862 // Register allocation 10863 10864 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10865 Pa_base = *regs; // Argument registers 10866 if (squaring) 10867 Pb_base = Pa_base; 10868 else 10869 Pb_base = *++regs; 10870 Pn_base = *++regs; 10871 Rlen= *++regs; 10872 inv = *++regs; 10873 Pm_base = *++regs; 10874 10875 // Working registers: 10876 Ra = *++regs; // The current digit of a, b, n, and m. 10877 Rb = *++regs; 10878 Rm = *++regs; 10879 Rn = *++regs; 10880 10881 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10882 Pb = *++regs; 10883 Pm = *++regs; 10884 Pn = *++regs; 10885 10886 t0 = *++regs; // Three registers which form a 10887 t1 = *++regs; // triple-precision accumuator. 10888 t2 = *++regs; 10889 10890 Ri = *++regs; // Inner and outer loop indexes. 10891 Rj = *++regs; 10892 10893 Rhi_ab = *++regs; // Product registers: low and high parts 10894 Rlo_ab = *++regs; // of a*b and m*n. 10895 Rhi_mn = *++regs; 10896 Rlo_mn = *++regs; 10897 10898 // r19 and up are callee-saved. 10899 _toSave = RegSet::range(r19, *regs) + Pm_base; 10900 } 10901 10902 private: 10903 void save_regs() { 10904 push(_toSave, sp); 10905 } 10906 10907 void restore_regs() { 10908 pop(_toSave, sp); 10909 } 10910 10911 template <typename T> 10912 void unroll_2(Register count, T block) { 10913 Label loop, end, odd; 10914 tbnz(count, 0, odd); 10915 cbz(count, end); 10916 align(16); 10917 bind(loop); 10918 (this->*block)(); 10919 bind(odd); 10920 (this->*block)(); 10921 subs(count, count, 2); 10922 br(Assembler::GT, loop); 10923 bind(end); 10924 } 10925 10926 template <typename T> 10927 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10928 Label loop, end, odd; 10929 tbnz(count, 0, odd); 10930 cbz(count, end); 10931 align(16); 10932 bind(loop); 10933 (this->*block)(d, s, tmp); 10934 bind(odd); 10935 (this->*block)(d, s, tmp); 10936 subs(count, count, 2); 10937 br(Assembler::GT, loop); 10938 bind(end); 10939 } 10940 10941 void pre1(RegisterOrConstant i) { 10942 block_comment("pre1"); 10943 // Pa = Pa_base; 10944 // Pb = Pb_base + i; 10945 // Pm = Pm_base; 10946 // Pn = Pn_base + i; 10947 // Ra = *Pa; 10948 // Rb = *Pb; 10949 // Rm = *Pm; 10950 // Rn = *Pn; 10951 ldr(Ra, Address(Pa_base)); 10952 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10953 ldr(Rm, Address(Pm_base)); 10954 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10955 lea(Pa, Address(Pa_base)); 10956 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10957 lea(Pm, Address(Pm_base)); 10958 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10959 10960 // Zero the m*n result. 10961 mov(Rhi_mn, zr); 10962 mov(Rlo_mn, zr); 10963 } 10964 10965 // The core multiply-accumulate step of a Montgomery 10966 // multiplication. The idea is to schedule operations as a 10967 // pipeline so that instructions with long latencies (loads and 10968 // multiplies) have time to complete before their results are 10969 // used. This most benefits in-order implementations of the 10970 // architecture but out-of-order ones also benefit. 10971 void step() { 10972 block_comment("step"); 10973 // MACC(Ra, Rb, t0, t1, t2); 10974 // Ra = *++Pa; 10975 // Rb = *--Pb; 10976 umulh(Rhi_ab, Ra, Rb); 10977 mul(Rlo_ab, Ra, Rb); 10978 ldr(Ra, pre(Pa, wordSize)); 10979 ldr(Rb, pre(Pb, -wordSize)); 10980 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 10981 // previous iteration. 10982 // MACC(Rm, Rn, t0, t1, t2); 10983 // Rm = *++Pm; 10984 // Rn = *--Pn; 10985 umulh(Rhi_mn, Rm, Rn); 10986 mul(Rlo_mn, Rm, Rn); 10987 ldr(Rm, pre(Pm, wordSize)); 10988 ldr(Rn, pre(Pn, -wordSize)); 10989 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10990 } 10991 10992 void post1() { 10993 block_comment("post1"); 10994 10995 // MACC(Ra, Rb, t0, t1, t2); 10996 // Ra = *++Pa; 10997 // Rb = *--Pb; 10998 umulh(Rhi_ab, Ra, Rb); 10999 mul(Rlo_ab, Ra, Rb); 11000 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11001 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11002 11003 // *Pm = Rm = t0 * inv; 11004 mul(Rm, t0, inv); 11005 str(Rm, Address(Pm)); 11006 11007 // MACC(Rm, Rn, t0, t1, t2); 11008 // t0 = t1; t1 = t2; t2 = 0; 11009 umulh(Rhi_mn, Rm, Rn); 11010 11011 #ifndef PRODUCT 11012 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11013 { 11014 mul(Rlo_mn, Rm, Rn); 11015 add(Rlo_mn, t0, Rlo_mn); 11016 Label ok; 11017 cbz(Rlo_mn, ok); { 11018 stop("broken Montgomery multiply"); 11019 } bind(ok); 11020 } 11021 #endif 11022 // We have very carefully set things up so that 11023 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11024 // the lower half of Rm * Rn because we know the result already: 11025 // it must be -t0. t0 + (-t0) must generate a carry iff 11026 // t0 != 0. So, rather than do a mul and an adds we just set 11027 // the carry flag iff t0 is nonzero. 11028 // 11029 // mul(Rlo_mn, Rm, Rn); 11030 // adds(zr, t0, Rlo_mn); 11031 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11032 adcs(t0, t1, Rhi_mn); 11033 adc(t1, t2, zr); 11034 mov(t2, zr); 11035 } 11036 11037 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 11038 block_comment("pre2"); 11039 // Pa = Pa_base + i-len; 11040 // Pb = Pb_base + len; 11041 // Pm = Pm_base + i-len; 11042 // Pn = Pn_base + len; 11043 11044 if (i.is_register()) { 11045 sub(Rj, i.as_register(), len); 11046 } else { 11047 mov(Rj, i.as_constant()); 11048 sub(Rj, Rj, len); 11049 } 11050 // Rj == i-len 11051 11052 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 11053 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 11054 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11055 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 11056 11057 // Ra = *++Pa; 11058 // Rb = *--Pb; 11059 // Rm = *++Pm; 11060 // Rn = *--Pn; 11061 ldr(Ra, pre(Pa, wordSize)); 11062 ldr(Rb, pre(Pb, -wordSize)); 11063 ldr(Rm, pre(Pm, wordSize)); 11064 ldr(Rn, pre(Pn, -wordSize)); 11065 11066 mov(Rhi_mn, zr); 11067 mov(Rlo_mn, zr); 11068 } 11069 11070 void post2(RegisterOrConstant i, RegisterOrConstant len) { 11071 block_comment("post2"); 11072 if (i.is_constant()) { 11073 mov(Rj, i.as_constant()-len.as_constant()); 11074 } else { 11075 sub(Rj, i.as_register(), len); 11076 } 11077 11078 adds(t0, t0, Rlo_mn); // The pending m*n, low part 11079 11080 // As soon as we know the least significant digit of our result, 11081 // store it. 11082 // Pm_base[i-len] = t0; 11083 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11084 11085 // t0 = t1; t1 = t2; t2 = 0; 11086 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 11087 adc(t1, t2, zr); 11088 mov(t2, zr); 11089 } 11090 11091 // A carry in t0 after Montgomery multiplication means that we 11092 // should subtract multiples of n from our result in m. We'll 11093 // keep doing that until there is no carry. 11094 void normalize(RegisterOrConstant len) { 11095 block_comment("normalize"); 11096 // while (t0) 11097 // t0 = sub(Pm_base, Pn_base, t0, len); 11098 Label loop, post, again; 11099 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 11100 cbz(t0, post); { 11101 bind(again); { 11102 mov(i, zr); 11103 mov(cnt, len); 11104 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11105 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11106 subs(zr, zr, zr); // set carry flag, i.e. no borrow 11107 align(16); 11108 bind(loop); { 11109 sbcs(Rm, Rm, Rn); 11110 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11111 add(i, i, 1); 11112 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11113 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11114 sub(cnt, cnt, 1); 11115 } cbnz(cnt, loop); 11116 sbc(t0, t0, zr); 11117 } cbnz(t0, again); 11118 } bind(post); 11119 } 11120 11121 // Move memory at s to d, reversing words. 11122 // Increments d to end of copied memory 11123 // Destroys tmp1, tmp2 11124 // Preserves len 11125 // Leaves s pointing to the address which was in d at start 11126 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 11127 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 11128 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 11129 11130 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 11131 mov(tmp1, len); 11132 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 11133 sub(s, d, len, ext::uxtw, LogBytesPerWord); 11134 } 11135 // where 11136 void reverse1(Register d, Register s, Register tmp) { 11137 ldr(tmp, pre(s, -wordSize)); 11138 ror(tmp, tmp, 32); 11139 str(tmp, post(d, wordSize)); 11140 } 11141 11142 void step_squaring() { 11143 // An extra ACC 11144 step(); 11145 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11146 } 11147 11148 void last_squaring(RegisterOrConstant i) { 11149 Label dont; 11150 // if ((i & 1) == 0) { 11151 tbnz(i.as_register(), 0, dont); { 11152 // MACC(Ra, Rb, t0, t1, t2); 11153 // Ra = *++Pa; 11154 // Rb = *--Pb; 11155 umulh(Rhi_ab, Ra, Rb); 11156 mul(Rlo_ab, Ra, Rb); 11157 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11158 } bind(dont); 11159 } 11160 11161 void extra_step_squaring() { 11162 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11163 11164 // MACC(Rm, Rn, t0, t1, t2); 11165 // Rm = *++Pm; 11166 // Rn = *--Pn; 11167 umulh(Rhi_mn, Rm, Rn); 11168 mul(Rlo_mn, Rm, Rn); 11169 ldr(Rm, pre(Pm, wordSize)); 11170 ldr(Rn, pre(Pn, -wordSize)); 11171 } 11172 11173 void post1_squaring() { 11174 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11175 11176 // *Pm = Rm = t0 * inv; 11177 mul(Rm, t0, inv); 11178 str(Rm, Address(Pm)); 11179 11180 // MACC(Rm, Rn, t0, t1, t2); 11181 // t0 = t1; t1 = t2; t2 = 0; 11182 umulh(Rhi_mn, Rm, Rn); 11183 11184 #ifndef PRODUCT 11185 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11186 { 11187 mul(Rlo_mn, Rm, Rn); 11188 add(Rlo_mn, t0, Rlo_mn); 11189 Label ok; 11190 cbz(Rlo_mn, ok); { 11191 stop("broken Montgomery multiply"); 11192 } bind(ok); 11193 } 11194 #endif 11195 // We have very carefully set things up so that 11196 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11197 // the lower half of Rm * Rn because we know the result already: 11198 // it must be -t0. t0 + (-t0) must generate a carry iff 11199 // t0 != 0. So, rather than do a mul and an adds we just set 11200 // the carry flag iff t0 is nonzero. 11201 // 11202 // mul(Rlo_mn, Rm, Rn); 11203 // adds(zr, t0, Rlo_mn); 11204 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11205 adcs(t0, t1, Rhi_mn); 11206 adc(t1, t2, zr); 11207 mov(t2, zr); 11208 } 11209 11210 void acc(Register Rhi, Register Rlo, 11211 Register t0, Register t1, Register t2) { 11212 adds(t0, t0, Rlo); 11213 adcs(t1, t1, Rhi); 11214 adc(t2, t2, zr); 11215 } 11216 11217 public: 11218 /** 11219 * Fast Montgomery multiplication. The derivation of the 11220 * algorithm is in A Cryptographic Library for the Motorola 11221 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 11222 * 11223 * Arguments: 11224 * 11225 * Inputs for multiplication: 11226 * c_rarg0 - int array elements a 11227 * c_rarg1 - int array elements b 11228 * c_rarg2 - int array elements n (the modulus) 11229 * c_rarg3 - int length 11230 * c_rarg4 - int inv 11231 * c_rarg5 - int array elements m (the result) 11232 * 11233 * Inputs for squaring: 11234 * c_rarg0 - int array elements a 11235 * c_rarg1 - int array elements n (the modulus) 11236 * c_rarg2 - int length 11237 * c_rarg3 - int inv 11238 * c_rarg4 - int array elements m (the result) 11239 * 11240 */ 11241 address generate_multiply() { 11242 Label argh, nothing; 11243 bind(argh); 11244 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11245 11246 align(CodeEntryAlignment); 11247 address entry = pc(); 11248 11249 cbzw(Rlen, nothing); 11250 11251 enter(); 11252 11253 // Make room. 11254 cmpw(Rlen, 512); 11255 br(Assembler::HI, argh); 11256 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11257 andr(sp, Ra, -2 * wordSize); 11258 11259 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11260 11261 { 11262 // Copy input args, reversing as we go. We use Ra as a 11263 // temporary variable. 11264 reverse(Ra, Pa_base, Rlen, t0, t1); 11265 if (!_squaring) 11266 reverse(Ra, Pb_base, Rlen, t0, t1); 11267 reverse(Ra, Pn_base, Rlen, t0, t1); 11268 } 11269 11270 // Push all call-saved registers and also Pm_base which we'll need 11271 // at the end. 11272 save_regs(); 11273 11274 #ifndef PRODUCT 11275 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 11276 { 11277 ldr(Rn, Address(Pn_base, 0)); 11278 mul(Rlo_mn, Rn, inv); 11279 subs(zr, Rlo_mn, -1); 11280 Label ok; 11281 br(EQ, ok); { 11282 stop("broken inverse in Montgomery multiply"); 11283 } bind(ok); 11284 } 11285 #endif 11286 11287 mov(Pm_base, Ra); 11288 11289 mov(t0, zr); 11290 mov(t1, zr); 11291 mov(t2, zr); 11292 11293 block_comment("for (int i = 0; i < len; i++) {"); 11294 mov(Ri, zr); { 11295 Label loop, end; 11296 cmpw(Ri, Rlen); 11297 br(Assembler::GE, end); 11298 11299 bind(loop); 11300 pre1(Ri); 11301 11302 block_comment(" for (j = i; j; j--) {"); { 11303 movw(Rj, Ri); 11304 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11305 } block_comment(" } // j"); 11306 11307 post1(); 11308 addw(Ri, Ri, 1); 11309 cmpw(Ri, Rlen); 11310 br(Assembler::LT, loop); 11311 bind(end); 11312 block_comment("} // i"); 11313 } 11314 11315 block_comment("for (int i = len; i < 2*len; i++) {"); 11316 mov(Ri, Rlen); { 11317 Label loop, end; 11318 cmpw(Ri, Rlen, Assembler::LSL, 1); 11319 br(Assembler::GE, end); 11320 11321 bind(loop); 11322 pre2(Ri, Rlen); 11323 11324 block_comment(" for (j = len*2-i-1; j; j--) {"); { 11325 lslw(Rj, Rlen, 1); 11326 subw(Rj, Rj, Ri); 11327 subw(Rj, Rj, 1); 11328 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11329 } block_comment(" } // j"); 11330 11331 post2(Ri, Rlen); 11332 addw(Ri, Ri, 1); 11333 cmpw(Ri, Rlen, Assembler::LSL, 1); 11334 br(Assembler::LT, loop); 11335 bind(end); 11336 } 11337 block_comment("} // i"); 11338 11339 normalize(Rlen); 11340 11341 mov(Ra, Pm_base); // Save Pm_base in Ra 11342 restore_regs(); // Restore caller's Pm_base 11343 11344 // Copy our result into caller's Pm_base 11345 reverse(Pm_base, Ra, Rlen, t0, t1); 11346 11347 leave(); 11348 bind(nothing); 11349 ret(lr); 11350 11351 return entry; 11352 } 11353 // In C, approximately: 11354 11355 // void 11356 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 11357 // julong Pn_base[], julong Pm_base[], 11358 // julong inv, int len) { 11359 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11360 // julong *Pa, *Pb, *Pn, *Pm; 11361 // julong Ra, Rb, Rn, Rm; 11362 11363 // int i; 11364 11365 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11366 11367 // for (i = 0; i < len; i++) { 11368 // int j; 11369 11370 // Pa = Pa_base; 11371 // Pb = Pb_base + i; 11372 // Pm = Pm_base; 11373 // Pn = Pn_base + i; 11374 11375 // Ra = *Pa; 11376 // Rb = *Pb; 11377 // Rm = *Pm; 11378 // Rn = *Pn; 11379 11380 // int iters = i; 11381 // for (j = 0; iters--; j++) { 11382 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11383 // MACC(Ra, Rb, t0, t1, t2); 11384 // Ra = *++Pa; 11385 // Rb = *--Pb; 11386 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11387 // MACC(Rm, Rn, t0, t1, t2); 11388 // Rm = *++Pm; 11389 // Rn = *--Pn; 11390 // } 11391 11392 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11393 // MACC(Ra, Rb, t0, t1, t2); 11394 // *Pm = Rm = t0 * inv; 11395 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11396 // MACC(Rm, Rn, t0, t1, t2); 11397 11398 // assert(t0 == 0, "broken Montgomery multiply"); 11399 11400 // t0 = t1; t1 = t2; t2 = 0; 11401 // } 11402 11403 // for (i = len; i < 2*len; i++) { 11404 // int j; 11405 11406 // Pa = Pa_base + i-len; 11407 // Pb = Pb_base + len; 11408 // Pm = Pm_base + i-len; 11409 // Pn = Pn_base + len; 11410 11411 // Ra = *++Pa; 11412 // Rb = *--Pb; 11413 // Rm = *++Pm; 11414 // Rn = *--Pn; 11415 11416 // int iters = len*2-i-1; 11417 // for (j = i-len+1; iters--; j++) { 11418 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11419 // MACC(Ra, Rb, t0, t1, t2); 11420 // Ra = *++Pa; 11421 // Rb = *--Pb; 11422 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11423 // MACC(Rm, Rn, t0, t1, t2); 11424 // Rm = *++Pm; 11425 // Rn = *--Pn; 11426 // } 11427 11428 // Pm_base[i-len] = t0; 11429 // t0 = t1; t1 = t2; t2 = 0; 11430 // } 11431 11432 // while (t0) 11433 // t0 = sub(Pm_base, Pn_base, t0, len); 11434 // } 11435 11436 /** 11437 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11438 * multiplies than Montgomery multiplication so it should be up to 11439 * 25% faster. However, its loop control is more complex and it 11440 * may actually run slower on some machines. 11441 * 11442 * Arguments: 11443 * 11444 * Inputs: 11445 * c_rarg0 - int array elements a 11446 * c_rarg1 - int array elements n (the modulus) 11447 * c_rarg2 - int length 11448 * c_rarg3 - int inv 11449 * c_rarg4 - int array elements m (the result) 11450 * 11451 */ 11452 address generate_square() { 11453 Label argh; 11454 bind(argh); 11455 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11456 11457 align(CodeEntryAlignment); 11458 address entry = pc(); 11459 11460 enter(); 11461 11462 // Make room. 11463 cmpw(Rlen, 512); 11464 br(Assembler::HI, argh); 11465 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11466 andr(sp, Ra, -2 * wordSize); 11467 11468 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11469 11470 { 11471 // Copy input args, reversing as we go. We use Ra as a 11472 // temporary variable. 11473 reverse(Ra, Pa_base, Rlen, t0, t1); 11474 reverse(Ra, Pn_base, Rlen, t0, t1); 11475 } 11476 11477 // Push all call-saved registers and also Pm_base which we'll need 11478 // at the end. 11479 save_regs(); 11480 11481 mov(Pm_base, Ra); 11482 11483 mov(t0, zr); 11484 mov(t1, zr); 11485 mov(t2, zr); 11486 11487 block_comment("for (int i = 0; i < len; i++) {"); 11488 mov(Ri, zr); { 11489 Label loop, end; 11490 bind(loop); 11491 cmp(Ri, Rlen); 11492 br(Assembler::GE, end); 11493 11494 pre1(Ri); 11495 11496 block_comment("for (j = (i+1)/2; j; j--) {"); { 11497 add(Rj, Ri, 1); 11498 lsr(Rj, Rj, 1); 11499 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11500 } block_comment(" } // j"); 11501 11502 last_squaring(Ri); 11503 11504 block_comment(" for (j = i/2; j; j--) {"); { 11505 lsr(Rj, Ri, 1); 11506 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11507 } block_comment(" } // j"); 11508 11509 post1_squaring(); 11510 add(Ri, Ri, 1); 11511 cmp(Ri, Rlen); 11512 br(Assembler::LT, loop); 11513 11514 bind(end); 11515 block_comment("} // i"); 11516 } 11517 11518 block_comment("for (int i = len; i < 2*len; i++) {"); 11519 mov(Ri, Rlen); { 11520 Label loop, end; 11521 bind(loop); 11522 cmp(Ri, Rlen, Assembler::LSL, 1); 11523 br(Assembler::GE, end); 11524 11525 pre2(Ri, Rlen); 11526 11527 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11528 lsl(Rj, Rlen, 1); 11529 sub(Rj, Rj, Ri); 11530 sub(Rj, Rj, 1); 11531 lsr(Rj, Rj, 1); 11532 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11533 } block_comment(" } // j"); 11534 11535 last_squaring(Ri); 11536 11537 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11538 lsl(Rj, Rlen, 1); 11539 sub(Rj, Rj, Ri); 11540 lsr(Rj, Rj, 1); 11541 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11542 } block_comment(" } // j"); 11543 11544 post2(Ri, Rlen); 11545 add(Ri, Ri, 1); 11546 cmp(Ri, Rlen, Assembler::LSL, 1); 11547 11548 br(Assembler::LT, loop); 11549 bind(end); 11550 block_comment("} // i"); 11551 } 11552 11553 normalize(Rlen); 11554 11555 mov(Ra, Pm_base); // Save Pm_base in Ra 11556 restore_regs(); // Restore caller's Pm_base 11557 11558 // Copy our result into caller's Pm_base 11559 reverse(Pm_base, Ra, Rlen, t0, t1); 11560 11561 leave(); 11562 ret(lr); 11563 11564 return entry; 11565 } 11566 // In C, approximately: 11567 11568 // void 11569 // montgomery_square(julong Pa_base[], julong Pn_base[], 11570 // julong Pm_base[], julong inv, int len) { 11571 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11572 // julong *Pa, *Pb, *Pn, *Pm; 11573 // julong Ra, Rb, Rn, Rm; 11574 11575 // int i; 11576 11577 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11578 11579 // for (i = 0; i < len; i++) { 11580 // int j; 11581 11582 // Pa = Pa_base; 11583 // Pb = Pa_base + i; 11584 // Pm = Pm_base; 11585 // Pn = Pn_base + i; 11586 11587 // Ra = *Pa; 11588 // Rb = *Pb; 11589 // Rm = *Pm; 11590 // Rn = *Pn; 11591 11592 // int iters = (i+1)/2; 11593 // for (j = 0; iters--; j++) { 11594 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11595 // MACC2(Ra, Rb, t0, t1, t2); 11596 // Ra = *++Pa; 11597 // Rb = *--Pb; 11598 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11599 // MACC(Rm, Rn, t0, t1, t2); 11600 // Rm = *++Pm; 11601 // Rn = *--Pn; 11602 // } 11603 // if ((i & 1) == 0) { 11604 // assert(Ra == Pa_base[j], "must be"); 11605 // MACC(Ra, Ra, t0, t1, t2); 11606 // } 11607 // iters = i/2; 11608 // assert(iters == i-j, "must be"); 11609 // for (; iters--; j++) { 11610 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11611 // MACC(Rm, Rn, t0, t1, t2); 11612 // Rm = *++Pm; 11613 // Rn = *--Pn; 11614 // } 11615 11616 // *Pm = Rm = t0 * inv; 11617 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11618 // MACC(Rm, Rn, t0, t1, t2); 11619 11620 // assert(t0 == 0, "broken Montgomery multiply"); 11621 11622 // t0 = t1; t1 = t2; t2 = 0; 11623 // } 11624 11625 // for (i = len; i < 2*len; i++) { 11626 // int start = i-len+1; 11627 // int end = start + (len - start)/2; 11628 // int j; 11629 11630 // Pa = Pa_base + i-len; 11631 // Pb = Pa_base + len; 11632 // Pm = Pm_base + i-len; 11633 // Pn = Pn_base + len; 11634 11635 // Ra = *++Pa; 11636 // Rb = *--Pb; 11637 // Rm = *++Pm; 11638 // Rn = *--Pn; 11639 11640 // int iters = (2*len-i-1)/2; 11641 // assert(iters == end-start, "must be"); 11642 // for (j = start; iters--; j++) { 11643 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11644 // MACC2(Ra, Rb, t0, t1, t2); 11645 // Ra = *++Pa; 11646 // Rb = *--Pb; 11647 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11648 // MACC(Rm, Rn, t0, t1, t2); 11649 // Rm = *++Pm; 11650 // Rn = *--Pn; 11651 // } 11652 // if ((i & 1) == 0) { 11653 // assert(Ra == Pa_base[j], "must be"); 11654 // MACC(Ra, Ra, t0, t1, t2); 11655 // } 11656 // iters = (2*len-i)/2; 11657 // assert(iters == len-j, "must be"); 11658 // for (; iters--; j++) { 11659 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11660 // MACC(Rm, Rn, t0, t1, t2); 11661 // Rm = *++Pm; 11662 // Rn = *--Pn; 11663 // } 11664 // Pm_base[i-len] = t0; 11665 // t0 = t1; t1 = t2; t2 = 0; 11666 // } 11667 11668 // while (t0) 11669 // t0 = sub(Pm_base, Pn_base, t0, len); 11670 // } 11671 }; 11672 11673 // Initialization 11674 void generate_preuniverse_stubs() { 11675 // preuniverse stubs are not needed for aarch64 11676 } 11677 11678 void generate_initial_stubs() { 11679 // Generate initial stubs and initializes the entry points 11680 11681 // entry points that exist in all platforms Note: This is code 11682 // that could be shared among different platforms - however the 11683 // benefit seems to be smaller than the disadvantage of having a 11684 // much more complicated generator structure. See also comment in 11685 // stubRoutines.hpp. 11686 11687 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11688 11689 StubRoutines::_call_stub_entry = 11690 generate_call_stub(StubRoutines::_call_stub_return_address); 11691 11692 // is referenced by megamorphic call 11693 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11694 11695 // Initialize table for copy memory (arraycopy) check. 11696 if (UnsafeMemoryAccess::_table == nullptr) { 11697 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11698 } 11699 11700 if (UseCRC32Intrinsics) { 11701 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11702 } 11703 11704 if (UseCRC32CIntrinsics) { 11705 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11706 } 11707 11708 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11709 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11710 } 11711 11712 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11713 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11714 } 11715 11716 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11717 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11718 StubRoutines::_hf2f = generate_float16ToFloat(); 11719 StubRoutines::_f2hf = generate_floatToFloat16(); 11720 } 11721 } 11722 11723 void generate_continuation_stubs() { 11724 // Continuation stubs: 11725 StubRoutines::_cont_thaw = generate_cont_thaw(); 11726 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11727 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11728 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11729 } 11730 11731 void generate_final_stubs() { 11732 // support for verify_oop (must happen after universe_init) 11733 if (VerifyOops) { 11734 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11735 } 11736 11737 // arraycopy stubs used by compilers 11738 generate_arraycopy_stubs(); 11739 11740 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11741 11742 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11743 11744 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11745 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11746 11747 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11748 11749 generate_atomic_entry_points(); 11750 11751 #endif // LINUX 11752 11753 #ifdef COMPILER2 11754 if (UseSecondarySupersTable) { 11755 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11756 if (! InlineSecondarySupersTest) { 11757 generate_lookup_secondary_supers_table_stub(); 11758 } 11759 } 11760 #endif 11761 11762 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11763 11764 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11765 } 11766 11767 void generate_compiler_stubs() { 11768 #if COMPILER2_OR_JVMCI 11769 11770 if (UseSVE == 0) { 11771 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id); 11772 } 11773 11774 // array equals stub for large arrays. 11775 if (!UseSimpleArrayEquals) { 11776 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11777 } 11778 11779 // arrays_hascode stub for large arrays. 11780 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11781 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11782 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11783 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11784 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11785 11786 // byte_array_inflate stub for large arrays. 11787 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11788 11789 // countPositives stub for large arrays. 11790 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11791 11792 generate_compare_long_strings(); 11793 11794 generate_string_indexof_stubs(); 11795 11796 #ifdef COMPILER2 11797 if (UseMultiplyToLenIntrinsic) { 11798 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11799 } 11800 11801 if (UseSquareToLenIntrinsic) { 11802 StubRoutines::_squareToLen = generate_squareToLen(); 11803 } 11804 11805 if (UseMulAddIntrinsic) { 11806 StubRoutines::_mulAdd = generate_mulAdd(); 11807 } 11808 11809 if (UseSIMDForBigIntegerShiftIntrinsics) { 11810 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11811 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11812 } 11813 11814 if (UseMontgomeryMultiplyIntrinsic) { 11815 StubId stub_id = StubId::stubgen_montgomeryMultiply_id; 11816 StubCodeMark mark(this, stub_id); 11817 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11818 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11819 } 11820 11821 if (UseMontgomerySquareIntrinsic) { 11822 StubId stub_id = StubId::stubgen_montgomerySquare_id; 11823 StubCodeMark mark(this, stub_id); 11824 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11825 // We use generate_multiply() rather than generate_square() 11826 // because it's faster for the sizes of modulus we care about. 11827 StubRoutines::_montgomerySquare = g.generate_multiply(); 11828 } 11829 11830 #endif // COMPILER2 11831 11832 if (UseChaCha20Intrinsics) { 11833 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11834 } 11835 11836 if (UseKyberIntrinsics) { 11837 StubRoutines::_kyberNtt = generate_kyberNtt(); 11838 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11839 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11840 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11841 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11842 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11843 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11844 } 11845 11846 if (UseDilithiumIntrinsics) { 11847 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11848 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11849 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11850 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11851 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11852 } 11853 11854 if (UseBASE64Intrinsics) { 11855 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11856 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11857 } 11858 11859 // data cache line writeback 11860 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11861 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11862 11863 if (UseAESIntrinsics) { 11864 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11865 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11866 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11867 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11868 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11869 } 11870 if (UseGHASHIntrinsics) { 11871 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11872 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11873 } 11874 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11875 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11876 } 11877 11878 if (UseMD5Intrinsics) { 11879 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id); 11880 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id); 11881 } 11882 if (UseSHA1Intrinsics) { 11883 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id); 11884 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id); 11885 } 11886 if (UseSHA256Intrinsics) { 11887 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id); 11888 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id); 11889 } 11890 if (UseSHA512Intrinsics) { 11891 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id); 11892 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id); 11893 } 11894 if (UseSHA3Intrinsics) { 11895 11896 StubRoutines::_double_keccak = generate_double_keccak(); 11897 if (UseSIMDForSHA3Intrinsic) { 11898 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id); 11899 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id); 11900 } else { 11901 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id); 11902 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id); 11903 } 11904 } 11905 11906 if (UsePoly1305Intrinsics) { 11907 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11908 } 11909 11910 // generate Adler32 intrinsics code 11911 if (UseAdler32Intrinsics) { 11912 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11913 } 11914 11915 #endif // COMPILER2_OR_JVMCI 11916 } 11917 11918 public: 11919 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) { 11920 switch(blob_id) { 11921 case BlobId::stubgen_preuniverse_id: 11922 generate_preuniverse_stubs(); 11923 break; 11924 case BlobId::stubgen_initial_id: 11925 generate_initial_stubs(); 11926 break; 11927 case BlobId::stubgen_continuation_id: 11928 generate_continuation_stubs(); 11929 break; 11930 case BlobId::stubgen_compiler_id: 11931 generate_compiler_stubs(); 11932 break; 11933 case BlobId::stubgen_final_id: 11934 generate_final_stubs(); 11935 break; 11936 default: 11937 fatal("unexpected blob id: %s", StubInfo::name(blob_id)); 11938 break; 11939 }; 11940 } 11941 }; // end class declaration 11942 11943 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) { 11944 StubGenerator g(code, blob_id); 11945 } 11946 11947 11948 #if defined (LINUX) 11949 11950 // Define pointers to atomic stubs and initialize them to point to the 11951 // code in atomic_aarch64.S. 11952 11953 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 11954 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 11955 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 11956 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 11957 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 11958 11959 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 11960 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 11961 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 11962 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 11963 DEFAULT_ATOMIC_OP(xchg, 4, ) 11964 DEFAULT_ATOMIC_OP(xchg, 8, ) 11965 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 11966 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 11967 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 11968 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 11969 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 11970 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 11971 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 11972 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 11973 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 11974 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 11975 11976 #undef DEFAULT_ATOMIC_OP 11977 11978 #endif // LINUX