1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubId stub_id = StubId::stubgen_call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubId stub_id = StubId::stubgen_catch_exception_id; 426 StubCodeMark mark(this, stub_id); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubId stub_id = StubId::stubgen_forward_exception_id; 482 StubCodeMark mark(this, stub_id); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // Reinitialize the ptrue predicate register, in case the external runtime 521 // call clobbers ptrue reg, as we may return to SVE compiled code. 522 __ reinitialize_ptrue(); 523 524 // we should not really care that lr is no longer the callee 525 // address. we saved the value the handler needs in r19 so we can 526 // just copy it to r3. however, the C2 handler will push its own 527 // frame and then calls into the VM and the VM code asserts that 528 // the PC for the frame above the handler belongs to a compiled 529 // Java method. So, we restore lr here to satisfy that assert. 530 __ mov(lr, r19); 531 // setup r0 & r3 & clear pending exception 532 __ mov(r3, r19); 533 __ mov(r19, r0); 534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 535 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 536 537 #ifdef ASSERT 538 // make sure exception is set 539 { 540 Label L; 541 __ cbnz(r0, L); 542 __ stop("StubRoutines::forward exception: no pending exception (2)"); 543 __ bind(L); 544 } 545 #endif 546 547 // continue at exception handler 548 // r0: exception 549 // r3: throwing pc 550 // r19: exception handler 551 __ verify_oop(r0); 552 __ br(r19); 553 554 return start; 555 } 556 557 // Non-destructive plausibility checks for oops 558 // 559 // Arguments: 560 // r0: oop to verify 561 // rscratch1: error message 562 // 563 // Stack after saving c_rarg3: 564 // [tos + 0]: saved c_rarg3 565 // [tos + 1]: saved c_rarg2 566 // [tos + 2]: saved lr 567 // [tos + 3]: saved rscratch2 568 // [tos + 4]: saved r0 569 // [tos + 5]: saved rscratch1 570 address generate_verify_oop() { 571 StubId stub_id = StubId::stubgen_verify_oop_id; 572 StubCodeMark mark(this, stub_id); 573 address start = __ pc(); 574 575 Label exit, error; 576 577 // save c_rarg2 and c_rarg3 578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 579 580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ ldr(c_rarg3, Address(c_rarg2)); 583 __ add(c_rarg3, c_rarg3, 1); 584 __ str(c_rarg3, Address(c_rarg2)); 585 586 // object is in r0 587 // make sure object is 'reasonable' 588 __ cbz(r0, exit); // if obj is null it is OK 589 590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blr(rscratch1); 614 __ hlt(0); 615 616 return start; 617 } 618 619 // Generate indices for iota vector. 620 address generate_iota_indices(StubId stub_id) { 621 __ align(CodeEntryAlignment); 622 StubCodeMark mark(this, stub_id); 623 address start = __ pc(); 624 // B 625 __ emit_data64(0x0706050403020100, relocInfo::none); 626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 627 // H 628 __ emit_data64(0x0003000200010000, relocInfo::none); 629 __ emit_data64(0x0007000600050004, relocInfo::none); 630 // S 631 __ emit_data64(0x0000000100000000, relocInfo::none); 632 __ emit_data64(0x0000000300000002, relocInfo::none); 633 // D 634 __ emit_data64(0x0000000000000000, relocInfo::none); 635 __ emit_data64(0x0000000000000001, relocInfo::none); 636 // S - FP 637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 639 // D - FP 640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 642 return start; 643 } 644 645 // The inner part of zero_words(). This is the bulk operation, 646 // zeroing words in blocks, possibly using DC ZVA to do it. The 647 // caller is responsible for zeroing the last few words. 648 // 649 // Inputs: 650 // r10: the HeapWord-aligned base address of an array to zero. 651 // r11: the count in HeapWords, r11 > 0. 652 // 653 // Returns r10 and r11, adjusted for the caller to clear. 654 // r10: the base address of the tail of words left to clear. 655 // r11: the number of words in the tail. 656 // r11 < MacroAssembler::zero_words_block_size. 657 658 address generate_zero_blocks() { 659 Label done; 660 Label base_aligned; 661 662 Register base = r10, cnt = r11; 663 664 __ align(CodeEntryAlignment); 665 StubId stub_id = StubId::stubgen_zero_blocks_id; 666 StubCodeMark mark(this, stub_id); 667 address start = __ pc(); 668 669 if (UseBlockZeroing) { 670 int zva_length = VM_Version::zva_length(); 671 672 // Ensure ZVA length can be divided by 16. This is required by 673 // the subsequent operations. 674 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 675 676 __ tbz(base, 3, base_aligned); 677 __ str(zr, Address(__ post(base, 8))); 678 __ sub(cnt, cnt, 1); 679 __ bind(base_aligned); 680 681 // Ensure count >= zva_length * 2 so that it still deserves a zva after 682 // alignment. 683 Label small; 684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 685 __ subs(rscratch1, cnt, low_limit >> 3); 686 __ br(Assembler::LT, small); 687 __ zero_dcache_blocks(base, cnt); 688 __ bind(small); 689 } 690 691 { 692 // Number of stp instructions we'll unroll 693 const int unroll = 694 MacroAssembler::zero_words_block_size / 2; 695 // Clear the remaining blocks. 696 Label loop; 697 __ subs(cnt, cnt, unroll * 2); 698 __ br(Assembler::LT, done); 699 __ bind(loop); 700 for (int i = 0; i < unroll; i++) 701 __ stp(zr, zr, __ post(base, 16)); 702 __ subs(cnt, cnt, unroll * 2); 703 __ br(Assembler::GE, loop); 704 __ bind(done); 705 __ add(cnt, cnt, unroll * 2); 706 } 707 708 __ ret(lr); 709 710 return start; 711 } 712 713 714 typedef enum { 715 copy_forwards = 1, 716 copy_backwards = -1 717 } copy_direction; 718 719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 720 // for arraycopy stubs. 721 class ArrayCopyBarrierSetHelper : StackObj { 722 BarrierSetAssembler* _bs_asm; 723 MacroAssembler* _masm; 724 DecoratorSet _decorators; 725 BasicType _type; 726 Register _gct1; 727 Register _gct2; 728 Register _gct3; 729 FloatRegister _gcvt1; 730 FloatRegister _gcvt2; 731 FloatRegister _gcvt3; 732 733 public: 734 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 735 DecoratorSet decorators, 736 BasicType type, 737 Register gct1, 738 Register gct2, 739 Register gct3, 740 FloatRegister gcvt1, 741 FloatRegister gcvt2, 742 FloatRegister gcvt3) 743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 744 _masm(masm), 745 _decorators(decorators), 746 _type(type), 747 _gct1(gct1), 748 _gct2(gct2), 749 _gct3(gct3), 750 _gcvt1(gcvt1), 751 _gcvt2(gcvt2), 752 _gcvt3(gcvt3) { 753 } 754 755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 757 dst1, dst2, src, 758 _gct1, _gct2, _gcvt1); 759 } 760 761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 763 dst, src1, src2, 764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 765 } 766 767 void copy_load_at_16(Register dst1, Register dst2, Address src) { 768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 769 dst1, dst2, src, 770 _gct1); 771 } 772 773 void copy_store_at_16(Address dst, Register src1, Register src2) { 774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 775 dst, src1, src2, 776 _gct1, _gct2, _gct3); 777 } 778 779 void copy_load_at_8(Register dst, Address src) { 780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 781 dst, noreg, src, 782 _gct1); 783 } 784 785 void copy_store_at_8(Address dst, Register src) { 786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 787 dst, src, noreg, 788 _gct1, _gct2, _gct3); 789 } 790 }; 791 792 // Bulk copy of blocks of 8 words. 793 // 794 // count is a count of words. 795 // 796 // Precondition: count >= 8 797 // 798 // Postconditions: 799 // 800 // The least significant bit of count contains the remaining count 801 // of words to copy. The rest of count is trash. 802 // 803 // s and d are adjusted to point to the remaining words to copy 804 // 805 void generate_copy_longs(StubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 806 BasicType type; 807 copy_direction direction; 808 809 switch (stub_id) { 810 case StubId::stubgen_copy_byte_f_id: 811 direction = copy_forwards; 812 type = T_BYTE; 813 break; 814 case StubId::stubgen_copy_byte_b_id: 815 direction = copy_backwards; 816 type = T_BYTE; 817 break; 818 case StubId::stubgen_copy_oop_f_id: 819 direction = copy_forwards; 820 type = T_OBJECT; 821 break; 822 case StubId::stubgen_copy_oop_b_id: 823 direction = copy_backwards; 824 type = T_OBJECT; 825 break; 826 case StubId::stubgen_copy_oop_uninit_f_id: 827 direction = copy_forwards; 828 type = T_OBJECT; 829 break; 830 case StubId::stubgen_copy_oop_uninit_b_id: 831 direction = copy_backwards; 832 type = T_OBJECT; 833 break; 834 default: 835 ShouldNotReachHere(); 836 } 837 838 int unit = wordSize * direction; 839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 840 841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 842 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 843 const Register stride = r14; 844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 847 848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 849 assert_different_registers(s, d, count, rscratch1, rscratch2); 850 851 Label again, drain; 852 853 __ align(CodeEntryAlignment); 854 855 StubCodeMark mark(this, stub_id); 856 857 __ bind(start); 858 859 Label unaligned_copy_long; 860 if (AvoidUnalignedAccesses) { 861 __ tbnz(d, 3, unaligned_copy_long); 862 } 863 864 if (direction == copy_forwards) { 865 __ sub(s, s, bias); 866 __ sub(d, d, bias); 867 } 868 869 #ifdef ASSERT 870 // Make sure we are never given < 8 words 871 { 872 Label L; 873 __ cmp(count, (u1)8); 874 __ br(Assembler::GE, L); 875 __ stop("genrate_copy_longs called with < 8 words"); 876 __ bind(L); 877 } 878 #endif 879 880 // Fill 8 registers 881 if (UseSIMDForMemoryOps) { 882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 889 } 890 891 __ subs(count, count, 16); 892 __ br(Assembler::LO, drain); 893 894 int prefetch = PrefetchCopyIntervalInBytes; 895 bool use_stride = false; 896 if (direction == copy_backwards) { 897 use_stride = prefetch > 256; 898 prefetch = -prefetch; 899 if (use_stride) __ mov(stride, prefetch); 900 } 901 902 __ bind(again); 903 904 if (PrefetchCopyIntervalInBytes > 0) 905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 906 907 if (UseSIMDForMemoryOps) { 908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 912 } else { 913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 921 } 922 923 __ subs(count, count, 8); 924 __ br(Assembler::HS, again); 925 926 // Drain 927 __ bind(drain); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 931 } else { 932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 } 937 938 { 939 Label L1, L2; 940 __ tbz(count, exact_log2(4), L1); 941 if (UseSIMDForMemoryOps) { 942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 944 } else { 945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 949 } 950 __ bind(L1); 951 952 if (direction == copy_forwards) { 953 __ add(s, s, bias); 954 __ add(d, d, bias); 955 } 956 957 __ tbz(count, 1, L2); 958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 960 __ bind(L2); 961 } 962 963 __ ret(lr); 964 965 if (AvoidUnalignedAccesses) { 966 Label drain, again; 967 // Register order for storing. Order is different for backward copy. 968 969 __ bind(unaligned_copy_long); 970 971 // source address is even aligned, target odd aligned 972 // 973 // when forward copying word pairs we read long pairs at offsets 974 // {0, 2, 4, 6} (in long words). when backwards copying we read 975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 976 // address by -2 in the forwards case so we can compute the 977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 978 // or -1. 979 // 980 // when forward copying we need to store 1 word, 3 pairs and 981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 982 // zero offset We adjust the destination by -1 which means we 983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 984 // 985 // When backwards copyng we need to store 1 word, 3 pairs and 986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 987 // offsets {1, 3, 5, 7, 8} * unit. 988 989 if (direction == copy_forwards) { 990 __ sub(s, s, 16); 991 __ sub(d, d, 8); 992 } 993 994 // Fill 8 registers 995 // 996 // for forwards copy s was offset by -16 from the original input 997 // value of s so the register contents are at these offsets 998 // relative to the 64 bit block addressed by that original input 999 // and so on for each successive 64 byte block when s is updated 1000 // 1001 // t0 at offset 0, t1 at offset 8 1002 // t2 at offset 16, t3 at offset 24 1003 // t4 at offset 32, t5 at offset 40 1004 // t6 at offset 48, t7 at offset 56 1005 1006 // for backwards copy s was not offset so the register contents 1007 // are at these offsets into the preceding 64 byte block 1008 // relative to that original input and so on for each successive 1009 // preceding 64 byte block when s is updated. this explains the 1010 // slightly counter-intuitive looking pattern of register usage 1011 // in the stp instructions for backwards copy. 1012 // 1013 // t0 at offset -16, t1 at offset -8 1014 // t2 at offset -32, t3 at offset -24 1015 // t4 at offset -48, t5 at offset -40 1016 // t6 at offset -64, t7 at offset -56 1017 1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1022 1023 __ subs(count, count, 16); 1024 __ br(Assembler::LO, drain); 1025 1026 int prefetch = PrefetchCopyIntervalInBytes; 1027 bool use_stride = false; 1028 if (direction == copy_backwards) { 1029 use_stride = prefetch > 256; 1030 prefetch = -prefetch; 1031 if (use_stride) __ mov(stride, prefetch); 1032 } 1033 1034 __ bind(again); 1035 1036 if (PrefetchCopyIntervalInBytes > 0) 1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1038 1039 if (direction == copy_forwards) { 1040 // allowing for the offset of -8 the store instructions place 1041 // registers into the target 64 bit block at the following 1042 // offsets 1043 // 1044 // t0 at offset 0 1045 // t1 at offset 8, t2 at offset 16 1046 // t3 at offset 24, t4 at offset 32 1047 // t5 at offset 40, t6 at offset 48 1048 // t7 at offset 56 1049 1050 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1059 } else { 1060 // d was not offset when we started so the registers are 1061 // written into the 64 bit block preceding d with the following 1062 // offsets 1063 // 1064 // t1 at offset -8 1065 // t3 at offset -24, t0 at offset -16 1066 // t5 at offset -48, t2 at offset -32 1067 // t7 at offset -56, t4 at offset -48 1068 // t6 at offset -64 1069 // 1070 // note that this matches the offsets previously noted for the 1071 // loads 1072 1073 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1082 } 1083 1084 __ subs(count, count, 8); 1085 __ br(Assembler::HS, again); 1086 1087 // Drain 1088 // 1089 // this uses the same pattern of offsets and register arguments 1090 // as above 1091 __ bind(drain); 1092 if (direction == copy_forwards) { 1093 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1098 } else { 1099 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1104 } 1105 // now we need to copy any remaining part block which may 1106 // include a 4 word block subblock and/or a 2 word subblock. 1107 // bits 2 and 1 in the count are the tell-tale for whether we 1108 // have each such subblock 1109 { 1110 Label L1, L2; 1111 __ tbz(count, exact_log2(4), L1); 1112 // this is the same as above but copying only 4 longs hence 1113 // with only one intervening stp between the str instructions 1114 // but note that the offsets and registers still follow the 1115 // same pattern 1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1118 if (direction == copy_forwards) { 1119 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1122 } else { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1126 } 1127 __ bind(L1); 1128 1129 __ tbz(count, 1, L2); 1130 // this is the same as above but copying only 2 longs hence 1131 // there is no intervening stp between the str instructions 1132 // but note that the offset and register patterns are still 1133 // the same 1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1135 if (direction == copy_forwards) { 1136 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1141 } 1142 __ bind(L2); 1143 1144 // for forwards copy we need to re-adjust the offsets we 1145 // applied so that s and d are follow the last words written 1146 1147 if (direction == copy_forwards) { 1148 __ add(s, s, 16); 1149 __ add(d, d, 8); 1150 } 1151 1152 } 1153 1154 __ ret(lr); 1155 } 1156 } 1157 1158 // Small copy: less than 16 bytes. 1159 // 1160 // NB: Ignores all of the bits of count which represent more than 15 1161 // bytes, so a caller doesn't have to mask them. 1162 1163 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1164 bool is_backwards = step < 0; 1165 size_t granularity = g_uabs(step); 1166 int direction = is_backwards ? -1 : 1; 1167 1168 Label Lword, Lint, Lshort, Lbyte; 1169 1170 assert(granularity 1171 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1172 1173 const Register t0 = r3; 1174 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1175 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1176 1177 // ??? I don't know if this bit-test-and-branch is the right thing 1178 // to do. It does a lot of jumping, resulting in several 1179 // mispredicted branches. It might make more sense to do this 1180 // with something like Duff's device with a single computed branch. 1181 1182 __ tbz(count, 3 - exact_log2(granularity), Lword); 1183 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1184 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1185 __ bind(Lword); 1186 1187 if (granularity <= sizeof (jint)) { 1188 __ tbz(count, 2 - exact_log2(granularity), Lint); 1189 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1190 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1191 __ bind(Lint); 1192 } 1193 1194 if (granularity <= sizeof (jshort)) { 1195 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1196 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1197 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1198 __ bind(Lshort); 1199 } 1200 1201 if (granularity <= sizeof (jbyte)) { 1202 __ tbz(count, 0, Lbyte); 1203 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1204 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1205 __ bind(Lbyte); 1206 } 1207 } 1208 1209 Label copy_f, copy_b; 1210 Label copy_obj_f, copy_obj_b; 1211 Label copy_obj_uninit_f, copy_obj_uninit_b; 1212 1213 // All-singing all-dancing memory copy. 1214 // 1215 // Copy count units of memory from s to d. The size of a unit is 1216 // step, which can be positive or negative depending on the direction 1217 // of copy. If is_aligned is false, we align the source address. 1218 // 1219 1220 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1221 Register s, Register d, Register count, int step) { 1222 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1223 bool is_backwards = step < 0; 1224 unsigned int granularity = g_uabs(step); 1225 const Register t0 = r3, t1 = r4; 1226 1227 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1228 // load all the data before writing anything 1229 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1230 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1231 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1232 const Register send = r17, dend = r16; 1233 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1234 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1235 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1236 1237 if (PrefetchCopyIntervalInBytes > 0) 1238 __ prfm(Address(s, 0), PLDL1KEEP); 1239 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1240 __ br(Assembler::HI, copy_big); 1241 1242 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1243 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1244 1245 __ cmp(count, u1(16/granularity)); 1246 __ br(Assembler::LS, copy16); 1247 1248 __ cmp(count, u1(64/granularity)); 1249 __ br(Assembler::HI, copy80); 1250 1251 __ cmp(count, u1(32/granularity)); 1252 __ br(Assembler::LS, copy32); 1253 1254 // 33..64 bytes 1255 if (UseSIMDForMemoryOps) { 1256 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1257 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1258 bs.copy_store_at_32(Address(d, 0), v0, v1); 1259 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1260 } else { 1261 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1262 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1263 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1264 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1265 1266 bs.copy_store_at_16(Address(d, 0), t0, t1); 1267 bs.copy_store_at_16(Address(d, 16), t2, t3); 1268 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1269 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1270 } 1271 __ b(finish); 1272 1273 // 17..32 bytes 1274 __ bind(copy32); 1275 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1276 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1277 1278 bs.copy_store_at_16(Address(d, 0), t0, t1); 1279 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1280 __ b(finish); 1281 1282 // 65..80/96 bytes 1283 // (96 bytes if SIMD because we do 32 byes per instruction) 1284 __ bind(copy80); 1285 if (UseSIMDForMemoryOps) { 1286 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1287 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1288 // Unaligned pointers can be an issue for copying. 1289 // The issue has more chances to happen when granularity of data is 1290 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1291 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1292 // The most performance drop has been seen for the range 65-80 bytes. 1293 // For such cases using the pair of ldp/stp instead of the third pair of 1294 // ldpq/stpq fixes the performance issue. 1295 if (granularity < sizeof (jint)) { 1296 Label copy96; 1297 __ cmp(count, u1(80/granularity)); 1298 __ br(Assembler::HI, copy96); 1299 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1300 1301 bs.copy_store_at_32(Address(d, 0), v0, v1); 1302 bs.copy_store_at_32(Address(d, 32), v2, v3); 1303 1304 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1305 __ b(finish); 1306 1307 __ bind(copy96); 1308 } 1309 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1310 1311 bs.copy_store_at_32(Address(d, 0), v0, v1); 1312 bs.copy_store_at_32(Address(d, 32), v2, v3); 1313 1314 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1315 } else { 1316 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1317 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1318 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1319 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1320 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1321 1322 bs.copy_store_at_16(Address(d, 0), t0, t1); 1323 bs.copy_store_at_16(Address(d, 16), t2, t3); 1324 bs.copy_store_at_16(Address(d, 32), t4, t5); 1325 bs.copy_store_at_16(Address(d, 48), t6, t7); 1326 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1327 } 1328 __ b(finish); 1329 1330 // 0..16 bytes 1331 __ bind(copy16); 1332 __ cmp(count, u1(8/granularity)); 1333 __ br(Assembler::LO, copy8); 1334 1335 // 8..16 bytes 1336 bs.copy_load_at_8(t0, Address(s, 0)); 1337 bs.copy_load_at_8(t1, Address(send, -8)); 1338 bs.copy_store_at_8(Address(d, 0), t0); 1339 bs.copy_store_at_8(Address(dend, -8), t1); 1340 __ b(finish); 1341 1342 if (granularity < 8) { 1343 // 4..7 bytes 1344 __ bind(copy8); 1345 __ tbz(count, 2 - exact_log2(granularity), copy4); 1346 __ ldrw(t0, Address(s, 0)); 1347 __ ldrw(t1, Address(send, -4)); 1348 __ strw(t0, Address(d, 0)); 1349 __ strw(t1, Address(dend, -4)); 1350 __ b(finish); 1351 if (granularity < 4) { 1352 // 0..3 bytes 1353 __ bind(copy4); 1354 __ cbz(count, finish); // get rid of 0 case 1355 if (granularity == 2) { 1356 __ ldrh(t0, Address(s, 0)); 1357 __ strh(t0, Address(d, 0)); 1358 } else { // granularity == 1 1359 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1360 // the first and last byte. 1361 // Handle the 3 byte case by loading and storing base + count/2 1362 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1363 // This does means in the 1 byte case we load/store the same 1364 // byte 3 times. 1365 __ lsr(count, count, 1); 1366 __ ldrb(t0, Address(s, 0)); 1367 __ ldrb(t1, Address(send, -1)); 1368 __ ldrb(t2, Address(s, count)); 1369 __ strb(t0, Address(d, 0)); 1370 __ strb(t1, Address(dend, -1)); 1371 __ strb(t2, Address(d, count)); 1372 } 1373 __ b(finish); 1374 } 1375 } 1376 1377 __ bind(copy_big); 1378 if (is_backwards) { 1379 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1380 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1381 } 1382 1383 // Now we've got the small case out of the way we can align the 1384 // source address on a 2-word boundary. 1385 1386 // Here we will materialize a count in r15, which is used by copy_memory_small 1387 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1388 // Up until here, we have used t9, which aliases r15, but from here on, that register 1389 // can not be used as a temp register, as it contains the count. 1390 1391 Label aligned; 1392 1393 if (is_aligned) { 1394 // We may have to adjust by 1 word to get s 2-word-aligned. 1395 __ tbz(s, exact_log2(wordSize), aligned); 1396 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1397 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1398 __ sub(count, count, wordSize/granularity); 1399 } else { 1400 if (is_backwards) { 1401 __ andr(r15, s, 2 * wordSize - 1); 1402 } else { 1403 __ neg(r15, s); 1404 __ andr(r15, r15, 2 * wordSize - 1); 1405 } 1406 // r15 is the byte adjustment needed to align s. 1407 __ cbz(r15, aligned); 1408 int shift = exact_log2(granularity); 1409 if (shift > 0) { 1410 __ lsr(r15, r15, shift); 1411 } 1412 __ sub(count, count, r15); 1413 1414 #if 0 1415 // ?? This code is only correct for a disjoint copy. It may or 1416 // may not make sense to use it in that case. 1417 1418 // Copy the first pair; s and d may not be aligned. 1419 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1420 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1421 1422 // Align s and d, adjust count 1423 if (is_backwards) { 1424 __ sub(s, s, r15); 1425 __ sub(d, d, r15); 1426 } else { 1427 __ add(s, s, r15); 1428 __ add(d, d, r15); 1429 } 1430 #else 1431 copy_memory_small(decorators, type, s, d, r15, step); 1432 #endif 1433 } 1434 1435 __ bind(aligned); 1436 1437 // s is now 2-word-aligned. 1438 1439 // We have a count of units and some trailing bytes. Adjust the 1440 // count and do a bulk copy of words. If the shift is zero 1441 // perform a move instead to benefit from zero latency moves. 1442 int shift = exact_log2(wordSize/granularity); 1443 if (shift > 0) { 1444 __ lsr(r15, count, shift); 1445 } else { 1446 __ mov(r15, count); 1447 } 1448 if (direction == copy_forwards) { 1449 if (type != T_OBJECT) { 1450 __ bl(copy_f); 1451 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1452 __ bl(copy_obj_uninit_f); 1453 } else { 1454 __ bl(copy_obj_f); 1455 } 1456 } else { 1457 if (type != T_OBJECT) { 1458 __ bl(copy_b); 1459 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1460 __ bl(copy_obj_uninit_b); 1461 } else { 1462 __ bl(copy_obj_b); 1463 } 1464 } 1465 1466 // And the tail. 1467 copy_memory_small(decorators, type, s, d, count, step); 1468 1469 if (granularity >= 8) __ bind(copy8); 1470 if (granularity >= 4) __ bind(copy4); 1471 __ bind(finish); 1472 } 1473 1474 1475 void clobber_registers() { 1476 #ifdef ASSERT 1477 RegSet clobbered 1478 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1479 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1480 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1481 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1482 __ mov(*it, rscratch1); 1483 } 1484 #endif 1485 1486 } 1487 1488 // Scan over array at a for count oops, verifying each one. 1489 // Preserves a and count, clobbers rscratch1 and rscratch2. 1490 void verify_oop_array (int size, Register a, Register count, Register temp) { 1491 Label loop, end; 1492 __ mov(rscratch1, a); 1493 __ mov(rscratch2, zr); 1494 __ bind(loop); 1495 __ cmp(rscratch2, count); 1496 __ br(Assembler::HS, end); 1497 if (size == wordSize) { 1498 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1499 __ verify_oop(temp); 1500 } else { 1501 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1502 __ decode_heap_oop(temp); // calls verify_oop 1503 } 1504 __ add(rscratch2, rscratch2, 1); 1505 __ b(loop); 1506 __ bind(end); 1507 } 1508 1509 // Arguments: 1510 // stub_id - is used to name the stub and identify all details of 1511 // how to perform the copy. 1512 // 1513 // entry - is assigned to the stub's post push entry point unless 1514 // it is null 1515 // 1516 // Inputs: 1517 // c_rarg0 - source array address 1518 // c_rarg1 - destination array address 1519 // c_rarg2 - element count, treated as ssize_t, can be zero 1520 // 1521 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1522 // the hardware handle it. The two dwords within qwords that span 1523 // cache line boundaries will still be loaded and stored atomically. 1524 // 1525 // Side Effects: entry is set to the (post push) entry point so it 1526 // can be used by the corresponding conjoint copy 1527 // method 1528 // 1529 address generate_disjoint_copy(StubId stub_id, address *entry) { 1530 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1531 RegSet saved_reg = RegSet::of(s, d, count); 1532 int size; 1533 bool aligned; 1534 bool is_oop; 1535 bool dest_uninitialized; 1536 switch (stub_id) { 1537 case StubId::stubgen_jbyte_disjoint_arraycopy_id: 1538 size = sizeof(jbyte); 1539 aligned = false; 1540 is_oop = false; 1541 dest_uninitialized = false; 1542 break; 1543 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id: 1544 size = sizeof(jbyte); 1545 aligned = true; 1546 is_oop = false; 1547 dest_uninitialized = false; 1548 break; 1549 case StubId::stubgen_jshort_disjoint_arraycopy_id: 1550 size = sizeof(jshort); 1551 aligned = false; 1552 is_oop = false; 1553 dest_uninitialized = false; 1554 break; 1555 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id: 1556 size = sizeof(jshort); 1557 aligned = true; 1558 is_oop = false; 1559 dest_uninitialized = false; 1560 break; 1561 case StubId::stubgen_jint_disjoint_arraycopy_id: 1562 size = sizeof(jint); 1563 aligned = false; 1564 is_oop = false; 1565 dest_uninitialized = false; 1566 break; 1567 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id: 1568 size = sizeof(jint); 1569 aligned = true; 1570 is_oop = false; 1571 dest_uninitialized = false; 1572 break; 1573 case StubId::stubgen_jlong_disjoint_arraycopy_id: 1574 // since this is always aligned we can (should!) use the same 1575 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1576 ShouldNotReachHere(); 1577 break; 1578 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id: 1579 size = sizeof(jlong); 1580 aligned = true; 1581 is_oop = false; 1582 dest_uninitialized = false; 1583 break; 1584 case StubId::stubgen_oop_disjoint_arraycopy_id: 1585 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1586 aligned = !UseCompressedOops; 1587 is_oop = true; 1588 dest_uninitialized = false; 1589 break; 1590 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id: 1591 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1592 aligned = !UseCompressedOops; 1593 is_oop = true; 1594 dest_uninitialized = false; 1595 break; 1596 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id: 1597 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1598 aligned = !UseCompressedOops; 1599 is_oop = true; 1600 dest_uninitialized = true; 1601 break; 1602 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id: 1603 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1604 aligned = !UseCompressedOops; 1605 is_oop = true; 1606 dest_uninitialized = true; 1607 break; 1608 default: 1609 ShouldNotReachHere(); 1610 break; 1611 } 1612 1613 __ align(CodeEntryAlignment); 1614 StubCodeMark mark(this, stub_id); 1615 address start = __ pc(); 1616 __ enter(); 1617 1618 if (entry != nullptr) { 1619 *entry = __ pc(); 1620 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1621 BLOCK_COMMENT("Entry:"); 1622 } 1623 1624 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1625 if (dest_uninitialized) { 1626 decorators |= IS_DEST_UNINITIALIZED; 1627 } 1628 if (aligned) { 1629 decorators |= ARRAYCOPY_ALIGNED; 1630 } 1631 1632 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1633 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1634 1635 if (is_oop) { 1636 // save regs before copy_memory 1637 __ push(RegSet::of(d, count), sp); 1638 } 1639 { 1640 // UnsafeMemoryAccess page error: continue after unsafe access 1641 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1642 UnsafeMemoryAccessMark umam(this, add_entry, true); 1643 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1644 } 1645 1646 if (is_oop) { 1647 __ pop(RegSet::of(d, count), sp); 1648 if (VerifyOops) 1649 verify_oop_array(size, d, count, r16); 1650 } 1651 1652 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1653 1654 __ leave(); 1655 __ mov(r0, zr); // return 0 1656 __ ret(lr); 1657 return start; 1658 } 1659 1660 // Arguments: 1661 // stub_id - is used to name the stub and identify all details of 1662 // how to perform the copy. 1663 // 1664 // nooverlap_target - identifes the (post push) entry for the 1665 // corresponding disjoint copy routine which can be 1666 // jumped to if the ranges do not actually overlap 1667 // 1668 // entry - is assigned to the stub's post push entry point unless 1669 // it is null 1670 // 1671 // 1672 // Inputs: 1673 // c_rarg0 - source array address 1674 // c_rarg1 - destination array address 1675 // c_rarg2 - element count, treated as ssize_t, can be zero 1676 // 1677 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1678 // the hardware handle it. The two dwords within qwords that span 1679 // cache line boundaries will still be loaded and stored atomically. 1680 // 1681 // Side Effects: 1682 // entry is set to the no-overlap entry point so it can be used by 1683 // some other conjoint copy method 1684 // 1685 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) { 1686 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1687 RegSet saved_regs = RegSet::of(s, d, count); 1688 int size; 1689 bool aligned; 1690 bool is_oop; 1691 bool dest_uninitialized; 1692 switch (stub_id) { 1693 case StubId::stubgen_jbyte_arraycopy_id: 1694 size = sizeof(jbyte); 1695 aligned = false; 1696 is_oop = false; 1697 dest_uninitialized = false; 1698 break; 1699 case StubId::stubgen_arrayof_jbyte_arraycopy_id: 1700 size = sizeof(jbyte); 1701 aligned = true; 1702 is_oop = false; 1703 dest_uninitialized = false; 1704 break; 1705 case StubId::stubgen_jshort_arraycopy_id: 1706 size = sizeof(jshort); 1707 aligned = false; 1708 is_oop = false; 1709 dest_uninitialized = false; 1710 break; 1711 case StubId::stubgen_arrayof_jshort_arraycopy_id: 1712 size = sizeof(jshort); 1713 aligned = true; 1714 is_oop = false; 1715 dest_uninitialized = false; 1716 break; 1717 case StubId::stubgen_jint_arraycopy_id: 1718 size = sizeof(jint); 1719 aligned = false; 1720 is_oop = false; 1721 dest_uninitialized = false; 1722 break; 1723 case StubId::stubgen_arrayof_jint_arraycopy_id: 1724 size = sizeof(jint); 1725 aligned = true; 1726 is_oop = false; 1727 dest_uninitialized = false; 1728 break; 1729 case StubId::stubgen_jlong_arraycopy_id: 1730 // since this is always aligned we can (should!) use the same 1731 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1732 ShouldNotReachHere(); 1733 break; 1734 case StubId::stubgen_arrayof_jlong_arraycopy_id: 1735 size = sizeof(jlong); 1736 aligned = true; 1737 is_oop = false; 1738 dest_uninitialized = false; 1739 break; 1740 case StubId::stubgen_oop_arraycopy_id: 1741 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1742 aligned = !UseCompressedOops; 1743 is_oop = true; 1744 dest_uninitialized = false; 1745 break; 1746 case StubId::stubgen_arrayof_oop_arraycopy_id: 1747 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1748 aligned = !UseCompressedOops; 1749 is_oop = true; 1750 dest_uninitialized = false; 1751 break; 1752 case StubId::stubgen_oop_arraycopy_uninit_id: 1753 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1754 aligned = !UseCompressedOops; 1755 is_oop = true; 1756 dest_uninitialized = true; 1757 break; 1758 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id: 1759 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1760 aligned = !UseCompressedOops; 1761 is_oop = true; 1762 dest_uninitialized = true; 1763 break; 1764 default: 1765 ShouldNotReachHere(); 1766 } 1767 1768 StubCodeMark mark(this, stub_id); 1769 address start = __ pc(); 1770 __ enter(); 1771 1772 if (entry != nullptr) { 1773 *entry = __ pc(); 1774 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1775 BLOCK_COMMENT("Entry:"); 1776 } 1777 1778 // use fwd copy when (d-s) above_equal (count*size) 1779 __ sub(rscratch1, d, s); 1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1781 __ br(Assembler::HS, nooverlap_target); 1782 1783 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1784 if (dest_uninitialized) { 1785 decorators |= IS_DEST_UNINITIALIZED; 1786 } 1787 if (aligned) { 1788 decorators |= ARRAYCOPY_ALIGNED; 1789 } 1790 1791 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1792 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1793 1794 if (is_oop) { 1795 // save regs before copy_memory 1796 __ push(RegSet::of(d, count), sp); 1797 } 1798 { 1799 // UnsafeMemoryAccess page error: continue after unsafe access 1800 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1801 UnsafeMemoryAccessMark umam(this, add_entry, true); 1802 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1803 } 1804 if (is_oop) { 1805 __ pop(RegSet::of(d, count), sp); 1806 if (VerifyOops) 1807 verify_oop_array(size, d, count, r16); 1808 } 1809 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1810 __ leave(); 1811 __ mov(r0, zr); // return 0 1812 __ ret(lr); 1813 return start; 1814 } 1815 1816 // Helper for generating a dynamic type check. 1817 // Smashes rscratch1, rscratch2. 1818 void generate_type_check(Register sub_klass, 1819 Register super_check_offset, 1820 Register super_klass, 1821 Register temp1, 1822 Register temp2, 1823 Register result, 1824 Label& L_success) { 1825 assert_different_registers(sub_klass, super_check_offset, super_klass); 1826 1827 BLOCK_COMMENT("type_check:"); 1828 1829 Label L_miss; 1830 1831 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1832 super_check_offset); 1833 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1834 1835 // Fall through on failure! 1836 __ BIND(L_miss); 1837 } 1838 1839 // 1840 // Generate checkcasting array copy stub 1841 // 1842 // Input: 1843 // c_rarg0 - source array address 1844 // c_rarg1 - destination array address 1845 // c_rarg2 - element count, treated as ssize_t, can be zero 1846 // c_rarg3 - size_t ckoff (super_check_offset) 1847 // c_rarg4 - oop ckval (super_klass) 1848 // 1849 // Output: 1850 // r0 == 0 - success 1851 // r0 == -1^K - failure, where K is partial transfer count 1852 // 1853 address generate_checkcast_copy(StubId stub_id, address *entry) { 1854 bool dest_uninitialized; 1855 switch (stub_id) { 1856 case StubId::stubgen_checkcast_arraycopy_id: 1857 dest_uninitialized = false; 1858 break; 1859 case StubId::stubgen_checkcast_arraycopy_uninit_id: 1860 dest_uninitialized = true; 1861 break; 1862 default: 1863 ShouldNotReachHere(); 1864 } 1865 1866 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1867 1868 // Input registers (after setup_arg_regs) 1869 const Register from = c_rarg0; // source array address 1870 const Register to = c_rarg1; // destination array address 1871 const Register count = c_rarg2; // elementscount 1872 const Register ckoff = c_rarg3; // super_check_offset 1873 const Register ckval = c_rarg4; // super_klass 1874 1875 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1876 RegSet wb_post_saved_regs = RegSet::of(count); 1877 1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1879 const Register copied_oop = r22; // actual oop copied 1880 const Register count_save = r21; // orig elementscount 1881 const Register start_to = r20; // destination array start address 1882 const Register r19_klass = r19; // oop._klass 1883 1884 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1885 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1886 1887 //--------------------------------------------------------------- 1888 // Assembler stub will be used for this call to arraycopy 1889 // if the two arrays are subtypes of Object[] but the 1890 // destination array type is not equal to or a supertype 1891 // of the source type. Each element must be separately 1892 // checked. 1893 1894 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1895 copied_oop, r19_klass, count_save); 1896 1897 __ align(CodeEntryAlignment); 1898 StubCodeMark mark(this, stub_id); 1899 address start = __ pc(); 1900 1901 __ enter(); // required for proper stackwalking of RuntimeStub frame 1902 1903 #ifdef ASSERT 1904 // caller guarantees that the arrays really are different 1905 // otherwise, we would have to make conjoint checks 1906 { Label L; 1907 __ b(L); // conjoint check not yet implemented 1908 __ stop("checkcast_copy within a single array"); 1909 __ bind(L); 1910 } 1911 #endif //ASSERT 1912 1913 // Caller of this entry point must set up the argument registers. 1914 if (entry != nullptr) { 1915 *entry = __ pc(); 1916 BLOCK_COMMENT("Entry:"); 1917 } 1918 1919 // Empty array: Nothing to do. 1920 __ cbz(count, L_done); 1921 __ push(RegSet::of(r19, r20, r21, r22), sp); 1922 1923 #ifdef ASSERT 1924 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1925 // The ckoff and ckval must be mutually consistent, 1926 // even though caller generates both. 1927 { Label L; 1928 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1929 __ ldrw(start_to, Address(ckval, sco_offset)); 1930 __ cmpw(ckoff, start_to); 1931 __ br(Assembler::EQ, L); 1932 __ stop("super_check_offset inconsistent"); 1933 __ bind(L); 1934 } 1935 #endif //ASSERT 1936 1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1938 bool is_oop = true; 1939 int element_size = UseCompressedOops ? 4 : 8; 1940 if (dest_uninitialized) { 1941 decorators |= IS_DEST_UNINITIALIZED; 1942 } 1943 1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1946 1947 // save the original count 1948 __ mov(count_save, count); 1949 1950 // Copy from low to high addresses 1951 __ mov(start_to, to); // Save destination array start address 1952 __ b(L_load_element); 1953 1954 // ======== begin loop ======== 1955 // (Loop is rotated; its entry is L_load_element.) 1956 // Loop control: 1957 // for (; count != 0; count--) { 1958 // copied_oop = load_heap_oop(from++); 1959 // ... generate_type_check ...; 1960 // store_heap_oop(to++, copied_oop); 1961 // } 1962 __ align(OptoLoopAlignment); 1963 1964 __ BIND(L_store_element); 1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1966 __ post(to, element_size), copied_oop, noreg, 1967 gct1, gct2, gct3); 1968 __ sub(count, count, 1); 1969 __ cbz(count, L_do_card_marks); 1970 1971 // ======== loop entry is here ======== 1972 __ BIND(L_load_element); 1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1974 copied_oop, noreg, __ post(from, element_size), 1975 gct1); 1976 __ cbz(copied_oop, L_store_element); 1977 1978 __ load_klass(r19_klass, copied_oop);// query the object klass 1979 1980 BLOCK_COMMENT("type_check:"); 1981 generate_type_check(/*sub_klass*/r19_klass, 1982 /*super_check_offset*/ckoff, 1983 /*super_klass*/ckval, 1984 /*r_array_base*/gct1, 1985 /*temp2*/gct2, 1986 /*result*/r10, L_store_element); 1987 1988 // Fall through on failure! 1989 1990 // ======== end loop ======== 1991 1992 // It was a real error; we must depend on the caller to finish the job. 1993 // Register count = remaining oops, count_orig = total oops. 1994 // Emit GC store barriers for the oops we have copied and report 1995 // their number to the caller. 1996 1997 __ subs(count, count_save, count); // K = partially copied oop count 1998 __ eon(count, count, zr); // report (-1^K) to caller 1999 __ br(Assembler::EQ, L_done_pop); 2000 2001 __ BIND(L_do_card_marks); 2002 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2003 2004 __ bind(L_done_pop); 2005 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2007 2008 __ bind(L_done); 2009 __ mov(r0, count); 2010 __ leave(); 2011 __ ret(lr); 2012 2013 return start; 2014 } 2015 2016 // Perform range checks on the proposed arraycopy. 2017 // Kills temp, but nothing else. 2018 // Also, clean the sign bits of src_pos and dst_pos. 2019 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2020 Register src_pos, // source position (c_rarg1) 2021 Register dst, // destination array oo (c_rarg2) 2022 Register dst_pos, // destination position (c_rarg3) 2023 Register length, 2024 Register temp, 2025 Label& L_failed) { 2026 BLOCK_COMMENT("arraycopy_range_checks:"); 2027 2028 assert_different_registers(rscratch1, temp); 2029 2030 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2031 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2032 __ addw(temp, length, src_pos); 2033 __ cmpw(temp, rscratch1); 2034 __ br(Assembler::HI, L_failed); 2035 2036 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2037 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2038 __ addw(temp, length, dst_pos); 2039 __ cmpw(temp, rscratch1); 2040 __ br(Assembler::HI, L_failed); 2041 2042 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2043 __ movw(src_pos, src_pos); 2044 __ movw(dst_pos, dst_pos); 2045 2046 BLOCK_COMMENT("arraycopy_range_checks done"); 2047 } 2048 2049 // These stubs get called from some dumb test routine. 2050 // I'll write them properly when they're called from 2051 // something that's actually doing something. 2052 static void fake_arraycopy_stub(address src, address dst, int count) { 2053 assert(count == 0, "huh?"); 2054 } 2055 2056 2057 // 2058 // Generate 'unsafe' array copy stub 2059 // Though just as safe as the other stubs, it takes an unscaled 2060 // size_t argument instead of an element count. 2061 // 2062 // Input: 2063 // c_rarg0 - source array address 2064 // c_rarg1 - destination array address 2065 // c_rarg2 - byte count, treated as ssize_t, can be zero 2066 // 2067 // Examines the alignment of the operands and dispatches 2068 // to a long, int, short, or byte copy loop. 2069 // 2070 address generate_unsafe_copy(address byte_copy_entry, 2071 address short_copy_entry, 2072 address int_copy_entry, 2073 address long_copy_entry) { 2074 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id; 2075 2076 Label L_long_aligned, L_int_aligned, L_short_aligned; 2077 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2078 2079 __ align(CodeEntryAlignment); 2080 StubCodeMark mark(this, stub_id); 2081 address start = __ pc(); 2082 __ enter(); // required for proper stackwalking of RuntimeStub frame 2083 2084 // bump this on entry, not on exit: 2085 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2086 2087 __ orr(rscratch1, s, d); 2088 __ orr(rscratch1, rscratch1, count); 2089 2090 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2091 __ cbz(rscratch1, L_long_aligned); 2092 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2093 __ cbz(rscratch1, L_int_aligned); 2094 __ tbz(rscratch1, 0, L_short_aligned); 2095 __ b(RuntimeAddress(byte_copy_entry)); 2096 2097 __ BIND(L_short_aligned); 2098 __ lsr(count, count, LogBytesPerShort); // size => short_count 2099 __ b(RuntimeAddress(short_copy_entry)); 2100 __ BIND(L_int_aligned); 2101 __ lsr(count, count, LogBytesPerInt); // size => int_count 2102 __ b(RuntimeAddress(int_copy_entry)); 2103 __ BIND(L_long_aligned); 2104 __ lsr(count, count, LogBytesPerLong); // size => long_count 2105 __ b(RuntimeAddress(long_copy_entry)); 2106 2107 return start; 2108 } 2109 2110 // 2111 // Generate generic array copy stubs 2112 // 2113 // Input: 2114 // c_rarg0 - src oop 2115 // c_rarg1 - src_pos (32-bits) 2116 // c_rarg2 - dst oop 2117 // c_rarg3 - dst_pos (32-bits) 2118 // c_rarg4 - element count (32-bits) 2119 // 2120 // Output: 2121 // r0 == 0 - success 2122 // r0 == -1^K - failure, where K is partial transfer count 2123 // 2124 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2125 address int_copy_entry, address oop_copy_entry, 2126 address long_copy_entry, address checkcast_copy_entry) { 2127 StubId stub_id = StubId::stubgen_generic_arraycopy_id; 2128 2129 Label L_failed, L_objArray; 2130 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2131 2132 // Input registers 2133 const Register src = c_rarg0; // source array oop 2134 const Register src_pos = c_rarg1; // source position 2135 const Register dst = c_rarg2; // destination array oop 2136 const Register dst_pos = c_rarg3; // destination position 2137 const Register length = c_rarg4; 2138 2139 2140 // Registers used as temps 2141 const Register dst_klass = c_rarg5; 2142 2143 __ align(CodeEntryAlignment); 2144 2145 StubCodeMark mark(this, stub_id); 2146 2147 address start = __ pc(); 2148 2149 __ enter(); // required for proper stackwalking of RuntimeStub frame 2150 2151 // bump this on entry, not on exit: 2152 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2153 2154 //----------------------------------------------------------------------- 2155 // Assembler stub will be used for this call to arraycopy 2156 // if the following conditions are met: 2157 // 2158 // (1) src and dst must not be null. 2159 // (2) src_pos must not be negative. 2160 // (3) dst_pos must not be negative. 2161 // (4) length must not be negative. 2162 // (5) src klass and dst klass should be the same and not null. 2163 // (6) src and dst should be arrays. 2164 // (7) src_pos + length must not exceed length of src. 2165 // (8) dst_pos + length must not exceed length of dst. 2166 // 2167 2168 // if (src == nullptr) return -1; 2169 __ cbz(src, L_failed); 2170 2171 // if (src_pos < 0) return -1; 2172 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2173 2174 // if (dst == nullptr) return -1; 2175 __ cbz(dst, L_failed); 2176 2177 // if (dst_pos < 0) return -1; 2178 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2179 2180 // registers used as temp 2181 const Register scratch_length = r16; // elements count to copy 2182 const Register scratch_src_klass = r17; // array klass 2183 const Register lh = r15; // layout helper 2184 2185 // if (length < 0) return -1; 2186 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2187 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2188 2189 __ load_klass(scratch_src_klass, src); 2190 #ifdef ASSERT 2191 // assert(src->klass() != nullptr); 2192 { 2193 BLOCK_COMMENT("assert klasses not null {"); 2194 Label L1, L2; 2195 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2196 __ bind(L1); 2197 __ stop("broken null klass"); 2198 __ bind(L2); 2199 __ load_klass(rscratch1, dst); 2200 __ cbz(rscratch1, L1); // this would be broken also 2201 BLOCK_COMMENT("} assert klasses not null done"); 2202 } 2203 #endif 2204 2205 // Load layout helper (32-bits) 2206 // 2207 // |array_tag| | header_size | element_type | |log2_element_size| 2208 // 32 30 24 16 8 2 0 2209 // 2210 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2211 // 2212 2213 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2214 2215 // Handle objArrays completely differently... 2216 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2217 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2218 __ movw(rscratch1, objArray_lh); 2219 __ eorw(rscratch2, lh, rscratch1); 2220 __ cbzw(rscratch2, L_objArray); 2221 2222 // if (src->klass() != dst->klass()) return -1; 2223 __ load_klass(rscratch2, dst); 2224 __ eor(rscratch2, rscratch2, scratch_src_klass); 2225 __ cbnz(rscratch2, L_failed); 2226 2227 // if (!src->is_Array()) return -1; 2228 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2229 2230 // At this point, it is known to be a typeArray (array_tag 0x3). 2231 #ifdef ASSERT 2232 { 2233 BLOCK_COMMENT("assert primitive array {"); 2234 Label L; 2235 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2236 __ cmpw(lh, rscratch2); 2237 __ br(Assembler::GE, L); 2238 __ stop("must be a primitive array"); 2239 __ bind(L); 2240 BLOCK_COMMENT("} assert primitive array done"); 2241 } 2242 #endif 2243 2244 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2245 rscratch2, L_failed); 2246 2247 // TypeArrayKlass 2248 // 2249 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2250 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2251 // 2252 2253 const Register rscratch1_offset = rscratch1; // array offset 2254 const Register r15_elsize = lh; // element size 2255 2256 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2257 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2258 __ add(src, src, rscratch1_offset); // src array offset 2259 __ add(dst, dst, rscratch1_offset); // dst array offset 2260 BLOCK_COMMENT("choose copy loop based on element size"); 2261 2262 // next registers should be set before the jump to corresponding stub 2263 const Register from = c_rarg0; // source array address 2264 const Register to = c_rarg1; // destination array address 2265 const Register count = c_rarg2; // elements count 2266 2267 // 'from', 'to', 'count' registers should be set in such order 2268 // since they are the same as 'src', 'src_pos', 'dst'. 2269 2270 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2271 2272 // The possible values of elsize are 0-3, i.e. exact_log2(element 2273 // size in bytes). We do a simple bitwise binary search. 2274 __ BIND(L_copy_bytes); 2275 __ tbnz(r15_elsize, 1, L_copy_ints); 2276 __ tbnz(r15_elsize, 0, L_copy_shorts); 2277 __ lea(from, Address(src, src_pos));// src_addr 2278 __ lea(to, Address(dst, dst_pos));// dst_addr 2279 __ movw(count, scratch_length); // length 2280 __ b(RuntimeAddress(byte_copy_entry)); 2281 2282 __ BIND(L_copy_shorts); 2283 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2284 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2285 __ movw(count, scratch_length); // length 2286 __ b(RuntimeAddress(short_copy_entry)); 2287 2288 __ BIND(L_copy_ints); 2289 __ tbnz(r15_elsize, 0, L_copy_longs); 2290 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2291 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2292 __ movw(count, scratch_length); // length 2293 __ b(RuntimeAddress(int_copy_entry)); 2294 2295 __ BIND(L_copy_longs); 2296 #ifdef ASSERT 2297 { 2298 BLOCK_COMMENT("assert long copy {"); 2299 Label L; 2300 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2301 __ cmpw(r15_elsize, LogBytesPerLong); 2302 __ br(Assembler::EQ, L); 2303 __ stop("must be long copy, but elsize is wrong"); 2304 __ bind(L); 2305 BLOCK_COMMENT("} assert long copy done"); 2306 } 2307 #endif 2308 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2309 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2310 __ movw(count, scratch_length); // length 2311 __ b(RuntimeAddress(long_copy_entry)); 2312 2313 // ObjArrayKlass 2314 __ BIND(L_objArray); 2315 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2316 2317 Label L_plain_copy, L_checkcast_copy; 2318 // test array classes for subtyping 2319 __ load_klass(r15, dst); 2320 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2321 __ br(Assembler::NE, L_checkcast_copy); 2322 2323 // Identically typed arrays can be copied without element-wise checks. 2324 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2325 rscratch2, L_failed); 2326 2327 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2328 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2329 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2330 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2331 __ movw(count, scratch_length); // length 2332 __ BIND(L_plain_copy); 2333 __ b(RuntimeAddress(oop_copy_entry)); 2334 2335 __ BIND(L_checkcast_copy); 2336 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2337 { 2338 // Before looking at dst.length, make sure dst is also an objArray. 2339 __ ldrw(rscratch1, Address(r15, lh_offset)); 2340 __ movw(rscratch2, objArray_lh); 2341 __ eorw(rscratch1, rscratch1, rscratch2); 2342 __ cbnzw(rscratch1, L_failed); 2343 2344 // It is safe to examine both src.length and dst.length. 2345 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2346 r15, L_failed); 2347 2348 __ load_klass(dst_klass, dst); // reload 2349 2350 // Marshal the base address arguments now, freeing registers. 2351 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2354 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2355 __ movw(count, length); // length (reloaded) 2356 Register sco_temp = c_rarg3; // this register is free now 2357 assert_different_registers(from, to, count, sco_temp, 2358 dst_klass, scratch_src_klass); 2359 // assert_clean_int(count, sco_temp); 2360 2361 // Generate the type check. 2362 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2364 2365 // Smashes rscratch1, rscratch2 2366 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2367 L_plain_copy); 2368 2369 // Fetch destination element klass from the ObjArrayKlass header. 2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2371 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2373 2374 // the checkcast_copy loop needs two extra arguments: 2375 assert(c_rarg3 == sco_temp, "#3 already in place"); 2376 // Set up arguments for checkcast_copy_entry. 2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2378 __ b(RuntimeAddress(checkcast_copy_entry)); 2379 } 2380 2381 __ BIND(L_failed); 2382 __ mov(r0, -1); 2383 __ leave(); // required for proper stackwalking of RuntimeStub frame 2384 __ ret(lr); 2385 2386 return start; 2387 } 2388 2389 // 2390 // Generate stub for array fill. If "aligned" is true, the 2391 // "to" address is assumed to be heapword aligned. 2392 // 2393 // Arguments for generated stub: 2394 // to: c_rarg0 2395 // value: c_rarg1 2396 // count: c_rarg2 treated as signed 2397 // 2398 address generate_fill(StubId stub_id) { 2399 BasicType t; 2400 bool aligned; 2401 2402 switch (stub_id) { 2403 case StubId::stubgen_jbyte_fill_id: 2404 t = T_BYTE; 2405 aligned = false; 2406 break; 2407 case StubId::stubgen_jshort_fill_id: 2408 t = T_SHORT; 2409 aligned = false; 2410 break; 2411 case StubId::stubgen_jint_fill_id: 2412 t = T_INT; 2413 aligned = false; 2414 break; 2415 case StubId::stubgen_arrayof_jbyte_fill_id: 2416 t = T_BYTE; 2417 aligned = true; 2418 break; 2419 case StubId::stubgen_arrayof_jshort_fill_id: 2420 t = T_SHORT; 2421 aligned = true; 2422 break; 2423 case StubId::stubgen_arrayof_jint_fill_id: 2424 t = T_INT; 2425 aligned = true; 2426 break; 2427 default: 2428 ShouldNotReachHere(); 2429 }; 2430 2431 __ align(CodeEntryAlignment); 2432 StubCodeMark mark(this, stub_id); 2433 address start = __ pc(); 2434 2435 BLOCK_COMMENT("Entry:"); 2436 2437 const Register to = c_rarg0; // source array address 2438 const Register value = c_rarg1; // value 2439 const Register count = c_rarg2; // elements count 2440 2441 const Register bz_base = r10; // base for block_zero routine 2442 const Register cnt_words = r11; // temp register 2443 2444 __ enter(); 2445 2446 Label L_fill_elements, L_exit1; 2447 2448 int shift = -1; 2449 switch (t) { 2450 case T_BYTE: 2451 shift = 0; 2452 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2453 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2454 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2455 __ br(Assembler::LO, L_fill_elements); 2456 break; 2457 case T_SHORT: 2458 shift = 1; 2459 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2461 __ br(Assembler::LO, L_fill_elements); 2462 break; 2463 case T_INT: 2464 shift = 2; 2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2466 __ br(Assembler::LO, L_fill_elements); 2467 break; 2468 default: ShouldNotReachHere(); 2469 } 2470 2471 // Align source address at 8 bytes address boundary. 2472 Label L_skip_align1, L_skip_align2, L_skip_align4; 2473 if (!aligned) { 2474 switch (t) { 2475 case T_BYTE: 2476 // One byte misalignment happens only for byte arrays. 2477 __ tbz(to, 0, L_skip_align1); 2478 __ strb(value, Address(__ post(to, 1))); 2479 __ subw(count, count, 1); 2480 __ bind(L_skip_align1); 2481 // Fallthrough 2482 case T_SHORT: 2483 // Two bytes misalignment happens only for byte and short (char) arrays. 2484 __ tbz(to, 1, L_skip_align2); 2485 __ strh(value, Address(__ post(to, 2))); 2486 __ subw(count, count, 2 >> shift); 2487 __ bind(L_skip_align2); 2488 // Fallthrough 2489 case T_INT: 2490 // Align to 8 bytes, we know we are 4 byte aligned to start. 2491 __ tbz(to, 2, L_skip_align4); 2492 __ strw(value, Address(__ post(to, 4))); 2493 __ subw(count, count, 4 >> shift); 2494 __ bind(L_skip_align4); 2495 break; 2496 default: ShouldNotReachHere(); 2497 } 2498 } 2499 2500 // 2501 // Fill large chunks 2502 // 2503 __ lsrw(cnt_words, count, 3 - shift); // number of words 2504 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2505 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2506 if (UseBlockZeroing) { 2507 Label non_block_zeroing, rest; 2508 // If the fill value is zero we can use the fast zero_words(). 2509 __ cbnz(value, non_block_zeroing); 2510 __ mov(bz_base, to); 2511 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2512 address tpc = __ zero_words(bz_base, cnt_words); 2513 if (tpc == nullptr) { 2514 fatal("CodeCache is full at generate_fill"); 2515 } 2516 __ b(rest); 2517 __ bind(non_block_zeroing); 2518 __ fill_words(to, cnt_words, value); 2519 __ bind(rest); 2520 } else { 2521 __ fill_words(to, cnt_words, value); 2522 } 2523 2524 // Remaining count is less than 8 bytes. Fill it by a single store. 2525 // Note that the total length is no less than 8 bytes. 2526 if (t == T_BYTE || t == T_SHORT) { 2527 Label L_exit1; 2528 __ cbzw(count, L_exit1); 2529 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2530 __ str(value, Address(to, -8)); // overwrite some elements 2531 __ bind(L_exit1); 2532 __ leave(); 2533 __ ret(lr); 2534 } 2535 2536 // Handle copies less than 8 bytes. 2537 Label L_fill_2, L_fill_4, L_exit2; 2538 __ bind(L_fill_elements); 2539 switch (t) { 2540 case T_BYTE: 2541 __ tbz(count, 0, L_fill_2); 2542 __ strb(value, Address(__ post(to, 1))); 2543 __ bind(L_fill_2); 2544 __ tbz(count, 1, L_fill_4); 2545 __ strh(value, Address(__ post(to, 2))); 2546 __ bind(L_fill_4); 2547 __ tbz(count, 2, L_exit2); 2548 __ strw(value, Address(to)); 2549 break; 2550 case T_SHORT: 2551 __ tbz(count, 0, L_fill_4); 2552 __ strh(value, Address(__ post(to, 2))); 2553 __ bind(L_fill_4); 2554 __ tbz(count, 1, L_exit2); 2555 __ strw(value, Address(to)); 2556 break; 2557 case T_INT: 2558 __ cbzw(count, L_exit2); 2559 __ strw(value, Address(to)); 2560 break; 2561 default: ShouldNotReachHere(); 2562 } 2563 __ bind(L_exit2); 2564 __ leave(); 2565 __ ret(lr); 2566 return start; 2567 } 2568 2569 address generate_unsafecopy_common_error_exit() { 2570 address start_pc = __ pc(); 2571 __ leave(); 2572 __ mov(r0, 0); 2573 __ ret(lr); 2574 return start_pc; 2575 } 2576 2577 // 2578 // Generate 'unsafe' set memory stub 2579 // Though just as safe as the other stubs, it takes an unscaled 2580 // size_t (# bytes) argument instead of an element count. 2581 // 2582 // This fill operation is atomicity preserving: as long as the 2583 // address supplied is sufficiently aligned, all writes of up to 64 2584 // bits in size are single-copy atomic. 2585 // 2586 // Input: 2587 // c_rarg0 - destination array address 2588 // c_rarg1 - byte count (size_t) 2589 // c_rarg2 - byte value 2590 // 2591 address generate_unsafe_setmemory() { 2592 __ align(CodeEntryAlignment); 2593 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id); 2594 address start = __ pc(); 2595 2596 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2597 Label tail; 2598 2599 UnsafeMemoryAccessMark umam(this, true, false); 2600 2601 __ enter(); // required for proper stackwalking of RuntimeStub frame 2602 2603 __ dup(v0, __ T16B, value); 2604 2605 if (AvoidUnalignedAccesses) { 2606 __ cmp(count, (u1)16); 2607 __ br(__ LO, tail); 2608 2609 __ mov(rscratch1, 16); 2610 __ andr(rscratch2, dest, 15); 2611 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2612 __ strq(v0, Address(dest)); 2613 __ sub(count, count, rscratch1); 2614 __ add(dest, dest, rscratch1); 2615 } 2616 2617 __ subs(count, count, (u1)64); 2618 __ br(__ LO, tail); 2619 { 2620 Label again; 2621 __ bind(again); 2622 __ stpq(v0, v0, Address(dest)); 2623 __ stpq(v0, v0, Address(dest, 32)); 2624 2625 __ subs(count, count, 64); 2626 __ add(dest, dest, 64); 2627 __ br(__ HS, again); 2628 } 2629 2630 __ bind(tail); 2631 // The count of bytes is off by 64, but we don't need to correct 2632 // it because we're only going to use the least-significant few 2633 // count bits from here on. 2634 // __ add(count, count, 64); 2635 2636 { 2637 Label dont; 2638 __ tbz(count, exact_log2(32), dont); 2639 __ stpq(v0, v0, __ post(dest, 32)); 2640 __ bind(dont); 2641 } 2642 { 2643 Label dont; 2644 __ tbz(count, exact_log2(16), dont); 2645 __ strq(v0, __ post(dest, 16)); 2646 __ bind(dont); 2647 } 2648 { 2649 Label dont; 2650 __ tbz(count, exact_log2(8), dont); 2651 __ strd(v0, __ post(dest, 8)); 2652 __ bind(dont); 2653 } 2654 2655 Label finished; 2656 __ tst(count, 7); 2657 __ br(__ EQ, finished); 2658 2659 { 2660 Label dont; 2661 __ tbz(count, exact_log2(4), dont); 2662 __ strs(v0, __ post(dest, 4)); 2663 __ bind(dont); 2664 } 2665 { 2666 Label dont; 2667 __ tbz(count, exact_log2(2), dont); 2668 __ bfi(value, value, 8, 8); 2669 __ strh(value, __ post(dest, 2)); 2670 __ bind(dont); 2671 } 2672 { 2673 Label dont; 2674 __ tbz(count, exact_log2(1), dont); 2675 __ strb(value, Address(dest)); 2676 __ bind(dont); 2677 } 2678 2679 __ bind(finished); 2680 __ leave(); 2681 __ ret(lr); 2682 2683 return start; 2684 } 2685 2686 address generate_data_cache_writeback() { 2687 const Register line = c_rarg0; // address of line to write back 2688 2689 __ align(CodeEntryAlignment); 2690 2691 StubId stub_id = StubId::stubgen_data_cache_writeback_id; 2692 StubCodeMark mark(this, stub_id); 2693 2694 address start = __ pc(); 2695 __ enter(); 2696 __ cache_wb(Address(line, 0)); 2697 __ leave(); 2698 __ ret(lr); 2699 2700 return start; 2701 } 2702 2703 address generate_data_cache_writeback_sync() { 2704 const Register is_pre = c_rarg0; // pre or post sync 2705 2706 __ align(CodeEntryAlignment); 2707 2708 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id; 2709 StubCodeMark mark(this, stub_id); 2710 2711 // pre wbsync is a no-op 2712 // post wbsync translates to an sfence 2713 2714 Label skip; 2715 address start = __ pc(); 2716 __ enter(); 2717 __ cbnz(is_pre, skip); 2718 __ cache_wbsync(false); 2719 __ bind(skip); 2720 __ leave(); 2721 __ ret(lr); 2722 2723 return start; 2724 } 2725 2726 void generate_arraycopy_stubs() { 2727 address entry; 2728 address entry_jbyte_arraycopy; 2729 address entry_jshort_arraycopy; 2730 address entry_jint_arraycopy; 2731 address entry_oop_arraycopy; 2732 address entry_jlong_arraycopy; 2733 address entry_checkcast_arraycopy; 2734 2735 // generate the common exit first so later stubs can rely on it if 2736 // they want an UnsafeMemoryAccess exit non-local to the stub 2737 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit(); 2738 // register the stub as the default exit with class UnsafeMemoryAccess 2739 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit); 2740 2741 generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2742 generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2743 2744 generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2745 generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2746 2747 generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2748 generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2749 2750 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2751 2752 //*** jbyte 2753 // Always need aligned and unaligned versions 2754 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry); 2755 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2756 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry); 2757 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr); 2758 2759 //*** jshort 2760 // Always need aligned and unaligned versions 2761 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry); 2762 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2763 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry); 2764 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr); 2765 2766 //*** jint 2767 // Aligned versions 2768 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry); 2769 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2770 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2771 // entry_jint_arraycopy always points to the unaligned version 2772 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry); 2773 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2774 2775 //*** jlong 2776 // It is always aligned 2777 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry); 2778 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2779 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2780 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2781 2782 //*** oops 2783 { 2784 // With compressed oops we need unaligned versions; notice that 2785 // we overwrite entry_oop_arraycopy. 2786 bool aligned = !UseCompressedOops; 2787 2788 StubRoutines::_arrayof_oop_disjoint_arraycopy 2789 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry); 2790 StubRoutines::_arrayof_oop_arraycopy 2791 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2792 // Aligned versions without pre-barriers 2793 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2794 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2795 StubRoutines::_arrayof_oop_arraycopy_uninit 2796 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2797 } 2798 2799 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2800 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2801 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2802 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2803 2804 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2805 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr); 2806 2807 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2808 entry_jshort_arraycopy, 2809 entry_jint_arraycopy, 2810 entry_jlong_arraycopy); 2811 2812 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2813 entry_jshort_arraycopy, 2814 entry_jint_arraycopy, 2815 entry_oop_arraycopy, 2816 entry_jlong_arraycopy, 2817 entry_checkcast_arraycopy); 2818 2819 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id); 2820 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id); 2821 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id); 2822 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id); 2823 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id); 2824 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id); 2825 } 2826 2827 void generate_math_stubs() { Unimplemented(); } 2828 2829 // Arguments: 2830 // 2831 // Inputs: 2832 // c_rarg0 - source byte array address 2833 // c_rarg1 - destination byte array address 2834 // c_rarg2 - K (key) in little endian int array 2835 // 2836 address generate_aescrypt_encryptBlock() { 2837 __ align(CodeEntryAlignment); 2838 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id; 2839 StubCodeMark mark(this, stub_id); 2840 2841 const Register from = c_rarg0; // source array address 2842 const Register to = c_rarg1; // destination array address 2843 const Register key = c_rarg2; // key array address 2844 const Register keylen = rscratch1; 2845 2846 address start = __ pc(); 2847 __ enter(); 2848 2849 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2850 2851 __ aesenc_loadkeys(key, keylen); 2852 __ aesecb_encrypt(from, to, keylen); 2853 2854 __ mov(r0, 0); 2855 2856 __ leave(); 2857 __ ret(lr); 2858 2859 return start; 2860 } 2861 2862 // Arguments: 2863 // 2864 // Inputs: 2865 // c_rarg0 - source byte array address 2866 // c_rarg1 - destination byte array address 2867 // c_rarg2 - K (key) in little endian int array 2868 // 2869 address generate_aescrypt_decryptBlock() { 2870 assert(UseAES, "need AES cryptographic extension support"); 2871 __ align(CodeEntryAlignment); 2872 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id; 2873 StubCodeMark mark(this, stub_id); 2874 Label L_doLast; 2875 2876 const Register from = c_rarg0; // source array address 2877 const Register to = c_rarg1; // destination array address 2878 const Register key = c_rarg2; // key array address 2879 const Register keylen = rscratch1; 2880 2881 address start = __ pc(); 2882 __ enter(); // required for proper stackwalking of RuntimeStub frame 2883 2884 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2885 2886 __ aesecb_decrypt(from, to, key, keylen); 2887 2888 __ mov(r0, 0); 2889 2890 __ leave(); 2891 __ ret(lr); 2892 2893 return start; 2894 } 2895 2896 // Arguments: 2897 // 2898 // Inputs: 2899 // c_rarg0 - source byte array address 2900 // c_rarg1 - destination byte array address 2901 // c_rarg2 - K (key) in little endian int array 2902 // c_rarg3 - r vector byte array address 2903 // c_rarg4 - input length 2904 // 2905 // Output: 2906 // x0 - input length 2907 // 2908 address generate_cipherBlockChaining_encryptAESCrypt() { 2909 assert(UseAES, "need AES cryptographic extension support"); 2910 __ align(CodeEntryAlignment); 2911 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id; 2912 StubCodeMark mark(this, stub_id); 2913 2914 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2915 2916 const Register from = c_rarg0; // source array address 2917 const Register to = c_rarg1; // destination array address 2918 const Register key = c_rarg2; // key array address 2919 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2920 // and left with the results of the last encryption block 2921 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2922 const Register keylen = rscratch1; 2923 2924 address start = __ pc(); 2925 2926 __ enter(); 2927 2928 __ movw(rscratch2, len_reg); 2929 2930 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2931 2932 __ ld1(v0, __ T16B, rvec); 2933 2934 __ cmpw(keylen, 52); 2935 __ br(Assembler::CC, L_loadkeys_44); 2936 __ br(Assembler::EQ, L_loadkeys_52); 2937 2938 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2939 __ rev32(v17, __ T16B, v17); 2940 __ rev32(v18, __ T16B, v18); 2941 __ BIND(L_loadkeys_52); 2942 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2943 __ rev32(v19, __ T16B, v19); 2944 __ rev32(v20, __ T16B, v20); 2945 __ BIND(L_loadkeys_44); 2946 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2947 __ rev32(v21, __ T16B, v21); 2948 __ rev32(v22, __ T16B, v22); 2949 __ rev32(v23, __ T16B, v23); 2950 __ rev32(v24, __ T16B, v24); 2951 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2952 __ rev32(v25, __ T16B, v25); 2953 __ rev32(v26, __ T16B, v26); 2954 __ rev32(v27, __ T16B, v27); 2955 __ rev32(v28, __ T16B, v28); 2956 __ ld1(v29, v30, v31, __ T16B, key); 2957 __ rev32(v29, __ T16B, v29); 2958 __ rev32(v30, __ T16B, v30); 2959 __ rev32(v31, __ T16B, v31); 2960 2961 __ BIND(L_aes_loop); 2962 __ ld1(v1, __ T16B, __ post(from, 16)); 2963 __ eor(v0, __ T16B, v0, v1); 2964 2965 __ br(Assembler::CC, L_rounds_44); 2966 __ br(Assembler::EQ, L_rounds_52); 2967 2968 __ aese(v0, v17); __ aesmc(v0, v0); 2969 __ aese(v0, v18); __ aesmc(v0, v0); 2970 __ BIND(L_rounds_52); 2971 __ aese(v0, v19); __ aesmc(v0, v0); 2972 __ aese(v0, v20); __ aesmc(v0, v0); 2973 __ BIND(L_rounds_44); 2974 __ aese(v0, v21); __ aesmc(v0, v0); 2975 __ aese(v0, v22); __ aesmc(v0, v0); 2976 __ aese(v0, v23); __ aesmc(v0, v0); 2977 __ aese(v0, v24); __ aesmc(v0, v0); 2978 __ aese(v0, v25); __ aesmc(v0, v0); 2979 __ aese(v0, v26); __ aesmc(v0, v0); 2980 __ aese(v0, v27); __ aesmc(v0, v0); 2981 __ aese(v0, v28); __ aesmc(v0, v0); 2982 __ aese(v0, v29); __ aesmc(v0, v0); 2983 __ aese(v0, v30); 2984 __ eor(v0, __ T16B, v0, v31); 2985 2986 __ st1(v0, __ T16B, __ post(to, 16)); 2987 2988 __ subw(len_reg, len_reg, 16); 2989 __ cbnzw(len_reg, L_aes_loop); 2990 2991 __ st1(v0, __ T16B, rvec); 2992 2993 __ mov(r0, rscratch2); 2994 2995 __ leave(); 2996 __ ret(lr); 2997 2998 return start; 2999 } 3000 3001 // Arguments: 3002 // 3003 // Inputs: 3004 // c_rarg0 - source byte array address 3005 // c_rarg1 - destination byte array address 3006 // c_rarg2 - K (key) in little endian int array 3007 // c_rarg3 - r vector byte array address 3008 // c_rarg4 - input length 3009 // 3010 // Output: 3011 // r0 - input length 3012 // 3013 address generate_cipherBlockChaining_decryptAESCrypt() { 3014 assert(UseAES, "need AES cryptographic extension support"); 3015 __ align(CodeEntryAlignment); 3016 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id; 3017 StubCodeMark mark(this, stub_id); 3018 3019 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3020 3021 const Register from = c_rarg0; // source array address 3022 const Register to = c_rarg1; // destination array address 3023 const Register key = c_rarg2; // key array address 3024 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3025 // and left with the results of the last encryption block 3026 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3027 const Register keylen = rscratch1; 3028 3029 address start = __ pc(); 3030 3031 __ enter(); 3032 3033 __ movw(rscratch2, len_reg); 3034 3035 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3036 3037 __ ld1(v2, __ T16B, rvec); 3038 3039 __ ld1(v31, __ T16B, __ post(key, 16)); 3040 __ rev32(v31, __ T16B, v31); 3041 3042 __ cmpw(keylen, 52); 3043 __ br(Assembler::CC, L_loadkeys_44); 3044 __ br(Assembler::EQ, L_loadkeys_52); 3045 3046 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3047 __ rev32(v17, __ T16B, v17); 3048 __ rev32(v18, __ T16B, v18); 3049 __ BIND(L_loadkeys_52); 3050 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3051 __ rev32(v19, __ T16B, v19); 3052 __ rev32(v20, __ T16B, v20); 3053 __ BIND(L_loadkeys_44); 3054 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3055 __ rev32(v21, __ T16B, v21); 3056 __ rev32(v22, __ T16B, v22); 3057 __ rev32(v23, __ T16B, v23); 3058 __ rev32(v24, __ T16B, v24); 3059 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3060 __ rev32(v25, __ T16B, v25); 3061 __ rev32(v26, __ T16B, v26); 3062 __ rev32(v27, __ T16B, v27); 3063 __ rev32(v28, __ T16B, v28); 3064 __ ld1(v29, v30, __ T16B, key); 3065 __ rev32(v29, __ T16B, v29); 3066 __ rev32(v30, __ T16B, v30); 3067 3068 __ BIND(L_aes_loop); 3069 __ ld1(v0, __ T16B, __ post(from, 16)); 3070 __ orr(v1, __ T16B, v0, v0); 3071 3072 __ br(Assembler::CC, L_rounds_44); 3073 __ br(Assembler::EQ, L_rounds_52); 3074 3075 __ aesd(v0, v17); __ aesimc(v0, v0); 3076 __ aesd(v0, v18); __ aesimc(v0, v0); 3077 __ BIND(L_rounds_52); 3078 __ aesd(v0, v19); __ aesimc(v0, v0); 3079 __ aesd(v0, v20); __ aesimc(v0, v0); 3080 __ BIND(L_rounds_44); 3081 __ aesd(v0, v21); __ aesimc(v0, v0); 3082 __ aesd(v0, v22); __ aesimc(v0, v0); 3083 __ aesd(v0, v23); __ aesimc(v0, v0); 3084 __ aesd(v0, v24); __ aesimc(v0, v0); 3085 __ aesd(v0, v25); __ aesimc(v0, v0); 3086 __ aesd(v0, v26); __ aesimc(v0, v0); 3087 __ aesd(v0, v27); __ aesimc(v0, v0); 3088 __ aesd(v0, v28); __ aesimc(v0, v0); 3089 __ aesd(v0, v29); __ aesimc(v0, v0); 3090 __ aesd(v0, v30); 3091 __ eor(v0, __ T16B, v0, v31); 3092 __ eor(v0, __ T16B, v0, v2); 3093 3094 __ st1(v0, __ T16B, __ post(to, 16)); 3095 __ orr(v2, __ T16B, v1, v1); 3096 3097 __ subw(len_reg, len_reg, 16); 3098 __ cbnzw(len_reg, L_aes_loop); 3099 3100 __ st1(v2, __ T16B, rvec); 3101 3102 __ mov(r0, rscratch2); 3103 3104 __ leave(); 3105 __ ret(lr); 3106 3107 return start; 3108 } 3109 3110 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3111 // Inputs: 128-bits. in is preserved. 3112 // The least-significant 64-bit word is in the upper dword of each vector. 3113 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3114 // Output: result 3115 void be_add_128_64(FloatRegister result, FloatRegister in, 3116 FloatRegister inc, FloatRegister tmp) { 3117 assert_different_registers(result, tmp, inc); 3118 3119 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3120 // input 3121 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3122 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3123 // MSD == 0 (must be!) to LSD 3124 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3125 } 3126 3127 // CTR AES crypt. 3128 // Arguments: 3129 // 3130 // Inputs: 3131 // c_rarg0 - source byte array address 3132 // c_rarg1 - destination byte array address 3133 // c_rarg2 - K (key) in little endian int array 3134 // c_rarg3 - counter vector byte array address 3135 // c_rarg4 - input length 3136 // c_rarg5 - saved encryptedCounter start 3137 // c_rarg6 - saved used length 3138 // 3139 // Output: 3140 // r0 - input length 3141 // 3142 address generate_counterMode_AESCrypt() { 3143 const Register in = c_rarg0; 3144 const Register out = c_rarg1; 3145 const Register key = c_rarg2; 3146 const Register counter = c_rarg3; 3147 const Register saved_len = c_rarg4, len = r10; 3148 const Register saved_encrypted_ctr = c_rarg5; 3149 const Register used_ptr = c_rarg6, used = r12; 3150 3151 const Register offset = r7; 3152 const Register keylen = r11; 3153 3154 const unsigned char block_size = 16; 3155 const int bulk_width = 4; 3156 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3157 // performance with larger data sizes, but it also means that the 3158 // fast path isn't used until you have at least 8 blocks, and up 3159 // to 127 bytes of data will be executed on the slow path. For 3160 // that reason, and also so as not to blow away too much icache, 4 3161 // blocks seems like a sensible compromise. 3162 3163 // Algorithm: 3164 // 3165 // if (len == 0) { 3166 // goto DONE; 3167 // } 3168 // int result = len; 3169 // do { 3170 // if (used >= blockSize) { 3171 // if (len >= bulk_width * blockSize) { 3172 // CTR_large_block(); 3173 // if (len == 0) 3174 // goto DONE; 3175 // } 3176 // for (;;) { 3177 // 16ByteVector v0 = counter; 3178 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3179 // used = 0; 3180 // if (len < blockSize) 3181 // break; /* goto NEXT */ 3182 // 16ByteVector v1 = load16Bytes(in, offset); 3183 // v1 = v1 ^ encryptedCounter; 3184 // store16Bytes(out, offset); 3185 // used = blockSize; 3186 // offset += blockSize; 3187 // len -= blockSize; 3188 // if (len == 0) 3189 // goto DONE; 3190 // } 3191 // } 3192 // NEXT: 3193 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3194 // len--; 3195 // } while (len != 0); 3196 // DONE: 3197 // return result; 3198 // 3199 // CTR_large_block() 3200 // Wide bulk encryption of whole blocks. 3201 3202 __ align(CodeEntryAlignment); 3203 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id; 3204 StubCodeMark mark(this, stub_id); 3205 const address start = __ pc(); 3206 __ enter(); 3207 3208 Label DONE, CTR_large_block, large_block_return; 3209 __ ldrw(used, Address(used_ptr)); 3210 __ cbzw(saved_len, DONE); 3211 3212 __ mov(len, saved_len); 3213 __ mov(offset, 0); 3214 3215 // Compute #rounds for AES based on the length of the key array 3216 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3217 3218 __ aesenc_loadkeys(key, keylen); 3219 3220 { 3221 Label L_CTR_loop, NEXT; 3222 3223 __ bind(L_CTR_loop); 3224 3225 __ cmp(used, block_size); 3226 __ br(__ LO, NEXT); 3227 3228 // Maybe we have a lot of data 3229 __ subsw(rscratch1, len, bulk_width * block_size); 3230 __ br(__ HS, CTR_large_block); 3231 __ BIND(large_block_return); 3232 __ cbzw(len, DONE); 3233 3234 // Setup the counter 3235 __ movi(v4, __ T4S, 0); 3236 __ movi(v5, __ T4S, 1); 3237 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3238 3239 // 128-bit big-endian increment 3240 __ ld1(v0, __ T16B, counter); 3241 __ rev64(v16, __ T16B, v0); 3242 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3243 __ rev64(v16, __ T16B, v16); 3244 __ st1(v16, __ T16B, counter); 3245 // Previous counter value is in v0 3246 // v4 contains { 0, 1 } 3247 3248 { 3249 // We have fewer than bulk_width blocks of data left. Encrypt 3250 // them one by one until there is less than a full block 3251 // remaining, being careful to save both the encrypted counter 3252 // and the counter. 3253 3254 Label inner_loop; 3255 __ bind(inner_loop); 3256 // Counter to encrypt is in v0 3257 __ aesecb_encrypt(noreg, noreg, keylen); 3258 __ st1(v0, __ T16B, saved_encrypted_ctr); 3259 3260 // Do we have a remaining full block? 3261 3262 __ mov(used, 0); 3263 __ cmp(len, block_size); 3264 __ br(__ LO, NEXT); 3265 3266 // Yes, we have a full block 3267 __ ldrq(v1, Address(in, offset)); 3268 __ eor(v1, __ T16B, v1, v0); 3269 __ strq(v1, Address(out, offset)); 3270 __ mov(used, block_size); 3271 __ add(offset, offset, block_size); 3272 3273 __ subw(len, len, block_size); 3274 __ cbzw(len, DONE); 3275 3276 // Increment the counter, store it back 3277 __ orr(v0, __ T16B, v16, v16); 3278 __ rev64(v16, __ T16B, v16); 3279 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3280 __ rev64(v16, __ T16B, v16); 3281 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3282 3283 __ b(inner_loop); 3284 } 3285 3286 __ BIND(NEXT); 3287 3288 // Encrypt a single byte, and loop. 3289 // We expect this to be a rare event. 3290 __ ldrb(rscratch1, Address(in, offset)); 3291 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3292 __ eor(rscratch1, rscratch1, rscratch2); 3293 __ strb(rscratch1, Address(out, offset)); 3294 __ add(offset, offset, 1); 3295 __ add(used, used, 1); 3296 __ subw(len, len,1); 3297 __ cbnzw(len, L_CTR_loop); 3298 } 3299 3300 __ bind(DONE); 3301 __ strw(used, Address(used_ptr)); 3302 __ mov(r0, saved_len); 3303 3304 __ leave(); // required for proper stackwalking of RuntimeStub frame 3305 __ ret(lr); 3306 3307 // Bulk encryption 3308 3309 __ BIND (CTR_large_block); 3310 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3311 3312 if (bulk_width == 8) { 3313 __ sub(sp, sp, 4 * 16); 3314 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3315 } 3316 __ sub(sp, sp, 4 * 16); 3317 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3318 RegSet saved_regs = (RegSet::of(in, out, offset) 3319 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3320 __ push(saved_regs, sp); 3321 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3322 __ add(in, in, offset); 3323 __ add(out, out, offset); 3324 3325 // Keys should already be loaded into the correct registers 3326 3327 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3328 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3329 3330 // AES/CTR loop 3331 { 3332 Label L_CTR_loop; 3333 __ BIND(L_CTR_loop); 3334 3335 // Setup the counters 3336 __ movi(v8, __ T4S, 0); 3337 __ movi(v9, __ T4S, 1); 3338 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3339 3340 for (int i = 0; i < bulk_width; i++) { 3341 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3342 __ rev64(v0_ofs, __ T16B, v16); 3343 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3344 } 3345 3346 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3347 3348 // Encrypt the counters 3349 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3350 3351 if (bulk_width == 8) { 3352 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3353 } 3354 3355 // XOR the encrypted counters with the inputs 3356 for (int i = 0; i < bulk_width; i++) { 3357 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3358 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3359 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3360 } 3361 3362 // Write the encrypted data 3363 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3364 if (bulk_width == 8) { 3365 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3366 } 3367 3368 __ subw(len, len, 16 * bulk_width); 3369 __ cbnzw(len, L_CTR_loop); 3370 } 3371 3372 // Save the counter back where it goes 3373 __ rev64(v16, __ T16B, v16); 3374 __ st1(v16, __ T16B, counter); 3375 3376 __ pop(saved_regs, sp); 3377 3378 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3379 if (bulk_width == 8) { 3380 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3381 } 3382 3383 __ andr(rscratch1, len, -16 * bulk_width); 3384 __ sub(len, len, rscratch1); 3385 __ add(offset, offset, rscratch1); 3386 __ mov(used, 16); 3387 __ strw(used, Address(used_ptr)); 3388 __ b(large_block_return); 3389 3390 return start; 3391 } 3392 3393 // Vector AES Galois Counter Mode implementation. Parameters: 3394 // 3395 // in = c_rarg0 3396 // len = c_rarg1 3397 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3398 // out = c_rarg3 3399 // key = c_rarg4 3400 // state = c_rarg5 - GHASH.state 3401 // subkeyHtbl = c_rarg6 - powers of H 3402 // counter = c_rarg7 - 16 bytes of CTR 3403 // return - number of processed bytes 3404 address generate_galoisCounterMode_AESCrypt() { 3405 address ghash_polynomial = __ pc(); 3406 __ emit_int64(0x87); // The low-order bits of the field 3407 // polynomial (i.e. p = z^7+z^2+z+1) 3408 // repeated in the low and high parts of a 3409 // 128-bit vector 3410 __ emit_int64(0x87); 3411 3412 __ align(CodeEntryAlignment); 3413 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id; 3414 StubCodeMark mark(this, stub_id); 3415 address start = __ pc(); 3416 __ enter(); 3417 3418 const Register in = c_rarg0; 3419 const Register len = c_rarg1; 3420 const Register ct = c_rarg2; 3421 const Register out = c_rarg3; 3422 // and updated with the incremented counter in the end 3423 3424 const Register key = c_rarg4; 3425 const Register state = c_rarg5; 3426 3427 const Register subkeyHtbl = c_rarg6; 3428 3429 const Register counter = c_rarg7; 3430 3431 const Register keylen = r10; 3432 // Save state before entering routine 3433 __ sub(sp, sp, 4 * 16); 3434 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3435 __ sub(sp, sp, 4 * 16); 3436 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3437 3438 // __ andr(len, len, -512); 3439 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3440 __ str(len, __ pre(sp, -2 * wordSize)); 3441 3442 Label DONE; 3443 __ cbz(len, DONE); 3444 3445 // Compute #rounds for AES based on the length of the key array 3446 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3447 3448 __ aesenc_loadkeys(key, keylen); 3449 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3450 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3451 3452 // AES/CTR loop 3453 { 3454 Label L_CTR_loop; 3455 __ BIND(L_CTR_loop); 3456 3457 // Setup the counters 3458 __ movi(v8, __ T4S, 0); 3459 __ movi(v9, __ T4S, 1); 3460 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3461 3462 assert(v0->encoding() < v8->encoding(), ""); 3463 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3464 FloatRegister f = as_FloatRegister(i); 3465 __ rev32(f, __ T16B, v16); 3466 __ addv(v16, __ T4S, v16, v8); 3467 } 3468 3469 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3470 3471 // Encrypt the counters 3472 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3473 3474 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3475 3476 // XOR the encrypted counters with the inputs 3477 for (int i = 0; i < 8; i++) { 3478 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3479 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3480 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3481 } 3482 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3483 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3484 3485 __ subw(len, len, 16 * 8); 3486 __ cbnzw(len, L_CTR_loop); 3487 } 3488 3489 __ rev32(v16, __ T16B, v16); 3490 __ st1(v16, __ T16B, counter); 3491 3492 __ ldr(len, Address(sp)); 3493 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3494 3495 // GHASH/CTR loop 3496 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3497 len, /*unrolls*/4); 3498 3499 #ifdef ASSERT 3500 { Label L; 3501 __ cmp(len, (unsigned char)0); 3502 __ br(Assembler::EQ, L); 3503 __ stop("stubGenerator: abort"); 3504 __ bind(L); 3505 } 3506 #endif 3507 3508 __ bind(DONE); 3509 // Return the number of bytes processed 3510 __ ldr(r0, __ post(sp, 2 * wordSize)); 3511 3512 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3513 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3514 3515 __ leave(); // required for proper stackwalking of RuntimeStub frame 3516 __ ret(lr); 3517 return start; 3518 } 3519 3520 class Cached64Bytes { 3521 private: 3522 MacroAssembler *_masm; 3523 Register _regs[8]; 3524 3525 public: 3526 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3527 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3528 auto it = rs.begin(); 3529 for (auto &r: _regs) { 3530 r = *it; 3531 ++it; 3532 } 3533 } 3534 3535 void gen_loads(Register base) { 3536 for (int i = 0; i < 8; i += 2) { 3537 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3538 } 3539 } 3540 3541 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3542 void extract_u32(Register dest, int i) { 3543 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3544 } 3545 }; 3546 3547 // Utility routines for md5. 3548 // Clobbers r10 and r11. 3549 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3550 int k, int s, int t) { 3551 Register rscratch3 = r10; 3552 Register rscratch4 = r11; 3553 3554 __ eorw(rscratch3, r3, r4); 3555 __ movw(rscratch2, t); 3556 __ andw(rscratch3, rscratch3, r2); 3557 __ addw(rscratch4, r1, rscratch2); 3558 reg_cache.extract_u32(rscratch1, k); 3559 __ eorw(rscratch3, rscratch3, r4); 3560 __ addw(rscratch4, rscratch4, rscratch1); 3561 __ addw(rscratch3, rscratch3, rscratch4); 3562 __ rorw(rscratch2, rscratch3, 32 - s); 3563 __ addw(r1, rscratch2, r2); 3564 } 3565 3566 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3567 int k, int s, int t) { 3568 Register rscratch3 = r10; 3569 Register rscratch4 = r11; 3570 3571 reg_cache.extract_u32(rscratch1, k); 3572 __ movw(rscratch2, t); 3573 __ addw(rscratch4, r1, rscratch2); 3574 __ addw(rscratch4, rscratch4, rscratch1); 3575 __ bicw(rscratch2, r3, r4); 3576 __ andw(rscratch3, r2, r4); 3577 __ addw(rscratch2, rscratch2, rscratch4); 3578 __ addw(rscratch2, rscratch2, rscratch3); 3579 __ rorw(rscratch2, rscratch2, 32 - s); 3580 __ addw(r1, rscratch2, r2); 3581 } 3582 3583 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3584 int k, int s, int t) { 3585 Register rscratch3 = r10; 3586 Register rscratch4 = r11; 3587 3588 __ eorw(rscratch3, r3, r4); 3589 __ movw(rscratch2, t); 3590 __ addw(rscratch4, r1, rscratch2); 3591 reg_cache.extract_u32(rscratch1, k); 3592 __ eorw(rscratch3, rscratch3, r2); 3593 __ addw(rscratch4, rscratch4, rscratch1); 3594 __ addw(rscratch3, rscratch3, rscratch4); 3595 __ rorw(rscratch2, rscratch3, 32 - s); 3596 __ addw(r1, rscratch2, r2); 3597 } 3598 3599 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3600 int k, int s, int t) { 3601 Register rscratch3 = r10; 3602 Register rscratch4 = r11; 3603 3604 __ movw(rscratch3, t); 3605 __ ornw(rscratch2, r2, r4); 3606 __ addw(rscratch4, r1, rscratch3); 3607 reg_cache.extract_u32(rscratch1, k); 3608 __ eorw(rscratch3, rscratch2, r3); 3609 __ addw(rscratch4, rscratch4, rscratch1); 3610 __ addw(rscratch3, rscratch3, rscratch4); 3611 __ rorw(rscratch2, rscratch3, 32 - s); 3612 __ addw(r1, rscratch2, r2); 3613 } 3614 3615 // Arguments: 3616 // 3617 // Inputs: 3618 // c_rarg0 - byte[] source+offset 3619 // c_rarg1 - int[] SHA.state 3620 // c_rarg2 - int offset 3621 // c_rarg3 - int limit 3622 // 3623 address generate_md5_implCompress(StubId stub_id) { 3624 bool multi_block; 3625 switch (stub_id) { 3626 case StubId::stubgen_md5_implCompress_id: 3627 multi_block = false; 3628 break; 3629 case StubId::stubgen_md5_implCompressMB_id: 3630 multi_block = true; 3631 break; 3632 default: 3633 ShouldNotReachHere(); 3634 } 3635 __ align(CodeEntryAlignment); 3636 3637 StubCodeMark mark(this, stub_id); 3638 address start = __ pc(); 3639 3640 Register buf = c_rarg0; 3641 Register state = c_rarg1; 3642 Register ofs = c_rarg2; 3643 Register limit = c_rarg3; 3644 Register a = r4; 3645 Register b = r5; 3646 Register c = r6; 3647 Register d = r7; 3648 Register rscratch3 = r10; 3649 Register rscratch4 = r11; 3650 3651 Register state_regs[2] = { r12, r13 }; 3652 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3653 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3654 3655 __ push(saved_regs, sp); 3656 3657 __ ldp(state_regs[0], state_regs[1], Address(state)); 3658 __ ubfx(a, state_regs[0], 0, 32); 3659 __ ubfx(b, state_regs[0], 32, 32); 3660 __ ubfx(c, state_regs[1], 0, 32); 3661 __ ubfx(d, state_regs[1], 32, 32); 3662 3663 Label md5_loop; 3664 __ BIND(md5_loop); 3665 3666 reg_cache.gen_loads(buf); 3667 3668 // Round 1 3669 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3670 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3671 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3672 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3673 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3674 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3675 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3676 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3677 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3678 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3679 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3680 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3681 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3682 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3683 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3684 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3685 3686 // Round 2 3687 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3688 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3689 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3690 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3691 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3692 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3693 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3694 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3695 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3696 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3697 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3698 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3699 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3700 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3701 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3702 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3703 3704 // Round 3 3705 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3706 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3707 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3708 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3709 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3710 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3711 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3712 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3713 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3714 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3715 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3716 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3717 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3718 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3719 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3720 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3721 3722 // Round 4 3723 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3724 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3725 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3726 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3727 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3728 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3729 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3730 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3731 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3732 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3733 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3734 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3735 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3736 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3737 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3738 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3739 3740 __ addw(a, state_regs[0], a); 3741 __ ubfx(rscratch2, state_regs[0], 32, 32); 3742 __ addw(b, rscratch2, b); 3743 __ addw(c, state_regs[1], c); 3744 __ ubfx(rscratch4, state_regs[1], 32, 32); 3745 __ addw(d, rscratch4, d); 3746 3747 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3748 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3749 3750 if (multi_block) { 3751 __ add(buf, buf, 64); 3752 __ add(ofs, ofs, 64); 3753 __ cmp(ofs, limit); 3754 __ br(Assembler::LE, md5_loop); 3755 __ mov(c_rarg0, ofs); // return ofs 3756 } 3757 3758 // write hash values back in the correct order 3759 __ stp(state_regs[0], state_regs[1], Address(state)); 3760 3761 __ pop(saved_regs, sp); 3762 3763 __ ret(lr); 3764 3765 return start; 3766 } 3767 3768 // Arguments: 3769 // 3770 // Inputs: 3771 // c_rarg0 - byte[] source+offset 3772 // c_rarg1 - int[] SHA.state 3773 // c_rarg2 - int offset 3774 // c_rarg3 - int limit 3775 // 3776 address generate_sha1_implCompress(StubId stub_id) { 3777 bool multi_block; 3778 switch (stub_id) { 3779 case StubId::stubgen_sha1_implCompress_id: 3780 multi_block = false; 3781 break; 3782 case StubId::stubgen_sha1_implCompressMB_id: 3783 multi_block = true; 3784 break; 3785 default: 3786 ShouldNotReachHere(); 3787 } 3788 3789 __ align(CodeEntryAlignment); 3790 3791 StubCodeMark mark(this, stub_id); 3792 address start = __ pc(); 3793 3794 Register buf = c_rarg0; 3795 Register state = c_rarg1; 3796 Register ofs = c_rarg2; 3797 Register limit = c_rarg3; 3798 3799 Label keys; 3800 Label sha1_loop; 3801 3802 // load the keys into v0..v3 3803 __ adr(rscratch1, keys); 3804 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3805 // load 5 words state into v6, v7 3806 __ ldrq(v6, Address(state, 0)); 3807 __ ldrs(v7, Address(state, 16)); 3808 3809 3810 __ BIND(sha1_loop); 3811 // load 64 bytes of data into v16..v19 3812 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3813 __ rev32(v16, __ T16B, v16); 3814 __ rev32(v17, __ T16B, v17); 3815 __ rev32(v18, __ T16B, v18); 3816 __ rev32(v19, __ T16B, v19); 3817 3818 // do the sha1 3819 __ addv(v4, __ T4S, v16, v0); 3820 __ orr(v20, __ T16B, v6, v6); 3821 3822 FloatRegister d0 = v16; 3823 FloatRegister d1 = v17; 3824 FloatRegister d2 = v18; 3825 FloatRegister d3 = v19; 3826 3827 for (int round = 0; round < 20; round++) { 3828 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3829 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3830 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3831 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3832 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3833 3834 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3835 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3836 __ sha1h(tmp2, __ T4S, v20); 3837 if (round < 5) 3838 __ sha1c(v20, __ T4S, tmp3, tmp4); 3839 else if (round < 10 || round >= 15) 3840 __ sha1p(v20, __ T4S, tmp3, tmp4); 3841 else 3842 __ sha1m(v20, __ T4S, tmp3, tmp4); 3843 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3844 3845 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3846 } 3847 3848 __ addv(v7, __ T2S, v7, v21); 3849 __ addv(v6, __ T4S, v6, v20); 3850 3851 if (multi_block) { 3852 __ add(ofs, ofs, 64); 3853 __ cmp(ofs, limit); 3854 __ br(Assembler::LE, sha1_loop); 3855 __ mov(c_rarg0, ofs); // return ofs 3856 } 3857 3858 __ strq(v6, Address(state, 0)); 3859 __ strs(v7, Address(state, 16)); 3860 3861 __ ret(lr); 3862 3863 __ bind(keys); 3864 __ emit_int32(0x5a827999); 3865 __ emit_int32(0x6ed9eba1); 3866 __ emit_int32(0x8f1bbcdc); 3867 __ emit_int32(0xca62c1d6); 3868 3869 return start; 3870 } 3871 3872 3873 // Arguments: 3874 // 3875 // Inputs: 3876 // c_rarg0 - byte[] source+offset 3877 // c_rarg1 - int[] SHA.state 3878 // c_rarg2 - int offset 3879 // c_rarg3 - int limit 3880 // 3881 address generate_sha256_implCompress(StubId stub_id) { 3882 bool multi_block; 3883 switch (stub_id) { 3884 case StubId::stubgen_sha256_implCompress_id: 3885 multi_block = false; 3886 break; 3887 case StubId::stubgen_sha256_implCompressMB_id: 3888 multi_block = true; 3889 break; 3890 default: 3891 ShouldNotReachHere(); 3892 } 3893 3894 static const uint32_t round_consts[64] = { 3895 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3896 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3897 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3898 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3899 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3900 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3901 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3902 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3903 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3904 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3905 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3906 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3907 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3908 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3909 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3910 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3911 }; 3912 3913 __ align(CodeEntryAlignment); 3914 3915 StubCodeMark mark(this, stub_id); 3916 address start = __ pc(); 3917 3918 Register buf = c_rarg0; 3919 Register state = c_rarg1; 3920 Register ofs = c_rarg2; 3921 Register limit = c_rarg3; 3922 3923 Label sha1_loop; 3924 3925 __ stpd(v8, v9, __ pre(sp, -32)); 3926 __ stpd(v10, v11, Address(sp, 16)); 3927 3928 // dga == v0 3929 // dgb == v1 3930 // dg0 == v2 3931 // dg1 == v3 3932 // dg2 == v4 3933 // t0 == v6 3934 // t1 == v7 3935 3936 // load 16 keys to v16..v31 3937 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3938 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3939 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3940 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3941 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3942 3943 // load 8 words (256 bits) state 3944 __ ldpq(v0, v1, state); 3945 3946 __ BIND(sha1_loop); 3947 // load 64 bytes of data into v8..v11 3948 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3949 __ rev32(v8, __ T16B, v8); 3950 __ rev32(v9, __ T16B, v9); 3951 __ rev32(v10, __ T16B, v10); 3952 __ rev32(v11, __ T16B, v11); 3953 3954 __ addv(v6, __ T4S, v8, v16); 3955 __ orr(v2, __ T16B, v0, v0); 3956 __ orr(v3, __ T16B, v1, v1); 3957 3958 FloatRegister d0 = v8; 3959 FloatRegister d1 = v9; 3960 FloatRegister d2 = v10; 3961 FloatRegister d3 = v11; 3962 3963 3964 for (int round = 0; round < 16; round++) { 3965 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3966 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3967 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3968 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3969 3970 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3971 __ orr(v4, __ T16B, v2, v2); 3972 if (round < 15) 3973 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3974 __ sha256h(v2, __ T4S, v3, tmp2); 3975 __ sha256h2(v3, __ T4S, v4, tmp2); 3976 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3977 3978 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3979 } 3980 3981 __ addv(v0, __ T4S, v0, v2); 3982 __ addv(v1, __ T4S, v1, v3); 3983 3984 if (multi_block) { 3985 __ add(ofs, ofs, 64); 3986 __ cmp(ofs, limit); 3987 __ br(Assembler::LE, sha1_loop); 3988 __ mov(c_rarg0, ofs); // return ofs 3989 } 3990 3991 __ ldpd(v10, v11, Address(sp, 16)); 3992 __ ldpd(v8, v9, __ post(sp, 32)); 3993 3994 __ stpq(v0, v1, state); 3995 3996 __ ret(lr); 3997 3998 return start; 3999 } 4000 4001 // Double rounds for sha512. 4002 void sha512_dround(int dr, 4003 FloatRegister vi0, FloatRegister vi1, 4004 FloatRegister vi2, FloatRegister vi3, 4005 FloatRegister vi4, FloatRegister vrc0, 4006 FloatRegister vrc1, FloatRegister vin0, 4007 FloatRegister vin1, FloatRegister vin2, 4008 FloatRegister vin3, FloatRegister vin4) { 4009 if (dr < 36) { 4010 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4011 } 4012 __ addv(v5, __ T2D, vrc0, vin0); 4013 __ ext(v6, __ T16B, vi2, vi3, 8); 4014 __ ext(v5, __ T16B, v5, v5, 8); 4015 __ ext(v7, __ T16B, vi1, vi2, 8); 4016 __ addv(vi3, __ T2D, vi3, v5); 4017 if (dr < 32) { 4018 __ ext(v5, __ T16B, vin3, vin4, 8); 4019 __ sha512su0(vin0, __ T2D, vin1); 4020 } 4021 __ sha512h(vi3, __ T2D, v6, v7); 4022 if (dr < 32) { 4023 __ sha512su1(vin0, __ T2D, vin2, v5); 4024 } 4025 __ addv(vi4, __ T2D, vi1, vi3); 4026 __ sha512h2(vi3, __ T2D, vi1, vi0); 4027 } 4028 4029 // Arguments: 4030 // 4031 // Inputs: 4032 // c_rarg0 - byte[] source+offset 4033 // c_rarg1 - int[] SHA.state 4034 // c_rarg2 - int offset 4035 // c_rarg3 - int limit 4036 // 4037 address generate_sha512_implCompress(StubId stub_id) { 4038 bool multi_block; 4039 switch (stub_id) { 4040 case StubId::stubgen_sha512_implCompress_id: 4041 multi_block = false; 4042 break; 4043 case StubId::stubgen_sha512_implCompressMB_id: 4044 multi_block = true; 4045 break; 4046 default: 4047 ShouldNotReachHere(); 4048 } 4049 4050 static const uint64_t round_consts[80] = { 4051 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4052 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4053 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4054 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4055 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4056 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4057 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4058 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4059 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4060 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4061 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4062 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4063 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4064 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4065 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4066 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4067 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4068 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4069 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4070 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4071 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4072 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4073 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4074 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4075 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4076 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4077 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4078 }; 4079 4080 __ align(CodeEntryAlignment); 4081 4082 StubCodeMark mark(this, stub_id); 4083 address start = __ pc(); 4084 4085 Register buf = c_rarg0; 4086 Register state = c_rarg1; 4087 Register ofs = c_rarg2; 4088 Register limit = c_rarg3; 4089 4090 __ stpd(v8, v9, __ pre(sp, -64)); 4091 __ stpd(v10, v11, Address(sp, 16)); 4092 __ stpd(v12, v13, Address(sp, 32)); 4093 __ stpd(v14, v15, Address(sp, 48)); 4094 4095 Label sha512_loop; 4096 4097 // load state 4098 __ ld1(v8, v9, v10, v11, __ T2D, state); 4099 4100 // load first 4 round constants 4101 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4102 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4103 4104 __ BIND(sha512_loop); 4105 // load 128B of data into v12..v19 4106 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4107 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4108 __ rev64(v12, __ T16B, v12); 4109 __ rev64(v13, __ T16B, v13); 4110 __ rev64(v14, __ T16B, v14); 4111 __ rev64(v15, __ T16B, v15); 4112 __ rev64(v16, __ T16B, v16); 4113 __ rev64(v17, __ T16B, v17); 4114 __ rev64(v18, __ T16B, v18); 4115 __ rev64(v19, __ T16B, v19); 4116 4117 __ mov(rscratch2, rscratch1); 4118 4119 __ mov(v0, __ T16B, v8); 4120 __ mov(v1, __ T16B, v9); 4121 __ mov(v2, __ T16B, v10); 4122 __ mov(v3, __ T16B, v11); 4123 4124 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4125 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4126 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4127 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4128 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4129 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4130 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4131 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4132 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4133 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4134 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4135 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4136 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4137 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4138 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4139 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4140 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4141 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4142 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4143 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4144 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4145 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4146 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4147 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4148 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4149 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4150 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4151 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4152 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4153 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4154 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4155 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4156 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4157 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4158 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4159 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4160 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4161 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4162 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4163 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4164 4165 __ addv(v8, __ T2D, v8, v0); 4166 __ addv(v9, __ T2D, v9, v1); 4167 __ addv(v10, __ T2D, v10, v2); 4168 __ addv(v11, __ T2D, v11, v3); 4169 4170 if (multi_block) { 4171 __ add(ofs, ofs, 128); 4172 __ cmp(ofs, limit); 4173 __ br(Assembler::LE, sha512_loop); 4174 __ mov(c_rarg0, ofs); // return ofs 4175 } 4176 4177 __ st1(v8, v9, v10, v11, __ T2D, state); 4178 4179 __ ldpd(v14, v15, Address(sp, 48)); 4180 __ ldpd(v12, v13, Address(sp, 32)); 4181 __ ldpd(v10, v11, Address(sp, 16)); 4182 __ ldpd(v8, v9, __ post(sp, 64)); 4183 4184 __ ret(lr); 4185 4186 return start; 4187 } 4188 4189 // Execute one round of keccak of two computations in parallel. 4190 // One of the states should be loaded into the lower halves of 4191 // the vector registers v0-v24, the other should be loaded into 4192 // the upper halves of those registers. The ld1r instruction loads 4193 // the round constant into both halves of register v31. 4194 // Intermediate results c0...c5 and d0...d5 are computed 4195 // in registers v25...v30. 4196 // All vector instructions that are used operate on both register 4197 // halves in parallel. 4198 // If only a single computation is needed, one can only load the lower halves. 4199 void keccak_round(Register rscratch1) { 4200 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4201 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4202 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4203 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4204 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4205 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4206 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4207 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4208 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4209 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4210 4211 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4212 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4213 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4214 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4215 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4216 4217 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4218 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4219 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4220 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4221 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4222 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4223 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4224 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4225 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4226 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4227 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4228 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4229 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4230 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4231 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4232 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4233 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4234 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4235 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4236 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4237 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4238 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4239 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4240 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4241 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4242 4243 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4244 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4245 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4246 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4247 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4248 4249 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4250 4251 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4252 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4253 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4254 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4255 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4256 4257 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4258 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4259 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4260 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4261 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4262 4263 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4264 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4265 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4266 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4267 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4268 4269 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4270 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4271 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4272 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4273 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4274 4275 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4276 } 4277 4278 // Arguments: 4279 // 4280 // Inputs: 4281 // c_rarg0 - byte[] source+offset 4282 // c_rarg1 - byte[] SHA.state 4283 // c_rarg2 - int block_size 4284 // c_rarg3 - int offset 4285 // c_rarg4 - int limit 4286 // 4287 address generate_sha3_implCompress(StubId stub_id) { 4288 bool multi_block; 4289 switch (stub_id) { 4290 case StubId::stubgen_sha3_implCompress_id: 4291 multi_block = false; 4292 break; 4293 case StubId::stubgen_sha3_implCompressMB_id: 4294 multi_block = true; 4295 break; 4296 default: 4297 ShouldNotReachHere(); 4298 } 4299 4300 static const uint64_t round_consts[24] = { 4301 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4302 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4303 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4304 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4305 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4306 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4307 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4308 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4309 }; 4310 4311 __ align(CodeEntryAlignment); 4312 4313 StubCodeMark mark(this, stub_id); 4314 address start = __ pc(); 4315 4316 Register buf = c_rarg0; 4317 Register state = c_rarg1; 4318 Register block_size = c_rarg2; 4319 Register ofs = c_rarg3; 4320 Register limit = c_rarg4; 4321 4322 Label sha3_loop, rounds24_loop; 4323 Label sha3_512_or_sha3_384, shake128; 4324 4325 __ stpd(v8, v9, __ pre(sp, -64)); 4326 __ stpd(v10, v11, Address(sp, 16)); 4327 __ stpd(v12, v13, Address(sp, 32)); 4328 __ stpd(v14, v15, Address(sp, 48)); 4329 4330 // load state 4331 __ add(rscratch1, state, 32); 4332 __ ld1(v0, v1, v2, v3, __ T1D, state); 4333 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4334 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4335 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4336 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4337 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4338 __ ld1(v24, __ T1D, rscratch1); 4339 4340 __ BIND(sha3_loop); 4341 4342 // 24 keccak rounds 4343 __ movw(rscratch2, 24); 4344 4345 // load round_constants base 4346 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4347 4348 // load input 4349 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4350 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4351 __ eor(v0, __ T8B, v0, v25); 4352 __ eor(v1, __ T8B, v1, v26); 4353 __ eor(v2, __ T8B, v2, v27); 4354 __ eor(v3, __ T8B, v3, v28); 4355 __ eor(v4, __ T8B, v4, v29); 4356 __ eor(v5, __ T8B, v5, v30); 4357 __ eor(v6, __ T8B, v6, v31); 4358 4359 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4360 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4361 4362 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4363 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4364 __ eor(v7, __ T8B, v7, v25); 4365 __ eor(v8, __ T8B, v8, v26); 4366 __ eor(v9, __ T8B, v9, v27); 4367 __ eor(v10, __ T8B, v10, v28); 4368 __ eor(v11, __ T8B, v11, v29); 4369 __ eor(v12, __ T8B, v12, v30); 4370 __ eor(v13, __ T8B, v13, v31); 4371 4372 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4373 __ eor(v14, __ T8B, v14, v25); 4374 __ eor(v15, __ T8B, v15, v26); 4375 __ eor(v16, __ T8B, v16, v27); 4376 4377 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4378 __ andw(c_rarg5, block_size, 48); 4379 __ cbzw(c_rarg5, rounds24_loop); 4380 4381 __ tbnz(block_size, 5, shake128); 4382 // block_size == 144, bit5 == 0, SHA3-224 4383 __ ldrd(v28, __ post(buf, 8)); 4384 __ eor(v17, __ T8B, v17, v28); 4385 __ b(rounds24_loop); 4386 4387 __ BIND(shake128); 4388 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4389 __ eor(v17, __ T8B, v17, v28); 4390 __ eor(v18, __ T8B, v18, v29); 4391 __ eor(v19, __ T8B, v19, v30); 4392 __ eor(v20, __ T8B, v20, v31); 4393 __ b(rounds24_loop); // block_size == 168, SHAKE128 4394 4395 __ BIND(sha3_512_or_sha3_384); 4396 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4397 __ eor(v7, __ T8B, v7, v25); 4398 __ eor(v8, __ T8B, v8, v26); 4399 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4400 4401 // SHA3-384 4402 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4403 __ eor(v9, __ T8B, v9, v27); 4404 __ eor(v10, __ T8B, v10, v28); 4405 __ eor(v11, __ T8B, v11, v29); 4406 __ eor(v12, __ T8B, v12, v30); 4407 4408 __ BIND(rounds24_loop); 4409 __ subw(rscratch2, rscratch2, 1); 4410 4411 keccak_round(rscratch1); 4412 4413 __ cbnzw(rscratch2, rounds24_loop); 4414 4415 if (multi_block) { 4416 __ add(ofs, ofs, block_size); 4417 __ cmp(ofs, limit); 4418 __ br(Assembler::LE, sha3_loop); 4419 __ mov(c_rarg0, ofs); // return ofs 4420 } 4421 4422 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4423 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4424 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4425 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4426 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4427 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4428 __ st1(v24, __ T1D, state); 4429 4430 // restore callee-saved registers 4431 __ ldpd(v14, v15, Address(sp, 48)); 4432 __ ldpd(v12, v13, Address(sp, 32)); 4433 __ ldpd(v10, v11, Address(sp, 16)); 4434 __ ldpd(v8, v9, __ post(sp, 64)); 4435 4436 __ ret(lr); 4437 4438 return start; 4439 } 4440 4441 // Inputs: 4442 // c_rarg0 - long[] state0 4443 // c_rarg1 - long[] state1 4444 address generate_double_keccak() { 4445 static const uint64_t round_consts[24] = { 4446 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4447 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4448 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4449 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4450 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4451 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4452 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4453 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4454 }; 4455 4456 // Implements the double_keccak() method of the 4457 // sun.secyrity.provider.SHA3Parallel class 4458 __ align(CodeEntryAlignment); 4459 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4460 address start = __ pc(); 4461 __ enter(); 4462 4463 Register state0 = c_rarg0; 4464 Register state1 = c_rarg1; 4465 4466 Label rounds24_loop; 4467 4468 // save callee-saved registers 4469 __ stpd(v8, v9, __ pre(sp, -64)); 4470 __ stpd(v10, v11, Address(sp, 16)); 4471 __ stpd(v12, v13, Address(sp, 32)); 4472 __ stpd(v14, v15, Address(sp, 48)); 4473 4474 // load states 4475 __ add(rscratch1, state0, 32); 4476 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4477 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4478 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4479 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4480 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4481 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4482 __ ld1(v24, __ D, 0, rscratch1); 4483 __ add(rscratch1, state1, 32); 4484 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4485 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4486 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4487 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4488 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4489 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4490 __ ld1(v24, __ D, 1, rscratch1); 4491 4492 // 24 keccak rounds 4493 __ movw(rscratch2, 24); 4494 4495 // load round_constants base 4496 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4497 4498 __ BIND(rounds24_loop); 4499 __ subw(rscratch2, rscratch2, 1); 4500 keccak_round(rscratch1); 4501 __ cbnzw(rscratch2, rounds24_loop); 4502 4503 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4504 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4505 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4506 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4507 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4508 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4509 __ st1(v24, __ D, 0, state0); 4510 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4511 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4512 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4513 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4514 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4515 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4516 __ st1(v24, __ D, 1, state1); 4517 4518 // restore callee-saved vector registers 4519 __ ldpd(v14, v15, Address(sp, 48)); 4520 __ ldpd(v12, v13, Address(sp, 32)); 4521 __ ldpd(v10, v11, Address(sp, 16)); 4522 __ ldpd(v8, v9, __ post(sp, 64)); 4523 4524 __ leave(); // required for proper stackwalking of RuntimeStub frame 4525 __ mov(r0, zr); // return 0 4526 __ ret(lr); 4527 4528 return start; 4529 } 4530 4531 // ChaCha20 block function. This version parallelizes the 32-bit 4532 // state elements on each of 16 vectors, producing 4 blocks of 4533 // keystream at a time. 4534 // 4535 // state (int[16]) = c_rarg0 4536 // keystream (byte[256]) = c_rarg1 4537 // return - number of bytes of produced keystream (always 256) 4538 // 4539 // This implementation takes each 32-bit integer from the state 4540 // array and broadcasts it across all 4 32-bit lanes of a vector register 4541 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4542 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4543 // the quarter round schedule is implemented as outlined in RFC 7539 section 4544 // 2.3. However, instead of sequentially processing the 3 quarter round 4545 // operations represented by one QUARTERROUND function, we instead stack all 4546 // the adds, xors and left-rotations from the first 4 quarter rounds together 4547 // and then do the same for the second set of 4 quarter rounds. This removes 4548 // some latency that would otherwise be incurred by waiting for an add to 4549 // complete before performing an xor (which depends on the result of the 4550 // add), etc. An adjustment happens between the first and second groups of 4 4551 // quarter rounds, but this is done only in the inputs to the macro functions 4552 // that generate the assembly instructions - these adjustments themselves are 4553 // not part of the resulting assembly. 4554 // The 4 registers v0-v3 are used during the quarter round operations as 4555 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4556 // registers become the vectors involved in adding the start state back onto 4557 // the post-QR working state. After the adds are complete, each of the 16 4558 // vectors write their first lane back to the keystream buffer, followed 4559 // by the second lane from all vectors and so on. 4560 address generate_chacha20Block_blockpar() { 4561 Label L_twoRounds, L_cc20_const; 4562 // The constant data is broken into two 128-bit segments to be loaded 4563 // onto FloatRegisters. The first 128 bits are a counter add overlay 4564 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4565 // The second 128-bits is a table constant used for 8-bit left rotations. 4566 __ BIND(L_cc20_const); 4567 __ emit_int64(0x0000000100000000UL); 4568 __ emit_int64(0x0000000300000002UL); 4569 __ emit_int64(0x0605040702010003UL); 4570 __ emit_int64(0x0E0D0C0F0A09080BUL); 4571 4572 __ align(CodeEntryAlignment); 4573 StubId stub_id = StubId::stubgen_chacha20Block_id; 4574 StubCodeMark mark(this, stub_id); 4575 address start = __ pc(); 4576 __ enter(); 4577 4578 int i, j; 4579 const Register state = c_rarg0; 4580 const Register keystream = c_rarg1; 4581 const Register loopCtr = r10; 4582 const Register tmpAddr = r11; 4583 const FloatRegister ctrAddOverlay = v28; 4584 const FloatRegister lrot8Tbl = v29; 4585 4586 // Organize SIMD registers in an array that facilitates 4587 // putting repetitive opcodes into loop structures. It is 4588 // important that each grouping of 4 registers is monotonically 4589 // increasing to support the requirements of multi-register 4590 // instructions (e.g. ld4r, st4, etc.) 4591 const FloatRegister workSt[16] = { 4592 v4, v5, v6, v7, v16, v17, v18, v19, 4593 v20, v21, v22, v23, v24, v25, v26, v27 4594 }; 4595 4596 // Pull in constant data. The first 16 bytes are the add overlay 4597 // which is applied to the vector holding the counter (state[12]). 4598 // The second 16 bytes is the index register for the 8-bit left 4599 // rotation tbl instruction. 4600 __ adr(tmpAddr, L_cc20_const); 4601 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4602 4603 // Load from memory and interlace across 16 SIMD registers, 4604 // With each word from memory being broadcast to all lanes of 4605 // each successive SIMD register. 4606 // Addr(0) -> All lanes in workSt[i] 4607 // Addr(4) -> All lanes workSt[i + 1], etc. 4608 __ mov(tmpAddr, state); 4609 for (i = 0; i < 16; i += 4) { 4610 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4611 __ post(tmpAddr, 16)); 4612 } 4613 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4614 4615 // Before entering the loop, create 5 4-register arrays. These 4616 // will hold the 4 registers that represent the a/b/c/d fields 4617 // in the quarter round operation. For instance the "b" field 4618 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4619 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4620 // since it is part of a diagonal organization. The aSet and scratch 4621 // register sets are defined at declaration time because they do not change 4622 // organization at any point during the 20-round processing. 4623 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4624 FloatRegister bSet[4]; 4625 FloatRegister cSet[4]; 4626 FloatRegister dSet[4]; 4627 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4628 4629 // Set up the 10 iteration loop and perform all 8 quarter round ops 4630 __ mov(loopCtr, 10); 4631 __ BIND(L_twoRounds); 4632 4633 // Set to columnar organization and do the following 4 quarter-rounds: 4634 // QUARTERROUND(0, 4, 8, 12) 4635 // QUARTERROUND(1, 5, 9, 13) 4636 // QUARTERROUND(2, 6, 10, 14) 4637 // QUARTERROUND(3, 7, 11, 15) 4638 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4639 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4640 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4641 4642 __ cc20_qr_add4(aSet, bSet); // a += b 4643 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4644 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4645 4646 __ cc20_qr_add4(cSet, dSet); // c += d 4647 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4648 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4649 4650 __ cc20_qr_add4(aSet, bSet); // a += b 4651 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4652 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4653 4654 __ cc20_qr_add4(cSet, dSet); // c += d 4655 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4656 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4657 4658 // Set to diagonal organization and do the next 4 quarter-rounds: 4659 // QUARTERROUND(0, 5, 10, 15) 4660 // QUARTERROUND(1, 6, 11, 12) 4661 // QUARTERROUND(2, 7, 8, 13) 4662 // QUARTERROUND(3, 4, 9, 14) 4663 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4664 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4665 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4666 4667 __ cc20_qr_add4(aSet, bSet); // a += b 4668 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4669 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4670 4671 __ cc20_qr_add4(cSet, dSet); // c += d 4672 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4673 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4674 4675 __ cc20_qr_add4(aSet, bSet); // a += b 4676 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4677 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4678 4679 __ cc20_qr_add4(cSet, dSet); // c += d 4680 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4681 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4682 4683 // Decrement and iterate 4684 __ sub(loopCtr, loopCtr, 1); 4685 __ cbnz(loopCtr, L_twoRounds); 4686 4687 __ mov(tmpAddr, state); 4688 4689 // Add the starting state back to the post-loop keystream 4690 // state. We read/interlace the state array from memory into 4691 // 4 registers similar to what we did in the beginning. Then 4692 // add the counter overlay onto workSt[12] at the end. 4693 for (i = 0; i < 16; i += 4) { 4694 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4695 __ addv(workSt[i], __ T4S, workSt[i], v0); 4696 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4697 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4698 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4699 } 4700 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4701 4702 // Write working state into the keystream buffer. This is accomplished 4703 // by taking the lane "i" from each of the four vectors and writing 4704 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4705 // repeating with the next 4 vectors until all 16 vectors have been used. 4706 // Then move to the next lane and repeat the process until all lanes have 4707 // been written. 4708 for (i = 0; i < 4; i++) { 4709 for (j = 0; j < 16; j += 4) { 4710 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4711 __ post(keystream, 16)); 4712 } 4713 } 4714 4715 __ mov(r0, 256); // Return length of output keystream 4716 __ leave(); 4717 __ ret(lr); 4718 4719 return start; 4720 } 4721 4722 // Helpers to schedule parallel operation bundles across vector 4723 // register sequences of size 2, 4 or 8. 4724 4725 // Implement various primitive computations across vector sequences 4726 4727 template<int N> 4728 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4729 const VSeq<N>& v1, const VSeq<N>& v2) { 4730 // output must not be constant 4731 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4732 // output cannot overwrite pending inputs 4733 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4734 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4735 for (int i = 0; i < N; i++) { 4736 __ addv(v[i], T, v1[i], v2[i]); 4737 } 4738 } 4739 4740 template<int N> 4741 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4742 const VSeq<N>& v1, const VSeq<N>& v2) { 4743 // output must not be constant 4744 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4745 // output cannot overwrite pending inputs 4746 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4747 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4748 for (int i = 0; i < N; i++) { 4749 __ subv(v[i], T, v1[i], v2[i]); 4750 } 4751 } 4752 4753 template<int N> 4754 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4755 const VSeq<N>& v1, const VSeq<N>& v2) { 4756 // output must not be constant 4757 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4758 // output cannot overwrite pending inputs 4759 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4760 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4761 for (int i = 0; i < N; i++) { 4762 __ mulv(v[i], T, v1[i], v2[i]); 4763 } 4764 } 4765 4766 template<int N> 4767 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4768 // output must not be constant 4769 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4770 // output cannot overwrite pending inputs 4771 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4772 for (int i = 0; i < N; i++) { 4773 __ negr(v[i], T, v1[i]); 4774 } 4775 } 4776 4777 template<int N> 4778 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4779 const VSeq<N>& v1, int shift) { 4780 // output must not be constant 4781 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4782 // output cannot overwrite pending inputs 4783 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4784 for (int i = 0; i < N; i++) { 4785 __ sshr(v[i], T, v1[i], shift); 4786 } 4787 } 4788 4789 template<int N> 4790 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4791 // output must not be constant 4792 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4793 // output cannot overwrite pending inputs 4794 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4795 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4796 for (int i = 0; i < N; i++) { 4797 __ andr(v[i], __ T16B, v1[i], v2[i]); 4798 } 4799 } 4800 4801 template<int N> 4802 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4803 // output must not be constant 4804 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4805 // output cannot overwrite pending inputs 4806 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4807 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4808 for (int i = 0; i < N; i++) { 4809 __ orr(v[i], __ T16B, v1[i], v2[i]); 4810 } 4811 } 4812 4813 template<int N> 4814 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4815 // output must not be constant 4816 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4817 // output cannot overwrite pending inputs 4818 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4819 for (int i = 0; i < N; i++) { 4820 __ notr(v[i], __ T16B, v1[i]); 4821 } 4822 } 4823 4824 template<int N> 4825 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4826 // output must not be constant 4827 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4828 // output cannot overwrite pending inputs 4829 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4830 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4831 for (int i = 0; i < N; i++) { 4832 __ sqdmulh(v[i], T, v1[i], v2[i]); 4833 } 4834 } 4835 4836 template<int N> 4837 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4838 // output must not be constant 4839 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4840 // output cannot overwrite pending inputs 4841 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4842 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4843 for (int i = 0; i < N; i++) { 4844 __ mlsv(v[i], T, v1[i], v2[i]); 4845 } 4846 } 4847 4848 // load N/2 successive pairs of quadword values from memory in order 4849 // into N successive vector registers of the sequence via the 4850 // address supplied in base. 4851 template<int N> 4852 void vs_ldpq(const VSeq<N>& v, Register base) { 4853 for (int i = 0; i < N; i += 2) { 4854 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4855 } 4856 } 4857 4858 // load N/2 successive pairs of quadword values from memory in order 4859 // into N vector registers of the sequence via the address supplied 4860 // in base using post-increment addressing 4861 template<int N> 4862 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4863 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4864 for (int i = 0; i < N; i += 2) { 4865 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4866 } 4867 } 4868 4869 // store N successive vector registers of the sequence into N/2 4870 // successive pairs of quadword memory locations via the address 4871 // supplied in base using post-increment addressing 4872 template<int N> 4873 void vs_stpq_post(const VSeq<N>& v, Register base) { 4874 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4875 for (int i = 0; i < N; i += 2) { 4876 __ stpq(v[i], v[i+1], __ post(base, 32)); 4877 } 4878 } 4879 4880 // load N/2 pairs of quadword values from memory de-interleaved into 4881 // N vector registers 2 at a time via the address supplied in base 4882 // using post-increment addressing. 4883 template<int N> 4884 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4885 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4886 for (int i = 0; i < N; i += 2) { 4887 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4888 } 4889 } 4890 4891 // store N vector registers interleaved into N/2 pairs of quadword 4892 // memory locations via the address supplied in base using 4893 // post-increment addressing. 4894 template<int N> 4895 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4896 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4897 for (int i = 0; i < N; i += 2) { 4898 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4899 } 4900 } 4901 4902 // load N quadword values from memory de-interleaved into N vector 4903 // registers 3 elements at a time via the address supplied in base. 4904 template<int N> 4905 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4906 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4907 for (int i = 0; i < N; i += 3) { 4908 __ ld3(v[i], v[i+1], v[i+2], T, base); 4909 } 4910 } 4911 4912 // load N quadword values from memory de-interleaved into N vector 4913 // registers 3 elements at a time via the address supplied in base 4914 // using post-increment addressing. 4915 template<int N> 4916 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4917 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4918 for (int i = 0; i < N; i += 3) { 4919 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4920 } 4921 } 4922 4923 // load N/2 pairs of quadword values from memory into N vector 4924 // registers via the address supplied in base with each pair indexed 4925 // using the the start offset plus the corresponding entry in the 4926 // offsets array 4927 template<int N> 4928 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4929 for (int i = 0; i < N/2; i++) { 4930 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4931 } 4932 } 4933 4934 // store N vector registers into N/2 pairs of quadword memory 4935 // locations via the address supplied in base with each pair indexed 4936 // using the the start offset plus the corresponding entry in the 4937 // offsets array 4938 template<int N> 4939 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4940 for (int i = 0; i < N/2; i++) { 4941 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4942 } 4943 } 4944 4945 // load N single quadword values from memory into N vector registers 4946 // via the address supplied in base with each value indexed using 4947 // the the start offset plus the corresponding entry in the offsets 4948 // array 4949 template<int N> 4950 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4951 int start, int (&offsets)[N]) { 4952 for (int i = 0; i < N; i++) { 4953 __ ldr(v[i], T, Address(base, start + offsets[i])); 4954 } 4955 } 4956 4957 // store N vector registers into N single quadword memory locations 4958 // via the address supplied in base with each value indexed using 4959 // the the start offset plus the corresponding entry in the offsets 4960 // array 4961 template<int N> 4962 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4963 int start, int (&offsets)[N]) { 4964 for (int i = 0; i < N; i++) { 4965 __ str(v[i], T, Address(base, start + offsets[i])); 4966 } 4967 } 4968 4969 // load N/2 pairs of quadword values from memory de-interleaved into 4970 // N vector registers 2 at a time via the address supplied in base 4971 // with each pair indexed using the the start offset plus the 4972 // corresponding entry in the offsets array 4973 template<int N> 4974 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4975 Register tmp, int start, int (&offsets)[N/2]) { 4976 for (int i = 0; i < N/2; i++) { 4977 __ add(tmp, base, start + offsets[i]); 4978 __ ld2(v[2*i], v[2*i+1], T, tmp); 4979 } 4980 } 4981 4982 // store N vector registers 2 at a time interleaved into N/2 pairs 4983 // of quadword memory locations via the address supplied in base 4984 // with each pair indexed using the the start offset plus the 4985 // corresponding entry in the offsets array 4986 template<int N> 4987 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4988 Register tmp, int start, int (&offsets)[N/2]) { 4989 for (int i = 0; i < N/2; i++) { 4990 __ add(tmp, base, start + offsets[i]); 4991 __ st2(v[2*i], v[2*i+1], T, tmp); 4992 } 4993 } 4994 4995 // Helper routines for various flavours of Montgomery multiply 4996 4997 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 4998 // multiplications in parallel 4999 // 5000 5001 // See the montMul() method of the sun.security.provider.ML_DSA 5002 // class. 5003 // 5004 // Computes 4x4S results or 8x8H results 5005 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5006 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5007 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5008 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5009 // Outputs: va - 4x4S or 4x8H vector register sequences 5010 // vb, vc, vtmp and vq must all be disjoint 5011 // va must be disjoint from all other inputs/temps or must equal vc 5012 // va must have a non-zero delta i.e. it must not be a constant vseq. 5013 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5014 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5015 Assembler::SIMD_Arrangement T, 5016 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5017 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5018 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5019 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5020 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5021 5022 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5023 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5024 5025 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5026 5027 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5028 assert(vs_disjoint(va, vb), "va and vb overlap"); 5029 assert(vs_disjoint(va, vq), "va and vq overlap"); 5030 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5031 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5032 5033 // schedule 4 streams of instructions across the vector sequences 5034 for (int i = 0; i < 4; i++) { 5035 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5036 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5037 } 5038 5039 for (int i = 0; i < 4; i++) { 5040 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5041 } 5042 5043 for (int i = 0; i < 4; i++) { 5044 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5045 } 5046 5047 for (int i = 0; i < 4; i++) { 5048 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5049 } 5050 } 5051 5052 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5053 // multiplications in parallel 5054 // 5055 5056 // See the montMul() method of the sun.security.provider.ML_DSA 5057 // class. 5058 // 5059 // Computes 4x4S results or 8x8H results 5060 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5061 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5062 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5063 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5064 // Outputs: va - 4x4S or 4x8H vector register sequences 5065 // vb, vc, vtmp and vq must all be disjoint 5066 // va must be disjoint from all other inputs/temps or must equal vc 5067 // va must have a non-zero delta i.e. it must not be a constant vseq. 5068 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5069 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5070 Assembler::SIMD_Arrangement T, 5071 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5072 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5073 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5074 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5075 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5076 5077 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5078 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5079 5080 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5081 5082 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5083 assert(vs_disjoint(va, vb), "va and vb overlap"); 5084 assert(vs_disjoint(va, vq), "va and vq overlap"); 5085 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5086 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5087 5088 // schedule 2 streams of instructions across the vector sequences 5089 for (int i = 0; i < 2; i++) { 5090 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5091 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5092 } 5093 5094 for (int i = 0; i < 2; i++) { 5095 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5096 } 5097 5098 for (int i = 0; i < 2; i++) { 5099 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5100 } 5101 5102 for (int i = 0; i < 2; i++) { 5103 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5104 } 5105 } 5106 5107 // Perform 16 16-bit Montgomery multiplications in parallel. 5108 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5109 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5110 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5111 // It will assert that the register use is valid 5112 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5113 } 5114 5115 // Perform 32 16-bit Montgomery multiplications in parallel. 5116 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5117 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5118 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5119 // It will assert that the register use is valid 5120 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5121 } 5122 5123 // Perform 64 16-bit Montgomery multiplications in parallel. 5124 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5125 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5126 // Schedule two successive 4x8H multiplies via the montmul helper 5127 // on the front and back halves of va, vb and vc. The helper will 5128 // assert that the register use has no overlap conflicts on each 5129 // individual call but we also need to ensure that the necessary 5130 // disjoint/equality constraints are met across both calls. 5131 5132 // vb, vc, vtmp and vq must be disjoint. va must either be 5133 // disjoint from all other registers or equal vc 5134 5135 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5136 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5137 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5138 5139 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5140 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5141 5142 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5143 5144 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5145 assert(vs_disjoint(va, vb), "va and vb overlap"); 5146 assert(vs_disjoint(va, vq), "va and vq overlap"); 5147 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5148 5149 // we multiply the front and back halves of each sequence 4 at a 5150 // time because 5151 // 5152 // 1) we are currently only able to get 4-way instruction 5153 // parallelism at best 5154 // 5155 // 2) we need registers for the constants in vq and temporary 5156 // scratch registers to hold intermediate results so vtmp can only 5157 // be a VSeq<4> which means we only have 4 scratch slots 5158 5159 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5160 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5161 } 5162 5163 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5164 const VSeq<4>& vc, 5165 const VSeq<4>& vtmp, 5166 const VSeq<2>& vq) { 5167 // compute a = montmul(a1, c) 5168 kyber_montmul32(vc, va1, vc, vtmp, vq); 5169 // ouptut a1 = a0 - a 5170 vs_subv(va1, __ T8H, va0, vc); 5171 // and a0 = a0 + a 5172 vs_addv(va0, __ T8H, va0, vc); 5173 } 5174 5175 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5176 const VSeq<4>& vb, 5177 const VSeq<4>& vtmp1, 5178 const VSeq<4>& vtmp2, 5179 const VSeq<2>& vq) { 5180 // compute c = a0 - a1 5181 vs_subv(vtmp1, __ T8H, va0, va1); 5182 // output a0 = a0 + a1 5183 vs_addv(va0, __ T8H, va0, va1); 5184 // output a1 = b montmul c 5185 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5186 } 5187 5188 void load64shorts(const VSeq<8>& v, Register shorts) { 5189 vs_ldpq_post(v, shorts); 5190 } 5191 5192 void load32shorts(const VSeq<4>& v, Register shorts) { 5193 vs_ldpq_post(v, shorts); 5194 } 5195 5196 void store64shorts(VSeq<8> v, Register tmpAddr) { 5197 vs_stpq_post(v, tmpAddr); 5198 } 5199 5200 // Kyber NTT function. 5201 // Implements 5202 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5203 // 5204 // coeffs (short[256]) = c_rarg0 5205 // ntt_zetas (short[256]) = c_rarg1 5206 address generate_kyberNtt() { 5207 5208 __ align(CodeEntryAlignment); 5209 StubId stub_id = StubId::stubgen_kyberNtt_id; 5210 StubCodeMark mark(this, stub_id); 5211 address start = __ pc(); 5212 __ enter(); 5213 5214 const Register coeffs = c_rarg0; 5215 const Register zetas = c_rarg1; 5216 5217 const Register kyberConsts = r10; 5218 const Register tmpAddr = r11; 5219 5220 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5221 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5222 VSeq<2> vq(30); // n.b. constants overlap vs3 5223 5224 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5225 // load the montmul constants 5226 vs_ldpq(vq, kyberConsts); 5227 5228 // Each level corresponds to an iteration of the outermost loop of the 5229 // Java method seilerNTT(int[] coeffs). There are some differences 5230 // from what is done in the seilerNTT() method, though: 5231 // 1. The computation is using 16-bit signed values, we do not convert them 5232 // to ints here. 5233 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5234 // this array for each level, it is easier that way to fill up the vector 5235 // registers. 5236 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5237 // multiplications (this is because that way there should not be any 5238 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5239 // that we can use the 16-bit arithmetic in the vector unit. 5240 // 5241 // On each level, we fill up the vector registers in such a way that the 5242 // array elements that need to be multiplied by the zetas go into one 5243 // set of vector registers while the corresponding ones that don't need to 5244 // be multiplied, go into another set. 5245 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5246 // registers interleaving the steps of 4 identical computations, 5247 // each done on 8 16-bit values per register. 5248 5249 // At levels 0-3 the coefficients multiplied by or added/subtracted 5250 // to the zetas occur in discrete blocks whose size is some multiple 5251 // of 32. 5252 5253 // level 0 5254 __ add(tmpAddr, coeffs, 256); 5255 load64shorts(vs1, tmpAddr); 5256 load64shorts(vs2, zetas); 5257 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5258 __ add(tmpAddr, coeffs, 0); 5259 load64shorts(vs1, tmpAddr); 5260 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5261 vs_addv(vs1, __ T8H, vs1, vs2); 5262 __ add(tmpAddr, coeffs, 0); 5263 vs_stpq_post(vs1, tmpAddr); 5264 __ add(tmpAddr, coeffs, 256); 5265 vs_stpq_post(vs3, tmpAddr); 5266 // restore montmul constants 5267 vs_ldpq(vq, kyberConsts); 5268 load64shorts(vs1, tmpAddr); 5269 load64shorts(vs2, zetas); 5270 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5271 __ add(tmpAddr, coeffs, 128); 5272 load64shorts(vs1, tmpAddr); 5273 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5274 vs_addv(vs1, __ T8H, vs1, vs2); 5275 __ add(tmpAddr, coeffs, 128); 5276 store64shorts(vs1, tmpAddr); 5277 __ add(tmpAddr, coeffs, 384); 5278 store64shorts(vs3, tmpAddr); 5279 5280 // level 1 5281 // restore montmul constants 5282 vs_ldpq(vq, kyberConsts); 5283 __ add(tmpAddr, coeffs, 128); 5284 load64shorts(vs1, tmpAddr); 5285 load64shorts(vs2, zetas); 5286 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5287 __ add(tmpAddr, coeffs, 0); 5288 load64shorts(vs1, tmpAddr); 5289 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5290 vs_addv(vs1, __ T8H, vs1, vs2); 5291 __ add(tmpAddr, coeffs, 0); 5292 store64shorts(vs1, tmpAddr); 5293 store64shorts(vs3, tmpAddr); 5294 vs_ldpq(vq, kyberConsts); 5295 __ add(tmpAddr, coeffs, 384); 5296 load64shorts(vs1, tmpAddr); 5297 load64shorts(vs2, zetas); 5298 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5299 __ add(tmpAddr, coeffs, 256); 5300 load64shorts(vs1, tmpAddr); 5301 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5302 vs_addv(vs1, __ T8H, vs1, vs2); 5303 __ add(tmpAddr, coeffs, 256); 5304 store64shorts(vs1, tmpAddr); 5305 store64shorts(vs3, tmpAddr); 5306 5307 // level 2 5308 vs_ldpq(vq, kyberConsts); 5309 int offsets1[4] = { 0, 32, 128, 160 }; 5310 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5311 load64shorts(vs2, zetas); 5312 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5313 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5314 // kyber_subv_addv64(); 5315 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5316 vs_addv(vs1, __ T8H, vs1, vs2); 5317 __ add(tmpAddr, coeffs, 0); 5318 vs_stpq_post(vs_front(vs1), tmpAddr); 5319 vs_stpq_post(vs_front(vs3), tmpAddr); 5320 vs_stpq_post(vs_back(vs1), tmpAddr); 5321 vs_stpq_post(vs_back(vs3), tmpAddr); 5322 vs_ldpq(vq, kyberConsts); 5323 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5324 load64shorts(vs2, zetas); 5325 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5326 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5327 // kyber_subv_addv64(); 5328 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5329 vs_addv(vs1, __ T8H, vs1, vs2); 5330 __ add(tmpAddr, coeffs, 256); 5331 vs_stpq_post(vs_front(vs1), tmpAddr); 5332 vs_stpq_post(vs_front(vs3), tmpAddr); 5333 vs_stpq_post(vs_back(vs1), tmpAddr); 5334 vs_stpq_post(vs_back(vs3), tmpAddr); 5335 5336 // level 3 5337 vs_ldpq(vq, kyberConsts); 5338 int offsets2[4] = { 0, 64, 128, 192 }; 5339 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5340 load64shorts(vs2, zetas); 5341 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5342 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5343 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5344 vs_addv(vs1, __ T8H, vs1, vs2); 5345 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5346 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5347 5348 vs_ldpq(vq, kyberConsts); 5349 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5350 load64shorts(vs2, zetas); 5351 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5352 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5353 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5354 vs_addv(vs1, __ T8H, vs1, vs2); 5355 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5356 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5357 5358 // level 4 5359 // At level 4 coefficients occur in 8 discrete blocks of size 16 5360 // so they are loaded using employing an ldr at 8 distinct offsets. 5361 5362 vs_ldpq(vq, kyberConsts); 5363 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5364 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5365 load64shorts(vs2, zetas); 5366 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5367 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5368 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5369 vs_addv(vs1, __ T8H, vs1, vs2); 5370 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5371 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5372 5373 vs_ldpq(vq, kyberConsts); 5374 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5375 load64shorts(vs2, zetas); 5376 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5377 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5378 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5379 vs_addv(vs1, __ T8H, vs1, vs2); 5380 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5381 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5382 5383 // level 5 5384 // At level 5 related coefficients occur in discrete blocks of size 8 so 5385 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5386 5387 vs_ldpq(vq, kyberConsts); 5388 int offsets4[4] = { 0, 32, 64, 96 }; 5389 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5390 load32shorts(vs_front(vs2), zetas); 5391 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5392 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5393 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5394 load32shorts(vs_front(vs2), zetas); 5395 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5396 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5397 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5398 load32shorts(vs_front(vs2), zetas); 5399 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5400 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5401 5402 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5403 load32shorts(vs_front(vs2), zetas); 5404 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5405 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5406 5407 // level 6 5408 // At level 6 related coefficients occur in discrete blocks of size 4 so 5409 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5410 5411 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5412 load32shorts(vs_front(vs2), zetas); 5413 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5414 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5415 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5416 // __ ldpq(v18, v19, __ post(zetas, 32)); 5417 load32shorts(vs_front(vs2), zetas); 5418 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5419 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5420 5421 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5422 load32shorts(vs_front(vs2), zetas); 5423 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5424 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5425 5426 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5427 load32shorts(vs_front(vs2), zetas); 5428 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5429 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5430 5431 __ leave(); // required for proper stackwalking of RuntimeStub frame 5432 __ mov(r0, zr); // return 0 5433 __ ret(lr); 5434 5435 return start; 5436 } 5437 5438 // Kyber Inverse NTT function 5439 // Implements 5440 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5441 // 5442 // coeffs (short[256]) = c_rarg0 5443 // ntt_zetas (short[256]) = c_rarg1 5444 address generate_kyberInverseNtt() { 5445 5446 __ align(CodeEntryAlignment); 5447 StubId stub_id = StubId::stubgen_kyberInverseNtt_id; 5448 StubCodeMark mark(this, stub_id); 5449 address start = __ pc(); 5450 __ enter(); 5451 5452 const Register coeffs = c_rarg0; 5453 const Register zetas = c_rarg1; 5454 5455 const Register kyberConsts = r10; 5456 const Register tmpAddr = r11; 5457 const Register tmpAddr2 = c_rarg2; 5458 5459 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5460 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5461 VSeq<2> vq(30); // n.b. constants overlap vs3 5462 5463 __ lea(kyberConsts, 5464 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5465 5466 // level 0 5467 // At level 0 related coefficients occur in discrete blocks of size 4 so 5468 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5469 5470 vs_ldpq(vq, kyberConsts); 5471 int offsets4[4] = { 0, 32, 64, 96 }; 5472 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5473 load32shorts(vs_front(vs2), zetas); 5474 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5475 vs_front(vs2), vs_back(vs2), vtmp, vq); 5476 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5477 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5478 load32shorts(vs_front(vs2), zetas); 5479 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5480 vs_front(vs2), vs_back(vs2), vtmp, vq); 5481 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5482 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5483 load32shorts(vs_front(vs2), zetas); 5484 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5485 vs_front(vs2), vs_back(vs2), vtmp, vq); 5486 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5487 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5488 load32shorts(vs_front(vs2), zetas); 5489 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5490 vs_front(vs2), vs_back(vs2), vtmp, vq); 5491 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5492 5493 // level 1 5494 // At level 1 related coefficients occur in discrete blocks of size 8 so 5495 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5496 5497 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5498 load32shorts(vs_front(vs2), zetas); 5499 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5500 vs_front(vs2), vs_back(vs2), vtmp, vq); 5501 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5502 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5503 load32shorts(vs_front(vs2), zetas); 5504 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5505 vs_front(vs2), vs_back(vs2), vtmp, vq); 5506 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5507 5508 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5509 load32shorts(vs_front(vs2), zetas); 5510 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5511 vs_front(vs2), vs_back(vs2), vtmp, vq); 5512 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5513 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5514 load32shorts(vs_front(vs2), zetas); 5515 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5516 vs_front(vs2), vs_back(vs2), vtmp, vq); 5517 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5518 5519 // level 2 5520 // At level 2 coefficients occur in 8 discrete blocks of size 16 5521 // so they are loaded using employing an ldr at 8 distinct offsets. 5522 5523 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5524 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5525 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5526 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5527 vs_subv(vs1, __ T8H, vs1, vs2); 5528 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5529 load64shorts(vs2, zetas); 5530 vs_ldpq(vq, kyberConsts); 5531 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5532 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5533 5534 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5535 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5536 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5537 vs_subv(vs1, __ T8H, vs1, vs2); 5538 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5539 load64shorts(vs2, zetas); 5540 vs_ldpq(vq, kyberConsts); 5541 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5542 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5543 5544 // Barrett reduction at indexes where overflow may happen 5545 5546 // load q and the multiplier for the Barrett reduction 5547 __ add(tmpAddr, kyberConsts, 16); 5548 vs_ldpq(vq, tmpAddr); 5549 5550 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5551 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5552 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5553 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5554 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5555 vs_sshr(vs2, __ T8H, vs2, 11); 5556 vs_mlsv(vs1, __ T8H, vs2, vq1); 5557 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5558 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5559 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5560 vs_sshr(vs2, __ T8H, vs2, 11); 5561 vs_mlsv(vs1, __ T8H, vs2, vq1); 5562 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5563 5564 // level 3 5565 // From level 3 upwards coefficients occur in discrete blocks whose size is 5566 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5567 5568 int offsets2[4] = { 0, 64, 128, 192 }; 5569 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5570 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5571 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5572 vs_subv(vs1, __ T8H, vs1, vs2); 5573 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5574 load64shorts(vs2, zetas); 5575 vs_ldpq(vq, kyberConsts); 5576 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5577 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5578 5579 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5580 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5581 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5582 vs_subv(vs1, __ T8H, vs1, vs2); 5583 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5584 load64shorts(vs2, zetas); 5585 vs_ldpq(vq, kyberConsts); 5586 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5587 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5588 5589 // level 4 5590 5591 int offsets1[4] = { 0, 32, 128, 160 }; 5592 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5593 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5594 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5595 vs_subv(vs1, __ T8H, vs1, vs2); 5596 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5597 load64shorts(vs2, zetas); 5598 vs_ldpq(vq, kyberConsts); 5599 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5600 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5601 5602 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5603 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5604 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5605 vs_subv(vs1, __ T8H, vs1, vs2); 5606 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5607 load64shorts(vs2, zetas); 5608 vs_ldpq(vq, kyberConsts); 5609 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5610 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5611 5612 // level 5 5613 5614 __ add(tmpAddr, coeffs, 0); 5615 load64shorts(vs1, tmpAddr); 5616 __ add(tmpAddr, coeffs, 128); 5617 load64shorts(vs2, tmpAddr); 5618 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5619 vs_subv(vs1, __ T8H, vs1, vs2); 5620 __ add(tmpAddr, coeffs, 0); 5621 store64shorts(vs3, tmpAddr); 5622 load64shorts(vs2, zetas); 5623 vs_ldpq(vq, kyberConsts); 5624 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5625 __ add(tmpAddr, coeffs, 128); 5626 store64shorts(vs2, tmpAddr); 5627 5628 load64shorts(vs1, tmpAddr); 5629 __ add(tmpAddr, coeffs, 384); 5630 load64shorts(vs2, tmpAddr); 5631 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5632 vs_subv(vs1, __ T8H, vs1, vs2); 5633 __ add(tmpAddr, coeffs, 256); 5634 store64shorts(vs3, tmpAddr); 5635 load64shorts(vs2, zetas); 5636 vs_ldpq(vq, kyberConsts); 5637 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5638 __ add(tmpAddr, coeffs, 384); 5639 store64shorts(vs2, tmpAddr); 5640 5641 // Barrett reduction at indexes where overflow may happen 5642 5643 // load q and the multiplier for the Barrett reduction 5644 __ add(tmpAddr, kyberConsts, 16); 5645 vs_ldpq(vq, tmpAddr); 5646 5647 int offsets0[2] = { 0, 256 }; 5648 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5649 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5650 vs_sshr(vs2, __ T8H, vs2, 11); 5651 vs_mlsv(vs1, __ T8H, vs2, vq1); 5652 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5653 5654 // level 6 5655 5656 __ add(tmpAddr, coeffs, 0); 5657 load64shorts(vs1, tmpAddr); 5658 __ add(tmpAddr, coeffs, 256); 5659 load64shorts(vs2, tmpAddr); 5660 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5661 vs_subv(vs1, __ T8H, vs1, vs2); 5662 __ add(tmpAddr, coeffs, 0); 5663 store64shorts(vs3, tmpAddr); 5664 load64shorts(vs2, zetas); 5665 vs_ldpq(vq, kyberConsts); 5666 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5667 __ add(tmpAddr, coeffs, 256); 5668 store64shorts(vs2, tmpAddr); 5669 5670 __ add(tmpAddr, coeffs, 128); 5671 load64shorts(vs1, tmpAddr); 5672 __ add(tmpAddr, coeffs, 384); 5673 load64shorts(vs2, tmpAddr); 5674 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5675 vs_subv(vs1, __ T8H, vs1, vs2); 5676 __ add(tmpAddr, coeffs, 128); 5677 store64shorts(vs3, tmpAddr); 5678 load64shorts(vs2, zetas); 5679 vs_ldpq(vq, kyberConsts); 5680 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5681 __ add(tmpAddr, coeffs, 384); 5682 store64shorts(vs2, tmpAddr); 5683 5684 // multiply by 2^-n 5685 5686 // load toMont(2^-n mod q) 5687 __ add(tmpAddr, kyberConsts, 48); 5688 __ ldr(v29, __ Q, tmpAddr); 5689 5690 vs_ldpq(vq, kyberConsts); 5691 __ add(tmpAddr, coeffs, 0); 5692 load64shorts(vs1, tmpAddr); 5693 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5694 __ add(tmpAddr, coeffs, 0); 5695 store64shorts(vs2, tmpAddr); 5696 5697 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5698 load64shorts(vs1, tmpAddr); 5699 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5700 __ add(tmpAddr, coeffs, 128); 5701 store64shorts(vs2, tmpAddr); 5702 5703 // now tmpAddr contains coeffs + 256 5704 load64shorts(vs1, tmpAddr); 5705 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5706 __ add(tmpAddr, coeffs, 256); 5707 store64shorts(vs2, tmpAddr); 5708 5709 // now tmpAddr contains coeffs + 384 5710 load64shorts(vs1, tmpAddr); 5711 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5712 __ add(tmpAddr, coeffs, 384); 5713 store64shorts(vs2, tmpAddr); 5714 5715 __ leave(); // required for proper stackwalking of RuntimeStub frame 5716 __ mov(r0, zr); // return 0 5717 __ ret(lr); 5718 5719 return start; 5720 } 5721 5722 // Kyber multiply polynomials in the NTT domain. 5723 // Implements 5724 // static int implKyberNttMult( 5725 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5726 // 5727 // result (short[256]) = c_rarg0 5728 // ntta (short[256]) = c_rarg1 5729 // nttb (short[256]) = c_rarg2 5730 // zetas (short[128]) = c_rarg3 5731 address generate_kyberNttMult() { 5732 5733 __ align(CodeEntryAlignment); 5734 StubId stub_id = StubId::stubgen_kyberNttMult_id; 5735 StubCodeMark mark(this, stub_id); 5736 address start = __ pc(); 5737 __ enter(); 5738 5739 const Register result = c_rarg0; 5740 const Register ntta = c_rarg1; 5741 const Register nttb = c_rarg2; 5742 const Register zetas = c_rarg3; 5743 5744 const Register kyberConsts = r10; 5745 const Register limit = r11; 5746 5747 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5748 VSeq<4> vs3(16), vs4(20); 5749 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5750 VSeq<2> vz(28); // pair of zetas 5751 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5752 5753 __ lea(kyberConsts, 5754 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5755 5756 Label kyberNttMult_loop; 5757 5758 __ add(limit, result, 512); 5759 5760 // load q and qinv 5761 vs_ldpq(vq, kyberConsts); 5762 5763 // load R^2 mod q (to convert back from Montgomery representation) 5764 __ add(kyberConsts, kyberConsts, 64); 5765 __ ldr(v27, __ Q, kyberConsts); 5766 5767 __ BIND(kyberNttMult_loop); 5768 5769 // load 16 zetas 5770 vs_ldpq_post(vz, zetas); 5771 5772 // load 2 sets of 32 coefficients from the two input arrays 5773 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5774 // are striped across pairs of vector registers 5775 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5776 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5777 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5778 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5779 5780 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5781 // i.e. montmul the first and second halves of vs1 in order and 5782 // then with one sequence reversed storing the two results in vs3 5783 // 5784 // vs3[0] <- montmul(a0, b0) 5785 // vs3[1] <- montmul(a1, b1) 5786 // vs3[2] <- montmul(a0, b1) 5787 // vs3[3] <- montmul(a1, b0) 5788 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5789 kyber_montmul16(vs_back(vs3), 5790 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5791 5792 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5793 // i.e. montmul the first and second halves of vs4 in order and 5794 // then with one sequence reversed storing the two results in vs1 5795 // 5796 // vs1[0] <- montmul(a2, b2) 5797 // vs1[1] <- montmul(a3, b3) 5798 // vs1[2] <- montmul(a2, b3) 5799 // vs1[3] <- montmul(a3, b2) 5800 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5801 kyber_montmul16(vs_back(vs1), 5802 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5803 5804 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5805 // We can schedule two montmuls at a time if we use a suitable vector 5806 // sequence <vs3[1], vs1[1]>. 5807 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5808 VSeq<2> vs5(vs3[1], delta); 5809 5810 // vs3[1] <- montmul(montmul(a1, b1), z0) 5811 // vs1[1] <- montmul(montmul(a3, b3), z1) 5812 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5813 5814 // add results in pairs storing in vs3 5815 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5816 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5817 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5818 5819 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5820 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5821 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5822 5823 // vs1 <- montmul(vs3, montRSquareModQ) 5824 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5825 5826 // store back the two pairs of result vectors de-interleaved as 8H elements 5827 // i.e. storing each pairs of shorts striped across a register pair adjacent 5828 // in memory 5829 vs_st2_post(vs1, __ T8H, result); 5830 5831 __ cmp(result, limit); 5832 __ br(Assembler::NE, kyberNttMult_loop); 5833 5834 __ leave(); // required for proper stackwalking of RuntimeStub frame 5835 __ mov(r0, zr); // return 0 5836 __ ret(lr); 5837 5838 return start; 5839 } 5840 5841 // Kyber add 2 polynomials. 5842 // Implements 5843 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5844 // 5845 // result (short[256]) = c_rarg0 5846 // a (short[256]) = c_rarg1 5847 // b (short[256]) = c_rarg2 5848 address generate_kyberAddPoly_2() { 5849 5850 __ align(CodeEntryAlignment); 5851 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id; 5852 StubCodeMark mark(this, stub_id); 5853 address start = __ pc(); 5854 __ enter(); 5855 5856 const Register result = c_rarg0; 5857 const Register a = c_rarg1; 5858 const Register b = c_rarg2; 5859 5860 const Register kyberConsts = r11; 5861 5862 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5863 // So, we can load, add and store the data in 3 groups of 11, 5864 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5865 // registers. A further constraint is that the mapping needs 5866 // to skip callee saves. So, we allocate the register 5867 // sequences using two 8 sequences, two 2 sequences and two 5868 // single registers. 5869 VSeq<8> vs1_1(0); 5870 VSeq<2> vs1_2(16); 5871 FloatRegister vs1_3 = v28; 5872 VSeq<8> vs2_1(18); 5873 VSeq<2> vs2_2(26); 5874 FloatRegister vs2_3 = v29; 5875 5876 // two constant vector sequences 5877 VSeq<8> vc_1(31, 0); 5878 VSeq<2> vc_2(31, 0); 5879 5880 FloatRegister vc_3 = v31; 5881 __ lea(kyberConsts, 5882 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5883 5884 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5885 for (int i = 0; i < 3; i++) { 5886 // load 80 or 88 values from a into vs1_1/2/3 5887 vs_ldpq_post(vs1_1, a); 5888 vs_ldpq_post(vs1_2, a); 5889 if (i < 2) { 5890 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5891 } 5892 // load 80 or 88 values from b into vs2_1/2/3 5893 vs_ldpq_post(vs2_1, b); 5894 vs_ldpq_post(vs2_2, b); 5895 if (i < 2) { 5896 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5897 } 5898 // sum 80 or 88 values across vs1 and vs2 into vs1 5899 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5900 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5901 if (i < 2) { 5902 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5903 } 5904 // add constant to all 80 or 88 results 5905 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5906 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5907 if (i < 2) { 5908 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5909 } 5910 // store 80 or 88 values 5911 vs_stpq_post(vs1_1, result); 5912 vs_stpq_post(vs1_2, result); 5913 if (i < 2) { 5914 __ str(vs1_3, __ Q, __ post(result, 16)); 5915 } 5916 } 5917 5918 __ leave(); // required for proper stackwalking of RuntimeStub frame 5919 __ mov(r0, zr); // return 0 5920 __ ret(lr); 5921 5922 return start; 5923 } 5924 5925 // Kyber add 3 polynomials. 5926 // Implements 5927 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5928 // 5929 // result (short[256]) = c_rarg0 5930 // a (short[256]) = c_rarg1 5931 // b (short[256]) = c_rarg2 5932 // c (short[256]) = c_rarg3 5933 address generate_kyberAddPoly_3() { 5934 5935 __ align(CodeEntryAlignment); 5936 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id; 5937 StubCodeMark mark(this, stub_id); 5938 address start = __ pc(); 5939 __ enter(); 5940 5941 const Register result = c_rarg0; 5942 const Register a = c_rarg1; 5943 const Register b = c_rarg2; 5944 const Register c = c_rarg3; 5945 5946 const Register kyberConsts = r11; 5947 5948 // As above we sum 256 sets of values in total i.e. 32 x 8H 5949 // quadwords. So, we can load, add and store the data in 3 5950 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5951 // of 10 or 11 registers. A further constraint is that the 5952 // mapping needs to skip callee saves. So, we allocate the 5953 // register sequences using two 8 sequences, two 2 sequences 5954 // and two single registers. 5955 VSeq<8> vs1_1(0); 5956 VSeq<2> vs1_2(16); 5957 FloatRegister vs1_3 = v28; 5958 VSeq<8> vs2_1(18); 5959 VSeq<2> vs2_2(26); 5960 FloatRegister vs2_3 = v29; 5961 5962 // two constant vector sequences 5963 VSeq<8> vc_1(31, 0); 5964 VSeq<2> vc_2(31, 0); 5965 5966 FloatRegister vc_3 = v31; 5967 5968 __ lea(kyberConsts, 5969 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5970 5971 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5972 for (int i = 0; i < 3; i++) { 5973 // load 80 or 88 values from a into vs1_1/2/3 5974 vs_ldpq_post(vs1_1, a); 5975 vs_ldpq_post(vs1_2, a); 5976 if (i < 2) { 5977 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5978 } 5979 // load 80 or 88 values from b into vs2_1/2/3 5980 vs_ldpq_post(vs2_1, b); 5981 vs_ldpq_post(vs2_2, b); 5982 if (i < 2) { 5983 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5984 } 5985 // sum 80 or 88 values across vs1 and vs2 into vs1 5986 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5987 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5988 if (i < 2) { 5989 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5990 } 5991 // load 80 or 88 values from c into vs2_1/2/3 5992 vs_ldpq_post(vs2_1, c); 5993 vs_ldpq_post(vs2_2, c); 5994 if (i < 2) { 5995 __ ldr(vs2_3, __ Q, __ post(c, 16)); 5996 } 5997 // sum 80 or 88 values across vs1 and vs2 into vs1 5998 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5999 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6000 if (i < 2) { 6001 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6002 } 6003 // add constant to all 80 or 88 results 6004 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6005 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6006 if (i < 2) { 6007 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6008 } 6009 // store 80 or 88 values 6010 vs_stpq_post(vs1_1, result); 6011 vs_stpq_post(vs1_2, result); 6012 if (i < 2) { 6013 __ str(vs1_3, __ Q, __ post(result, 16)); 6014 } 6015 } 6016 6017 __ leave(); // required for proper stackwalking of RuntimeStub frame 6018 __ mov(r0, zr); // return 0 6019 __ ret(lr); 6020 6021 return start; 6022 } 6023 6024 // Kyber parse XOF output to polynomial coefficient candidates 6025 // or decodePoly(12, ...). 6026 // Implements 6027 // static int implKyber12To16( 6028 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6029 // 6030 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6031 // 6032 // condensed (byte[]) = c_rarg0 6033 // condensedIndex = c_rarg1 6034 // parsed (short[112 or 256]) = c_rarg2 6035 // parsedLength (112 or 256) = c_rarg3 6036 address generate_kyber12To16() { 6037 Label L_F00, L_loop, L_end; 6038 6039 __ BIND(L_F00); 6040 __ emit_int64(0x0f000f000f000f00); 6041 __ emit_int64(0x0f000f000f000f00); 6042 6043 __ align(CodeEntryAlignment); 6044 StubId stub_id = StubId::stubgen_kyber12To16_id; 6045 StubCodeMark mark(this, stub_id); 6046 address start = __ pc(); 6047 __ enter(); 6048 6049 const Register condensed = c_rarg0; 6050 const Register condensedOffs = c_rarg1; 6051 const Register parsed = c_rarg2; 6052 const Register parsedLength = c_rarg3; 6053 6054 const Register tmpAddr = r11; 6055 6056 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6057 // quadwords so we need a 6 vector sequence for the inputs. 6058 // Parsing produces 64 shorts, employing two 8 vector 6059 // sequences to store and combine the intermediate data. 6060 VSeq<6> vin(24); 6061 VSeq<8> va(0), vb(16); 6062 6063 __ adr(tmpAddr, L_F00); 6064 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6065 __ add(condensed, condensed, condensedOffs); 6066 6067 __ BIND(L_loop); 6068 // load 96 (6 x 16B) byte values 6069 vs_ld3_post(vin, __ T16B, condensed); 6070 6071 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6072 // holds 48 (16x3) contiguous bytes from memory striped 6073 // horizontally across each of the 16 byte lanes. Equivalently, 6074 // that is 16 pairs of 12-bit integers. Likewise the back half 6075 // holds the next 48 bytes in the same arrangement. 6076 6077 // Each vector in the front half can also be viewed as a vertical 6078 // strip across the 16 pairs of 12 bit integers. Each byte in 6079 // vin[0] stores the low 8 bits of the first int in a pair. Each 6080 // byte in vin[1] stores the high 4 bits of the first int and the 6081 // low 4 bits of the second int. Each byte in vin[2] stores the 6082 // high 8 bits of the second int. Likewise the vectors in second 6083 // half. 6084 6085 // Converting the data to 16-bit shorts requires first of all 6086 // expanding each of the 6 x 16B vectors into 6 corresponding 6087 // pairs of 8H vectors. Mask, shift and add operations on the 6088 // resulting vector pairs can be used to combine 4 and 8 bit 6089 // parts of related 8H vector elements. 6090 // 6091 // The middle vectors (vin[2] and vin[5]) are actually expanded 6092 // twice, one copy manipulated to provide the lower 4 bits 6093 // belonging to the first short in a pair and another copy 6094 // manipulated to provide the higher 4 bits belonging to the 6095 // second short in a pair. This is why the the vector sequences va 6096 // and vb used to hold the expanded 8H elements are of length 8. 6097 6098 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6099 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6100 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6101 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6102 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6103 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6104 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6105 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6106 6107 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6108 // and vb[4:5] 6109 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6110 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6111 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6112 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6113 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6114 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6115 6116 // shift lo byte of copy 1 of the middle stripe into the high byte 6117 __ shl(va[2], __ T8H, va[2], 8); 6118 __ shl(va[3], __ T8H, va[3], 8); 6119 __ shl(vb[2], __ T8H, vb[2], 8); 6120 __ shl(vb[3], __ T8H, vb[3], 8); 6121 6122 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6123 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6124 // are in bit positions [4..11]. 6125 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6126 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6127 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6128 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6129 6130 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6131 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6132 // copy2 6133 __ andr(va[2], __ T16B, va[2], v31); 6134 __ andr(va[3], __ T16B, va[3], v31); 6135 __ ushr(va[4], __ T8H, va[4], 4); 6136 __ ushr(va[5], __ T8H, va[5], 4); 6137 __ andr(vb[2], __ T16B, vb[2], v31); 6138 __ andr(vb[3], __ T16B, vb[3], v31); 6139 __ ushr(vb[4], __ T8H, vb[4], 4); 6140 __ ushr(vb[5], __ T8H, vb[5], 4); 6141 6142 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6143 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6144 // n.b. the ordering ensures: i) inputs are consumed before they 6145 // are overwritten ii) the order of 16-bit results across successive 6146 // pairs of vectors in va and then vb reflects the order of the 6147 // corresponding 12-bit inputs 6148 __ addv(va[0], __ T8H, va[0], va[2]); 6149 __ addv(va[2], __ T8H, va[1], va[3]); 6150 __ addv(va[1], __ T8H, va[4], va[6]); 6151 __ addv(va[3], __ T8H, va[5], va[7]); 6152 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6153 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6154 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6155 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6156 6157 // store 64 results interleaved as shorts 6158 vs_st2_post(vs_front(va), __ T8H, parsed); 6159 vs_st2_post(vs_front(vb), __ T8H, parsed); 6160 6161 __ sub(parsedLength, parsedLength, 64); 6162 __ cmp(parsedLength, (u1)64); 6163 __ br(Assembler::GE, L_loop); 6164 __ cbz(parsedLength, L_end); 6165 6166 // if anything is left it should be a final 72 bytes of input 6167 // i.e. a final 48 12-bit values. so we handle this by loading 6168 // 48 bytes into all 16B lanes of front(vin) and only 24 6169 // bytes into the lower 8B lane of back(vin) 6170 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6171 vs_ld3(vs_back(vin), __ T8B, condensed); 6172 6173 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6174 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6175 // 5 and target element 2 of vb duplicates element 4. 6176 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6177 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6178 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6179 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6180 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6181 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6182 6183 // This time expand just the lower 8 lanes 6184 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6185 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6186 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6187 6188 // shift lo byte of copy 1 of the middle stripe into the high byte 6189 __ shl(va[2], __ T8H, va[2], 8); 6190 __ shl(va[3], __ T8H, va[3], 8); 6191 __ shl(vb[2], __ T8H, vb[2], 8); 6192 6193 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6194 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6195 // int are in bit positions [4..11]. 6196 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6197 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6198 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6199 6200 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6201 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6202 // copy2 6203 __ andr(va[2], __ T16B, va[2], v31); 6204 __ andr(va[3], __ T16B, va[3], v31); 6205 __ ushr(va[4], __ T8H, va[4], 4); 6206 __ ushr(va[5], __ T8H, va[5], 4); 6207 __ andr(vb[2], __ T16B, vb[2], v31); 6208 __ ushr(vb[4], __ T8H, vb[4], 4); 6209 6210 6211 6212 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6213 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6214 6215 // n.b. ordering ensures: i) inputs are consumed before they are 6216 // overwritten ii) order of 16-bit results across succsessive 6217 // pairs of vectors in va and then lower half of vb reflects order 6218 // of corresponding 12-bit inputs 6219 __ addv(va[0], __ T8H, va[0], va[2]); 6220 __ addv(va[2], __ T8H, va[1], va[3]); 6221 __ addv(va[1], __ T8H, va[4], va[6]); 6222 __ addv(va[3], __ T8H, va[5], va[7]); 6223 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6224 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6225 6226 // store 48 results interleaved as shorts 6227 vs_st2_post(vs_front(va), __ T8H, parsed); 6228 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6229 6230 __ BIND(L_end); 6231 6232 __ leave(); // required for proper stackwalking of RuntimeStub frame 6233 __ mov(r0, zr); // return 0 6234 __ ret(lr); 6235 6236 return start; 6237 } 6238 6239 // Kyber Barrett reduce function. 6240 // Implements 6241 // static int implKyberBarrettReduce(short[] coeffs) {} 6242 // 6243 // coeffs (short[256]) = c_rarg0 6244 address generate_kyberBarrettReduce() { 6245 6246 __ align(CodeEntryAlignment); 6247 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id; 6248 StubCodeMark mark(this, stub_id); 6249 address start = __ pc(); 6250 __ enter(); 6251 6252 const Register coeffs = c_rarg0; 6253 6254 const Register kyberConsts = r10; 6255 const Register result = r11; 6256 6257 // As above we process 256 sets of values in total i.e. 32 x 6258 // 8H quadwords. So, we can load, add and store the data in 3 6259 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6260 // of 10 or 11 registers. A further constraint is that the 6261 // mapping needs to skip callee saves. So, we allocate the 6262 // register sequences using two 8 sequences, two 2 sequences 6263 // and two single registers. 6264 VSeq<8> vs1_1(0); 6265 VSeq<2> vs1_2(16); 6266 FloatRegister vs1_3 = v28; 6267 VSeq<8> vs2_1(18); 6268 VSeq<2> vs2_2(26); 6269 FloatRegister vs2_3 = v29; 6270 6271 // we also need a pair of corresponding constant sequences 6272 6273 VSeq<8> vc1_1(30, 0); 6274 VSeq<2> vc1_2(30, 0); 6275 FloatRegister vc1_3 = v30; // for kyber_q 6276 6277 VSeq<8> vc2_1(31, 0); 6278 VSeq<2> vc2_2(31, 0); 6279 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6280 6281 __ add(result, coeffs, 0); 6282 __ lea(kyberConsts, 6283 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6284 6285 // load q and the multiplier for the Barrett reduction 6286 __ add(kyberConsts, kyberConsts, 16); 6287 __ ldpq(vc1_3, vc2_3, kyberConsts); 6288 6289 for (int i = 0; i < 3; i++) { 6290 // load 80 or 88 coefficients 6291 vs_ldpq_post(vs1_1, coeffs); 6292 vs_ldpq_post(vs1_2, coeffs); 6293 if (i < 2) { 6294 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6295 } 6296 6297 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6298 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6299 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6300 if (i < 2) { 6301 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6302 } 6303 6304 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6305 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6306 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6307 if (i < 2) { 6308 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6309 } 6310 6311 // vs1 <- vs1 - vs2 * kyber_q 6312 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6313 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6314 if (i < 2) { 6315 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6316 } 6317 6318 vs_stpq_post(vs1_1, result); 6319 vs_stpq_post(vs1_2, result); 6320 if (i < 2) { 6321 __ str(vs1_3, __ Q, __ post(result, 16)); 6322 } 6323 } 6324 6325 __ leave(); // required for proper stackwalking of RuntimeStub frame 6326 __ mov(r0, zr); // return 0 6327 __ ret(lr); 6328 6329 return start; 6330 } 6331 6332 6333 // Dilithium-specific montmul helper routines that generate parallel 6334 // code for, respectively, a single 4x4s vector sequence montmul or 6335 // two such multiplies in a row. 6336 6337 // Perform 16 32-bit Montgomery multiplications in parallel 6338 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6339 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6340 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6341 // It will assert that the register use is valid 6342 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6343 } 6344 6345 // Perform 2x16 32-bit Montgomery multiplications in parallel 6346 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6347 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6348 // Schedule two successive 4x4S multiplies via the montmul helper 6349 // on the front and back halves of va, vb and vc. The helper will 6350 // assert that the register use has no overlap conflicts on each 6351 // individual call but we also need to ensure that the necessary 6352 // disjoint/equality constraints are met across both calls. 6353 6354 // vb, vc, vtmp and vq must be disjoint. va must either be 6355 // disjoint from all other registers or equal vc 6356 6357 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6358 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6359 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6360 6361 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6362 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6363 6364 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6365 6366 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6367 assert(vs_disjoint(va, vb), "va and vb overlap"); 6368 assert(vs_disjoint(va, vq), "va and vq overlap"); 6369 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6370 6371 // We multiply the front and back halves of each sequence 4 at a 6372 // time because 6373 // 6374 // 1) we are currently only able to get 4-way instruction 6375 // parallelism at best 6376 // 6377 // 2) we need registers for the constants in vq and temporary 6378 // scratch registers to hold intermediate results so vtmp can only 6379 // be a VSeq<4> which means we only have 4 scratch slots. 6380 6381 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6382 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6383 } 6384 6385 // Perform combined montmul then add/sub on 4x4S vectors. 6386 void dilithium_montmul16_sub_add( 6387 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6388 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6389 // compute a = montmul(a1, c) 6390 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6391 // ouptut a1 = a0 - a 6392 vs_subv(va1, __ T4S, va0, vc); 6393 // and a0 = a0 + a 6394 vs_addv(va0, __ T4S, va0, vc); 6395 } 6396 6397 // Perform combined add/sub then montul on 4x4S vectors. 6398 void dilithium_sub_add_montmul16( 6399 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6400 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6401 // compute c = a0 - a1 6402 vs_subv(vtmp1, __ T4S, va0, va1); 6403 // output a0 = a0 + a1 6404 vs_addv(va0, __ T4S, va0, va1); 6405 // output a1 = b montmul c 6406 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6407 } 6408 6409 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6410 // in the Java implementation come in sequences of at least 8, so we 6411 // can use ldpq to collect the corresponding data into pairs of vector 6412 // registers. 6413 // We collect the coefficients corresponding to the 'j+l' indexes into 6414 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6415 // then we do the (Montgomery) multiplications by the zetas in parallel 6416 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6417 // v0-v7, then do the additions into v24-v31 and the subtractions into 6418 // v0-v7 and finally save the results back to the coeffs array. 6419 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6420 const Register coeffs, const Register zetas) { 6421 int c1 = 0; 6422 int c2 = 512; 6423 int startIncr; 6424 // don't use callee save registers v8 - v15 6425 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6426 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6427 VSeq<2> vq(30); // n.b. constants overlap vs3 6428 int offsets[4] = { 0, 32, 64, 96 }; 6429 6430 for (int level = 0; level < 5; level++) { 6431 int c1Start = c1; 6432 int c2Start = c2; 6433 if (level == 3) { 6434 offsets[1] = 32; 6435 offsets[2] = 128; 6436 offsets[3] = 160; 6437 } else if (level == 4) { 6438 offsets[1] = 64; 6439 offsets[2] = 128; 6440 offsets[3] = 192; 6441 } 6442 6443 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6444 // time at 4 different offsets and multiply them in order by the 6445 // next set of input values. So we employ indexed load and store 6446 // pair instructions with arrangement 4S. 6447 for (int i = 0; i < 4; i++) { 6448 // reload q and qinv 6449 vs_ldpq(vq, dilithiumConsts); // qInv, q 6450 // load 8x4S coefficients via second start pos == c2 6451 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6452 // load next 8x4S inputs == b 6453 vs_ldpq_post(vs2, zetas); 6454 // compute a == c2 * b mod MONT_Q 6455 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6456 // load 8x4s coefficients via first start pos == c1 6457 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6458 // compute a1 = c1 + a 6459 vs_addv(vs3, __ T4S, vs1, vs2); 6460 // compute a2 = c1 - a 6461 vs_subv(vs1, __ T4S, vs1, vs2); 6462 // output a1 and a2 6463 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6464 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6465 6466 int k = 4 * level + i; 6467 6468 if (k > 7) { 6469 startIncr = 256; 6470 } else if (k == 5) { 6471 startIncr = 384; 6472 } else { 6473 startIncr = 128; 6474 } 6475 6476 c1Start += startIncr; 6477 c2Start += startIncr; 6478 } 6479 6480 c2 /= 2; 6481 } 6482 } 6483 6484 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6485 // Implements the method 6486 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6487 // of the Java class sun.security.provider 6488 // 6489 // coeffs (int[256]) = c_rarg0 6490 // zetas (int[256]) = c_rarg1 6491 address generate_dilithiumAlmostNtt() { 6492 6493 __ align(CodeEntryAlignment); 6494 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id; 6495 StubCodeMark mark(this, stub_id); 6496 address start = __ pc(); 6497 __ enter(); 6498 6499 const Register coeffs = c_rarg0; 6500 const Register zetas = c_rarg1; 6501 6502 const Register tmpAddr = r9; 6503 const Register dilithiumConsts = r10; 6504 const Register result = r11; 6505 // don't use callee save registers v8 - v15 6506 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6507 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6508 VSeq<2> vq(30); // n.b. constants overlap vs3 6509 int offsets[4] = { 0, 32, 64, 96}; 6510 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6511 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6512 __ add(result, coeffs, 0); 6513 __ lea(dilithiumConsts, 6514 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6515 6516 // Each level represents one iteration of the outer for loop of the Java version. 6517 6518 // level 0-4 6519 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6520 6521 // level 5 6522 6523 // At level 5 the coefficients we need to combine with the zetas 6524 // are grouped in memory in blocks of size 4. So, for both sets of 6525 // coefficients we load 4 adjacent values at 8 different offsets 6526 // using an indexed ldr with register variant Q and multiply them 6527 // in sequence order by the next set of inputs. Likewise we store 6528 // the resuls using an indexed str with register variant Q. 6529 for (int i = 0; i < 1024; i += 256) { 6530 // reload constants q, qinv each iteration as they get clobbered later 6531 vs_ldpq(vq, dilithiumConsts); // qInv, q 6532 // load 32 (8x4S) coefficients via first offsets = c1 6533 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6534 // load next 32 (8x4S) inputs = b 6535 vs_ldpq_post(vs2, zetas); 6536 // a = b montul c1 6537 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6538 // load 32 (8x4S) coefficients via second offsets = c2 6539 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6540 // add/sub with result of multiply 6541 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6542 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6543 // write back new coefficients using same offsets 6544 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6545 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6546 } 6547 6548 // level 6 6549 // At level 6 the coefficients we need to combine with the zetas 6550 // are grouped in memory in pairs, the first two being montmul 6551 // inputs and the second add/sub inputs. We can still implement 6552 // the montmul+sub+add using 4-way parallelism but only if we 6553 // combine the coefficients with the zetas 16 at a time. We load 8 6554 // adjacent values at 4 different offsets using an ld2 load with 6555 // arrangement 2D. That interleaves the lower and upper halves of 6556 // each pair of quadwords into successive vector registers. We 6557 // then need to montmul the 4 even elements of the coefficients 6558 // register sequence by the zetas in order and then add/sub the 4 6559 // odd elements of the coefficients register sequence. We use an 6560 // equivalent st2 operation to store the results back into memory 6561 // de-interleaved. 6562 for (int i = 0; i < 1024; i += 128) { 6563 // reload constants q, qinv each iteration as they get clobbered later 6564 vs_ldpq(vq, dilithiumConsts); // qInv, q 6565 // load interleaved 16 (4x2D) coefficients via offsets 6566 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6567 // load next 16 (4x4S) inputs 6568 vs_ldpq_post(vs_front(vs2), zetas); 6569 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6570 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6571 vs_front(vs2), vtmp, vq); 6572 // store interleaved 16 (4x2D) coefficients via offsets 6573 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6574 } 6575 6576 // level 7 6577 // At level 7 the coefficients we need to combine with the zetas 6578 // occur singly with montmul inputs alterating with add/sub 6579 // inputs. Once again we can use 4-way parallelism to combine 16 6580 // zetas at a time. However, we have to load 8 adjacent values at 6581 // 4 different offsets using an ld2 load with arrangement 4S. That 6582 // interleaves the the odd words of each pair into one 6583 // coefficients vector register and the even words of the pair 6584 // into the next register. We then need to montmul the 4 even 6585 // elements of the coefficients register sequence by the zetas in 6586 // order and then add/sub the 4 odd elements of the coefficients 6587 // register sequence. We use an equivalent st2 operation to store 6588 // the results back into memory de-interleaved. 6589 6590 for (int i = 0; i < 1024; i += 128) { 6591 // reload constants q, qinv each iteration as they get clobbered later 6592 vs_ldpq(vq, dilithiumConsts); // qInv, q 6593 // load interleaved 16 (4x4S) coefficients via offsets 6594 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6595 // load next 16 (4x4S) inputs 6596 vs_ldpq_post(vs_front(vs2), zetas); 6597 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6598 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6599 vs_front(vs2), vtmp, vq); 6600 // store interleaved 16 (4x4S) coefficients via offsets 6601 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6602 } 6603 __ leave(); // required for proper stackwalking of RuntimeStub frame 6604 __ mov(r0, zr); // return 0 6605 __ ret(lr); 6606 6607 return start; 6608 } 6609 6610 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6611 // in the Java implementation come in sequences of at least 8, so we 6612 // can use ldpq to collect the corresponding data into pairs of vector 6613 // registers 6614 // We collect the coefficients that correspond to the 'j's into vs1 6615 // the coefficiets that correspond to the 'j+l's into vs2 then 6616 // do the additions into vs3 and the subtractions into vs1 then 6617 // save the result of the additions, load the zetas into vs2 6618 // do the (Montgomery) multiplications by zeta in parallel into vs2 6619 // finally save the results back to the coeffs array 6620 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6621 const Register coeffs, const Register zetas) { 6622 int c1 = 0; 6623 int c2 = 32; 6624 int startIncr; 6625 int offsets[4]; 6626 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6627 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6628 VSeq<2> vq(30); // n.b. constants overlap vs3 6629 6630 offsets[0] = 0; 6631 6632 for (int level = 3; level < 8; level++) { 6633 int c1Start = c1; 6634 int c2Start = c2; 6635 if (level == 3) { 6636 offsets[1] = 64; 6637 offsets[2] = 128; 6638 offsets[3] = 192; 6639 } else if (level == 4) { 6640 offsets[1] = 32; 6641 offsets[2] = 128; 6642 offsets[3] = 160; 6643 } else { 6644 offsets[1] = 32; 6645 offsets[2] = 64; 6646 offsets[3] = 96; 6647 } 6648 6649 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6650 // time at 4 different offsets and multiply them in order by the 6651 // next set of input values. So we employ indexed load and store 6652 // pair instructions with arrangement 4S. 6653 for (int i = 0; i < 4; i++) { 6654 // load v1 32 (8x4S) coefficients relative to first start index 6655 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6656 // load v2 32 (8x4S) coefficients relative to second start index 6657 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6658 // a0 = v1 + v2 -- n.b. clobbers vqs 6659 vs_addv(vs3, __ T4S, vs1, vs2); 6660 // a1 = v1 - v2 6661 vs_subv(vs1, __ T4S, vs1, vs2); 6662 // save a1 relative to first start index 6663 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6664 // load constants q, qinv each iteration as they get clobbered above 6665 vs_ldpq(vq, dilithiumConsts); // qInv, q 6666 // load b next 32 (8x4S) inputs 6667 vs_ldpq_post(vs2, zetas); 6668 // a = a1 montmul b 6669 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6670 // save a relative to second start index 6671 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6672 6673 int k = 4 * level + i; 6674 6675 if (k < 24) { 6676 startIncr = 256; 6677 } else if (k == 25) { 6678 startIncr = 384; 6679 } else { 6680 startIncr = 128; 6681 } 6682 6683 c1Start += startIncr; 6684 c2Start += startIncr; 6685 } 6686 6687 c2 *= 2; 6688 } 6689 } 6690 6691 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6692 // Implements the method 6693 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6694 // the sun.security.provider.ML_DSA class. 6695 // 6696 // coeffs (int[256]) = c_rarg0 6697 // zetas (int[256]) = c_rarg1 6698 address generate_dilithiumAlmostInverseNtt() { 6699 6700 __ align(CodeEntryAlignment); 6701 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id; 6702 StubCodeMark mark(this, stub_id); 6703 address start = __ pc(); 6704 __ enter(); 6705 6706 const Register coeffs = c_rarg0; 6707 const Register zetas = c_rarg1; 6708 6709 const Register tmpAddr = r9; 6710 const Register dilithiumConsts = r10; 6711 const Register result = r11; 6712 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6713 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6714 VSeq<2> vq(30); // n.b. constants overlap vs3 6715 int offsets[4] = { 0, 32, 64, 96 }; 6716 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6717 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6718 6719 __ add(result, coeffs, 0); 6720 __ lea(dilithiumConsts, 6721 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6722 6723 // Each level represents one iteration of the outer for loop of the Java version 6724 6725 // level 0 6726 // At level 0 we need to interleave adjacent quartets of 6727 // coefficients before we multiply and add/sub by the next 16 6728 // zetas just as we did for level 7 in the multiply code. So we 6729 // load and store the values using an ld2/st2 with arrangement 4S. 6730 for (int i = 0; i < 1024; i += 128) { 6731 // load constants q, qinv 6732 // n.b. this can be moved out of the loop as they do not get 6733 // clobbered by first two loops 6734 vs_ldpq(vq, dilithiumConsts); // qInv, q 6735 // a0/a1 load interleaved 32 (8x4S) coefficients 6736 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6737 // b load next 32 (8x4S) inputs 6738 vs_ldpq_post(vs_front(vs2), zetas); 6739 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6740 // n.b. second half of vs2 provides temporary register storage 6741 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6742 vs_front(vs2), vs_back(vs2), vtmp, vq); 6743 // a0/a1 store interleaved 32 (8x4S) coefficients 6744 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6745 } 6746 6747 // level 1 6748 // At level 1 we need to interleave pairs of adjacent pairs of 6749 // coefficients before we multiply by the next 16 zetas just as we 6750 // did for level 6 in the multiply code. So we load and store the 6751 // values an ld2/st2 with arrangement 2D. 6752 for (int i = 0; i < 1024; i += 128) { 6753 // a0/a1 load interleaved 32 (8x2D) coefficients 6754 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6755 // b load next 16 (4x4S) inputs 6756 vs_ldpq_post(vs_front(vs2), zetas); 6757 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6758 // n.b. second half of vs2 provides temporary register storage 6759 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6760 vs_front(vs2), vs_back(vs2), vtmp, vq); 6761 // a0/a1 store interleaved 32 (8x2D) coefficients 6762 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6763 } 6764 6765 // level 2 6766 // At level 2 coefficients come in blocks of 4. So, we load 4 6767 // adjacent coefficients at 8 distinct offsets for both the first 6768 // and second coefficient sequences, using an ldr with register 6769 // variant Q then combine them with next set of 32 zetas. Likewise 6770 // we store the results using an str with register variant Q. 6771 for (int i = 0; i < 1024; i += 256) { 6772 // c0 load 32 (8x4S) coefficients via first offsets 6773 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6774 // c1 load 32 (8x4S) coefficients via second offsets 6775 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6776 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6777 vs_addv(vs3, __ T4S, vs1, vs2); 6778 // c = c0 - c1 6779 vs_subv(vs1, __ T4S, vs1, vs2); 6780 // store a0 32 (8x4S) coefficients via first offsets 6781 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6782 // b load 32 (8x4S) next inputs 6783 vs_ldpq_post(vs2, zetas); 6784 // reload constants q, qinv -- they were clobbered earlier 6785 vs_ldpq(vq, dilithiumConsts); // qInv, q 6786 // compute a1 = b montmul c 6787 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6788 // store a1 32 (8x4S) coefficients via second offsets 6789 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6790 } 6791 6792 // level 3-7 6793 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6794 6795 __ leave(); // required for proper stackwalking of RuntimeStub frame 6796 __ mov(r0, zr); // return 0 6797 __ ret(lr); 6798 6799 return start; 6800 } 6801 6802 // Dilithium multiply polynomials in the NTT domain. 6803 // Straightforward implementation of the method 6804 // static int implDilithiumNttMult( 6805 // int[] result, int[] ntta, int[] nttb {} of 6806 // the sun.security.provider.ML_DSA class. 6807 // 6808 // result (int[256]) = c_rarg0 6809 // poly1 (int[256]) = c_rarg1 6810 // poly2 (int[256]) = c_rarg2 6811 address generate_dilithiumNttMult() { 6812 6813 __ align(CodeEntryAlignment); 6814 StubId stub_id = StubId::stubgen_dilithiumNttMult_id; 6815 StubCodeMark mark(this, stub_id); 6816 address start = __ pc(); 6817 __ enter(); 6818 6819 Label L_loop; 6820 6821 const Register result = c_rarg0; 6822 const Register poly1 = c_rarg1; 6823 const Register poly2 = c_rarg2; 6824 6825 const Register dilithiumConsts = r10; 6826 const Register len = r11; 6827 6828 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6829 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6830 VSeq<2> vq(30); // n.b. constants overlap vs3 6831 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6832 6833 __ lea(dilithiumConsts, 6834 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6835 6836 // load constants q, qinv 6837 vs_ldpq(vq, dilithiumConsts); // qInv, q 6838 // load constant rSquare into v29 6839 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6840 6841 __ mov(len, zr); 6842 __ add(len, len, 1024); 6843 6844 __ BIND(L_loop); 6845 6846 // b load 32 (8x4S) next inputs from poly1 6847 vs_ldpq_post(vs1, poly1); 6848 // c load 32 (8x4S) next inputs from poly2 6849 vs_ldpq_post(vs2, poly2); 6850 // compute a = b montmul c 6851 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6852 // compute a = rsquare montmul a 6853 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6854 // save a 32 (8x4S) results 6855 vs_stpq_post(vs2, result); 6856 6857 __ sub(len, len, 128); 6858 __ cmp(len, (u1)128); 6859 __ br(Assembler::GE, L_loop); 6860 6861 __ leave(); // required for proper stackwalking of RuntimeStub frame 6862 __ mov(r0, zr); // return 0 6863 __ ret(lr); 6864 6865 return start; 6866 } 6867 6868 // Dilithium Motgomery multiply an array by a constant. 6869 // A straightforward implementation of the method 6870 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6871 // of the sun.security.provider.MLDSA class 6872 // 6873 // coeffs (int[256]) = c_rarg0 6874 // constant (int) = c_rarg1 6875 address generate_dilithiumMontMulByConstant() { 6876 6877 __ align(CodeEntryAlignment); 6878 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id; 6879 StubCodeMark mark(this, stub_id); 6880 address start = __ pc(); 6881 __ enter(); 6882 6883 Label L_loop; 6884 6885 const Register coeffs = c_rarg0; 6886 const Register constant = c_rarg1; 6887 6888 const Register dilithiumConsts = r10; 6889 const Register result = r11; 6890 const Register len = r12; 6891 6892 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6893 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6894 VSeq<2> vq(30); // n.b. constants overlap vs3 6895 VSeq<8> vconst(29, 0); // for montmul by constant 6896 6897 // results track inputs 6898 __ add(result, coeffs, 0); 6899 __ lea(dilithiumConsts, 6900 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6901 6902 // load constants q, qinv -- they do not get clobbered by first two loops 6903 vs_ldpq(vq, dilithiumConsts); // qInv, q 6904 // copy caller supplied constant across vconst 6905 __ dup(vconst[0], __ T4S, constant); 6906 __ mov(len, zr); 6907 __ add(len, len, 1024); 6908 6909 __ BIND(L_loop); 6910 6911 // load next 32 inputs 6912 vs_ldpq_post(vs2, coeffs); 6913 // mont mul by constant 6914 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6915 // write next 32 results 6916 vs_stpq_post(vs2, result); 6917 6918 __ sub(len, len, 128); 6919 __ cmp(len, (u1)128); 6920 __ br(Assembler::GE, L_loop); 6921 6922 __ leave(); // required for proper stackwalking of RuntimeStub frame 6923 __ mov(r0, zr); // return 0 6924 __ ret(lr); 6925 6926 return start; 6927 } 6928 6929 // Dilithium decompose poly. 6930 // Implements the method 6931 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6932 // of the sun.security.provider.ML_DSA class 6933 // 6934 // input (int[256]) = c_rarg0 6935 // lowPart (int[256]) = c_rarg1 6936 // highPart (int[256]) = c_rarg2 6937 // twoGamma2 (int) = c_rarg3 6938 // multiplier (int) = c_rarg4 6939 address generate_dilithiumDecomposePoly() { 6940 6941 __ align(CodeEntryAlignment); 6942 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id; 6943 StubCodeMark mark(this, stub_id); 6944 address start = __ pc(); 6945 Label L_loop; 6946 6947 const Register input = c_rarg0; 6948 const Register lowPart = c_rarg1; 6949 const Register highPart = c_rarg2; 6950 const Register twoGamma2 = c_rarg3; 6951 const Register multiplier = c_rarg4; 6952 6953 const Register len = r9; 6954 const Register dilithiumConsts = r10; 6955 const Register tmp = r11; 6956 6957 // 6 independent sets of 4x4s values 6958 VSeq<4> vs1(0), vs2(4), vs3(8); 6959 VSeq<4> vs4(12), vs5(16), vtmp(20); 6960 6961 // 7 constants for cross-multiplying 6962 VSeq<4> one(25, 0); 6963 VSeq<4> qminus1(26, 0); 6964 VSeq<4> g2(27, 0); 6965 VSeq<4> twog2(28, 0); 6966 VSeq<4> mult(29, 0); 6967 VSeq<4> q(30, 0); 6968 VSeq<4> qadd(31, 0); 6969 6970 __ enter(); 6971 6972 __ lea(dilithiumConsts, 6973 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6974 6975 // save callee-saved registers 6976 __ stpd(v8, v9, __ pre(sp, -64)); 6977 __ stpd(v10, v11, Address(sp, 16)); 6978 __ stpd(v12, v13, Address(sp, 32)); 6979 __ stpd(v14, v15, Address(sp, 48)); 6980 6981 // populate constant registers 6982 __ mov(tmp, zr); 6983 __ add(tmp, tmp, 1); 6984 __ dup(one[0], __ T4S, tmp); // 1 6985 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 6986 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 6987 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 6988 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 6989 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 6990 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 6991 6992 __ mov(len, zr); 6993 __ add(len, len, 1024); 6994 6995 __ BIND(L_loop); 6996 6997 // load next 4x4S inputs interleaved: rplus --> vs1 6998 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 6999 7000 // rplus = rplus - ((rplus + qadd) >> 23) * q 7001 vs_addv(vtmp, __ T4S, vs1, qadd); 7002 vs_sshr(vtmp, __ T4S, vtmp, 23); 7003 vs_mulv(vtmp, __ T4S, vtmp, q); 7004 vs_subv(vs1, __ T4S, vs1, vtmp); 7005 7006 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7007 vs_sshr(vtmp, __ T4S, vs1, 31); 7008 vs_andr(vtmp, vtmp, q); 7009 vs_addv(vs1, __ T4S, vs1, vtmp); 7010 7011 // quotient --> vs2 7012 // int quotient = (rplus * multiplier) >> 22; 7013 vs_mulv(vtmp, __ T4S, vs1, mult); 7014 vs_sshr(vs2, __ T4S, vtmp, 22); 7015 7016 // r0 --> vs3 7017 // int r0 = rplus - quotient * twoGamma2; 7018 vs_mulv(vtmp, __ T4S, vs2, twog2); 7019 vs_subv(vs3, __ T4S, vs1, vtmp); 7020 7021 // mask --> vs4 7022 // int mask = (twoGamma2 - r0) >> 22; 7023 vs_subv(vtmp, __ T4S, twog2, vs3); 7024 vs_sshr(vs4, __ T4S, vtmp, 22); 7025 7026 // r0 -= (mask & twoGamma2); 7027 vs_andr(vtmp, vs4, twog2); 7028 vs_subv(vs3, __ T4S, vs3, vtmp); 7029 7030 // quotient += (mask & 1); 7031 vs_andr(vtmp, vs4, one); 7032 vs_addv(vs2, __ T4S, vs2, vtmp); 7033 7034 // mask = (twoGamma2 / 2 - r0) >> 31; 7035 vs_subv(vtmp, __ T4S, g2, vs3); 7036 vs_sshr(vs4, __ T4S, vtmp, 31); 7037 7038 // r0 -= (mask & twoGamma2); 7039 vs_andr(vtmp, vs4, twog2); 7040 vs_subv(vs3, __ T4S, vs3, vtmp); 7041 7042 // quotient += (mask & 1); 7043 vs_andr(vtmp, vs4, one); 7044 vs_addv(vs2, __ T4S, vs2, vtmp); 7045 7046 // r1 --> vs5 7047 // int r1 = rplus - r0 - (dilithium_q - 1); 7048 vs_subv(vtmp, __ T4S, vs1, vs3); 7049 vs_subv(vs5, __ T4S, vtmp, qminus1); 7050 7051 // r1 --> vs1 (overwriting rplus) 7052 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7053 vs_negr(vtmp, __ T4S, vs5); 7054 vs_orr(vtmp, vs5, vtmp); 7055 vs_sshr(vs1, __ T4S, vtmp, 31); 7056 7057 // r0 += ~r1; 7058 vs_notr(vtmp, vs1); 7059 vs_addv(vs3, __ T4S, vs3, vtmp); 7060 7061 // r1 = r1 & quotient; 7062 vs_andr(vs1, vs2, vs1); 7063 7064 // store results inteleaved 7065 // lowPart[m] = r0; 7066 // highPart[m] = r1; 7067 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7068 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7069 7070 __ sub(len, len, 64); 7071 __ cmp(len, (u1)64); 7072 __ br(Assembler::GE, L_loop); 7073 7074 // restore callee-saved vector registers 7075 __ ldpd(v14, v15, Address(sp, 48)); 7076 __ ldpd(v12, v13, Address(sp, 32)); 7077 __ ldpd(v10, v11, Address(sp, 16)); 7078 __ ldpd(v8, v9, __ post(sp, 64)); 7079 7080 __ leave(); // required for proper stackwalking of RuntimeStub frame 7081 __ mov(r0, zr); // return 0 7082 __ ret(lr); 7083 7084 return start; 7085 } 7086 7087 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4, 7088 Register tmp0, Register tmp1, Register tmp2) { 7089 __ bic(tmp0, a2, a1); // for a0 7090 __ bic(tmp1, a3, a2); // for a1 7091 __ bic(tmp2, a4, a3); // for a2 7092 __ eor(a2, a2, tmp2); 7093 __ bic(tmp2, a0, a4); // for a3 7094 __ eor(a3, a3, tmp2); 7095 __ bic(tmp2, a1, a0); // for a4 7096 __ eor(a0, a0, tmp0); 7097 __ eor(a1, a1, tmp1); 7098 __ eor(a4, a4, tmp2); 7099 } 7100 7101 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc, 7102 Register a0, Register a1, Register a2, Register a3, Register a4, 7103 Register a5, Register a6, Register a7, Register a8, Register a9, 7104 Register a10, Register a11, Register a12, Register a13, Register a14, 7105 Register a15, Register a16, Register a17, Register a18, Register a19, 7106 Register a20, Register a21, Register a22, Register a23, Register a24, 7107 Register tmp0, Register tmp1, Register tmp2) { 7108 __ eor3(tmp1, a4, a9, a14); 7109 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4 7110 __ eor3(tmp2, a1, a6, a11); 7111 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1 7112 __ rax1(tmp2, tmp0, tmp1); // d0 7113 { 7114 7115 Register tmp3, tmp4; 7116 if (can_use_fp && can_use_r18) { 7117 tmp3 = rfp; 7118 tmp4 = r18_tls; 7119 } else { 7120 tmp3 = a4; 7121 tmp4 = a9; 7122 __ stp(tmp3, tmp4, __ pre(sp, -16)); 7123 } 7124 7125 __ eor3(tmp3, a0, a5, a10); 7126 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0 7127 __ eor(a0, a0, tmp2); 7128 __ eor(a5, a5, tmp2); 7129 __ eor(a10, a10, tmp2); 7130 __ eor(a15, a15, tmp2); 7131 __ eor(a20, a20, tmp2); // d0(tmp2) 7132 __ eor3(tmp3, a2, a7, a12); 7133 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2 7134 __ rax1(tmp3, tmp4, tmp2); // d1 7135 __ eor(a1, a1, tmp3); 7136 __ eor(a6, a6, tmp3); 7137 __ eor(a11, a11, tmp3); 7138 __ eor(a16, a16, tmp3); 7139 __ eor(a21, a21, tmp3); // d1(tmp3) 7140 __ rax1(tmp3, tmp2, tmp0); // d3 7141 __ eor3(tmp2, a3, a8, a13); 7142 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3 7143 __ eor(a3, a3, tmp3); 7144 __ eor(a8, a8, tmp3); 7145 __ eor(a13, a13, tmp3); 7146 __ eor(a18, a18, tmp3); 7147 __ eor(a23, a23, tmp3); 7148 __ rax1(tmp2, tmp1, tmp0); // d2 7149 __ eor(a2, a2, tmp2); 7150 __ eor(a7, a7, tmp2); 7151 __ eor(a12, a12, tmp2); 7152 __ rax1(tmp0, tmp0, tmp4); // d4 7153 if (!can_use_fp || !can_use_r18) { 7154 __ ldp(tmp3, tmp4, __ post(sp, 16)); 7155 } 7156 __ eor(a17, a17, tmp2); 7157 __ eor(a22, a22, tmp2); 7158 __ eor(a4, a4, tmp0); 7159 __ eor(a9, a9, tmp0); 7160 __ eor(a14, a14, tmp0); 7161 __ eor(a19, a19, tmp0); 7162 __ eor(a24, a24, tmp0); 7163 } 7164 7165 __ rol(tmp0, a10, 3); 7166 __ rol(a10, a1, 1); 7167 __ rol(a1, a6, 44); 7168 __ rol(a6, a9, 20); 7169 __ rol(a9, a22, 61); 7170 __ rol(a22, a14, 39); 7171 __ rol(a14, a20, 18); 7172 __ rol(a20, a2, 62); 7173 __ rol(a2, a12, 43); 7174 __ rol(a12, a13, 25); 7175 __ rol(a13, a19, 8) ; 7176 __ rol(a19, a23, 56); 7177 __ rol(a23, a15, 41); 7178 __ rol(a15, a4, 27); 7179 __ rol(a4, a24, 14); 7180 __ rol(a24, a21, 2); 7181 __ rol(a21, a8, 55); 7182 __ rol(a8, a16, 45); 7183 __ rol(a16, a5, 36); 7184 __ rol(a5, a3, 28); 7185 __ rol(a3, a18, 21); 7186 __ rol(a18, a17, 15); 7187 __ rol(a17, a11, 10); 7188 __ rol(a11, a7, 6); 7189 __ mov(a7, tmp0); 7190 7191 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2); 7192 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2); 7193 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2); 7194 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2); 7195 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2); 7196 7197 __ ldr(tmp1, __ post(rc, 8)); 7198 __ eor(a0, a0, tmp1); 7199 7200 } 7201 7202 // Arguments: 7203 // 7204 // Inputs: 7205 // c_rarg0 - byte[] source+offset 7206 // c_rarg1 - byte[] SHA.state 7207 // c_rarg2 - int block_size 7208 // c_rarg3 - int offset 7209 // c_rarg4 - int limit 7210 // 7211 address generate_sha3_implCompress_gpr(StubId stub_id) { 7212 bool multi_block; 7213 switch (stub_id) { 7214 case StubId::stubgen_sha3_implCompress_id: 7215 multi_block = false; 7216 break; 7217 case StubId::stubgen_sha3_implCompressMB_id: 7218 multi_block = true; 7219 break; 7220 default: 7221 ShouldNotReachHere(); 7222 } 7223 7224 static const uint64_t round_consts[24] = { 7225 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 7226 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 7227 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 7228 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 7229 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 7230 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 7231 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 7232 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 7233 }; 7234 7235 __ align(CodeEntryAlignment); 7236 StubCodeMark mark(this, stub_id); 7237 address start = __ pc(); 7238 7239 Register buf = c_rarg0; 7240 Register state = c_rarg1; 7241 Register block_size = c_rarg2; 7242 Register ofs = c_rarg3; 7243 Register limit = c_rarg4; 7244 7245 // use r3.r17,r19..r28 to keep a0..a24. 7246 // a0..a24 are respective locals from SHA3.java 7247 Register a0 = r25, 7248 a1 = r26, 7249 a2 = r27, 7250 a3 = r3, 7251 a4 = r4, 7252 a5 = r5, 7253 a6 = r6, 7254 a7 = r7, 7255 a8 = rscratch1, // r8 7256 a9 = rscratch2, // r9 7257 a10 = r10, 7258 a11 = r11, 7259 a12 = r12, 7260 a13 = r13, 7261 a14 = r14, 7262 a15 = r15, 7263 a16 = r16, 7264 a17 = r17, 7265 a18 = r28, 7266 a19 = r19, 7267 a20 = r20, 7268 a21 = r21, 7269 a22 = r22, 7270 a23 = r23, 7271 a24 = r24; 7272 7273 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30; 7274 7275 Label sha3_loop, rounds24_preloop, loop_body; 7276 Label sha3_512_or_sha3_384, shake128; 7277 7278 bool can_use_r18 = false; 7279 #ifndef R18_RESERVED 7280 can_use_r18 = true; 7281 #endif 7282 bool can_use_fp = !PreserveFramePointer; 7283 7284 __ enter(); 7285 7286 // save almost all yet unsaved gpr registers on stack 7287 __ str(block_size, __ pre(sp, -128)); 7288 if (multi_block) { 7289 __ stpw(ofs, limit, Address(sp, 8)); 7290 } 7291 // 8 bytes at sp+16 will be used to keep buf 7292 __ stp(r19, r20, Address(sp, 32)); 7293 __ stp(r21, r22, Address(sp, 48)); 7294 __ stp(r23, r24, Address(sp, 64)); 7295 __ stp(r25, r26, Address(sp, 80)); 7296 __ stp(r27, r28, Address(sp, 96)); 7297 if (can_use_r18 && can_use_fp) { 7298 __ stp(r18_tls, state, Address(sp, 112)); 7299 } else { 7300 __ str(state, Address(sp, 112)); 7301 } 7302 7303 // begin sha3 calculations: loading a0..a24 from state arrary 7304 __ ldp(a0, a1, state); 7305 __ ldp(a2, a3, Address(state, 16)); 7306 __ ldp(a4, a5, Address(state, 32)); 7307 __ ldp(a6, a7, Address(state, 48)); 7308 __ ldp(a8, a9, Address(state, 64)); 7309 __ ldp(a10, a11, Address(state, 80)); 7310 __ ldp(a12, a13, Address(state, 96)); 7311 __ ldp(a14, a15, Address(state, 112)); 7312 __ ldp(a16, a17, Address(state, 128)); 7313 __ ldp(a18, a19, Address(state, 144)); 7314 __ ldp(a20, a21, Address(state, 160)); 7315 __ ldp(a22, a23, Address(state, 176)); 7316 __ ldr(a24, Address(state, 192)); 7317 7318 __ BIND(sha3_loop); 7319 7320 // load input 7321 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7322 __ eor(a0, a0, tmp3); 7323 __ eor(a1, a1, tmp2); 7324 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7325 __ eor(a2, a2, tmp3); 7326 __ eor(a3, a3, tmp2); 7327 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7328 __ eor(a4, a4, tmp3); 7329 __ eor(a5, a5, tmp2); 7330 __ ldr(tmp3, __ post(buf, 8)); 7331 __ eor(a6, a6, tmp3); 7332 7333 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 7334 __ tbz(block_size, 7, sha3_512_or_sha3_384); 7335 7336 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7337 __ eor(a7, a7, tmp3); 7338 __ eor(a8, a8, tmp2); 7339 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7340 __ eor(a9, a9, tmp3); 7341 __ eor(a10, a10, tmp2); 7342 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7343 __ eor(a11, a11, tmp3); 7344 __ eor(a12, a12, tmp2); 7345 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7346 __ eor(a13, a13, tmp3); 7347 __ eor(a14, a14, tmp2); 7348 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7349 __ eor(a15, a15, tmp3); 7350 __ eor(a16, a16, tmp2); 7351 7352 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 7353 __ andw(tmp2, block_size, 48); 7354 __ cbzw(tmp2, rounds24_preloop); 7355 __ tbnz(block_size, 5, shake128); 7356 // block_size == 144, bit5 == 0, SHA3-244 7357 __ ldr(tmp3, __ post(buf, 8)); 7358 __ eor(a17, a17, tmp3); 7359 __ b(rounds24_preloop); 7360 7361 __ BIND(shake128); 7362 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7363 __ eor(a17, a17, tmp3); 7364 __ eor(a18, a18, tmp2); 7365 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7366 __ eor(a19, a19, tmp3); 7367 __ eor(a20, a20, tmp2); 7368 __ b(rounds24_preloop); // block_size == 168, SHAKE128 7369 7370 __ BIND(sha3_512_or_sha3_384); 7371 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7372 __ eor(a7, a7, tmp3); 7373 __ eor(a8, a8, tmp2); 7374 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512 7375 7376 // SHA3-384 7377 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7378 __ eor(a9, a9, tmp3); 7379 __ eor(a10, a10, tmp2); 7380 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7381 __ eor(a11, a11, tmp3); 7382 __ eor(a12, a12, tmp2); 7383 7384 __ BIND(rounds24_preloop); 7385 __ fmovs(v0, 24.0); // float loop counter, 7386 __ fmovs(v1, 1.0); // exact representation 7387 7388 __ str(buf, Address(sp, 16)); 7389 __ lea(tmp3, ExternalAddress((address) round_consts)); 7390 7391 __ BIND(loop_body); 7392 keccak_round_gpr(can_use_fp, can_use_r18, tmp3, 7393 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, 7394 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, 7395 tmp0, tmp1, tmp2); 7396 __ fsubs(v0, v0, v1); 7397 __ fcmps(v0, 0.0); 7398 __ br(__ NE, loop_body); 7399 7400 if (multi_block) { 7401 __ ldrw(block_size, sp); // block_size 7402 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit 7403 __ addw(tmp2, tmp2, block_size); 7404 __ cmpw(tmp2, tmp1); 7405 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping 7406 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping 7407 __ br(Assembler::LE, sha3_loop); 7408 __ movw(c_rarg0, tmp2); // return offset 7409 } 7410 if (can_use_fp && can_use_r18) { 7411 __ ldp(r18_tls, state, Address(sp, 112)); 7412 } else { 7413 __ ldr(state, Address(sp, 112)); 7414 } 7415 // save calculated sha3 state 7416 __ stp(a0, a1, Address(state)); 7417 __ stp(a2, a3, Address(state, 16)); 7418 __ stp(a4, a5, Address(state, 32)); 7419 __ stp(a6, a7, Address(state, 48)); 7420 __ stp(a8, a9, Address(state, 64)); 7421 __ stp(a10, a11, Address(state, 80)); 7422 __ stp(a12, a13, Address(state, 96)); 7423 __ stp(a14, a15, Address(state, 112)); 7424 __ stp(a16, a17, Address(state, 128)); 7425 __ stp(a18, a19, Address(state, 144)); 7426 __ stp(a20, a21, Address(state, 160)); 7427 __ stp(a22, a23, Address(state, 176)); 7428 __ str(a24, Address(state, 192)); 7429 7430 // restore required registers from stack 7431 __ ldp(r19, r20, Address(sp, 32)); 7432 __ ldp(r21, r22, Address(sp, 48)); 7433 __ ldp(r23, r24, Address(sp, 64)); 7434 __ ldp(r25, r26, Address(sp, 80)); 7435 __ ldp(r27, r28, Address(sp, 96)); 7436 if (can_use_fp && can_use_r18) { 7437 __ add(rfp, sp, 128); // leave() will copy rfp to sp below 7438 } // else no need to recalculate rfp, since it wasn't changed 7439 7440 __ leave(); 7441 7442 __ ret(lr); 7443 7444 return start; 7445 } 7446 7447 /** 7448 * Arguments: 7449 * 7450 * Inputs: 7451 * c_rarg0 - int crc 7452 * c_rarg1 - byte* buf 7453 * c_rarg2 - int length 7454 * 7455 * Output: 7456 * rax - int crc result 7457 */ 7458 address generate_updateBytesCRC32() { 7459 assert(UseCRC32Intrinsics, "what are we doing here?"); 7460 7461 __ align(CodeEntryAlignment); 7462 StubId stub_id = StubId::stubgen_updateBytesCRC32_id; 7463 StubCodeMark mark(this, stub_id); 7464 7465 address start = __ pc(); 7466 7467 const Register crc = c_rarg0; // crc 7468 const Register buf = c_rarg1; // source java byte array address 7469 const Register len = c_rarg2; // length 7470 const Register table0 = c_rarg3; // crc_table address 7471 const Register table1 = c_rarg4; 7472 const Register table2 = c_rarg5; 7473 const Register table3 = c_rarg6; 7474 const Register tmp3 = c_rarg7; 7475 7476 BLOCK_COMMENT("Entry:"); 7477 __ enter(); // required for proper stackwalking of RuntimeStub frame 7478 7479 __ kernel_crc32(crc, buf, len, 7480 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7481 7482 __ leave(); // required for proper stackwalking of RuntimeStub frame 7483 __ ret(lr); 7484 7485 return start; 7486 } 7487 7488 /** 7489 * Arguments: 7490 * 7491 * Inputs: 7492 * c_rarg0 - int crc 7493 * c_rarg1 - byte* buf 7494 * c_rarg2 - int length 7495 * c_rarg3 - int* table 7496 * 7497 * Output: 7498 * r0 - int crc result 7499 */ 7500 address generate_updateBytesCRC32C() { 7501 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7502 7503 __ align(CodeEntryAlignment); 7504 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id; 7505 StubCodeMark mark(this, stub_id); 7506 7507 address start = __ pc(); 7508 7509 const Register crc = c_rarg0; // crc 7510 const Register buf = c_rarg1; // source java byte array address 7511 const Register len = c_rarg2; // length 7512 const Register table0 = c_rarg3; // crc_table address 7513 const Register table1 = c_rarg4; 7514 const Register table2 = c_rarg5; 7515 const Register table3 = c_rarg6; 7516 const Register tmp3 = c_rarg7; 7517 7518 BLOCK_COMMENT("Entry:"); 7519 __ enter(); // required for proper stackwalking of RuntimeStub frame 7520 7521 __ kernel_crc32c(crc, buf, len, 7522 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7523 7524 __ leave(); // required for proper stackwalking of RuntimeStub frame 7525 __ ret(lr); 7526 7527 return start; 7528 } 7529 7530 /*** 7531 * Arguments: 7532 * 7533 * Inputs: 7534 * c_rarg0 - int adler 7535 * c_rarg1 - byte* buff 7536 * c_rarg2 - int len 7537 * 7538 * Output: 7539 * c_rarg0 - int adler result 7540 */ 7541 address generate_updateBytesAdler32() { 7542 __ align(CodeEntryAlignment); 7543 StubId stub_id = StubId::stubgen_updateBytesAdler32_id; 7544 StubCodeMark mark(this, stub_id); 7545 address start = __ pc(); 7546 7547 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7548 7549 // Aliases 7550 Register adler = c_rarg0; 7551 Register s1 = c_rarg0; 7552 Register s2 = c_rarg3; 7553 Register buff = c_rarg1; 7554 Register len = c_rarg2; 7555 Register nmax = r4; 7556 Register base = r5; 7557 Register count = r6; 7558 Register temp0 = rscratch1; 7559 Register temp1 = rscratch2; 7560 FloatRegister vbytes = v0; 7561 FloatRegister vs1acc = v1; 7562 FloatRegister vs2acc = v2; 7563 FloatRegister vtable = v3; 7564 7565 // Max number of bytes we can process before having to take the mod 7566 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7567 uint64_t BASE = 0xfff1; 7568 uint64_t NMAX = 0x15B0; 7569 7570 __ mov(base, BASE); 7571 __ mov(nmax, NMAX); 7572 7573 // Load accumulation coefficients for the upper 16 bits 7574 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7575 __ ld1(vtable, __ T16B, Address(temp0)); 7576 7577 // s1 is initialized to the lower 16 bits of adler 7578 // s2 is initialized to the upper 16 bits of adler 7579 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7580 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7581 7582 // The pipelined loop needs at least 16 elements for 1 iteration 7583 // It does check this, but it is more effective to skip to the cleanup loop 7584 __ cmp(len, (u1)16); 7585 __ br(Assembler::HS, L_nmax); 7586 __ cbz(len, L_combine); 7587 7588 __ bind(L_simple_by1_loop); 7589 __ ldrb(temp0, Address(__ post(buff, 1))); 7590 __ add(s1, s1, temp0); 7591 __ add(s2, s2, s1); 7592 __ subs(len, len, 1); 7593 __ br(Assembler::HI, L_simple_by1_loop); 7594 7595 // s1 = s1 % BASE 7596 __ subs(temp0, s1, base); 7597 __ csel(s1, temp0, s1, Assembler::HS); 7598 7599 // s2 = s2 % BASE 7600 __ lsr(temp0, s2, 16); 7601 __ lsl(temp1, temp0, 4); 7602 __ sub(temp1, temp1, temp0); 7603 __ add(s2, temp1, s2, ext::uxth); 7604 7605 __ subs(temp0, s2, base); 7606 __ csel(s2, temp0, s2, Assembler::HS); 7607 7608 __ b(L_combine); 7609 7610 __ bind(L_nmax); 7611 __ subs(len, len, nmax); 7612 __ sub(count, nmax, 16); 7613 __ br(Assembler::LO, L_by16); 7614 7615 __ bind(L_nmax_loop); 7616 7617 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7618 vbytes, vs1acc, vs2acc, vtable); 7619 7620 __ subs(count, count, 16); 7621 __ br(Assembler::HS, L_nmax_loop); 7622 7623 // s1 = s1 % BASE 7624 __ lsr(temp0, s1, 16); 7625 __ lsl(temp1, temp0, 4); 7626 __ sub(temp1, temp1, temp0); 7627 __ add(temp1, temp1, s1, ext::uxth); 7628 7629 __ lsr(temp0, temp1, 16); 7630 __ lsl(s1, temp0, 4); 7631 __ sub(s1, s1, temp0); 7632 __ add(s1, s1, temp1, ext:: uxth); 7633 7634 __ subs(temp0, s1, base); 7635 __ csel(s1, temp0, s1, Assembler::HS); 7636 7637 // s2 = s2 % BASE 7638 __ lsr(temp0, s2, 16); 7639 __ lsl(temp1, temp0, 4); 7640 __ sub(temp1, temp1, temp0); 7641 __ add(temp1, temp1, s2, ext::uxth); 7642 7643 __ lsr(temp0, temp1, 16); 7644 __ lsl(s2, temp0, 4); 7645 __ sub(s2, s2, temp0); 7646 __ add(s2, s2, temp1, ext:: uxth); 7647 7648 __ subs(temp0, s2, base); 7649 __ csel(s2, temp0, s2, Assembler::HS); 7650 7651 __ subs(len, len, nmax); 7652 __ sub(count, nmax, 16); 7653 __ br(Assembler::HS, L_nmax_loop); 7654 7655 __ bind(L_by16); 7656 __ adds(len, len, count); 7657 __ br(Assembler::LO, L_by1); 7658 7659 __ bind(L_by16_loop); 7660 7661 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7662 vbytes, vs1acc, vs2acc, vtable); 7663 7664 __ subs(len, len, 16); 7665 __ br(Assembler::HS, L_by16_loop); 7666 7667 __ bind(L_by1); 7668 __ adds(len, len, 15); 7669 __ br(Assembler::LO, L_do_mod); 7670 7671 __ bind(L_by1_loop); 7672 __ ldrb(temp0, Address(__ post(buff, 1))); 7673 __ add(s1, temp0, s1); 7674 __ add(s2, s2, s1); 7675 __ subs(len, len, 1); 7676 __ br(Assembler::HS, L_by1_loop); 7677 7678 __ bind(L_do_mod); 7679 // s1 = s1 % BASE 7680 __ lsr(temp0, s1, 16); 7681 __ lsl(temp1, temp0, 4); 7682 __ sub(temp1, temp1, temp0); 7683 __ add(temp1, temp1, s1, ext::uxth); 7684 7685 __ lsr(temp0, temp1, 16); 7686 __ lsl(s1, temp0, 4); 7687 __ sub(s1, s1, temp0); 7688 __ add(s1, s1, temp1, ext:: uxth); 7689 7690 __ subs(temp0, s1, base); 7691 __ csel(s1, temp0, s1, Assembler::HS); 7692 7693 // s2 = s2 % BASE 7694 __ lsr(temp0, s2, 16); 7695 __ lsl(temp1, temp0, 4); 7696 __ sub(temp1, temp1, temp0); 7697 __ add(temp1, temp1, s2, ext::uxth); 7698 7699 __ lsr(temp0, temp1, 16); 7700 __ lsl(s2, temp0, 4); 7701 __ sub(s2, s2, temp0); 7702 __ add(s2, s2, temp1, ext:: uxth); 7703 7704 __ subs(temp0, s2, base); 7705 __ csel(s2, temp0, s2, Assembler::HS); 7706 7707 // Combine lower bits and higher bits 7708 __ bind(L_combine); 7709 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7710 7711 __ ret(lr); 7712 7713 return start; 7714 } 7715 7716 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7717 Register temp0, Register temp1, FloatRegister vbytes, 7718 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7719 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7720 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7721 // In non-vectorized code, we update s1 and s2 as: 7722 // s1 <- s1 + b1 7723 // s2 <- s2 + s1 7724 // s1 <- s1 + b2 7725 // s2 <- s2 + b1 7726 // ... 7727 // s1 <- s1 + b16 7728 // s2 <- s2 + s1 7729 // Putting above assignments together, we have: 7730 // s1_new = s1 + b1 + b2 + ... + b16 7731 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7732 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7733 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7734 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7735 7736 // s2 = s2 + s1 * 16 7737 __ add(s2, s2, s1, Assembler::LSL, 4); 7738 7739 // vs1acc = b1 + b2 + b3 + ... + b16 7740 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7741 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7742 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7743 __ uaddlv(vs1acc, __ T16B, vbytes); 7744 __ uaddlv(vs2acc, __ T8H, vs2acc); 7745 7746 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7747 __ fmovd(temp0, vs1acc); 7748 __ fmovd(temp1, vs2acc); 7749 __ add(s1, s1, temp0); 7750 __ add(s2, s2, temp1); 7751 } 7752 7753 /** 7754 * Arguments: 7755 * 7756 * Input: 7757 * c_rarg0 - x address 7758 * c_rarg1 - x length 7759 * c_rarg2 - y address 7760 * c_rarg3 - y length 7761 * c_rarg4 - z address 7762 */ 7763 address generate_multiplyToLen() { 7764 __ align(CodeEntryAlignment); 7765 StubId stub_id = StubId::stubgen_multiplyToLen_id; 7766 StubCodeMark mark(this, stub_id); 7767 7768 address start = __ pc(); 7769 const Register x = r0; 7770 const Register xlen = r1; 7771 const Register y = r2; 7772 const Register ylen = r3; 7773 const Register z = r4; 7774 7775 const Register tmp0 = r5; 7776 const Register tmp1 = r10; 7777 const Register tmp2 = r11; 7778 const Register tmp3 = r12; 7779 const Register tmp4 = r13; 7780 const Register tmp5 = r14; 7781 const Register tmp6 = r15; 7782 const Register tmp7 = r16; 7783 7784 BLOCK_COMMENT("Entry:"); 7785 __ enter(); // required for proper stackwalking of RuntimeStub frame 7786 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7787 __ leave(); // required for proper stackwalking of RuntimeStub frame 7788 __ ret(lr); 7789 7790 return start; 7791 } 7792 7793 address generate_squareToLen() { 7794 // squareToLen algorithm for sizes 1..127 described in java code works 7795 // faster than multiply_to_len on some CPUs and slower on others, but 7796 // multiply_to_len shows a bit better overall results 7797 __ align(CodeEntryAlignment); 7798 StubId stub_id = StubId::stubgen_squareToLen_id; 7799 StubCodeMark mark(this, stub_id); 7800 address start = __ pc(); 7801 7802 const Register x = r0; 7803 const Register xlen = r1; 7804 const Register z = r2; 7805 const Register y = r4; // == x 7806 const Register ylen = r5; // == xlen 7807 7808 const Register tmp0 = r3; 7809 const Register tmp1 = r10; 7810 const Register tmp2 = r11; 7811 const Register tmp3 = r12; 7812 const Register tmp4 = r13; 7813 const Register tmp5 = r14; 7814 const Register tmp6 = r15; 7815 const Register tmp7 = r16; 7816 7817 RegSet spilled_regs = RegSet::of(y, ylen); 7818 BLOCK_COMMENT("Entry:"); 7819 __ enter(); 7820 __ push(spilled_regs, sp); 7821 __ mov(y, x); 7822 __ mov(ylen, xlen); 7823 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7824 __ pop(spilled_regs, sp); 7825 __ leave(); 7826 __ ret(lr); 7827 return start; 7828 } 7829 7830 address generate_mulAdd() { 7831 __ align(CodeEntryAlignment); 7832 StubId stub_id = StubId::stubgen_mulAdd_id; 7833 StubCodeMark mark(this, stub_id); 7834 7835 address start = __ pc(); 7836 7837 const Register out = r0; 7838 const Register in = r1; 7839 const Register offset = r2; 7840 const Register len = r3; 7841 const Register k = r4; 7842 7843 BLOCK_COMMENT("Entry:"); 7844 __ enter(); 7845 __ mul_add(out, in, offset, len, k); 7846 __ leave(); 7847 __ ret(lr); 7848 7849 return start; 7850 } 7851 7852 // Arguments: 7853 // 7854 // Input: 7855 // c_rarg0 - newArr address 7856 // c_rarg1 - oldArr address 7857 // c_rarg2 - newIdx 7858 // c_rarg3 - shiftCount 7859 // c_rarg4 - numIter 7860 // 7861 address generate_bigIntegerRightShift() { 7862 __ align(CodeEntryAlignment); 7863 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id; 7864 StubCodeMark mark(this, stub_id); 7865 address start = __ pc(); 7866 7867 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7868 7869 Register newArr = c_rarg0; 7870 Register oldArr = c_rarg1; 7871 Register newIdx = c_rarg2; 7872 Register shiftCount = c_rarg3; 7873 Register numIter = c_rarg4; 7874 Register idx = numIter; 7875 7876 Register newArrCur = rscratch1; 7877 Register shiftRevCount = rscratch2; 7878 Register oldArrCur = r13; 7879 Register oldArrNext = r14; 7880 7881 FloatRegister oldElem0 = v0; 7882 FloatRegister oldElem1 = v1; 7883 FloatRegister newElem = v2; 7884 FloatRegister shiftVCount = v3; 7885 FloatRegister shiftVRevCount = v4; 7886 7887 __ cbz(idx, Exit); 7888 7889 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7890 7891 // left shift count 7892 __ movw(shiftRevCount, 32); 7893 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7894 7895 // numIter too small to allow a 4-words SIMD loop, rolling back 7896 __ cmp(numIter, (u1)4); 7897 __ br(Assembler::LT, ShiftThree); 7898 7899 __ dup(shiftVCount, __ T4S, shiftCount); 7900 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7901 __ negr(shiftVCount, __ T4S, shiftVCount); 7902 7903 __ BIND(ShiftSIMDLoop); 7904 7905 // Calculate the load addresses 7906 __ sub(idx, idx, 4); 7907 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7908 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7909 __ add(oldArrCur, oldArrNext, 4); 7910 7911 // Load 4 words and process 7912 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7913 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7914 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7915 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7916 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7917 __ st1(newElem, __ T4S, Address(newArrCur)); 7918 7919 __ cmp(idx, (u1)4); 7920 __ br(Assembler::LT, ShiftTwoLoop); 7921 __ b(ShiftSIMDLoop); 7922 7923 __ BIND(ShiftTwoLoop); 7924 __ cbz(idx, Exit); 7925 __ cmp(idx, (u1)1); 7926 __ br(Assembler::EQ, ShiftOne); 7927 7928 // Calculate the load addresses 7929 __ sub(idx, idx, 2); 7930 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7931 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7932 __ add(oldArrCur, oldArrNext, 4); 7933 7934 // Load 2 words and process 7935 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7936 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7937 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7938 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7939 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7940 __ st1(newElem, __ T2S, Address(newArrCur)); 7941 __ b(ShiftTwoLoop); 7942 7943 __ BIND(ShiftThree); 7944 __ tbz(idx, 1, ShiftOne); 7945 __ tbz(idx, 0, ShiftTwo); 7946 __ ldrw(r10, Address(oldArr, 12)); 7947 __ ldrw(r11, Address(oldArr, 8)); 7948 __ lsrvw(r10, r10, shiftCount); 7949 __ lslvw(r11, r11, shiftRevCount); 7950 __ orrw(r12, r10, r11); 7951 __ strw(r12, Address(newArr, 8)); 7952 7953 __ BIND(ShiftTwo); 7954 __ ldrw(r10, Address(oldArr, 8)); 7955 __ ldrw(r11, Address(oldArr, 4)); 7956 __ lsrvw(r10, r10, shiftCount); 7957 __ lslvw(r11, r11, shiftRevCount); 7958 __ orrw(r12, r10, r11); 7959 __ strw(r12, Address(newArr, 4)); 7960 7961 __ BIND(ShiftOne); 7962 __ ldrw(r10, Address(oldArr, 4)); 7963 __ ldrw(r11, Address(oldArr)); 7964 __ lsrvw(r10, r10, shiftCount); 7965 __ lslvw(r11, r11, shiftRevCount); 7966 __ orrw(r12, r10, r11); 7967 __ strw(r12, Address(newArr)); 7968 7969 __ BIND(Exit); 7970 __ ret(lr); 7971 7972 return start; 7973 } 7974 7975 // Arguments: 7976 // 7977 // Input: 7978 // c_rarg0 - newArr address 7979 // c_rarg1 - oldArr address 7980 // c_rarg2 - newIdx 7981 // c_rarg3 - shiftCount 7982 // c_rarg4 - numIter 7983 // 7984 address generate_bigIntegerLeftShift() { 7985 __ align(CodeEntryAlignment); 7986 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id; 7987 StubCodeMark mark(this, stub_id); 7988 address start = __ pc(); 7989 7990 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7991 7992 Register newArr = c_rarg0; 7993 Register oldArr = c_rarg1; 7994 Register newIdx = c_rarg2; 7995 Register shiftCount = c_rarg3; 7996 Register numIter = c_rarg4; 7997 7998 Register shiftRevCount = rscratch1; 7999 Register oldArrNext = rscratch2; 8000 8001 FloatRegister oldElem0 = v0; 8002 FloatRegister oldElem1 = v1; 8003 FloatRegister newElem = v2; 8004 FloatRegister shiftVCount = v3; 8005 FloatRegister shiftVRevCount = v4; 8006 8007 __ cbz(numIter, Exit); 8008 8009 __ add(oldArrNext, oldArr, 4); 8010 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 8011 8012 // right shift count 8013 __ movw(shiftRevCount, 32); 8014 __ subw(shiftRevCount, shiftRevCount, shiftCount); 8015 8016 // numIter too small to allow a 4-words SIMD loop, rolling back 8017 __ cmp(numIter, (u1)4); 8018 __ br(Assembler::LT, ShiftThree); 8019 8020 __ dup(shiftVCount, __ T4S, shiftCount); 8021 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 8022 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 8023 8024 __ BIND(ShiftSIMDLoop); 8025 8026 // load 4 words and process 8027 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 8028 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 8029 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 8030 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 8031 __ orr(newElem, __ T16B, oldElem0, oldElem1); 8032 __ st1(newElem, __ T4S, __ post(newArr, 16)); 8033 __ sub(numIter, numIter, 4); 8034 8035 __ cmp(numIter, (u1)4); 8036 __ br(Assembler::LT, ShiftTwoLoop); 8037 __ b(ShiftSIMDLoop); 8038 8039 __ BIND(ShiftTwoLoop); 8040 __ cbz(numIter, Exit); 8041 __ cmp(numIter, (u1)1); 8042 __ br(Assembler::EQ, ShiftOne); 8043 8044 // load 2 words and process 8045 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 8046 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 8047 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 8048 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 8049 __ orr(newElem, __ T8B, oldElem0, oldElem1); 8050 __ st1(newElem, __ T2S, __ post(newArr, 8)); 8051 __ sub(numIter, numIter, 2); 8052 __ b(ShiftTwoLoop); 8053 8054 __ BIND(ShiftThree); 8055 __ ldrw(r10, __ post(oldArr, 4)); 8056 __ ldrw(r11, __ post(oldArrNext, 4)); 8057 __ lslvw(r10, r10, shiftCount); 8058 __ lsrvw(r11, r11, shiftRevCount); 8059 __ orrw(r12, r10, r11); 8060 __ strw(r12, __ post(newArr, 4)); 8061 __ tbz(numIter, 1, Exit); 8062 __ tbz(numIter, 0, ShiftOne); 8063 8064 __ BIND(ShiftTwo); 8065 __ ldrw(r10, __ post(oldArr, 4)); 8066 __ ldrw(r11, __ post(oldArrNext, 4)); 8067 __ lslvw(r10, r10, shiftCount); 8068 __ lsrvw(r11, r11, shiftRevCount); 8069 __ orrw(r12, r10, r11); 8070 __ strw(r12, __ post(newArr, 4)); 8071 8072 __ BIND(ShiftOne); 8073 __ ldrw(r10, Address(oldArr)); 8074 __ ldrw(r11, Address(oldArrNext)); 8075 __ lslvw(r10, r10, shiftCount); 8076 __ lsrvw(r11, r11, shiftRevCount); 8077 __ orrw(r12, r10, r11); 8078 __ strw(r12, Address(newArr)); 8079 8080 __ BIND(Exit); 8081 __ ret(lr); 8082 8083 return start; 8084 } 8085 8086 address generate_count_positives(address &count_positives_long) { 8087 const u1 large_loop_size = 64; 8088 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 8089 int dcache_line = VM_Version::dcache_line_size(); 8090 8091 Register ary1 = r1, len = r2, result = r0; 8092 8093 __ align(CodeEntryAlignment); 8094 8095 StubId stub_id = StubId::stubgen_count_positives_id; 8096 StubCodeMark mark(this, stub_id); 8097 8098 address entry = __ pc(); 8099 8100 __ enter(); 8101 // precondition: a copy of len is already in result 8102 // __ mov(result, len); 8103 8104 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 8105 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 8106 8107 __ cmp(len, (u1)15); 8108 __ br(Assembler::GT, LEN_OVER_15); 8109 // The only case when execution falls into this code is when pointer is near 8110 // the end of memory page and we have to avoid reading next page 8111 __ add(ary1, ary1, len); 8112 __ subs(len, len, 8); 8113 __ br(Assembler::GT, LEN_OVER_8); 8114 __ ldr(rscratch2, Address(ary1, -8)); 8115 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 8116 __ lsrv(rscratch2, rscratch2, rscratch1); 8117 __ tst(rscratch2, UPPER_BIT_MASK); 8118 __ csel(result, zr, result, Assembler::NE); 8119 __ leave(); 8120 __ ret(lr); 8121 __ bind(LEN_OVER_8); 8122 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 8123 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 8124 __ tst(rscratch2, UPPER_BIT_MASK); 8125 __ br(Assembler::NE, RET_NO_POP); 8126 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 8127 __ lsrv(rscratch1, rscratch1, rscratch2); 8128 __ tst(rscratch1, UPPER_BIT_MASK); 8129 __ bind(RET_NO_POP); 8130 __ csel(result, zr, result, Assembler::NE); 8131 __ leave(); 8132 __ ret(lr); 8133 8134 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 8135 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 8136 8137 count_positives_long = __ pc(); // 2nd entry point 8138 8139 __ enter(); 8140 8141 __ bind(LEN_OVER_15); 8142 __ push(spilled_regs, sp); 8143 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 8144 __ cbz(rscratch2, ALIGNED); 8145 __ ldp(tmp6, tmp1, Address(ary1)); 8146 __ mov(tmp5, 16); 8147 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 8148 __ add(ary1, ary1, rscratch1); 8149 __ orr(tmp6, tmp6, tmp1); 8150 __ tst(tmp6, UPPER_BIT_MASK); 8151 __ br(Assembler::NE, RET_ADJUST); 8152 __ sub(len, len, rscratch1); 8153 8154 __ bind(ALIGNED); 8155 __ cmp(len, large_loop_size); 8156 __ br(Assembler::LT, CHECK_16); 8157 // Perform 16-byte load as early return in pre-loop to handle situation 8158 // when initially aligned large array has negative values at starting bytes, 8159 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 8160 // slower. Cases with negative bytes further ahead won't be affected that 8161 // much. In fact, it'll be faster due to early loads, less instructions and 8162 // less branches in LARGE_LOOP. 8163 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 8164 __ sub(len, len, 16); 8165 __ orr(tmp6, tmp6, tmp1); 8166 __ tst(tmp6, UPPER_BIT_MASK); 8167 __ br(Assembler::NE, RET_ADJUST_16); 8168 __ cmp(len, large_loop_size); 8169 __ br(Assembler::LT, CHECK_16); 8170 8171 if (SoftwarePrefetchHintDistance >= 0 8172 && SoftwarePrefetchHintDistance >= dcache_line) { 8173 // initial prefetch 8174 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 8175 } 8176 __ bind(LARGE_LOOP); 8177 if (SoftwarePrefetchHintDistance >= 0) { 8178 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 8179 } 8180 // Issue load instructions first, since it can save few CPU/MEM cycles, also 8181 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 8182 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 8183 // instructions per cycle and have less branches, but this approach disables 8184 // early return, thus, all 64 bytes are loaded and checked every time. 8185 __ ldp(tmp2, tmp3, Address(ary1)); 8186 __ ldp(tmp4, tmp5, Address(ary1, 16)); 8187 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 8188 __ ldp(tmp6, tmp1, Address(ary1, 48)); 8189 __ add(ary1, ary1, large_loop_size); 8190 __ sub(len, len, large_loop_size); 8191 __ orr(tmp2, tmp2, tmp3); 8192 __ orr(tmp4, tmp4, tmp5); 8193 __ orr(rscratch1, rscratch1, rscratch2); 8194 __ orr(tmp6, tmp6, tmp1); 8195 __ orr(tmp2, tmp2, tmp4); 8196 __ orr(rscratch1, rscratch1, tmp6); 8197 __ orr(tmp2, tmp2, rscratch1); 8198 __ tst(tmp2, UPPER_BIT_MASK); 8199 __ br(Assembler::NE, RET_ADJUST_LONG); 8200 __ cmp(len, large_loop_size); 8201 __ br(Assembler::GE, LARGE_LOOP); 8202 8203 __ bind(CHECK_16); // small 16-byte load pre-loop 8204 __ cmp(len, (u1)16); 8205 __ br(Assembler::LT, POST_LOOP16); 8206 8207 __ bind(LOOP16); // small 16-byte load loop 8208 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 8209 __ sub(len, len, 16); 8210 __ orr(tmp2, tmp2, tmp3); 8211 __ tst(tmp2, UPPER_BIT_MASK); 8212 __ br(Assembler::NE, RET_ADJUST_16); 8213 __ cmp(len, (u1)16); 8214 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 8215 8216 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 8217 __ cmp(len, (u1)8); 8218 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 8219 __ ldr(tmp3, Address(__ post(ary1, 8))); 8220 __ tst(tmp3, UPPER_BIT_MASK); 8221 __ br(Assembler::NE, RET_ADJUST); 8222 __ sub(len, len, 8); 8223 8224 __ bind(POST_LOOP16_LOAD_TAIL); 8225 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 8226 __ ldr(tmp1, Address(ary1)); 8227 __ mov(tmp2, 64); 8228 __ sub(tmp4, tmp2, len, __ LSL, 3); 8229 __ lslv(tmp1, tmp1, tmp4); 8230 __ tst(tmp1, UPPER_BIT_MASK); 8231 __ br(Assembler::NE, RET_ADJUST); 8232 // Fallthrough 8233 8234 __ bind(RET_LEN); 8235 __ pop(spilled_regs, sp); 8236 __ leave(); 8237 __ ret(lr); 8238 8239 // difference result - len is the count of guaranteed to be 8240 // positive bytes 8241 8242 __ bind(RET_ADJUST_LONG); 8243 __ add(len, len, (u1)(large_loop_size - 16)); 8244 __ bind(RET_ADJUST_16); 8245 __ add(len, len, 16); 8246 __ bind(RET_ADJUST); 8247 __ pop(spilled_regs, sp); 8248 __ leave(); 8249 __ sub(result, result, len); 8250 __ ret(lr); 8251 8252 return entry; 8253 } 8254 8255 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 8256 bool usePrefetch, Label &NOT_EQUAL) { 8257 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8258 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8259 tmp7 = r12, tmp8 = r13; 8260 Label LOOP; 8261 8262 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8263 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8264 __ bind(LOOP); 8265 if (usePrefetch) { 8266 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8267 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8268 } 8269 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8270 __ eor(tmp1, tmp1, tmp2); 8271 __ eor(tmp3, tmp3, tmp4); 8272 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8273 __ orr(tmp1, tmp1, tmp3); 8274 __ cbnz(tmp1, NOT_EQUAL); 8275 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8276 __ eor(tmp5, tmp5, tmp6); 8277 __ eor(tmp7, tmp7, tmp8); 8278 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8279 __ orr(tmp5, tmp5, tmp7); 8280 __ cbnz(tmp5, NOT_EQUAL); 8281 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8282 __ eor(tmp1, tmp1, tmp2); 8283 __ eor(tmp3, tmp3, tmp4); 8284 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8285 __ orr(tmp1, tmp1, tmp3); 8286 __ cbnz(tmp1, NOT_EQUAL); 8287 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8288 __ eor(tmp5, tmp5, tmp6); 8289 __ sub(cnt1, cnt1, 8 * wordSize); 8290 __ eor(tmp7, tmp7, tmp8); 8291 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8292 // tmp6 is not used. MacroAssembler::subs is used here (rather than 8293 // cmp) because subs allows an unlimited range of immediate operand. 8294 __ subs(tmp6, cnt1, loopThreshold); 8295 __ orr(tmp5, tmp5, tmp7); 8296 __ cbnz(tmp5, NOT_EQUAL); 8297 __ br(__ GE, LOOP); 8298 // post-loop 8299 __ eor(tmp1, tmp1, tmp2); 8300 __ eor(tmp3, tmp3, tmp4); 8301 __ orr(tmp1, tmp1, tmp3); 8302 __ sub(cnt1, cnt1, 2 * wordSize); 8303 __ cbnz(tmp1, NOT_EQUAL); 8304 } 8305 8306 void generate_large_array_equals_loop_simd(int loopThreshold, 8307 bool usePrefetch, Label &NOT_EQUAL) { 8308 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8309 tmp2 = rscratch2; 8310 Label LOOP; 8311 8312 __ bind(LOOP); 8313 if (usePrefetch) { 8314 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8315 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8316 } 8317 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 8318 __ sub(cnt1, cnt1, 8 * wordSize); 8319 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 8320 __ subs(tmp1, cnt1, loopThreshold); 8321 __ eor(v0, __ T16B, v0, v4); 8322 __ eor(v1, __ T16B, v1, v5); 8323 __ eor(v2, __ T16B, v2, v6); 8324 __ eor(v3, __ T16B, v3, v7); 8325 __ orr(v0, __ T16B, v0, v1); 8326 __ orr(v1, __ T16B, v2, v3); 8327 __ orr(v0, __ T16B, v0, v1); 8328 __ umov(tmp1, v0, __ D, 0); 8329 __ umov(tmp2, v0, __ D, 1); 8330 __ orr(tmp1, tmp1, tmp2); 8331 __ cbnz(tmp1, NOT_EQUAL); 8332 __ br(__ GE, LOOP); 8333 } 8334 8335 // a1 = r1 - array1 address 8336 // a2 = r2 - array2 address 8337 // result = r0 - return value. Already contains "false" 8338 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 8339 // r3-r5 are reserved temporary registers 8340 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 8341 address generate_large_array_equals() { 8342 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8343 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8344 tmp7 = r12, tmp8 = r13; 8345 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 8346 SMALL_LOOP, POST_LOOP; 8347 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 8348 // calculate if at least 32 prefetched bytes are used 8349 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 8350 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 8351 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 8352 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 8353 tmp5, tmp6, tmp7, tmp8); 8354 8355 __ align(CodeEntryAlignment); 8356 8357 StubId stub_id = StubId::stubgen_large_array_equals_id; 8358 StubCodeMark mark(this, stub_id); 8359 8360 address entry = __ pc(); 8361 __ enter(); 8362 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8363 // also advance pointers to use post-increment instead of pre-increment 8364 __ add(a1, a1, wordSize); 8365 __ add(a2, a2, wordSize); 8366 if (AvoidUnalignedAccesses) { 8367 // both implementations (SIMD/nonSIMD) are using relatively large load 8368 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8369 // on some CPUs in case of address is not at least 16-byte aligned. 8370 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8371 // load if needed at least for 1st address and make if 16-byte aligned. 8372 Label ALIGNED16; 8373 __ tbz(a1, 3, ALIGNED16); 8374 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8375 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8376 __ sub(cnt1, cnt1, wordSize); 8377 __ eor(tmp1, tmp1, tmp2); 8378 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8379 __ bind(ALIGNED16); 8380 } 8381 if (UseSIMDForArrayEquals) { 8382 if (SoftwarePrefetchHintDistance >= 0) { 8383 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8384 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8385 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8386 /* prfm = */ true, NOT_EQUAL); 8387 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8388 __ br(__ LT, TAIL); 8389 } 8390 __ bind(NO_PREFETCH_LARGE_LOOP); 8391 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8392 /* prfm = */ false, NOT_EQUAL); 8393 } else { 8394 __ push(spilled_regs, sp); 8395 if (SoftwarePrefetchHintDistance >= 0) { 8396 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8397 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8398 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8399 /* prfm = */ true, NOT_EQUAL); 8400 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8401 __ br(__ LT, TAIL); 8402 } 8403 __ bind(NO_PREFETCH_LARGE_LOOP); 8404 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8405 /* prfm = */ false, NOT_EQUAL); 8406 } 8407 __ bind(TAIL); 8408 __ cbz(cnt1, EQUAL); 8409 __ subs(cnt1, cnt1, wordSize); 8410 __ br(__ LE, POST_LOOP); 8411 __ bind(SMALL_LOOP); 8412 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8413 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8414 __ subs(cnt1, cnt1, wordSize); 8415 __ eor(tmp1, tmp1, tmp2); 8416 __ cbnz(tmp1, NOT_EQUAL); 8417 __ br(__ GT, SMALL_LOOP); 8418 __ bind(POST_LOOP); 8419 __ ldr(tmp1, Address(a1, cnt1)); 8420 __ ldr(tmp2, Address(a2, cnt1)); 8421 __ eor(tmp1, tmp1, tmp2); 8422 __ cbnz(tmp1, NOT_EQUAL); 8423 __ bind(EQUAL); 8424 __ mov(result, true); 8425 __ bind(NOT_EQUAL); 8426 if (!UseSIMDForArrayEquals) { 8427 __ pop(spilled_regs, sp); 8428 } 8429 __ bind(NOT_EQUAL_NO_POP); 8430 __ leave(); 8431 __ ret(lr); 8432 return entry; 8433 } 8434 8435 // result = r0 - return value. Contains initial hashcode value on entry. 8436 // ary = r1 - array address 8437 // cnt = r2 - elements count 8438 // Clobbers: v0-v13, rscratch1, rscratch2 8439 address generate_large_arrays_hashcode(BasicType eltype) { 8440 const Register result = r0, ary = r1, cnt = r2; 8441 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8442 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8443 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8444 const FloatRegister vpowm = v13; 8445 8446 ARRAYS_HASHCODE_REGISTERS; 8447 8448 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8449 8450 unsigned int vf; // vectorization factor 8451 bool multiply_by_halves; 8452 Assembler::SIMD_Arrangement load_arrangement; 8453 switch (eltype) { 8454 case T_BOOLEAN: 8455 case T_BYTE: 8456 load_arrangement = Assembler::T8B; 8457 multiply_by_halves = true; 8458 vf = 8; 8459 break; 8460 case T_CHAR: 8461 case T_SHORT: 8462 load_arrangement = Assembler::T8H; 8463 multiply_by_halves = true; 8464 vf = 8; 8465 break; 8466 case T_INT: 8467 load_arrangement = Assembler::T4S; 8468 multiply_by_halves = false; 8469 vf = 4; 8470 break; 8471 default: 8472 ShouldNotReachHere(); 8473 } 8474 8475 // Unroll factor 8476 const unsigned uf = 4; 8477 8478 // Effective vectorization factor 8479 const unsigned evf = vf * uf; 8480 8481 __ align(CodeEntryAlignment); 8482 8483 StubId stub_id; 8484 switch (eltype) { 8485 case T_BOOLEAN: 8486 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id; 8487 break; 8488 case T_BYTE: 8489 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id; 8490 break; 8491 case T_CHAR: 8492 stub_id = StubId::stubgen_large_arrays_hashcode_char_id; 8493 break; 8494 case T_SHORT: 8495 stub_id = StubId::stubgen_large_arrays_hashcode_short_id; 8496 break; 8497 case T_INT: 8498 stub_id = StubId::stubgen_large_arrays_hashcode_int_id; 8499 break; 8500 default: 8501 stub_id = StubId::NO_STUBID; 8502 ShouldNotReachHere(); 8503 }; 8504 8505 StubCodeMark mark(this, stub_id); 8506 8507 address entry = __ pc(); 8508 __ enter(); 8509 8510 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8511 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8512 // value shouldn't change throughout both loops. 8513 __ movw(rscratch1, intpow(31U, 3)); 8514 __ mov(vpow, Assembler::S, 0, rscratch1); 8515 __ movw(rscratch1, intpow(31U, 2)); 8516 __ mov(vpow, Assembler::S, 1, rscratch1); 8517 __ movw(rscratch1, intpow(31U, 1)); 8518 __ mov(vpow, Assembler::S, 2, rscratch1); 8519 __ movw(rscratch1, intpow(31U, 0)); 8520 __ mov(vpow, Assembler::S, 3, rscratch1); 8521 8522 __ mov(vmul0, Assembler::T16B, 0); 8523 __ mov(vmul0, Assembler::S, 3, result); 8524 8525 __ andr(rscratch2, cnt, (uf - 1) * vf); 8526 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8527 8528 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8529 __ mov(vpowm, Assembler::S, 0, rscratch1); 8530 8531 // SMALL LOOP 8532 __ bind(SMALL_LOOP); 8533 8534 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8535 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8536 __ subsw(rscratch2, rscratch2, vf); 8537 8538 if (load_arrangement == Assembler::T8B) { 8539 // Extend 8B to 8H to be able to use vector multiply 8540 // instructions 8541 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8542 if (is_signed_subword_type(eltype)) { 8543 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8544 } else { 8545 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8546 } 8547 } 8548 8549 switch (load_arrangement) { 8550 case Assembler::T4S: 8551 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8552 break; 8553 case Assembler::T8B: 8554 case Assembler::T8H: 8555 assert(is_subword_type(eltype), "subword type expected"); 8556 if (is_signed_subword_type(eltype)) { 8557 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8558 } else { 8559 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8560 } 8561 break; 8562 default: 8563 __ should_not_reach_here(); 8564 } 8565 8566 // Process the upper half of a vector 8567 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8568 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8569 if (is_signed_subword_type(eltype)) { 8570 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8571 } else { 8572 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8573 } 8574 } 8575 8576 __ br(Assembler::HI, SMALL_LOOP); 8577 8578 // SMALL LOOP'S EPILOQUE 8579 __ lsr(rscratch2, cnt, exact_log2(evf)); 8580 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8581 8582 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8583 __ addv(vmul0, Assembler::T4S, vmul0); 8584 __ umov(result, vmul0, Assembler::S, 0); 8585 8586 // TAIL 8587 __ bind(TAIL); 8588 8589 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8590 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8591 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8592 __ andr(rscratch2, cnt, vf - 1); 8593 __ bind(TAIL_SHORTCUT); 8594 __ adr(rscratch1, BR_BASE); 8595 // For Cortex-A53 offset is 4 because 2 nops are generated. 8596 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8597 __ movw(rscratch2, 0x1f); 8598 __ br(rscratch1); 8599 8600 for (size_t i = 0; i < vf - 1; ++i) { 8601 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8602 eltype); 8603 __ maddw(result, result, rscratch2, rscratch1); 8604 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8605 // Generate 2nd nop to have 4 instructions per iteration. 8606 if (VM_Version::supports_a53mac()) { 8607 __ nop(); 8608 } 8609 } 8610 __ bind(BR_BASE); 8611 8612 __ leave(); 8613 __ ret(lr); 8614 8615 // LARGE LOOP 8616 __ bind(LARGE_LOOP_PREHEADER); 8617 8618 __ lsr(rscratch2, cnt, exact_log2(evf)); 8619 8620 if (multiply_by_halves) { 8621 // 31^4 - multiplier between lower and upper parts of a register 8622 __ movw(rscratch1, intpow(31U, vf / 2)); 8623 __ mov(vpowm, Assembler::S, 1, rscratch1); 8624 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8625 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8626 __ mov(vpowm, Assembler::S, 0, rscratch1); 8627 } else { 8628 // 31^16 8629 __ movw(rscratch1, intpow(31U, evf)); 8630 __ mov(vpowm, Assembler::S, 0, rscratch1); 8631 } 8632 8633 __ mov(vmul3, Assembler::T16B, 0); 8634 __ mov(vmul2, Assembler::T16B, 0); 8635 __ mov(vmul1, Assembler::T16B, 0); 8636 8637 __ bind(LARGE_LOOP); 8638 8639 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8640 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8641 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8642 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8643 8644 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8645 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8646 8647 if (load_arrangement == Assembler::T8B) { 8648 // Extend 8B to 8H to be able to use vector multiply 8649 // instructions 8650 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8651 if (is_signed_subword_type(eltype)) { 8652 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8653 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8654 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8655 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8656 } else { 8657 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8658 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8659 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8660 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8661 } 8662 } 8663 8664 switch (load_arrangement) { 8665 case Assembler::T4S: 8666 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8667 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8668 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8669 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8670 break; 8671 case Assembler::T8B: 8672 case Assembler::T8H: 8673 assert(is_subword_type(eltype), "subword type expected"); 8674 if (is_signed_subword_type(eltype)) { 8675 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8676 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8677 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8678 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8679 } else { 8680 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8681 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8682 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8683 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8684 } 8685 break; 8686 default: 8687 __ should_not_reach_here(); 8688 } 8689 8690 // Process the upper half of a vector 8691 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8692 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8693 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8694 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8695 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8696 if (is_signed_subword_type(eltype)) { 8697 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8698 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8699 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8700 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8701 } else { 8702 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8703 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8704 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8705 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8706 } 8707 } 8708 8709 __ subsw(rscratch2, rscratch2, 1); 8710 __ br(Assembler::HI, LARGE_LOOP); 8711 8712 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8713 __ addv(vmul3, Assembler::T4S, vmul3); 8714 __ umov(result, vmul3, Assembler::S, 0); 8715 8716 __ mov(rscratch2, intpow(31U, vf)); 8717 8718 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8719 __ addv(vmul2, Assembler::T4S, vmul2); 8720 __ umov(rscratch1, vmul2, Assembler::S, 0); 8721 __ maddw(result, result, rscratch2, rscratch1); 8722 8723 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8724 __ addv(vmul1, Assembler::T4S, vmul1); 8725 __ umov(rscratch1, vmul1, Assembler::S, 0); 8726 __ maddw(result, result, rscratch2, rscratch1); 8727 8728 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8729 __ addv(vmul0, Assembler::T4S, vmul0); 8730 __ umov(rscratch1, vmul0, Assembler::S, 0); 8731 __ maddw(result, result, rscratch2, rscratch1); 8732 8733 __ andr(rscratch2, cnt, vf - 1); 8734 __ cbnz(rscratch2, TAIL_SHORTCUT); 8735 8736 __ leave(); 8737 __ ret(lr); 8738 8739 return entry; 8740 } 8741 8742 address generate_dsin_dcos(bool isCos) { 8743 __ align(CodeEntryAlignment); 8744 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id); 8745 StubCodeMark mark(this, stub_id); 8746 address start = __ pc(); 8747 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8748 (address)StubRoutines::aarch64::_two_over_pi, 8749 (address)StubRoutines::aarch64::_pio2, 8750 (address)StubRoutines::aarch64::_dsin_coef, 8751 (address)StubRoutines::aarch64::_dcos_coef); 8752 return start; 8753 } 8754 8755 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8756 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8757 Label &DIFF2) { 8758 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8759 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8760 8761 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8762 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8763 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8764 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8765 8766 __ fmovd(tmpL, vtmp3); 8767 __ eor(rscratch2, tmp3, tmpL); 8768 __ cbnz(rscratch2, DIFF2); 8769 8770 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8771 __ umov(tmpL, vtmp3, __ D, 1); 8772 __ eor(rscratch2, tmpU, tmpL); 8773 __ cbnz(rscratch2, DIFF1); 8774 8775 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8776 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8777 __ fmovd(tmpL, vtmp); 8778 __ eor(rscratch2, tmp3, tmpL); 8779 __ cbnz(rscratch2, DIFF2); 8780 8781 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8782 __ umov(tmpL, vtmp, __ D, 1); 8783 __ eor(rscratch2, tmpU, tmpL); 8784 __ cbnz(rscratch2, DIFF1); 8785 } 8786 8787 // r0 = result 8788 // r1 = str1 8789 // r2 = cnt1 8790 // r3 = str2 8791 // r4 = cnt2 8792 // r10 = tmp1 8793 // r11 = tmp2 8794 address generate_compare_long_string_different_encoding(bool isLU) { 8795 __ align(CodeEntryAlignment); 8796 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id); 8797 StubCodeMark mark(this, stub_id); 8798 address entry = __ pc(); 8799 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8800 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8801 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8802 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8803 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8804 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8805 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8806 8807 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8808 8809 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8810 // cnt2 == amount of characters left to compare 8811 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8812 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8813 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8814 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8815 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8816 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8817 __ eor(rscratch2, tmp1, tmp2); 8818 __ mov(rscratch1, tmp2); 8819 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8820 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8821 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8822 __ push(spilled_regs, sp); 8823 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8824 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8825 8826 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8827 8828 if (SoftwarePrefetchHintDistance >= 0) { 8829 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8830 __ br(__ LT, NO_PREFETCH); 8831 __ bind(LARGE_LOOP_PREFETCH); 8832 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8833 __ mov(tmp4, 2); 8834 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8835 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8836 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8837 __ subs(tmp4, tmp4, 1); 8838 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8839 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8840 __ mov(tmp4, 2); 8841 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8842 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8843 __ subs(tmp4, tmp4, 1); 8844 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8845 __ sub(cnt2, cnt2, 64); 8846 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8847 __ br(__ GE, LARGE_LOOP_PREFETCH); 8848 } 8849 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8850 __ bind(NO_PREFETCH); 8851 __ subs(cnt2, cnt2, 16); 8852 __ br(__ LT, TAIL); 8853 __ align(OptoLoopAlignment); 8854 __ bind(SMALL_LOOP); // smaller loop 8855 __ subs(cnt2, cnt2, 16); 8856 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8857 __ br(__ GE, SMALL_LOOP); 8858 __ cmn(cnt2, (u1)16); 8859 __ br(__ EQ, LOAD_LAST); 8860 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8861 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8862 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8863 __ ldr(tmp3, Address(cnt1, -8)); 8864 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8865 __ b(LOAD_LAST); 8866 __ bind(DIFF2); 8867 __ mov(tmpU, tmp3); 8868 __ bind(DIFF1); 8869 __ pop(spilled_regs, sp); 8870 __ b(CALCULATE_DIFFERENCE); 8871 __ bind(LOAD_LAST); 8872 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8873 // No need to load it again 8874 __ mov(tmpU, tmp3); 8875 __ pop(spilled_regs, sp); 8876 8877 // tmp2 points to the address of the last 4 Latin1 characters right now 8878 __ ldrs(vtmp, Address(tmp2)); 8879 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8880 __ fmovd(tmpL, vtmp); 8881 8882 __ eor(rscratch2, tmpU, tmpL); 8883 __ cbz(rscratch2, DONE); 8884 8885 // Find the first different characters in the longwords and 8886 // compute their difference. 8887 __ bind(CALCULATE_DIFFERENCE); 8888 __ rev(rscratch2, rscratch2); 8889 __ clz(rscratch2, rscratch2); 8890 __ andr(rscratch2, rscratch2, -16); 8891 __ lsrv(tmp1, tmp1, rscratch2); 8892 __ uxthw(tmp1, tmp1); 8893 __ lsrv(rscratch1, rscratch1, rscratch2); 8894 __ uxthw(rscratch1, rscratch1); 8895 __ subw(result, tmp1, rscratch1); 8896 __ bind(DONE); 8897 __ ret(lr); 8898 return entry; 8899 } 8900 8901 // r0 = input (float16) 8902 // v0 = result (float) 8903 // v1 = temporary float register 8904 address generate_float16ToFloat() { 8905 __ align(CodeEntryAlignment); 8906 StubId stub_id = StubId::stubgen_hf2f_id; 8907 StubCodeMark mark(this, stub_id); 8908 address entry = __ pc(); 8909 BLOCK_COMMENT("Entry:"); 8910 __ flt16_to_flt(v0, r0, v1); 8911 __ ret(lr); 8912 return entry; 8913 } 8914 8915 // v0 = input (float) 8916 // r0 = result (float16) 8917 // v1 = temporary float register 8918 address generate_floatToFloat16() { 8919 __ align(CodeEntryAlignment); 8920 StubId stub_id = StubId::stubgen_f2hf_id; 8921 StubCodeMark mark(this, stub_id); 8922 address entry = __ pc(); 8923 BLOCK_COMMENT("Entry:"); 8924 __ flt_to_flt16(r0, v0, v1); 8925 __ ret(lr); 8926 return entry; 8927 } 8928 8929 address generate_method_entry_barrier() { 8930 __ align(CodeEntryAlignment); 8931 StubId stub_id = StubId::stubgen_method_entry_barrier_id; 8932 StubCodeMark mark(this, stub_id); 8933 8934 Label deoptimize_label; 8935 8936 address start = __ pc(); 8937 8938 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8939 8940 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8941 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8942 // We can get here despite the nmethod being good, if we have not 8943 // yet applied our cross modification fence (or data fence). 8944 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8945 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8946 __ ldrw(rscratch2, rscratch2); 8947 __ strw(rscratch2, thread_epoch_addr); 8948 __ isb(); 8949 __ membar(__ LoadLoad); 8950 } 8951 8952 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8953 8954 __ enter(); 8955 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8956 8957 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8958 8959 __ push_call_clobbered_registers(); 8960 8961 __ mov(c_rarg0, rscratch2); 8962 __ call_VM_leaf 8963 (CAST_FROM_FN_PTR 8964 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8965 8966 __ reset_last_Java_frame(true); 8967 8968 __ mov(rscratch1, r0); 8969 8970 __ pop_call_clobbered_registers(); 8971 8972 __ cbnz(rscratch1, deoptimize_label); 8973 8974 __ leave(); 8975 __ ret(lr); 8976 8977 __ BIND(deoptimize_label); 8978 8979 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8980 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 8981 8982 __ mov(sp, rscratch1); 8983 __ br(rscratch2); 8984 8985 return start; 8986 } 8987 8988 // r0 = result 8989 // r1 = str1 8990 // r2 = cnt1 8991 // r3 = str2 8992 // r4 = cnt2 8993 // r10 = tmp1 8994 // r11 = tmp2 8995 address generate_compare_long_string_same_encoding(bool isLL) { 8996 __ align(CodeEntryAlignment); 8997 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id); 8998 StubCodeMark mark(this, stub_id); 8999 address entry = __ pc(); 9000 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9001 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 9002 9003 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 9004 9005 // exit from large loop when less than 64 bytes left to read or we're about 9006 // to prefetch memory behind array border 9007 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 9008 9009 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 9010 __ eor(rscratch2, tmp1, tmp2); 9011 __ cbnz(rscratch2, CAL_DIFFERENCE); 9012 9013 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 9014 // update pointers, because of previous read 9015 __ add(str1, str1, wordSize); 9016 __ add(str2, str2, wordSize); 9017 if (SoftwarePrefetchHintDistance >= 0) { 9018 __ align(OptoLoopAlignment); 9019 __ bind(LARGE_LOOP_PREFETCH); 9020 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 9021 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 9022 9023 for (int i = 0; i < 4; i++) { 9024 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 9025 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 9026 __ cmp(tmp1, tmp2); 9027 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9028 __ br(Assembler::NE, DIFF); 9029 } 9030 __ sub(cnt2, cnt2, isLL ? 64 : 32); 9031 __ add(str1, str1, 64); 9032 __ add(str2, str2, 64); 9033 __ subs(rscratch2, cnt2, largeLoopExitCondition); 9034 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 9035 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 9036 } 9037 9038 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 9039 __ br(Assembler::LE, LESS16); 9040 __ align(OptoLoopAlignment); 9041 __ bind(LOOP_COMPARE16); 9042 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9043 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9044 __ cmp(tmp1, tmp2); 9045 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9046 __ br(Assembler::NE, DIFF); 9047 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9048 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9049 __ br(Assembler::LT, LESS16); 9050 9051 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9052 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9053 __ cmp(tmp1, tmp2); 9054 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9055 __ br(Assembler::NE, DIFF); 9056 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9057 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9058 __ br(Assembler::GE, LOOP_COMPARE16); 9059 __ cbz(cnt2, LENGTH_DIFF); 9060 9061 __ bind(LESS16); 9062 // each 8 compare 9063 __ subs(cnt2, cnt2, isLL ? 8 : 4); 9064 __ br(Assembler::LE, LESS8); 9065 __ ldr(tmp1, Address(__ post(str1, 8))); 9066 __ ldr(tmp2, Address(__ post(str2, 8))); 9067 __ eor(rscratch2, tmp1, tmp2); 9068 __ cbnz(rscratch2, CAL_DIFFERENCE); 9069 __ sub(cnt2, cnt2, isLL ? 8 : 4); 9070 9071 __ bind(LESS8); // directly load last 8 bytes 9072 if (!isLL) { 9073 __ add(cnt2, cnt2, cnt2); 9074 } 9075 __ ldr(tmp1, Address(str1, cnt2)); 9076 __ ldr(tmp2, Address(str2, cnt2)); 9077 __ eor(rscratch2, tmp1, tmp2); 9078 __ cbz(rscratch2, LENGTH_DIFF); 9079 __ b(CAL_DIFFERENCE); 9080 9081 __ bind(DIFF); 9082 __ cmp(tmp1, tmp2); 9083 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 9084 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 9085 // reuse rscratch2 register for the result of eor instruction 9086 __ eor(rscratch2, tmp1, tmp2); 9087 9088 __ bind(CAL_DIFFERENCE); 9089 __ rev(rscratch2, rscratch2); 9090 __ clz(rscratch2, rscratch2); 9091 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 9092 __ lsrv(tmp1, tmp1, rscratch2); 9093 __ lsrv(tmp2, tmp2, rscratch2); 9094 if (isLL) { 9095 __ uxtbw(tmp1, tmp1); 9096 __ uxtbw(tmp2, tmp2); 9097 } else { 9098 __ uxthw(tmp1, tmp1); 9099 __ uxthw(tmp2, tmp2); 9100 } 9101 __ subw(result, tmp1, tmp2); 9102 9103 __ bind(LENGTH_DIFF); 9104 __ ret(lr); 9105 return entry; 9106 } 9107 9108 enum string_compare_mode { 9109 LL, 9110 LU, 9111 UL, 9112 UU, 9113 }; 9114 9115 // The following registers are declared in aarch64.ad 9116 // r0 = result 9117 // r1 = str1 9118 // r2 = cnt1 9119 // r3 = str2 9120 // r4 = cnt2 9121 // r10 = tmp1 9122 // r11 = tmp2 9123 // z0 = ztmp1 9124 // z1 = ztmp2 9125 // p0 = pgtmp1 9126 // p1 = pgtmp2 9127 address generate_compare_long_string_sve(string_compare_mode mode) { 9128 StubId stub_id; 9129 switch (mode) { 9130 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break; 9131 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break; 9132 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break; 9133 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break; 9134 default: ShouldNotReachHere(); 9135 } 9136 9137 __ align(CodeEntryAlignment); 9138 address entry = __ pc(); 9139 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9140 tmp1 = r10, tmp2 = r11; 9141 9142 Label LOOP, DONE, MISMATCH; 9143 Register vec_len = tmp1; 9144 Register idx = tmp2; 9145 // The minimum of the string lengths has been stored in cnt2. 9146 Register cnt = cnt2; 9147 FloatRegister ztmp1 = z0, ztmp2 = z1; 9148 PRegister pgtmp1 = p0, pgtmp2 = p1; 9149 9150 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 9151 switch (mode) { \ 9152 case LL: \ 9153 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 9154 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 9155 break; \ 9156 case LU: \ 9157 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 9158 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9159 break; \ 9160 case UL: \ 9161 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9162 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 9163 break; \ 9164 case UU: \ 9165 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9166 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9167 break; \ 9168 default: \ 9169 ShouldNotReachHere(); \ 9170 } 9171 9172 StubCodeMark mark(this, stub_id); 9173 9174 __ mov(idx, 0); 9175 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9176 9177 if (mode == LL) { 9178 __ sve_cntb(vec_len); 9179 } else { 9180 __ sve_cnth(vec_len); 9181 } 9182 9183 __ sub(rscratch1, cnt, vec_len); 9184 9185 __ bind(LOOP); 9186 9187 // main loop 9188 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9189 __ add(idx, idx, vec_len); 9190 // Compare strings. 9191 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9192 __ br(__ NE, MISMATCH); 9193 __ cmp(idx, rscratch1); 9194 __ br(__ LT, LOOP); 9195 9196 // post loop, last iteration 9197 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9198 9199 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9200 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9201 __ br(__ EQ, DONE); 9202 9203 __ bind(MISMATCH); 9204 9205 // Crop the vector to find its location. 9206 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 9207 // Extract the first different characters of each string. 9208 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 9209 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 9210 9211 // Compute the difference of the first different characters. 9212 __ sub(result, rscratch1, rscratch2); 9213 9214 __ bind(DONE); 9215 __ ret(lr); 9216 #undef LOAD_PAIR 9217 return entry; 9218 } 9219 9220 void generate_compare_long_strings() { 9221 if (UseSVE == 0) { 9222 StubRoutines::aarch64::_compare_long_string_LL 9223 = generate_compare_long_string_same_encoding(true); 9224 StubRoutines::aarch64::_compare_long_string_UU 9225 = generate_compare_long_string_same_encoding(false); 9226 StubRoutines::aarch64::_compare_long_string_LU 9227 = generate_compare_long_string_different_encoding(true); 9228 StubRoutines::aarch64::_compare_long_string_UL 9229 = generate_compare_long_string_different_encoding(false); 9230 } else { 9231 StubRoutines::aarch64::_compare_long_string_LL 9232 = generate_compare_long_string_sve(LL); 9233 StubRoutines::aarch64::_compare_long_string_UU 9234 = generate_compare_long_string_sve(UU); 9235 StubRoutines::aarch64::_compare_long_string_LU 9236 = generate_compare_long_string_sve(LU); 9237 StubRoutines::aarch64::_compare_long_string_UL 9238 = generate_compare_long_string_sve(UL); 9239 } 9240 } 9241 9242 // R0 = result 9243 // R1 = str2 9244 // R2 = cnt1 9245 // R3 = str1 9246 // R4 = cnt2 9247 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 9248 // 9249 // This generic linear code use few additional ideas, which makes it faster: 9250 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 9251 // in order to skip initial loading(help in systems with 1 ld pipeline) 9252 // 2) we can use "fast" algorithm of finding single character to search for 9253 // first symbol with less branches(1 branch per each loaded register instead 9254 // of branch for each symbol), so, this is where constants like 9255 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 9256 // 3) after loading and analyzing 1st register of source string, it can be 9257 // used to search for every 1st character entry, saving few loads in 9258 // comparison with "simplier-but-slower" implementation 9259 // 4) in order to avoid lots of push/pop operations, code below is heavily 9260 // re-using/re-initializing/compressing register values, which makes code 9261 // larger and a bit less readable, however, most of extra operations are 9262 // issued during loads or branches, so, penalty is minimal 9263 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 9264 StubId stub_id; 9265 if (str1_isL) { 9266 if (str2_isL) { 9267 stub_id = StubId::stubgen_string_indexof_linear_ll_id; 9268 } else { 9269 stub_id = StubId::stubgen_string_indexof_linear_ul_id; 9270 } 9271 } else { 9272 if (str2_isL) { 9273 ShouldNotReachHere(); 9274 } else { 9275 stub_id = StubId::stubgen_string_indexof_linear_uu_id; 9276 } 9277 } 9278 __ align(CodeEntryAlignment); 9279 StubCodeMark mark(this, stub_id); 9280 address entry = __ pc(); 9281 9282 int str1_chr_size = str1_isL ? 1 : 2; 9283 int str2_chr_size = str2_isL ? 1 : 2; 9284 int str1_chr_shift = str1_isL ? 0 : 1; 9285 int str2_chr_shift = str2_isL ? 0 : 1; 9286 bool isL = str1_isL && str2_isL; 9287 // parameters 9288 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 9289 // temporary registers 9290 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 9291 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 9292 // redefinitions 9293 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 9294 9295 __ push(spilled_regs, sp); 9296 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 9297 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 9298 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 9299 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 9300 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 9301 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 9302 // Read whole register from str1. It is safe, because length >=8 here 9303 __ ldr(ch1, Address(str1)); 9304 // Read whole register from str2. It is safe, because length >=8 here 9305 __ ldr(ch2, Address(str2)); 9306 __ sub(cnt2, cnt2, cnt1); 9307 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 9308 if (str1_isL != str2_isL) { 9309 __ eor(v0, __ T16B, v0, v0); 9310 } 9311 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 9312 __ mul(first, first, tmp1); 9313 // check if we have less than 1 register to check 9314 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 9315 if (str1_isL != str2_isL) { 9316 __ fmovd(v1, ch1); 9317 } 9318 __ br(__ LE, L_SMALL); 9319 __ eor(ch2, first, ch2); 9320 if (str1_isL != str2_isL) { 9321 __ zip1(v1, __ T16B, v1, v0); 9322 } 9323 __ sub(tmp2, ch2, tmp1); 9324 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9325 __ bics(tmp2, tmp2, ch2); 9326 if (str1_isL != str2_isL) { 9327 __ fmovd(ch1, v1); 9328 } 9329 __ br(__ NE, L_HAS_ZERO); 9330 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9331 __ add(result, result, wordSize/str2_chr_size); 9332 __ add(str2, str2, wordSize); 9333 __ br(__ LT, L_POST_LOOP); 9334 __ BIND(L_LOOP); 9335 __ ldr(ch2, Address(str2)); 9336 __ eor(ch2, first, ch2); 9337 __ sub(tmp2, ch2, tmp1); 9338 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9339 __ bics(tmp2, tmp2, ch2); 9340 __ br(__ NE, L_HAS_ZERO); 9341 __ BIND(L_LOOP_PROCEED); 9342 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9343 __ add(str2, str2, wordSize); 9344 __ add(result, result, wordSize/str2_chr_size); 9345 __ br(__ GE, L_LOOP); 9346 __ BIND(L_POST_LOOP); 9347 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 9348 __ br(__ LE, NOMATCH); 9349 __ ldr(ch2, Address(str2)); 9350 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9351 __ eor(ch2, first, ch2); 9352 __ sub(tmp2, ch2, tmp1); 9353 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9354 __ mov(tmp4, -1); // all bits set 9355 __ b(L_SMALL_PROCEED); 9356 __ align(OptoLoopAlignment); 9357 __ BIND(L_SMALL); 9358 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9359 __ eor(ch2, first, ch2); 9360 if (str1_isL != str2_isL) { 9361 __ zip1(v1, __ T16B, v1, v0); 9362 } 9363 __ sub(tmp2, ch2, tmp1); 9364 __ mov(tmp4, -1); // all bits set 9365 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9366 if (str1_isL != str2_isL) { 9367 __ fmovd(ch1, v1); // move converted 4 symbols 9368 } 9369 __ BIND(L_SMALL_PROCEED); 9370 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9371 __ bic(tmp2, tmp2, ch2); 9372 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9373 __ rbit(tmp2, tmp2); 9374 __ br(__ EQ, NOMATCH); 9375 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9376 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9377 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9378 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9379 if (str2_isL) { // LL 9380 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9381 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9382 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9383 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9384 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9385 } else { 9386 __ mov(ch2, 0xE); // all bits in byte set except last one 9387 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9388 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9389 __ lslv(tmp2, tmp2, tmp4); 9390 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9391 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9392 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9393 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9394 } 9395 __ cmp(ch1, ch2); 9396 __ mov(tmp4, wordSize/str2_chr_size); 9397 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9398 __ BIND(L_SMALL_CMP_LOOP); 9399 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9400 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9401 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9402 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9403 __ add(tmp4, tmp4, 1); 9404 __ cmp(tmp4, cnt1); 9405 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9406 __ cmp(first, ch2); 9407 __ br(__ EQ, L_SMALL_CMP_LOOP); 9408 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9409 __ cbz(tmp2, NOMATCH); // no more matches. exit 9410 __ clz(tmp4, tmp2); 9411 __ add(result, result, 1); // advance index 9412 __ add(str2, str2, str2_chr_size); // advance pointer 9413 __ b(L_SMALL_HAS_ZERO_LOOP); 9414 __ align(OptoLoopAlignment); 9415 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9416 __ cmp(first, ch2); 9417 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9418 __ b(DONE); 9419 __ align(OptoLoopAlignment); 9420 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9421 if (str2_isL) { // LL 9422 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9423 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9424 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9425 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9426 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9427 } else { 9428 __ mov(ch2, 0xE); // all bits in byte set except last one 9429 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9430 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9431 __ lslv(tmp2, tmp2, tmp4); 9432 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9433 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9434 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9435 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9436 } 9437 __ cmp(ch1, ch2); 9438 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9439 __ b(DONE); 9440 __ align(OptoLoopAlignment); 9441 __ BIND(L_HAS_ZERO); 9442 __ rbit(tmp2, tmp2); 9443 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9444 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9445 // It's fine because both counters are 32bit and are not changed in this 9446 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9447 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9448 __ sub(result, result, 1); 9449 __ BIND(L_HAS_ZERO_LOOP); 9450 __ mov(cnt1, wordSize/str2_chr_size); 9451 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9452 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9453 if (str2_isL) { 9454 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9455 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9456 __ lslv(tmp2, tmp2, tmp4); 9457 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9458 __ add(tmp4, tmp4, 1); 9459 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9460 __ lsl(tmp2, tmp2, 1); 9461 __ mov(tmp4, wordSize/str2_chr_size); 9462 } else { 9463 __ mov(ch2, 0xE); 9464 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9465 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9466 __ lslv(tmp2, tmp2, tmp4); 9467 __ add(tmp4, tmp4, 1); 9468 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9469 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9470 __ lsl(tmp2, tmp2, 1); 9471 __ mov(tmp4, wordSize/str2_chr_size); 9472 __ sub(str2, str2, str2_chr_size); 9473 } 9474 __ cmp(ch1, ch2); 9475 __ mov(tmp4, wordSize/str2_chr_size); 9476 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9477 __ BIND(L_CMP_LOOP); 9478 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9479 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9480 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9481 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9482 __ add(tmp4, tmp4, 1); 9483 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9484 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9485 __ cmp(cnt1, ch2); 9486 __ br(__ EQ, L_CMP_LOOP); 9487 __ BIND(L_CMP_LOOP_NOMATCH); 9488 // here we're not matched 9489 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9490 __ clz(tmp4, tmp2); 9491 __ add(str2, str2, str2_chr_size); // advance pointer 9492 __ b(L_HAS_ZERO_LOOP); 9493 __ align(OptoLoopAlignment); 9494 __ BIND(L_CMP_LOOP_LAST_CMP); 9495 __ cmp(cnt1, ch2); 9496 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9497 __ b(DONE); 9498 __ align(OptoLoopAlignment); 9499 __ BIND(L_CMP_LOOP_LAST_CMP2); 9500 if (str2_isL) { 9501 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9502 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9503 __ lslv(tmp2, tmp2, tmp4); 9504 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9505 __ add(tmp4, tmp4, 1); 9506 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9507 __ lsl(tmp2, tmp2, 1); 9508 } else { 9509 __ mov(ch2, 0xE); 9510 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9511 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9512 __ lslv(tmp2, tmp2, tmp4); 9513 __ add(tmp4, tmp4, 1); 9514 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9515 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9516 __ lsl(tmp2, tmp2, 1); 9517 __ sub(str2, str2, str2_chr_size); 9518 } 9519 __ cmp(ch1, ch2); 9520 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9521 __ b(DONE); 9522 __ align(OptoLoopAlignment); 9523 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9524 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9525 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9526 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9527 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9528 // result by analyzed characters value, so, we can just reset lower bits 9529 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9530 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9531 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9532 // index of last analyzed substring inside current octet. So, str2 in at 9533 // respective start address. We need to advance it to next octet 9534 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9535 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9536 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9537 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9538 __ movw(cnt2, cnt2); 9539 __ b(L_LOOP_PROCEED); 9540 __ align(OptoLoopAlignment); 9541 __ BIND(NOMATCH); 9542 __ mov(result, -1); 9543 __ BIND(DONE); 9544 __ pop(spilled_regs, sp); 9545 __ ret(lr); 9546 return entry; 9547 } 9548 9549 void generate_string_indexof_stubs() { 9550 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9551 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9552 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9553 } 9554 9555 void inflate_and_store_2_fp_registers(bool generatePrfm, 9556 FloatRegister src1, FloatRegister src2) { 9557 Register dst = r1; 9558 __ zip1(v1, __ T16B, src1, v0); 9559 __ zip2(v2, __ T16B, src1, v0); 9560 if (generatePrfm) { 9561 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9562 } 9563 __ zip1(v3, __ T16B, src2, v0); 9564 __ zip2(v4, __ T16B, src2, v0); 9565 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9566 } 9567 9568 // R0 = src 9569 // R1 = dst 9570 // R2 = len 9571 // R3 = len >> 3 9572 // V0 = 0 9573 // v1 = loaded 8 bytes 9574 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9575 address generate_large_byte_array_inflate() { 9576 __ align(CodeEntryAlignment); 9577 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id; 9578 StubCodeMark mark(this, stub_id); 9579 address entry = __ pc(); 9580 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9581 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9582 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9583 9584 // do one more 8-byte read to have address 16-byte aligned in most cases 9585 // also use single store instruction 9586 __ ldrd(v2, __ post(src, 8)); 9587 __ sub(octetCounter, octetCounter, 2); 9588 __ zip1(v1, __ T16B, v1, v0); 9589 __ zip1(v2, __ T16B, v2, v0); 9590 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9591 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9592 __ subs(rscratch1, octetCounter, large_loop_threshold); 9593 __ br(__ LE, LOOP_START); 9594 __ b(LOOP_PRFM_START); 9595 __ bind(LOOP_PRFM); 9596 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9597 __ bind(LOOP_PRFM_START); 9598 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9599 __ sub(octetCounter, octetCounter, 8); 9600 __ subs(rscratch1, octetCounter, large_loop_threshold); 9601 inflate_and_store_2_fp_registers(true, v3, v4); 9602 inflate_and_store_2_fp_registers(true, v5, v6); 9603 __ br(__ GT, LOOP_PRFM); 9604 __ cmp(octetCounter, (u1)8); 9605 __ br(__ LT, DONE); 9606 __ bind(LOOP); 9607 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9608 __ bind(LOOP_START); 9609 __ sub(octetCounter, octetCounter, 8); 9610 __ cmp(octetCounter, (u1)8); 9611 inflate_and_store_2_fp_registers(false, v3, v4); 9612 inflate_and_store_2_fp_registers(false, v5, v6); 9613 __ br(__ GE, LOOP); 9614 __ bind(DONE); 9615 __ ret(lr); 9616 return entry; 9617 } 9618 9619 /** 9620 * Arguments: 9621 * 9622 * Input: 9623 * c_rarg0 - current state address 9624 * c_rarg1 - H key address 9625 * c_rarg2 - data address 9626 * c_rarg3 - number of blocks 9627 * 9628 * Output: 9629 * Updated state at c_rarg0 9630 */ 9631 address generate_ghash_processBlocks() { 9632 // Bafflingly, GCM uses little-endian for the byte order, but 9633 // big-endian for the bit order. For example, the polynomial 1 is 9634 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9635 // 9636 // So, we must either reverse the bytes in each word and do 9637 // everything big-endian or reverse the bits in each byte and do 9638 // it little-endian. On AArch64 it's more idiomatic to reverse 9639 // the bits in each byte (we have an instruction, RBIT, to do 9640 // that) and keep the data in little-endian bit order through the 9641 // calculation, bit-reversing the inputs and outputs. 9642 9643 StubId stub_id = StubId::stubgen_ghash_processBlocks_id; 9644 StubCodeMark mark(this, stub_id); 9645 __ align(wordSize * 2); 9646 address p = __ pc(); 9647 __ emit_int64(0x87); // The low-order bits of the field 9648 // polynomial (i.e. p = z^7+z^2+z+1) 9649 // repeated in the low and high parts of a 9650 // 128-bit vector 9651 __ emit_int64(0x87); 9652 9653 __ align(CodeEntryAlignment); 9654 address start = __ pc(); 9655 9656 Register state = c_rarg0; 9657 Register subkeyH = c_rarg1; 9658 Register data = c_rarg2; 9659 Register blocks = c_rarg3; 9660 9661 FloatRegister vzr = v30; 9662 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9663 9664 __ ldrq(v24, p); // The field polynomial 9665 9666 __ ldrq(v0, Address(state)); 9667 __ ldrq(v1, Address(subkeyH)); 9668 9669 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9670 __ rbit(v0, __ T16B, v0); 9671 __ rev64(v1, __ T16B, v1); 9672 __ rbit(v1, __ T16B, v1); 9673 9674 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9675 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9676 9677 { 9678 Label L_ghash_loop; 9679 __ bind(L_ghash_loop); 9680 9681 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9682 // reversing each byte 9683 __ rbit(v2, __ T16B, v2); 9684 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9685 9686 // Multiply state in v2 by subkey in v1 9687 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9688 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9689 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9690 // Reduce v7:v5 by the field polynomial 9691 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9692 9693 __ sub(blocks, blocks, 1); 9694 __ cbnz(blocks, L_ghash_loop); 9695 } 9696 9697 // The bit-reversed result is at this point in v0 9698 __ rev64(v0, __ T16B, v0); 9699 __ rbit(v0, __ T16B, v0); 9700 9701 __ st1(v0, __ T16B, state); 9702 __ ret(lr); 9703 9704 return start; 9705 } 9706 9707 address generate_ghash_processBlocks_wide() { 9708 address small = generate_ghash_processBlocks(); 9709 9710 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id; 9711 StubCodeMark mark(this, stub_id); 9712 __ align(wordSize * 2); 9713 address p = __ pc(); 9714 __ emit_int64(0x87); // The low-order bits of the field 9715 // polynomial (i.e. p = z^7+z^2+z+1) 9716 // repeated in the low and high parts of a 9717 // 128-bit vector 9718 __ emit_int64(0x87); 9719 9720 __ align(CodeEntryAlignment); 9721 address start = __ pc(); 9722 9723 Register state = c_rarg0; 9724 Register subkeyH = c_rarg1; 9725 Register data = c_rarg2; 9726 Register blocks = c_rarg3; 9727 9728 const int unroll = 4; 9729 9730 __ cmp(blocks, (unsigned char)(unroll * 2)); 9731 __ br(__ LT, small); 9732 9733 if (unroll > 1) { 9734 // Save state before entering routine 9735 __ sub(sp, sp, 4 * 16); 9736 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9737 __ sub(sp, sp, 4 * 16); 9738 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9739 } 9740 9741 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9742 9743 if (unroll > 1) { 9744 // And restore state 9745 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9746 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9747 } 9748 9749 __ cmp(blocks, (unsigned char)0); 9750 __ br(__ GT, small); 9751 9752 __ ret(lr); 9753 9754 return start; 9755 } 9756 9757 void generate_base64_encode_simdround(Register src, Register dst, 9758 FloatRegister codec, u8 size) { 9759 9760 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9761 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9762 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9763 9764 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9765 9766 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9767 9768 __ ushr(ind0, arrangement, in0, 2); 9769 9770 __ ushr(ind1, arrangement, in1, 2); 9771 __ shl(in0, arrangement, in0, 6); 9772 __ orr(ind1, arrangement, ind1, in0); 9773 __ ushr(ind1, arrangement, ind1, 2); 9774 9775 __ ushr(ind2, arrangement, in2, 4); 9776 __ shl(in1, arrangement, in1, 4); 9777 __ orr(ind2, arrangement, in1, ind2); 9778 __ ushr(ind2, arrangement, ind2, 2); 9779 9780 __ shl(ind3, arrangement, in2, 2); 9781 __ ushr(ind3, arrangement, ind3, 2); 9782 9783 __ tbl(out0, arrangement, codec, 4, ind0); 9784 __ tbl(out1, arrangement, codec, 4, ind1); 9785 __ tbl(out2, arrangement, codec, 4, ind2); 9786 __ tbl(out3, arrangement, codec, 4, ind3); 9787 9788 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9789 } 9790 9791 /** 9792 * Arguments: 9793 * 9794 * Input: 9795 * c_rarg0 - src_start 9796 * c_rarg1 - src_offset 9797 * c_rarg2 - src_length 9798 * c_rarg3 - dest_start 9799 * c_rarg4 - dest_offset 9800 * c_rarg5 - isURL 9801 * 9802 */ 9803 address generate_base64_encodeBlock() { 9804 9805 static const char toBase64[64] = { 9806 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9807 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9808 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9809 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9810 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9811 }; 9812 9813 static const char toBase64URL[64] = { 9814 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9815 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9816 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9817 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9818 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9819 }; 9820 9821 __ align(CodeEntryAlignment); 9822 StubId stub_id = StubId::stubgen_base64_encodeBlock_id; 9823 StubCodeMark mark(this, stub_id); 9824 address start = __ pc(); 9825 9826 Register src = c_rarg0; // source array 9827 Register soff = c_rarg1; // source start offset 9828 Register send = c_rarg2; // source end offset 9829 Register dst = c_rarg3; // dest array 9830 Register doff = c_rarg4; // position for writing to dest array 9831 Register isURL = c_rarg5; // Base64 or URL character set 9832 9833 // c_rarg6 and c_rarg7 are free to use as temps 9834 Register codec = c_rarg6; 9835 Register length = c_rarg7; 9836 9837 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9838 9839 __ add(src, src, soff); 9840 __ add(dst, dst, doff); 9841 __ sub(length, send, soff); 9842 9843 // load the codec base address 9844 __ lea(codec, ExternalAddress((address) toBase64)); 9845 __ cbz(isURL, ProcessData); 9846 __ lea(codec, ExternalAddress((address) toBase64URL)); 9847 9848 __ BIND(ProcessData); 9849 9850 // too short to formup a SIMD loop, roll back 9851 __ cmp(length, (u1)24); 9852 __ br(Assembler::LT, Process3B); 9853 9854 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9855 9856 __ BIND(Process48B); 9857 __ cmp(length, (u1)48); 9858 __ br(Assembler::LT, Process24B); 9859 generate_base64_encode_simdround(src, dst, v0, 16); 9860 __ sub(length, length, 48); 9861 __ b(Process48B); 9862 9863 __ BIND(Process24B); 9864 __ cmp(length, (u1)24); 9865 __ br(Assembler::LT, SIMDExit); 9866 generate_base64_encode_simdround(src, dst, v0, 8); 9867 __ sub(length, length, 24); 9868 9869 __ BIND(SIMDExit); 9870 __ cbz(length, Exit); 9871 9872 __ BIND(Process3B); 9873 // 3 src bytes, 24 bits 9874 __ ldrb(r10, __ post(src, 1)); 9875 __ ldrb(r11, __ post(src, 1)); 9876 __ ldrb(r12, __ post(src, 1)); 9877 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9878 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9879 // codec index 9880 __ ubfmw(r15, r12, 18, 23); 9881 __ ubfmw(r14, r12, 12, 17); 9882 __ ubfmw(r13, r12, 6, 11); 9883 __ andw(r12, r12, 63); 9884 // get the code based on the codec 9885 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9886 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9887 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9888 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9889 __ strb(r15, __ post(dst, 1)); 9890 __ strb(r14, __ post(dst, 1)); 9891 __ strb(r13, __ post(dst, 1)); 9892 __ strb(r12, __ post(dst, 1)); 9893 __ sub(length, length, 3); 9894 __ cbnz(length, Process3B); 9895 9896 __ BIND(Exit); 9897 __ ret(lr); 9898 9899 return start; 9900 } 9901 9902 void generate_base64_decode_simdround(Register src, Register dst, 9903 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9904 9905 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9906 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9907 9908 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9909 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9910 9911 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9912 9913 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9914 9915 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9916 9917 // we need unsigned saturating subtract, to make sure all input values 9918 // in range [0, 63] will have 0U value in the higher half lookup 9919 __ uqsubv(decH0, __ T16B, in0, v27); 9920 __ uqsubv(decH1, __ T16B, in1, v27); 9921 __ uqsubv(decH2, __ T16B, in2, v27); 9922 __ uqsubv(decH3, __ T16B, in3, v27); 9923 9924 // lower half lookup 9925 __ tbl(decL0, arrangement, codecL, 4, in0); 9926 __ tbl(decL1, arrangement, codecL, 4, in1); 9927 __ tbl(decL2, arrangement, codecL, 4, in2); 9928 __ tbl(decL3, arrangement, codecL, 4, in3); 9929 9930 // higher half lookup 9931 __ tbx(decH0, arrangement, codecH, 4, decH0); 9932 __ tbx(decH1, arrangement, codecH, 4, decH1); 9933 __ tbx(decH2, arrangement, codecH, 4, decH2); 9934 __ tbx(decH3, arrangement, codecH, 4, decH3); 9935 9936 // combine lower and higher 9937 __ orr(decL0, arrangement, decL0, decH0); 9938 __ orr(decL1, arrangement, decL1, decH1); 9939 __ orr(decL2, arrangement, decL2, decH2); 9940 __ orr(decL3, arrangement, decL3, decH3); 9941 9942 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9943 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9944 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9945 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9946 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9947 __ orr(in0, arrangement, decH0, decH1); 9948 __ orr(in1, arrangement, decH2, decH3); 9949 __ orr(in2, arrangement, in0, in1); 9950 __ umaxv(in3, arrangement, in2); 9951 __ umov(rscratch2, in3, __ B, 0); 9952 9953 // get the data to output 9954 __ shl(out0, arrangement, decL0, 2); 9955 __ ushr(out1, arrangement, decL1, 4); 9956 __ orr(out0, arrangement, out0, out1); 9957 __ shl(out1, arrangement, decL1, 4); 9958 __ ushr(out2, arrangement, decL2, 2); 9959 __ orr(out1, arrangement, out1, out2); 9960 __ shl(out2, arrangement, decL2, 6); 9961 __ orr(out2, arrangement, out2, decL3); 9962 9963 __ cbz(rscratch2, NoIllegalData); 9964 9965 // handle illegal input 9966 __ umov(r10, in2, __ D, 0); 9967 if (size == 16) { 9968 __ cbnz(r10, ErrorInLowerHalf); 9969 9970 // illegal input is in higher half, store the lower half now. 9971 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9972 9973 __ umov(r10, in2, __ D, 1); 9974 __ umov(r11, out0, __ D, 1); 9975 __ umov(r12, out1, __ D, 1); 9976 __ umov(r13, out2, __ D, 1); 9977 __ b(StoreLegalData); 9978 9979 __ BIND(ErrorInLowerHalf); 9980 } 9981 __ umov(r11, out0, __ D, 0); 9982 __ umov(r12, out1, __ D, 0); 9983 __ umov(r13, out2, __ D, 0); 9984 9985 __ BIND(StoreLegalData); 9986 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 9987 __ strb(r11, __ post(dst, 1)); 9988 __ strb(r12, __ post(dst, 1)); 9989 __ strb(r13, __ post(dst, 1)); 9990 __ lsr(r10, r10, 8); 9991 __ lsr(r11, r11, 8); 9992 __ lsr(r12, r12, 8); 9993 __ lsr(r13, r13, 8); 9994 __ b(StoreLegalData); 9995 9996 __ BIND(NoIllegalData); 9997 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 9998 } 9999 10000 10001 /** 10002 * Arguments: 10003 * 10004 * Input: 10005 * c_rarg0 - src_start 10006 * c_rarg1 - src_offset 10007 * c_rarg2 - src_length 10008 * c_rarg3 - dest_start 10009 * c_rarg4 - dest_offset 10010 * c_rarg5 - isURL 10011 * c_rarg6 - isMIME 10012 * 10013 */ 10014 address generate_base64_decodeBlock() { 10015 10016 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 10017 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 10018 // titled "Base64 decoding". 10019 10020 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 10021 // except the trailing character '=' is also treated illegal value in this intrinsic. That 10022 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 10023 static const uint8_t fromBase64ForNoSIMD[256] = { 10024 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10025 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10026 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10027 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10028 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10029 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 10030 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10031 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10032 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10033 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10034 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10035 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10036 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10037 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10038 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10039 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10040 }; 10041 10042 static const uint8_t fromBase64URLForNoSIMD[256] = { 10043 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10044 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10045 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10046 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10047 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10048 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 10049 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10050 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10051 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10052 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10053 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10054 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10055 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10056 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10057 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10058 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10059 }; 10060 10061 // A legal value of base64 code is in range [0, 127]. We need two lookups 10062 // with tbl/tbx and combine them to get the decode data. The 1st table vector 10063 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 10064 // table vector lookup use tbx, out of range indices are unchanged in 10065 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 10066 // The value of index 64 is set to 0, so that we know that we already get the 10067 // decoded data with the 1st lookup. 10068 static const uint8_t fromBase64ForSIMD[128] = { 10069 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10070 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10071 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10072 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10073 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10074 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10075 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10076 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10077 }; 10078 10079 static const uint8_t fromBase64URLForSIMD[128] = { 10080 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10081 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10082 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10083 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10084 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10085 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10086 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10087 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10088 }; 10089 10090 __ align(CodeEntryAlignment); 10091 StubId stub_id = StubId::stubgen_base64_decodeBlock_id; 10092 StubCodeMark mark(this, stub_id); 10093 address start = __ pc(); 10094 10095 Register src = c_rarg0; // source array 10096 Register soff = c_rarg1; // source start offset 10097 Register send = c_rarg2; // source end offset 10098 Register dst = c_rarg3; // dest array 10099 Register doff = c_rarg4; // position for writing to dest array 10100 Register isURL = c_rarg5; // Base64 or URL character set 10101 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 10102 10103 Register length = send; // reuse send as length of source data to process 10104 10105 Register simd_codec = c_rarg6; 10106 Register nosimd_codec = c_rarg7; 10107 10108 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 10109 10110 __ enter(); 10111 10112 __ add(src, src, soff); 10113 __ add(dst, dst, doff); 10114 10115 __ mov(doff, dst); 10116 10117 __ sub(length, send, soff); 10118 __ bfm(length, zr, 0, 1); 10119 10120 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 10121 __ cbz(isURL, ProcessData); 10122 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 10123 10124 __ BIND(ProcessData); 10125 __ mov(rscratch1, length); 10126 __ cmp(length, (u1)144); // 144 = 80 + 64 10127 __ br(Assembler::LT, Process4B); 10128 10129 // In the MIME case, the line length cannot be more than 76 10130 // bytes (see RFC 2045). This is too short a block for SIMD 10131 // to be worthwhile, so we use non-SIMD here. 10132 __ movw(rscratch1, 79); 10133 10134 __ BIND(Process4B); 10135 __ ldrw(r14, __ post(src, 4)); 10136 __ ubfxw(r10, r14, 0, 8); 10137 __ ubfxw(r11, r14, 8, 8); 10138 __ ubfxw(r12, r14, 16, 8); 10139 __ ubfxw(r13, r14, 24, 8); 10140 // get the de-code 10141 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 10142 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 10143 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 10144 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 10145 // error detection, 255u indicates an illegal input 10146 __ orrw(r14, r10, r11); 10147 __ orrw(r15, r12, r13); 10148 __ orrw(r14, r14, r15); 10149 __ tbnz(r14, 7, Exit); 10150 // recover the data 10151 __ lslw(r14, r10, 10); 10152 __ bfiw(r14, r11, 4, 6); 10153 __ bfmw(r14, r12, 2, 5); 10154 __ rev16w(r14, r14); 10155 __ bfiw(r13, r12, 6, 2); 10156 __ strh(r14, __ post(dst, 2)); 10157 __ strb(r13, __ post(dst, 1)); 10158 // non-simd loop 10159 __ subsw(rscratch1, rscratch1, 4); 10160 __ br(Assembler::GT, Process4B); 10161 10162 // if exiting from PreProcess80B, rscratch1 == -1; 10163 // otherwise, rscratch1 == 0. 10164 __ cbzw(rscratch1, Exit); 10165 __ sub(length, length, 80); 10166 10167 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 10168 __ cbz(isURL, SIMDEnter); 10169 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 10170 10171 __ BIND(SIMDEnter); 10172 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 10173 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 10174 __ mov(rscratch1, 63); 10175 __ dup(v27, __ T16B, rscratch1); 10176 10177 __ BIND(Process64B); 10178 __ cmp(length, (u1)64); 10179 __ br(Assembler::LT, Process32B); 10180 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 10181 __ sub(length, length, 64); 10182 __ b(Process64B); 10183 10184 __ BIND(Process32B); 10185 __ cmp(length, (u1)32); 10186 __ br(Assembler::LT, SIMDExit); 10187 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 10188 __ sub(length, length, 32); 10189 __ b(Process32B); 10190 10191 __ BIND(SIMDExit); 10192 __ cbz(length, Exit); 10193 __ movw(rscratch1, length); 10194 __ b(Process4B); 10195 10196 __ BIND(Exit); 10197 __ sub(c_rarg0, dst, doff); 10198 10199 __ leave(); 10200 __ ret(lr); 10201 10202 return start; 10203 } 10204 10205 // Support for spin waits. 10206 address generate_spin_wait() { 10207 __ align(CodeEntryAlignment); 10208 StubId stub_id = StubId::stubgen_spin_wait_id; 10209 StubCodeMark mark(this, stub_id); 10210 address start = __ pc(); 10211 10212 __ spin_wait(); 10213 __ ret(lr); 10214 10215 return start; 10216 } 10217 10218 void generate_lookup_secondary_supers_table_stub() { 10219 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id; 10220 StubCodeMark mark(this, stub_id); 10221 10222 const Register 10223 r_super_klass = r0, 10224 r_array_base = r1, 10225 r_array_length = r2, 10226 r_array_index = r3, 10227 r_sub_klass = r4, 10228 r_bitmap = rscratch2, 10229 result = r5; 10230 const FloatRegister 10231 vtemp = v0; 10232 10233 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 10234 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 10235 Label L_success; 10236 __ enter(); 10237 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 10238 r_array_base, r_array_length, r_array_index, 10239 vtemp, result, slot, 10240 /*stub_is_near*/true); 10241 __ leave(); 10242 __ ret(lr); 10243 } 10244 } 10245 10246 // Slow path implementation for UseSecondarySupersTable. 10247 address generate_lookup_secondary_supers_table_slow_path_stub() { 10248 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id; 10249 StubCodeMark mark(this, stub_id); 10250 10251 address start = __ pc(); 10252 const Register 10253 r_super_klass = r0, // argument 10254 r_array_base = r1, // argument 10255 temp1 = r2, // temp 10256 r_array_index = r3, // argument 10257 r_bitmap = rscratch2, // argument 10258 result = r5; // argument 10259 10260 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 10261 __ ret(lr); 10262 10263 return start; 10264 } 10265 10266 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10267 10268 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 10269 // 10270 // If LSE is in use, generate LSE versions of all the stubs. The 10271 // non-LSE versions are in atomic_aarch64.S. 10272 10273 // class AtomicStubMark records the entry point of a stub and the 10274 // stub pointer which will point to it. The stub pointer is set to 10275 // the entry point when ~AtomicStubMark() is called, which must be 10276 // after ICache::invalidate_range. This ensures safe publication of 10277 // the generated code. 10278 class AtomicStubMark { 10279 address _entry_point; 10280 aarch64_atomic_stub_t *_stub; 10281 MacroAssembler *_masm; 10282 public: 10283 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 10284 _masm = masm; 10285 __ align(32); 10286 _entry_point = __ pc(); 10287 _stub = stub; 10288 } 10289 ~AtomicStubMark() { 10290 *_stub = (aarch64_atomic_stub_t)_entry_point; 10291 } 10292 }; 10293 10294 // NB: For memory_order_conservative we need a trailing membar after 10295 // LSE atomic operations but not a leading membar. 10296 // 10297 // We don't need a leading membar because a clause in the Arm ARM 10298 // says: 10299 // 10300 // Barrier-ordered-before 10301 // 10302 // Barrier instructions order prior Memory effects before subsequent 10303 // Memory effects generated by the same Observer. A read or a write 10304 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 10305 // Observer if and only if RW1 appears in program order before RW 2 10306 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 10307 // instruction with both Acquire and Release semantics. 10308 // 10309 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 10310 // and Release semantics, therefore we don't need a leading 10311 // barrier. However, there is no corresponding Barrier-ordered-after 10312 // relationship, therefore we need a trailing membar to prevent a 10313 // later store or load from being reordered with the store in an 10314 // atomic instruction. 10315 // 10316 // This was checked by using the herd7 consistency model simulator 10317 // (http://diy.inria.fr/) with this test case: 10318 // 10319 // AArch64 LseCas 10320 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 10321 // P0 | P1; 10322 // LDR W4, [X2] | MOV W3, #0; 10323 // DMB LD | MOV W4, #1; 10324 // LDR W3, [X1] | CASAL W3, W4, [X1]; 10325 // | DMB ISH; 10326 // | STR W4, [X2]; 10327 // exists 10328 // (0:X3=0 /\ 0:X4=1) 10329 // 10330 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 10331 // with the store to x in P1. Without the DMB in P1 this may happen. 10332 // 10333 // At the time of writing we don't know of any AArch64 hardware that 10334 // reorders stores in this way, but the Reference Manual permits it. 10335 10336 void gen_cas_entry(Assembler::operand_size size, 10337 atomic_memory_order order) { 10338 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 10339 exchange_val = c_rarg2; 10340 bool acquire, release; 10341 switch (order) { 10342 case memory_order_relaxed: 10343 acquire = false; 10344 release = false; 10345 break; 10346 case memory_order_release: 10347 acquire = false; 10348 release = true; 10349 break; 10350 default: 10351 acquire = true; 10352 release = true; 10353 break; 10354 } 10355 __ mov(prev, compare_val); 10356 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 10357 if (order == memory_order_conservative) { 10358 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10359 } 10360 if (size == Assembler::xword) { 10361 __ mov(r0, prev); 10362 } else { 10363 __ movw(r0, prev); 10364 } 10365 __ ret(lr); 10366 } 10367 10368 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10369 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10370 // If not relaxed, then default to conservative. Relaxed is the only 10371 // case we use enough to be worth specializing. 10372 if (order == memory_order_relaxed) { 10373 __ ldadd(size, incr, prev, addr); 10374 } else { 10375 __ ldaddal(size, incr, prev, addr); 10376 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10377 } 10378 if (size == Assembler::xword) { 10379 __ mov(r0, prev); 10380 } else { 10381 __ movw(r0, prev); 10382 } 10383 __ ret(lr); 10384 } 10385 10386 void gen_swpal_entry(Assembler::operand_size size) { 10387 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10388 __ swpal(size, incr, prev, addr); 10389 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10390 if (size == Assembler::xword) { 10391 __ mov(r0, prev); 10392 } else { 10393 __ movw(r0, prev); 10394 } 10395 __ ret(lr); 10396 } 10397 10398 void generate_atomic_entry_points() { 10399 if (! UseLSE) { 10400 return; 10401 } 10402 __ align(CodeEntryAlignment); 10403 StubId stub_id = StubId::stubgen_atomic_entry_points_id; 10404 StubCodeMark mark(this, stub_id); 10405 address first_entry = __ pc(); 10406 10407 // ADD, memory_order_conservative 10408 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10409 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10410 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10411 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10412 10413 // ADD, memory_order_relaxed 10414 AtomicStubMark mark_fetch_add_4_relaxed 10415 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10416 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10417 AtomicStubMark mark_fetch_add_8_relaxed 10418 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10419 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10420 10421 // XCHG, memory_order_conservative 10422 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10423 gen_swpal_entry(Assembler::word); 10424 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10425 gen_swpal_entry(Assembler::xword); 10426 10427 // CAS, memory_order_conservative 10428 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10429 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10430 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10431 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10432 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10433 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10434 10435 // CAS, memory_order_relaxed 10436 AtomicStubMark mark_cmpxchg_1_relaxed 10437 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10438 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10439 AtomicStubMark mark_cmpxchg_4_relaxed 10440 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10441 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10442 AtomicStubMark mark_cmpxchg_8_relaxed 10443 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10444 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10445 10446 AtomicStubMark mark_cmpxchg_4_release 10447 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10448 gen_cas_entry(MacroAssembler::word, memory_order_release); 10449 AtomicStubMark mark_cmpxchg_8_release 10450 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10451 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10452 10453 AtomicStubMark mark_cmpxchg_4_seq_cst 10454 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10455 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10456 AtomicStubMark mark_cmpxchg_8_seq_cst 10457 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10458 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10459 10460 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10461 } 10462 #endif // LINUX 10463 10464 address generate_cont_thaw(Continuation::thaw_kind kind) { 10465 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10466 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10467 10468 address start = __ pc(); 10469 10470 if (return_barrier) { 10471 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10472 __ mov(sp, rscratch1); 10473 } 10474 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10475 10476 if (return_barrier) { 10477 // preserve possible return value from a method returning to the return barrier 10478 __ fmovd(rscratch1, v0); 10479 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10480 } 10481 10482 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10483 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10484 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10485 10486 if (return_barrier) { 10487 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10488 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10489 __ fmovd(v0, rscratch1); 10490 } 10491 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10492 10493 10494 Label thaw_success; 10495 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10496 __ cbnz(rscratch2, thaw_success); 10497 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10498 __ br(rscratch1); 10499 __ bind(thaw_success); 10500 10501 // make room for the thawed frames 10502 __ sub(rscratch1, sp, rscratch2); 10503 __ andr(rscratch1, rscratch1, -16); // align 10504 __ mov(sp, rscratch1); 10505 10506 if (return_barrier) { 10507 // save original return value -- again 10508 __ fmovd(rscratch1, v0); 10509 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10510 } 10511 10512 // If we want, we can templatize thaw by kind, and have three different entries 10513 __ movw(c_rarg1, (uint32_t)kind); 10514 10515 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10516 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10517 10518 if (return_barrier) { 10519 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10520 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10521 __ fmovd(v0, rscratch1); 10522 } else { 10523 __ mov(r0, zr); // return 0 (success) from doYield 10524 } 10525 10526 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10527 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10528 __ mov(rfp, sp); 10529 10530 if (return_barrier_exception) { 10531 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10532 __ authenticate_return_address(c_rarg1); 10533 __ verify_oop(r0); 10534 // save return value containing the exception oop in callee-saved R19 10535 __ mov(r19, r0); 10536 10537 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10538 10539 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10540 // __ reinitialize_ptrue(); 10541 10542 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10543 10544 __ mov(r1, r0); // the exception handler 10545 __ mov(r0, r19); // restore return value containing the exception oop 10546 __ verify_oop(r0); 10547 10548 __ leave(); 10549 __ mov(r3, lr); 10550 __ br(r1); // the exception handler 10551 } else { 10552 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10553 __ leave(); 10554 __ ret(lr); 10555 } 10556 10557 return start; 10558 } 10559 10560 address generate_cont_thaw() { 10561 if (!Continuations::enabled()) return nullptr; 10562 10563 StubId stub_id = StubId::stubgen_cont_thaw_id; 10564 StubCodeMark mark(this, stub_id); 10565 address start = __ pc(); 10566 generate_cont_thaw(Continuation::thaw_top); 10567 return start; 10568 } 10569 10570 address generate_cont_returnBarrier() { 10571 if (!Continuations::enabled()) return nullptr; 10572 10573 // TODO: will probably need multiple return barriers depending on return type 10574 StubId stub_id = StubId::stubgen_cont_returnBarrier_id; 10575 StubCodeMark mark(this, stub_id); 10576 address start = __ pc(); 10577 10578 generate_cont_thaw(Continuation::thaw_return_barrier); 10579 10580 return start; 10581 } 10582 10583 address generate_cont_returnBarrier_exception() { 10584 if (!Continuations::enabled()) return nullptr; 10585 10586 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id; 10587 StubCodeMark mark(this, stub_id); 10588 address start = __ pc(); 10589 10590 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10591 10592 return start; 10593 } 10594 10595 address generate_cont_preempt_stub() { 10596 if (!Continuations::enabled()) return nullptr; 10597 StubId stub_id = StubId::stubgen_cont_preempt_id; 10598 StubCodeMark mark(this, stub_id); 10599 address start = __ pc(); 10600 10601 __ reset_last_Java_frame(true); 10602 10603 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10604 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10605 __ mov(sp, rscratch2); 10606 10607 Label preemption_cancelled; 10608 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10609 __ cbnz(rscratch1, preemption_cancelled); 10610 10611 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10612 SharedRuntime::continuation_enter_cleanup(_masm); 10613 __ leave(); 10614 __ ret(lr); 10615 10616 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10617 __ bind(preemption_cancelled); 10618 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10619 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10620 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10621 __ ldr(rscratch1, Address(rscratch1)); 10622 __ br(rscratch1); 10623 10624 return start; 10625 } 10626 10627 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10628 // are represented as long[5], with BITS_PER_LIMB = 26. 10629 // Pack five 26-bit limbs into three 64-bit registers. 10630 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10631 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10632 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10633 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10634 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10635 10636 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10637 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10638 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10639 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10640 10641 if (dest2->is_valid()) { 10642 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10643 } else { 10644 #ifdef ASSERT 10645 Label OK; 10646 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10647 __ br(__ EQ, OK); 10648 __ stop("high bits of Poly1305 integer should be zero"); 10649 __ should_not_reach_here(); 10650 __ bind(OK); 10651 #endif 10652 } 10653 } 10654 10655 // As above, but return only a 128-bit integer, packed into two 10656 // 64-bit registers. 10657 void pack_26(Register dest0, Register dest1, Register src) { 10658 pack_26(dest0, dest1, noreg, src); 10659 } 10660 10661 // Multiply and multiply-accumulate unsigned 64-bit registers. 10662 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10663 __ mul(prod_lo, n, m); 10664 __ umulh(prod_hi, n, m); 10665 } 10666 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10667 wide_mul(rscratch1, rscratch2, n, m); 10668 __ adds(sum_lo, sum_lo, rscratch1); 10669 __ adc(sum_hi, sum_hi, rscratch2); 10670 } 10671 10672 // Poly1305, RFC 7539 10673 10674 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10675 // description of the tricks used to simplify and accelerate this 10676 // computation. 10677 10678 address generate_poly1305_processBlocks() { 10679 __ align(CodeEntryAlignment); 10680 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id; 10681 StubCodeMark mark(this, stub_id); 10682 address start = __ pc(); 10683 Label here; 10684 __ enter(); 10685 RegSet callee_saved = RegSet::range(r19, r28); 10686 __ push(callee_saved, sp); 10687 10688 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10689 10690 // Arguments 10691 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10692 10693 // R_n is the 128-bit randomly-generated key, packed into two 10694 // registers. The caller passes this key to us as long[5], with 10695 // BITS_PER_LIMB = 26. 10696 const Register R_0 = *++regs, R_1 = *++regs; 10697 pack_26(R_0, R_1, r_start); 10698 10699 // RR_n is (R_n >> 2) * 5 10700 const Register RR_0 = *++regs, RR_1 = *++regs; 10701 __ lsr(RR_0, R_0, 2); 10702 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10703 __ lsr(RR_1, R_1, 2); 10704 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10705 10706 // U_n is the current checksum 10707 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10708 pack_26(U_0, U_1, U_2, acc_start); 10709 10710 static constexpr int BLOCK_LENGTH = 16; 10711 Label DONE, LOOP; 10712 10713 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10714 __ br(Assembler::LT, DONE); { 10715 __ bind(LOOP); 10716 10717 // S_n is to be the sum of U_n and the next block of data 10718 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10719 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10720 __ adds(S_0, U_0, S_0); 10721 __ adcs(S_1, U_1, S_1); 10722 __ adc(S_2, U_2, zr); 10723 __ add(S_2, S_2, 1); 10724 10725 const Register U_0HI = *++regs, U_1HI = *++regs; 10726 10727 // NB: this logic depends on some of the special properties of 10728 // Poly1305 keys. In particular, because we know that the top 10729 // four bits of R_0 and R_1 are zero, we can add together 10730 // partial products without any risk of needing to propagate a 10731 // carry out. 10732 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10733 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10734 __ andr(U_2, R_0, 3); 10735 __ mul(U_2, S_2, U_2); 10736 10737 // Recycle registers S_0, S_1, S_2 10738 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10739 10740 // Partial reduction mod 2**130 - 5 10741 __ adds(U_1, U_0HI, U_1); 10742 __ adc(U_2, U_1HI, U_2); 10743 // Sum now in U_2:U_1:U_0. 10744 // Dead: U_0HI, U_1HI. 10745 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10746 10747 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10748 10749 // First, U_2:U_1:U_0 += (U_2 >> 2) 10750 __ lsr(rscratch1, U_2, 2); 10751 __ andr(U_2, U_2, (u8)3); 10752 __ adds(U_0, U_0, rscratch1); 10753 __ adcs(U_1, U_1, zr); 10754 __ adc(U_2, U_2, zr); 10755 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10756 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10757 __ adcs(U_1, U_1, zr); 10758 __ adc(U_2, U_2, zr); 10759 10760 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10761 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10762 __ br(~ Assembler::LT, LOOP); 10763 } 10764 10765 // Further reduce modulo 2^130 - 5 10766 __ lsr(rscratch1, U_2, 2); 10767 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10768 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10769 __ adcs(U_1, U_1, zr); 10770 __ andr(U_2, U_2, (u1)3); 10771 __ adc(U_2, U_2, zr); 10772 10773 // Unpack the sum into five 26-bit limbs and write to memory. 10774 __ ubfiz(rscratch1, U_0, 0, 26); 10775 __ ubfx(rscratch2, U_0, 26, 26); 10776 __ stp(rscratch1, rscratch2, Address(acc_start)); 10777 __ ubfx(rscratch1, U_0, 52, 12); 10778 __ bfi(rscratch1, U_1, 12, 14); 10779 __ ubfx(rscratch2, U_1, 14, 26); 10780 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10781 __ ubfx(rscratch1, U_1, 40, 24); 10782 __ bfi(rscratch1, U_2, 24, 3); 10783 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10784 10785 __ bind(DONE); 10786 __ pop(callee_saved, sp); 10787 __ leave(); 10788 __ ret(lr); 10789 10790 return start; 10791 } 10792 10793 // exception handler for upcall stubs 10794 address generate_upcall_stub_exception_handler() { 10795 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id; 10796 StubCodeMark mark(this, stub_id); 10797 address start = __ pc(); 10798 10799 // Native caller has no idea how to handle exceptions, 10800 // so we just crash here. Up to callee to catch exceptions. 10801 __ verify_oop(r0); 10802 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10803 __ blr(rscratch1); 10804 __ should_not_reach_here(); 10805 10806 return start; 10807 } 10808 10809 // load Method* target of MethodHandle 10810 // j_rarg0 = jobject receiver 10811 // rmethod = result 10812 address generate_upcall_stub_load_target() { 10813 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id; 10814 StubCodeMark mark(this, stub_id); 10815 address start = __ pc(); 10816 10817 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10818 // Load target method from receiver 10819 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10820 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10821 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10822 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10823 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10824 noreg, noreg); 10825 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10826 10827 __ ret(lr); 10828 10829 return start; 10830 } 10831 10832 #undef __ 10833 #define __ masm-> 10834 10835 class MontgomeryMultiplyGenerator : public MacroAssembler { 10836 10837 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10838 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10839 10840 RegSet _toSave; 10841 bool _squaring; 10842 10843 public: 10844 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10845 : MacroAssembler(as->code()), _squaring(squaring) { 10846 10847 // Register allocation 10848 10849 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10850 Pa_base = *regs; // Argument registers 10851 if (squaring) 10852 Pb_base = Pa_base; 10853 else 10854 Pb_base = *++regs; 10855 Pn_base = *++regs; 10856 Rlen= *++regs; 10857 inv = *++regs; 10858 Pm_base = *++regs; 10859 10860 // Working registers: 10861 Ra = *++regs; // The current digit of a, b, n, and m. 10862 Rb = *++regs; 10863 Rm = *++regs; 10864 Rn = *++regs; 10865 10866 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10867 Pb = *++regs; 10868 Pm = *++regs; 10869 Pn = *++regs; 10870 10871 t0 = *++regs; // Three registers which form a 10872 t1 = *++regs; // triple-precision accumuator. 10873 t2 = *++regs; 10874 10875 Ri = *++regs; // Inner and outer loop indexes. 10876 Rj = *++regs; 10877 10878 Rhi_ab = *++regs; // Product registers: low and high parts 10879 Rlo_ab = *++regs; // of a*b and m*n. 10880 Rhi_mn = *++regs; 10881 Rlo_mn = *++regs; 10882 10883 // r19 and up are callee-saved. 10884 _toSave = RegSet::range(r19, *regs) + Pm_base; 10885 } 10886 10887 private: 10888 void save_regs() { 10889 push(_toSave, sp); 10890 } 10891 10892 void restore_regs() { 10893 pop(_toSave, sp); 10894 } 10895 10896 template <typename T> 10897 void unroll_2(Register count, T block) { 10898 Label loop, end, odd; 10899 tbnz(count, 0, odd); 10900 cbz(count, end); 10901 align(16); 10902 bind(loop); 10903 (this->*block)(); 10904 bind(odd); 10905 (this->*block)(); 10906 subs(count, count, 2); 10907 br(Assembler::GT, loop); 10908 bind(end); 10909 } 10910 10911 template <typename T> 10912 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10913 Label loop, end, odd; 10914 tbnz(count, 0, odd); 10915 cbz(count, end); 10916 align(16); 10917 bind(loop); 10918 (this->*block)(d, s, tmp); 10919 bind(odd); 10920 (this->*block)(d, s, tmp); 10921 subs(count, count, 2); 10922 br(Assembler::GT, loop); 10923 bind(end); 10924 } 10925 10926 void pre1(RegisterOrConstant i) { 10927 block_comment("pre1"); 10928 // Pa = Pa_base; 10929 // Pb = Pb_base + i; 10930 // Pm = Pm_base; 10931 // Pn = Pn_base + i; 10932 // Ra = *Pa; 10933 // Rb = *Pb; 10934 // Rm = *Pm; 10935 // Rn = *Pn; 10936 ldr(Ra, Address(Pa_base)); 10937 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10938 ldr(Rm, Address(Pm_base)); 10939 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10940 lea(Pa, Address(Pa_base)); 10941 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10942 lea(Pm, Address(Pm_base)); 10943 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10944 10945 // Zero the m*n result. 10946 mov(Rhi_mn, zr); 10947 mov(Rlo_mn, zr); 10948 } 10949 10950 // The core multiply-accumulate step of a Montgomery 10951 // multiplication. The idea is to schedule operations as a 10952 // pipeline so that instructions with long latencies (loads and 10953 // multiplies) have time to complete before their results are 10954 // used. This most benefits in-order implementations of the 10955 // architecture but out-of-order ones also benefit. 10956 void step() { 10957 block_comment("step"); 10958 // MACC(Ra, Rb, t0, t1, t2); 10959 // Ra = *++Pa; 10960 // Rb = *--Pb; 10961 umulh(Rhi_ab, Ra, Rb); 10962 mul(Rlo_ab, Ra, Rb); 10963 ldr(Ra, pre(Pa, wordSize)); 10964 ldr(Rb, pre(Pb, -wordSize)); 10965 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 10966 // previous iteration. 10967 // MACC(Rm, Rn, t0, t1, t2); 10968 // Rm = *++Pm; 10969 // Rn = *--Pn; 10970 umulh(Rhi_mn, Rm, Rn); 10971 mul(Rlo_mn, Rm, Rn); 10972 ldr(Rm, pre(Pm, wordSize)); 10973 ldr(Rn, pre(Pn, -wordSize)); 10974 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10975 } 10976 10977 void post1() { 10978 block_comment("post1"); 10979 10980 // MACC(Ra, Rb, t0, t1, t2); 10981 // Ra = *++Pa; 10982 // Rb = *--Pb; 10983 umulh(Rhi_ab, Ra, Rb); 10984 mul(Rlo_ab, Ra, Rb); 10985 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10986 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10987 10988 // *Pm = Rm = t0 * inv; 10989 mul(Rm, t0, inv); 10990 str(Rm, Address(Pm)); 10991 10992 // MACC(Rm, Rn, t0, t1, t2); 10993 // t0 = t1; t1 = t2; t2 = 0; 10994 umulh(Rhi_mn, Rm, Rn); 10995 10996 #ifndef PRODUCT 10997 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10998 { 10999 mul(Rlo_mn, Rm, Rn); 11000 add(Rlo_mn, t0, Rlo_mn); 11001 Label ok; 11002 cbz(Rlo_mn, ok); { 11003 stop("broken Montgomery multiply"); 11004 } bind(ok); 11005 } 11006 #endif 11007 // We have very carefully set things up so that 11008 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11009 // the lower half of Rm * Rn because we know the result already: 11010 // it must be -t0. t0 + (-t0) must generate a carry iff 11011 // t0 != 0. So, rather than do a mul and an adds we just set 11012 // the carry flag iff t0 is nonzero. 11013 // 11014 // mul(Rlo_mn, Rm, Rn); 11015 // adds(zr, t0, Rlo_mn); 11016 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11017 adcs(t0, t1, Rhi_mn); 11018 adc(t1, t2, zr); 11019 mov(t2, zr); 11020 } 11021 11022 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 11023 block_comment("pre2"); 11024 // Pa = Pa_base + i-len; 11025 // Pb = Pb_base + len; 11026 // Pm = Pm_base + i-len; 11027 // Pn = Pn_base + len; 11028 11029 if (i.is_register()) { 11030 sub(Rj, i.as_register(), len); 11031 } else { 11032 mov(Rj, i.as_constant()); 11033 sub(Rj, Rj, len); 11034 } 11035 // Rj == i-len 11036 11037 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 11038 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 11039 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11040 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 11041 11042 // Ra = *++Pa; 11043 // Rb = *--Pb; 11044 // Rm = *++Pm; 11045 // Rn = *--Pn; 11046 ldr(Ra, pre(Pa, wordSize)); 11047 ldr(Rb, pre(Pb, -wordSize)); 11048 ldr(Rm, pre(Pm, wordSize)); 11049 ldr(Rn, pre(Pn, -wordSize)); 11050 11051 mov(Rhi_mn, zr); 11052 mov(Rlo_mn, zr); 11053 } 11054 11055 void post2(RegisterOrConstant i, RegisterOrConstant len) { 11056 block_comment("post2"); 11057 if (i.is_constant()) { 11058 mov(Rj, i.as_constant()-len.as_constant()); 11059 } else { 11060 sub(Rj, i.as_register(), len); 11061 } 11062 11063 adds(t0, t0, Rlo_mn); // The pending m*n, low part 11064 11065 // As soon as we know the least significant digit of our result, 11066 // store it. 11067 // Pm_base[i-len] = t0; 11068 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11069 11070 // t0 = t1; t1 = t2; t2 = 0; 11071 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 11072 adc(t1, t2, zr); 11073 mov(t2, zr); 11074 } 11075 11076 // A carry in t0 after Montgomery multiplication means that we 11077 // should subtract multiples of n from our result in m. We'll 11078 // keep doing that until there is no carry. 11079 void normalize(RegisterOrConstant len) { 11080 block_comment("normalize"); 11081 // while (t0) 11082 // t0 = sub(Pm_base, Pn_base, t0, len); 11083 Label loop, post, again; 11084 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 11085 cbz(t0, post); { 11086 bind(again); { 11087 mov(i, zr); 11088 mov(cnt, len); 11089 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11090 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11091 subs(zr, zr, zr); // set carry flag, i.e. no borrow 11092 align(16); 11093 bind(loop); { 11094 sbcs(Rm, Rm, Rn); 11095 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11096 add(i, i, 1); 11097 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11098 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11099 sub(cnt, cnt, 1); 11100 } cbnz(cnt, loop); 11101 sbc(t0, t0, zr); 11102 } cbnz(t0, again); 11103 } bind(post); 11104 } 11105 11106 // Move memory at s to d, reversing words. 11107 // Increments d to end of copied memory 11108 // Destroys tmp1, tmp2 11109 // Preserves len 11110 // Leaves s pointing to the address which was in d at start 11111 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 11112 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 11113 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 11114 11115 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 11116 mov(tmp1, len); 11117 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 11118 sub(s, d, len, ext::uxtw, LogBytesPerWord); 11119 } 11120 // where 11121 void reverse1(Register d, Register s, Register tmp) { 11122 ldr(tmp, pre(s, -wordSize)); 11123 ror(tmp, tmp, 32); 11124 str(tmp, post(d, wordSize)); 11125 } 11126 11127 void step_squaring() { 11128 // An extra ACC 11129 step(); 11130 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11131 } 11132 11133 void last_squaring(RegisterOrConstant i) { 11134 Label dont; 11135 // if ((i & 1) == 0) { 11136 tbnz(i.as_register(), 0, dont); { 11137 // MACC(Ra, Rb, t0, t1, t2); 11138 // Ra = *++Pa; 11139 // Rb = *--Pb; 11140 umulh(Rhi_ab, Ra, Rb); 11141 mul(Rlo_ab, Ra, Rb); 11142 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11143 } bind(dont); 11144 } 11145 11146 void extra_step_squaring() { 11147 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11148 11149 // MACC(Rm, Rn, t0, t1, t2); 11150 // Rm = *++Pm; 11151 // Rn = *--Pn; 11152 umulh(Rhi_mn, Rm, Rn); 11153 mul(Rlo_mn, Rm, Rn); 11154 ldr(Rm, pre(Pm, wordSize)); 11155 ldr(Rn, pre(Pn, -wordSize)); 11156 } 11157 11158 void post1_squaring() { 11159 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11160 11161 // *Pm = Rm = t0 * inv; 11162 mul(Rm, t0, inv); 11163 str(Rm, Address(Pm)); 11164 11165 // MACC(Rm, Rn, t0, t1, t2); 11166 // t0 = t1; t1 = t2; t2 = 0; 11167 umulh(Rhi_mn, Rm, Rn); 11168 11169 #ifndef PRODUCT 11170 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11171 { 11172 mul(Rlo_mn, Rm, Rn); 11173 add(Rlo_mn, t0, Rlo_mn); 11174 Label ok; 11175 cbz(Rlo_mn, ok); { 11176 stop("broken Montgomery multiply"); 11177 } bind(ok); 11178 } 11179 #endif 11180 // We have very carefully set things up so that 11181 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11182 // the lower half of Rm * Rn because we know the result already: 11183 // it must be -t0. t0 + (-t0) must generate a carry iff 11184 // t0 != 0. So, rather than do a mul and an adds we just set 11185 // the carry flag iff t0 is nonzero. 11186 // 11187 // mul(Rlo_mn, Rm, Rn); 11188 // adds(zr, t0, Rlo_mn); 11189 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11190 adcs(t0, t1, Rhi_mn); 11191 adc(t1, t2, zr); 11192 mov(t2, zr); 11193 } 11194 11195 void acc(Register Rhi, Register Rlo, 11196 Register t0, Register t1, Register t2) { 11197 adds(t0, t0, Rlo); 11198 adcs(t1, t1, Rhi); 11199 adc(t2, t2, zr); 11200 } 11201 11202 public: 11203 /** 11204 * Fast Montgomery multiplication. The derivation of the 11205 * algorithm is in A Cryptographic Library for the Motorola 11206 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 11207 * 11208 * Arguments: 11209 * 11210 * Inputs for multiplication: 11211 * c_rarg0 - int array elements a 11212 * c_rarg1 - int array elements b 11213 * c_rarg2 - int array elements n (the modulus) 11214 * c_rarg3 - int length 11215 * c_rarg4 - int inv 11216 * c_rarg5 - int array elements m (the result) 11217 * 11218 * Inputs for squaring: 11219 * c_rarg0 - int array elements a 11220 * c_rarg1 - int array elements n (the modulus) 11221 * c_rarg2 - int length 11222 * c_rarg3 - int inv 11223 * c_rarg4 - int array elements m (the result) 11224 * 11225 */ 11226 address generate_multiply() { 11227 Label argh, nothing; 11228 bind(argh); 11229 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11230 11231 align(CodeEntryAlignment); 11232 address entry = pc(); 11233 11234 cbzw(Rlen, nothing); 11235 11236 enter(); 11237 11238 // Make room. 11239 cmpw(Rlen, 512); 11240 br(Assembler::HI, argh); 11241 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11242 andr(sp, Ra, -2 * wordSize); 11243 11244 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11245 11246 { 11247 // Copy input args, reversing as we go. We use Ra as a 11248 // temporary variable. 11249 reverse(Ra, Pa_base, Rlen, t0, t1); 11250 if (!_squaring) 11251 reverse(Ra, Pb_base, Rlen, t0, t1); 11252 reverse(Ra, Pn_base, Rlen, t0, t1); 11253 } 11254 11255 // Push all call-saved registers and also Pm_base which we'll need 11256 // at the end. 11257 save_regs(); 11258 11259 #ifndef PRODUCT 11260 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 11261 { 11262 ldr(Rn, Address(Pn_base, 0)); 11263 mul(Rlo_mn, Rn, inv); 11264 subs(zr, Rlo_mn, -1); 11265 Label ok; 11266 br(EQ, ok); { 11267 stop("broken inverse in Montgomery multiply"); 11268 } bind(ok); 11269 } 11270 #endif 11271 11272 mov(Pm_base, Ra); 11273 11274 mov(t0, zr); 11275 mov(t1, zr); 11276 mov(t2, zr); 11277 11278 block_comment("for (int i = 0; i < len; i++) {"); 11279 mov(Ri, zr); { 11280 Label loop, end; 11281 cmpw(Ri, Rlen); 11282 br(Assembler::GE, end); 11283 11284 bind(loop); 11285 pre1(Ri); 11286 11287 block_comment(" for (j = i; j; j--) {"); { 11288 movw(Rj, Ri); 11289 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11290 } block_comment(" } // j"); 11291 11292 post1(); 11293 addw(Ri, Ri, 1); 11294 cmpw(Ri, Rlen); 11295 br(Assembler::LT, loop); 11296 bind(end); 11297 block_comment("} // i"); 11298 } 11299 11300 block_comment("for (int i = len; i < 2*len; i++) {"); 11301 mov(Ri, Rlen); { 11302 Label loop, end; 11303 cmpw(Ri, Rlen, Assembler::LSL, 1); 11304 br(Assembler::GE, end); 11305 11306 bind(loop); 11307 pre2(Ri, Rlen); 11308 11309 block_comment(" for (j = len*2-i-1; j; j--) {"); { 11310 lslw(Rj, Rlen, 1); 11311 subw(Rj, Rj, Ri); 11312 subw(Rj, Rj, 1); 11313 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11314 } block_comment(" } // j"); 11315 11316 post2(Ri, Rlen); 11317 addw(Ri, Ri, 1); 11318 cmpw(Ri, Rlen, Assembler::LSL, 1); 11319 br(Assembler::LT, loop); 11320 bind(end); 11321 } 11322 block_comment("} // i"); 11323 11324 normalize(Rlen); 11325 11326 mov(Ra, Pm_base); // Save Pm_base in Ra 11327 restore_regs(); // Restore caller's Pm_base 11328 11329 // Copy our result into caller's Pm_base 11330 reverse(Pm_base, Ra, Rlen, t0, t1); 11331 11332 leave(); 11333 bind(nothing); 11334 ret(lr); 11335 11336 return entry; 11337 } 11338 // In C, approximately: 11339 11340 // void 11341 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 11342 // julong Pn_base[], julong Pm_base[], 11343 // julong inv, int len) { 11344 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11345 // julong *Pa, *Pb, *Pn, *Pm; 11346 // julong Ra, Rb, Rn, Rm; 11347 11348 // int i; 11349 11350 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11351 11352 // for (i = 0; i < len; i++) { 11353 // int j; 11354 11355 // Pa = Pa_base; 11356 // Pb = Pb_base + i; 11357 // Pm = Pm_base; 11358 // Pn = Pn_base + i; 11359 11360 // Ra = *Pa; 11361 // Rb = *Pb; 11362 // Rm = *Pm; 11363 // Rn = *Pn; 11364 11365 // int iters = i; 11366 // for (j = 0; iters--; j++) { 11367 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11368 // MACC(Ra, Rb, t0, t1, t2); 11369 // Ra = *++Pa; 11370 // Rb = *--Pb; 11371 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11372 // MACC(Rm, Rn, t0, t1, t2); 11373 // Rm = *++Pm; 11374 // Rn = *--Pn; 11375 // } 11376 11377 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11378 // MACC(Ra, Rb, t0, t1, t2); 11379 // *Pm = Rm = t0 * inv; 11380 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11381 // MACC(Rm, Rn, t0, t1, t2); 11382 11383 // assert(t0 == 0, "broken Montgomery multiply"); 11384 11385 // t0 = t1; t1 = t2; t2 = 0; 11386 // } 11387 11388 // for (i = len; i < 2*len; i++) { 11389 // int j; 11390 11391 // Pa = Pa_base + i-len; 11392 // Pb = Pb_base + len; 11393 // Pm = Pm_base + i-len; 11394 // Pn = Pn_base + len; 11395 11396 // Ra = *++Pa; 11397 // Rb = *--Pb; 11398 // Rm = *++Pm; 11399 // Rn = *--Pn; 11400 11401 // int iters = len*2-i-1; 11402 // for (j = i-len+1; iters--; j++) { 11403 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11404 // MACC(Ra, Rb, t0, t1, t2); 11405 // Ra = *++Pa; 11406 // Rb = *--Pb; 11407 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11408 // MACC(Rm, Rn, t0, t1, t2); 11409 // Rm = *++Pm; 11410 // Rn = *--Pn; 11411 // } 11412 11413 // Pm_base[i-len] = t0; 11414 // t0 = t1; t1 = t2; t2 = 0; 11415 // } 11416 11417 // while (t0) 11418 // t0 = sub(Pm_base, Pn_base, t0, len); 11419 // } 11420 11421 /** 11422 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11423 * multiplies than Montgomery multiplication so it should be up to 11424 * 25% faster. However, its loop control is more complex and it 11425 * may actually run slower on some machines. 11426 * 11427 * Arguments: 11428 * 11429 * Inputs: 11430 * c_rarg0 - int array elements a 11431 * c_rarg1 - int array elements n (the modulus) 11432 * c_rarg2 - int length 11433 * c_rarg3 - int inv 11434 * c_rarg4 - int array elements m (the result) 11435 * 11436 */ 11437 address generate_square() { 11438 Label argh; 11439 bind(argh); 11440 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11441 11442 align(CodeEntryAlignment); 11443 address entry = pc(); 11444 11445 enter(); 11446 11447 // Make room. 11448 cmpw(Rlen, 512); 11449 br(Assembler::HI, argh); 11450 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11451 andr(sp, Ra, -2 * wordSize); 11452 11453 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11454 11455 { 11456 // Copy input args, reversing as we go. We use Ra as a 11457 // temporary variable. 11458 reverse(Ra, Pa_base, Rlen, t0, t1); 11459 reverse(Ra, Pn_base, Rlen, t0, t1); 11460 } 11461 11462 // Push all call-saved registers and also Pm_base which we'll need 11463 // at the end. 11464 save_regs(); 11465 11466 mov(Pm_base, Ra); 11467 11468 mov(t0, zr); 11469 mov(t1, zr); 11470 mov(t2, zr); 11471 11472 block_comment("for (int i = 0; i < len; i++) {"); 11473 mov(Ri, zr); { 11474 Label loop, end; 11475 bind(loop); 11476 cmp(Ri, Rlen); 11477 br(Assembler::GE, end); 11478 11479 pre1(Ri); 11480 11481 block_comment("for (j = (i+1)/2; j; j--) {"); { 11482 add(Rj, Ri, 1); 11483 lsr(Rj, Rj, 1); 11484 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11485 } block_comment(" } // j"); 11486 11487 last_squaring(Ri); 11488 11489 block_comment(" for (j = i/2; j; j--) {"); { 11490 lsr(Rj, Ri, 1); 11491 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11492 } block_comment(" } // j"); 11493 11494 post1_squaring(); 11495 add(Ri, Ri, 1); 11496 cmp(Ri, Rlen); 11497 br(Assembler::LT, loop); 11498 11499 bind(end); 11500 block_comment("} // i"); 11501 } 11502 11503 block_comment("for (int i = len; i < 2*len; i++) {"); 11504 mov(Ri, Rlen); { 11505 Label loop, end; 11506 bind(loop); 11507 cmp(Ri, Rlen, Assembler::LSL, 1); 11508 br(Assembler::GE, end); 11509 11510 pre2(Ri, Rlen); 11511 11512 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11513 lsl(Rj, Rlen, 1); 11514 sub(Rj, Rj, Ri); 11515 sub(Rj, Rj, 1); 11516 lsr(Rj, Rj, 1); 11517 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11518 } block_comment(" } // j"); 11519 11520 last_squaring(Ri); 11521 11522 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11523 lsl(Rj, Rlen, 1); 11524 sub(Rj, Rj, Ri); 11525 lsr(Rj, Rj, 1); 11526 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11527 } block_comment(" } // j"); 11528 11529 post2(Ri, Rlen); 11530 add(Ri, Ri, 1); 11531 cmp(Ri, Rlen, Assembler::LSL, 1); 11532 11533 br(Assembler::LT, loop); 11534 bind(end); 11535 block_comment("} // i"); 11536 } 11537 11538 normalize(Rlen); 11539 11540 mov(Ra, Pm_base); // Save Pm_base in Ra 11541 restore_regs(); // Restore caller's Pm_base 11542 11543 // Copy our result into caller's Pm_base 11544 reverse(Pm_base, Ra, Rlen, t0, t1); 11545 11546 leave(); 11547 ret(lr); 11548 11549 return entry; 11550 } 11551 // In C, approximately: 11552 11553 // void 11554 // montgomery_square(julong Pa_base[], julong Pn_base[], 11555 // julong Pm_base[], julong inv, int len) { 11556 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11557 // julong *Pa, *Pb, *Pn, *Pm; 11558 // julong Ra, Rb, Rn, Rm; 11559 11560 // int i; 11561 11562 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11563 11564 // for (i = 0; i < len; i++) { 11565 // int j; 11566 11567 // Pa = Pa_base; 11568 // Pb = Pa_base + i; 11569 // Pm = Pm_base; 11570 // Pn = Pn_base + i; 11571 11572 // Ra = *Pa; 11573 // Rb = *Pb; 11574 // Rm = *Pm; 11575 // Rn = *Pn; 11576 11577 // int iters = (i+1)/2; 11578 // for (j = 0; iters--; j++) { 11579 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11580 // MACC2(Ra, Rb, t0, t1, t2); 11581 // Ra = *++Pa; 11582 // Rb = *--Pb; 11583 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11584 // MACC(Rm, Rn, t0, t1, t2); 11585 // Rm = *++Pm; 11586 // Rn = *--Pn; 11587 // } 11588 // if ((i & 1) == 0) { 11589 // assert(Ra == Pa_base[j], "must be"); 11590 // MACC(Ra, Ra, t0, t1, t2); 11591 // } 11592 // iters = i/2; 11593 // assert(iters == i-j, "must be"); 11594 // for (; iters--; j++) { 11595 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11596 // MACC(Rm, Rn, t0, t1, t2); 11597 // Rm = *++Pm; 11598 // Rn = *--Pn; 11599 // } 11600 11601 // *Pm = Rm = t0 * inv; 11602 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11603 // MACC(Rm, Rn, t0, t1, t2); 11604 11605 // assert(t0 == 0, "broken Montgomery multiply"); 11606 11607 // t0 = t1; t1 = t2; t2 = 0; 11608 // } 11609 11610 // for (i = len; i < 2*len; i++) { 11611 // int start = i-len+1; 11612 // int end = start + (len - start)/2; 11613 // int j; 11614 11615 // Pa = Pa_base + i-len; 11616 // Pb = Pa_base + len; 11617 // Pm = Pm_base + i-len; 11618 // Pn = Pn_base + len; 11619 11620 // Ra = *++Pa; 11621 // Rb = *--Pb; 11622 // Rm = *++Pm; 11623 // Rn = *--Pn; 11624 11625 // int iters = (2*len-i-1)/2; 11626 // assert(iters == end-start, "must be"); 11627 // for (j = start; iters--; j++) { 11628 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11629 // MACC2(Ra, Rb, t0, t1, t2); 11630 // Ra = *++Pa; 11631 // Rb = *--Pb; 11632 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11633 // MACC(Rm, Rn, t0, t1, t2); 11634 // Rm = *++Pm; 11635 // Rn = *--Pn; 11636 // } 11637 // if ((i & 1) == 0) { 11638 // assert(Ra == Pa_base[j], "must be"); 11639 // MACC(Ra, Ra, t0, t1, t2); 11640 // } 11641 // iters = (2*len-i)/2; 11642 // assert(iters == len-j, "must be"); 11643 // for (; iters--; j++) { 11644 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11645 // MACC(Rm, Rn, t0, t1, t2); 11646 // Rm = *++Pm; 11647 // Rn = *--Pn; 11648 // } 11649 // Pm_base[i-len] = t0; 11650 // t0 = t1; t1 = t2; t2 = 0; 11651 // } 11652 11653 // while (t0) 11654 // t0 = sub(Pm_base, Pn_base, t0, len); 11655 // } 11656 }; 11657 11658 // Initialization 11659 void generate_preuniverse_stubs() { 11660 // preuniverse stubs are not needed for aarch64 11661 } 11662 11663 void generate_initial_stubs() { 11664 // Generate initial stubs and initializes the entry points 11665 11666 // entry points that exist in all platforms Note: This is code 11667 // that could be shared among different platforms - however the 11668 // benefit seems to be smaller than the disadvantage of having a 11669 // much more complicated generator structure. See also comment in 11670 // stubRoutines.hpp. 11671 11672 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11673 11674 StubRoutines::_call_stub_entry = 11675 generate_call_stub(StubRoutines::_call_stub_return_address); 11676 11677 // is referenced by megamorphic call 11678 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11679 11680 // Initialize table for copy memory (arraycopy) check. 11681 if (UnsafeMemoryAccess::_table == nullptr) { 11682 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11683 } 11684 11685 if (UseCRC32Intrinsics) { 11686 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11687 } 11688 11689 if (UseCRC32CIntrinsics) { 11690 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11691 } 11692 11693 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11694 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11695 } 11696 11697 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11698 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11699 } 11700 11701 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11702 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11703 StubRoutines::_hf2f = generate_float16ToFloat(); 11704 StubRoutines::_f2hf = generate_floatToFloat16(); 11705 } 11706 } 11707 11708 void generate_continuation_stubs() { 11709 // Continuation stubs: 11710 StubRoutines::_cont_thaw = generate_cont_thaw(); 11711 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11712 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11713 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11714 } 11715 11716 void generate_final_stubs() { 11717 // support for verify_oop (must happen after universe_init) 11718 if (VerifyOops) { 11719 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11720 } 11721 11722 // arraycopy stubs used by compilers 11723 generate_arraycopy_stubs(); 11724 11725 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11726 11727 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11728 11729 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11730 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11731 11732 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11733 11734 generate_atomic_entry_points(); 11735 11736 #endif // LINUX 11737 11738 #ifdef COMPILER2 11739 if (UseSecondarySupersTable) { 11740 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11741 if (! InlineSecondarySupersTest) { 11742 generate_lookup_secondary_supers_table_stub(); 11743 } 11744 } 11745 #endif 11746 11747 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11748 11749 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11750 } 11751 11752 void generate_compiler_stubs() { 11753 #if COMPILER2_OR_JVMCI 11754 11755 if (UseSVE == 0) { 11756 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id); 11757 } 11758 11759 // array equals stub for large arrays. 11760 if (!UseSimpleArrayEquals) { 11761 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11762 } 11763 11764 // arrays_hascode stub for large arrays. 11765 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11766 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11767 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11768 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11769 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11770 11771 // byte_array_inflate stub for large arrays. 11772 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11773 11774 // countPositives stub for large arrays. 11775 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11776 11777 generate_compare_long_strings(); 11778 11779 generate_string_indexof_stubs(); 11780 11781 #ifdef COMPILER2 11782 if (UseMultiplyToLenIntrinsic) { 11783 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11784 } 11785 11786 if (UseSquareToLenIntrinsic) { 11787 StubRoutines::_squareToLen = generate_squareToLen(); 11788 } 11789 11790 if (UseMulAddIntrinsic) { 11791 StubRoutines::_mulAdd = generate_mulAdd(); 11792 } 11793 11794 if (UseSIMDForBigIntegerShiftIntrinsics) { 11795 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11796 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11797 } 11798 11799 if (UseMontgomeryMultiplyIntrinsic) { 11800 StubId stub_id = StubId::stubgen_montgomeryMultiply_id; 11801 StubCodeMark mark(this, stub_id); 11802 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11803 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11804 } 11805 11806 if (UseMontgomerySquareIntrinsic) { 11807 StubId stub_id = StubId::stubgen_montgomerySquare_id; 11808 StubCodeMark mark(this, stub_id); 11809 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11810 // We use generate_multiply() rather than generate_square() 11811 // because it's faster for the sizes of modulus we care about. 11812 StubRoutines::_montgomerySquare = g.generate_multiply(); 11813 } 11814 11815 #endif // COMPILER2 11816 11817 if (UseChaCha20Intrinsics) { 11818 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11819 } 11820 11821 if (UseKyberIntrinsics) { 11822 StubRoutines::_kyberNtt = generate_kyberNtt(); 11823 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11824 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11825 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11826 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11827 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11828 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11829 } 11830 11831 if (UseDilithiumIntrinsics) { 11832 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11833 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11834 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11835 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11836 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11837 } 11838 11839 if (UseBASE64Intrinsics) { 11840 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11841 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11842 } 11843 11844 // data cache line writeback 11845 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11846 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11847 11848 if (UseAESIntrinsics) { 11849 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11850 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11851 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11852 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11853 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11854 } 11855 if (UseGHASHIntrinsics) { 11856 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11857 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11858 } 11859 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11860 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11861 } 11862 11863 if (UseMD5Intrinsics) { 11864 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id); 11865 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id); 11866 } 11867 if (UseSHA1Intrinsics) { 11868 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id); 11869 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id); 11870 } 11871 if (UseSHA256Intrinsics) { 11872 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id); 11873 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id); 11874 } 11875 if (UseSHA512Intrinsics) { 11876 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id); 11877 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id); 11878 } 11879 if (UseSHA3Intrinsics) { 11880 11881 StubRoutines::_double_keccak = generate_double_keccak(); 11882 if (UseSIMDForSHA3Intrinsic) { 11883 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id); 11884 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id); 11885 } else { 11886 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id); 11887 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id); 11888 } 11889 } 11890 11891 if (UsePoly1305Intrinsics) { 11892 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11893 } 11894 11895 // generate Adler32 intrinsics code 11896 if (UseAdler32Intrinsics) { 11897 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11898 } 11899 11900 #endif // COMPILER2_OR_JVMCI 11901 } 11902 11903 public: 11904 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) { 11905 switch(blob_id) { 11906 case BlobId::stubgen_preuniverse_id: 11907 generate_preuniverse_stubs(); 11908 break; 11909 case BlobId::stubgen_initial_id: 11910 generate_initial_stubs(); 11911 break; 11912 case BlobId::stubgen_continuation_id: 11913 generate_continuation_stubs(); 11914 break; 11915 case BlobId::stubgen_compiler_id: 11916 generate_compiler_stubs(); 11917 break; 11918 case BlobId::stubgen_final_id: 11919 generate_final_stubs(); 11920 break; 11921 default: 11922 fatal("unexpected blob id: %s", StubInfo::name(blob_id)); 11923 break; 11924 }; 11925 } 11926 }; // end class declaration 11927 11928 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) { 11929 StubGenerator g(code, blob_id); 11930 } 11931 11932 11933 #if defined (LINUX) 11934 11935 // Define pointers to atomic stubs and initialize them to point to the 11936 // code in atomic_aarch64.S. 11937 11938 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 11939 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 11940 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 11941 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 11942 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 11943 11944 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 11945 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 11946 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 11947 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 11948 DEFAULT_ATOMIC_OP(xchg, 4, ) 11949 DEFAULT_ATOMIC_OP(xchg, 8, ) 11950 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 11951 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 11952 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 11953 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 11954 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 11955 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 11956 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 11957 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 11958 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 11959 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 11960 11961 #undef DEFAULT_ATOMIC_OP 11962 11963 #endif // LINUX