1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "code/aotCodeCache.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubGenStubId stub_id = StubGenStubId::call_stub_id; 207 StubCodeMark mark(this, stub_id); 208 address start = __ pc(); 209 210 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 211 212 const Address fpcr_save (rfp, fpcr_off * wordSize); 213 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 214 const Address result (rfp, result_off * wordSize); 215 const Address result_type (rfp, result_type_off * wordSize); 216 const Address method (rfp, method_off * wordSize); 217 const Address entry_point (rfp, entry_point_off * wordSize); 218 const Address parameter_size(rfp, parameter_size_off * wordSize); 219 220 const Address thread (rfp, thread_off * wordSize); 221 222 const Address d15_save (rfp, d15_off * wordSize); 223 const Address d13_save (rfp, d13_off * wordSize); 224 const Address d11_save (rfp, d11_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 227 const Address r28_save (rfp, r28_off * wordSize); 228 const Address r26_save (rfp, r26_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r22_save (rfp, r22_off * wordSize); 231 const Address r20_save (rfp, r20_off * wordSize); 232 233 // stub code 234 235 address aarch64_entry = __ pc(); 236 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 __ get_fpcr(rscratch1); 262 __ str(rscratch1, fpcr_save); 263 // Set FPCR to the state we need. We do want Round to Nearest. We 264 // don't want non-IEEE rounding modes or floating-point traps. 265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 267 __ set_fpcr(rscratch1); 268 269 // install Java thread in global register now we have saved 270 // whatever value it held 271 __ mov(rthread, c_rarg7); 272 // And method 273 __ mov(rmethod, c_rarg3); 274 275 // set up the heapbase register 276 __ reinit_heapbase(); 277 278 #ifdef ASSERT 279 // make sure we have no pending exceptions 280 { 281 Label L; 282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 283 __ cmp(rscratch1, (u1)NULL_WORD); 284 __ br(Assembler::EQ, L); 285 __ stop("StubRoutines::call_stub: entered with pending exception"); 286 __ BIND(L); 287 } 288 #endif 289 // pass parameters if any 290 __ mov(esp, sp); 291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 292 __ andr(sp, rscratch1, -2 * wordSize); 293 294 BLOCK_COMMENT("pass parameters if any"); 295 Label parameters_done; 296 // parameter count is still in c_rarg6 297 // and parameter pointer identifying param 1 is in c_rarg5 298 __ cbzw(c_rarg6, parameters_done); 299 300 address loop = __ pc(); 301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 302 __ subsw(c_rarg6, c_rarg6, 1); 303 __ push(rscratch1); 304 __ br(Assembler::GT, loop); 305 306 __ BIND(parameters_done); 307 308 // call Java entry -- passing methdoOop, and current sp 309 // rmethod: Method* 310 // r19_sender_sp: sender sp 311 BLOCK_COMMENT("call Java function"); 312 __ mov(r19_sender_sp, sp); 313 __ blr(c_rarg4); 314 315 // we do this here because the notify will already have been done 316 // if we get to the next instruction via an exception 317 // 318 // n.b. adding this instruction here affects the calculation of 319 // whether or not a routine returns to the call stub (used when 320 // doing stack walks) since the normal test is to check the return 321 // pc against the address saved below. so we may need to allow for 322 // this extra instruction in the check. 323 324 // save current address for use by exception handling code 325 326 return_address = __ pc(); 327 328 // store result depending on type (everything that is not 329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 330 // n.b. this assumes Java returns an integral result in r0 331 // and a floating result in j_farg0 332 __ ldr(j_rarg2, result); 333 Label is_long, is_float, is_double, exit; 334 __ ldr(j_rarg1, result_type); 335 __ cmp(j_rarg1, (u1)T_OBJECT); 336 __ br(Assembler::EQ, is_long); 337 __ cmp(j_rarg1, (u1)T_LONG); 338 __ br(Assembler::EQ, is_long); 339 __ cmp(j_rarg1, (u1)T_FLOAT); 340 __ br(Assembler::EQ, is_float); 341 __ cmp(j_rarg1, (u1)T_DOUBLE); 342 __ br(Assembler::EQ, is_double); 343 344 // handle T_INT case 345 __ strw(r0, Address(j_rarg2)); 346 347 __ BIND(exit); 348 349 // pop parameters 350 __ sub(esp, rfp, -sp_after_call_off * wordSize); 351 352 #ifdef ASSERT 353 // verify that threads correspond 354 { 355 Label L, S; 356 __ ldr(rscratch1, thread); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::NE, S); 359 __ get_thread(rscratch1); 360 __ cmp(rthread, rscratch1); 361 __ br(Assembler::EQ, L); 362 __ BIND(S); 363 __ stop("StubRoutines::call_stub: threads must correspond"); 364 __ BIND(L); 365 } 366 #endif 367 368 __ pop_cont_fastpath(rthread); 369 370 // restore callee-save registers 371 __ ldpd(v15, v14, d15_save); 372 __ ldpd(v13, v12, d13_save); 373 __ ldpd(v11, v10, d11_save); 374 __ ldpd(v9, v8, d9_save); 375 376 __ ldp(r28, r27, r28_save); 377 __ ldp(r26, r25, r26_save); 378 __ ldp(r24, r23, r24_save); 379 __ ldp(r22, r21, r22_save); 380 __ ldp(r20, r19, r20_save); 381 382 // restore fpcr 383 __ ldr(rscratch1, fpcr_save); 384 __ set_fpcr(rscratch1); 385 386 __ ldp(c_rarg0, c_rarg1, call_wrapper); 387 __ ldrw(c_rarg2, result_type); 388 __ ldr(c_rarg3, method); 389 __ ldp(c_rarg4, c_rarg5, entry_point); 390 __ ldp(c_rarg6, c_rarg7, parameter_size); 391 392 // leave frame and return to caller 393 __ leave(); 394 __ ret(lr); 395 396 // handle return types different from T_INT 397 398 __ BIND(is_long); 399 __ str(r0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(j_rarg2, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 427 StubCodeMark mark(this, stub_id); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != nullptr, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code with no x86 prolog 480 481 address generate_forward_exception() { 482 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 483 StubCodeMark mark(this, stub_id); 484 address start = __ pc(); 485 486 // Upon entry, LR points to the return address returning into 487 // Java (interpreted or compiled) code; i.e., the return address 488 // becomes the throwing pc. 489 // 490 // Arguments pushed before the runtime call are still on the stack 491 // but the exception handler will reset the stack pointer -> 492 // ignore them. A potential result in registers can be ignored as 493 // well. 494 495 #ifdef ASSERT 496 // make sure this code is only executed if there is a pending exception 497 { 498 Label L; 499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 500 __ cbnz(rscratch1, L); 501 __ stop("StubRoutines::forward exception: no pending exception (1)"); 502 __ bind(L); 503 } 504 #endif 505 506 // compute exception handler into r19 507 508 // call the VM to find the handler address associated with the 509 // caller address. pass thread in r0 and caller pc (ret address) 510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 511 // the stack. 512 __ mov(c_rarg1, lr); 513 // lr will be trashed by the VM call so we move it to R19 514 // (callee-saved) because we also need to pass it to the handler 515 // returned by this call. 516 __ mov(r19, lr); 517 BLOCK_COMMENT("call exception_handler_for_return_address"); 518 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 519 SharedRuntime::exception_handler_for_return_address), 520 rthread, c_rarg1); 521 // Reinitialize the ptrue predicate register, in case the external runtime 522 // call clobbers ptrue reg, as we may return to SVE compiled code. 523 __ reinitialize_ptrue(); 524 525 // we should not really care that lr is no longer the callee 526 // address. we saved the value the handler needs in r19 so we can 527 // just copy it to r3. however, the C2 handler will push its own 528 // frame and then calls into the VM and the VM code asserts that 529 // the PC for the frame above the handler belongs to a compiled 530 // Java method. So, we restore lr here to satisfy that assert. 531 __ mov(lr, r19); 532 // setup r0 & r3 & clear pending exception 533 __ mov(r3, r19); 534 __ mov(r19, r0); 535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 536 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 537 538 #ifdef ASSERT 539 // make sure exception is set 540 { 541 Label L; 542 __ cbnz(r0, L); 543 __ stop("StubRoutines::forward exception: no pending exception (2)"); 544 __ bind(L); 545 } 546 #endif 547 548 // continue at exception handler 549 // r0: exception 550 // r3: throwing pc 551 // r19: exception handler 552 __ verify_oop(r0); 553 __ br(r19); 554 555 return start; 556 } 557 558 // Non-destructive plausibility checks for oops 559 // 560 // Arguments: 561 // r0: oop to verify 562 // rscratch1: error message 563 // 564 // Stack after saving c_rarg3: 565 // [tos + 0]: saved c_rarg3 566 // [tos + 1]: saved c_rarg2 567 // [tos + 2]: saved lr 568 // [tos + 3]: saved rscratch2 569 // [tos + 4]: saved r0 570 // [tos + 5]: saved rscratch1 571 address generate_verify_oop() { 572 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 573 StubCodeMark mark(this, stub_id); 574 address start = __ pc(); 575 576 Label exit, error; 577 578 // save c_rarg2 and c_rarg3 579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 580 581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 583 __ ldr(c_rarg3, Address(c_rarg2)); 584 __ add(c_rarg3, c_rarg3, 1); 585 __ str(c_rarg3, Address(c_rarg2)); 586 587 // object is in r0 588 // make sure object is 'reasonable' 589 __ cbz(r0, exit); // if obj is null it is OK 590 591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blr(rscratch1); 615 __ hlt(0); 616 617 return start; 618 } 619 620 // Generate indices for iota vector. 621 address generate_iota_indices(StubGenStubId stub_id) { 622 __ align(CodeEntryAlignment); 623 StubCodeMark mark(this, stub_id); 624 address start = __ pc(); 625 // B 626 __ emit_data64(0x0706050403020100, relocInfo::none); 627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 628 // H 629 __ emit_data64(0x0003000200010000, relocInfo::none); 630 __ emit_data64(0x0007000600050004, relocInfo::none); 631 // S 632 __ emit_data64(0x0000000100000000, relocInfo::none); 633 __ emit_data64(0x0000000300000002, relocInfo::none); 634 // D 635 __ emit_data64(0x0000000000000000, relocInfo::none); 636 __ emit_data64(0x0000000000000001, relocInfo::none); 637 // S - FP 638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 640 // D - FP 641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 643 return start; 644 } 645 646 // The inner part of zero_words(). This is the bulk operation, 647 // zeroing words in blocks, possibly using DC ZVA to do it. The 648 // caller is responsible for zeroing the last few words. 649 // 650 // Inputs: 651 // r10: the HeapWord-aligned base address of an array to zero. 652 // r11: the count in HeapWords, r11 > 0. 653 // 654 // Returns r10 and r11, adjusted for the caller to clear. 655 // r10: the base address of the tail of words left to clear. 656 // r11: the number of words in the tail. 657 // r11 < MacroAssembler::zero_words_block_size. 658 659 address generate_zero_blocks() { 660 Label done; 661 Label base_aligned; 662 663 Register base = r10, cnt = r11; 664 665 __ align(CodeEntryAlignment); 666 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 667 StubCodeMark mark(this, stub_id); 668 address start = __ pc(); 669 670 if (UseBlockZeroing) { 671 int zva_length = VM_Version::zva_length(); 672 673 // Ensure ZVA length can be divided by 16. This is required by 674 // the subsequent operations. 675 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 676 677 __ tbz(base, 3, base_aligned); 678 __ str(zr, Address(__ post(base, 8))); 679 __ sub(cnt, cnt, 1); 680 __ bind(base_aligned); 681 682 // Ensure count >= zva_length * 2 so that it still deserves a zva after 683 // alignment. 684 Label small; 685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 686 __ subs(rscratch1, cnt, low_limit >> 3); 687 __ br(Assembler::LT, small); 688 __ zero_dcache_blocks(base, cnt); 689 __ bind(small); 690 } 691 692 { 693 // Number of stp instructions we'll unroll 694 const int unroll = 695 MacroAssembler::zero_words_block_size / 2; 696 // Clear the remaining blocks. 697 Label loop; 698 __ subs(cnt, cnt, unroll * 2); 699 __ br(Assembler::LT, done); 700 __ bind(loop); 701 for (int i = 0; i < unroll; i++) 702 __ stp(zr, zr, __ post(base, 16)); 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::GE, loop); 705 __ bind(done); 706 __ add(cnt, cnt, unroll * 2); 707 } 708 709 __ ret(lr); 710 711 return start; 712 } 713 714 715 typedef enum { 716 copy_forwards = 1, 717 copy_backwards = -1 718 } copy_direction; 719 720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 721 // for arraycopy stubs. 722 class ArrayCopyBarrierSetHelper : StackObj { 723 BarrierSetAssembler* _bs_asm; 724 MacroAssembler* _masm; 725 DecoratorSet _decorators; 726 BasicType _type; 727 Register _gct1; 728 Register _gct2; 729 Register _gct3; 730 FloatRegister _gcvt1; 731 FloatRegister _gcvt2; 732 FloatRegister _gcvt3; 733 734 public: 735 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 736 DecoratorSet decorators, 737 BasicType type, 738 Register gct1, 739 Register gct2, 740 Register gct3, 741 FloatRegister gcvt1, 742 FloatRegister gcvt2, 743 FloatRegister gcvt3) 744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 745 _masm(masm), 746 _decorators(decorators), 747 _type(type), 748 _gct1(gct1), 749 _gct2(gct2), 750 _gct3(gct3), 751 _gcvt1(gcvt1), 752 _gcvt2(gcvt2), 753 _gcvt3(gcvt3) { 754 } 755 756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 758 dst1, dst2, src, 759 _gct1, _gct2, _gcvt1); 760 } 761 762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 764 dst, src1, src2, 765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 766 } 767 768 void copy_load_at_16(Register dst1, Register dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 770 dst1, dst2, src, 771 _gct1); 772 } 773 774 void copy_store_at_16(Address dst, Register src1, Register src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3); 778 } 779 780 void copy_load_at_8(Register dst, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 782 dst, noreg, src, 783 _gct1); 784 } 785 786 void copy_store_at_8(Address dst, Register src) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 788 dst, src, noreg, 789 _gct1, _gct2, _gct3); 790 } 791 }; 792 793 // Bulk copy of blocks of 8 words. 794 // 795 // count is a count of words. 796 // 797 // Precondition: count >= 8 798 // 799 // Postconditions: 800 // 801 // The least significant bit of count contains the remaining count 802 // of words to copy. The rest of count is trash. 803 // 804 // s and d are adjusted to point to the remaining words to copy 805 // 806 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 807 BasicType type; 808 copy_direction direction; 809 810 switch (stub_id) { 811 case copy_byte_f_id: 812 direction = copy_forwards; 813 type = T_BYTE; 814 break; 815 case copy_byte_b_id: 816 direction = copy_backwards; 817 type = T_BYTE; 818 break; 819 case copy_oop_f_id: 820 direction = copy_forwards; 821 type = T_OBJECT; 822 break; 823 case copy_oop_b_id: 824 direction = copy_backwards; 825 type = T_OBJECT; 826 break; 827 case copy_oop_uninit_f_id: 828 direction = copy_forwards; 829 type = T_OBJECT; 830 break; 831 case copy_oop_uninit_b_id: 832 direction = copy_backwards; 833 type = T_OBJECT; 834 break; 835 default: 836 ShouldNotReachHere(); 837 } 838 839 int unit = wordSize * direction; 840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 841 842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 843 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 844 const Register stride = r14; 845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 848 849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 850 assert_different_registers(s, d, count, rscratch1, rscratch2); 851 852 Label again, drain; 853 854 __ align(CodeEntryAlignment); 855 856 StubCodeMark mark(this, stub_id); 857 858 __ bind(start); 859 860 Label unaligned_copy_long; 861 if (AvoidUnalignedAccesses) { 862 __ tbnz(d, 3, unaligned_copy_long); 863 } 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, bias); 867 __ sub(d, d, bias); 868 } 869 870 #ifdef ASSERT 871 // Make sure we are never given < 8 words 872 { 873 Label L; 874 __ cmp(count, (u1)8); 875 __ br(Assembler::GE, L); 876 __ stop("genrate_copy_longs called with < 8 words"); 877 __ bind(L); 878 } 879 #endif 880 881 // Fill 8 registers 882 if (UseSIMDForMemoryOps) { 883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 885 } else { 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 890 } 891 892 __ subs(count, count, 16); 893 __ br(Assembler::LO, drain); 894 895 int prefetch = PrefetchCopyIntervalInBytes; 896 bool use_stride = false; 897 if (direction == copy_backwards) { 898 use_stride = prefetch > 256; 899 prefetch = -prefetch; 900 if (use_stride) __ mov(stride, prefetch); 901 } 902 903 __ bind(again); 904 905 if (PrefetchCopyIntervalInBytes > 0) 906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 907 908 if (UseSIMDForMemoryOps) { 909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 913 } else { 914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 922 } 923 924 __ subs(count, count, 8); 925 __ br(Assembler::HS, again); 926 927 // Drain 928 __ bind(drain); 929 if (UseSIMDForMemoryOps) { 930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 932 } else { 933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 937 } 938 939 { 940 Label L1, L2; 941 __ tbz(count, exact_log2(4), L1); 942 if (UseSIMDForMemoryOps) { 943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 945 } else { 946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 950 } 951 __ bind(L1); 952 953 if (direction == copy_forwards) { 954 __ add(s, s, bias); 955 __ add(d, d, bias); 956 } 957 958 __ tbz(count, 1, L2); 959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 961 __ bind(L2); 962 } 963 964 __ ret(lr); 965 966 if (AvoidUnalignedAccesses) { 967 Label drain, again; 968 // Register order for storing. Order is different for backward copy. 969 970 __ bind(unaligned_copy_long); 971 972 // source address is even aligned, target odd aligned 973 // 974 // when forward copying word pairs we read long pairs at offsets 975 // {0, 2, 4, 6} (in long words). when backwards copying we read 976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 977 // address by -2 in the forwards case so we can compute the 978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 979 // or -1. 980 // 981 // when forward copying we need to store 1 word, 3 pairs and 982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 983 // zero offset We adjust the destination by -1 which means we 984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 985 // 986 // When backwards copyng we need to store 1 word, 3 pairs and 987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 988 // offsets {1, 3, 5, 7, 8} * unit. 989 990 if (direction == copy_forwards) { 991 __ sub(s, s, 16); 992 __ sub(d, d, 8); 993 } 994 995 // Fill 8 registers 996 // 997 // for forwards copy s was offset by -16 from the original input 998 // value of s so the register contents are at these offsets 999 // relative to the 64 bit block addressed by that original input 1000 // and so on for each successive 64 byte block when s is updated 1001 // 1002 // t0 at offset 0, t1 at offset 8 1003 // t2 at offset 16, t3 at offset 24 1004 // t4 at offset 32, t5 at offset 40 1005 // t6 at offset 48, t7 at offset 56 1006 1007 // for backwards copy s was not offset so the register contents 1008 // are at these offsets into the preceding 64 byte block 1009 // relative to that original input and so on for each successive 1010 // preceding 64 byte block when s is updated. this explains the 1011 // slightly counter-intuitive looking pattern of register usage 1012 // in the stp instructions for backwards copy. 1013 // 1014 // t0 at offset -16, t1 at offset -8 1015 // t2 at offset -32, t3 at offset -24 1016 // t4 at offset -48, t5 at offset -40 1017 // t6 at offset -64, t7 at offset -56 1018 1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1023 1024 __ subs(count, count, 16); 1025 __ br(Assembler::LO, drain); 1026 1027 int prefetch = PrefetchCopyIntervalInBytes; 1028 bool use_stride = false; 1029 if (direction == copy_backwards) { 1030 use_stride = prefetch > 256; 1031 prefetch = -prefetch; 1032 if (use_stride) __ mov(stride, prefetch); 1033 } 1034 1035 __ bind(again); 1036 1037 if (PrefetchCopyIntervalInBytes > 0) 1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1039 1040 if (direction == copy_forwards) { 1041 // allowing for the offset of -8 the store instructions place 1042 // registers into the target 64 bit block at the following 1043 // offsets 1044 // 1045 // t0 at offset 0 1046 // t1 at offset 8, t2 at offset 16 1047 // t3 at offset 24, t4 at offset 32 1048 // t5 at offset 40, t6 at offset 48 1049 // t7 at offset 56 1050 1051 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } else { 1061 // d was not offset when we started so the registers are 1062 // written into the 64 bit block preceding d with the following 1063 // offsets 1064 // 1065 // t1 at offset -8 1066 // t3 at offset -24, t0 at offset -16 1067 // t5 at offset -48, t2 at offset -32 1068 // t7 at offset -56, t4 at offset -48 1069 // t6 at offset -64 1070 // 1071 // note that this matches the offsets previously noted for the 1072 // loads 1073 1074 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1083 } 1084 1085 __ subs(count, count, 8); 1086 __ br(Assembler::HS, again); 1087 1088 // Drain 1089 // 1090 // this uses the same pattern of offsets and register arguments 1091 // as above 1092 __ bind(drain); 1093 if (direction == copy_forwards) { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1099 } else { 1100 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1105 } 1106 // now we need to copy any remaining part block which may 1107 // include a 4 word block subblock and/or a 2 word subblock. 1108 // bits 2 and 1 in the count are the tell-tale for whether we 1109 // have each such subblock 1110 { 1111 Label L1, L2; 1112 __ tbz(count, exact_log2(4), L1); 1113 // this is the same as above but copying only 4 longs hence 1114 // with only one intervening stp between the str instructions 1115 // but note that the offsets and registers still follow the 1116 // same pattern 1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1119 if (direction == copy_forwards) { 1120 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1123 } else { 1124 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1127 } 1128 __ bind(L1); 1129 1130 __ tbz(count, 1, L2); 1131 // this is the same as above but copying only 2 longs hence 1132 // there is no intervening stp between the str instructions 1133 // but note that the offset and register patterns are still 1134 // the same 1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1136 if (direction == copy_forwards) { 1137 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1139 } else { 1140 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1142 } 1143 __ bind(L2); 1144 1145 // for forwards copy we need to re-adjust the offsets we 1146 // applied so that s and d are follow the last words written 1147 1148 if (direction == copy_forwards) { 1149 __ add(s, s, 16); 1150 __ add(d, d, 8); 1151 } 1152 1153 } 1154 1155 __ ret(lr); 1156 } 1157 } 1158 1159 // Small copy: less than 16 bytes. 1160 // 1161 // NB: Ignores all of the bits of count which represent more than 15 1162 // bytes, so a caller doesn't have to mask them. 1163 1164 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1165 bool is_backwards = step < 0; 1166 size_t granularity = uabs(step); 1167 int direction = is_backwards ? -1 : 1; 1168 1169 Label Lword, Lint, Lshort, Lbyte; 1170 1171 assert(granularity 1172 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1173 1174 const Register t0 = r3; 1175 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1176 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1177 1178 // ??? I don't know if this bit-test-and-branch is the right thing 1179 // to do. It does a lot of jumping, resulting in several 1180 // mispredicted branches. It might make more sense to do this 1181 // with something like Duff's device with a single computed branch. 1182 1183 __ tbz(count, 3 - exact_log2(granularity), Lword); 1184 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1185 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1186 __ bind(Lword); 1187 1188 if (granularity <= sizeof (jint)) { 1189 __ tbz(count, 2 - exact_log2(granularity), Lint); 1190 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1191 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1192 __ bind(Lint); 1193 } 1194 1195 if (granularity <= sizeof (jshort)) { 1196 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1197 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1198 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1199 __ bind(Lshort); 1200 } 1201 1202 if (granularity <= sizeof (jbyte)) { 1203 __ tbz(count, 0, Lbyte); 1204 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1205 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1206 __ bind(Lbyte); 1207 } 1208 } 1209 1210 Label copy_f, copy_b; 1211 Label copy_obj_f, copy_obj_b; 1212 Label copy_obj_uninit_f, copy_obj_uninit_b; 1213 1214 // All-singing all-dancing memory copy. 1215 // 1216 // Copy count units of memory from s to d. The size of a unit is 1217 // step, which can be positive or negative depending on the direction 1218 // of copy. If is_aligned is false, we align the source address. 1219 // 1220 1221 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1222 Register s, Register d, Register count, int step) { 1223 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1224 bool is_backwards = step < 0; 1225 unsigned int granularity = uabs(step); 1226 const Register t0 = r3, t1 = r4; 1227 1228 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1229 // load all the data before writing anything 1230 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1231 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1232 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1233 const Register send = r17, dend = r16; 1234 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1235 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1236 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1237 1238 if (PrefetchCopyIntervalInBytes > 0) 1239 __ prfm(Address(s, 0), PLDL1KEEP); 1240 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1241 __ br(Assembler::HI, copy_big); 1242 1243 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1244 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1245 1246 __ cmp(count, u1(16/granularity)); 1247 __ br(Assembler::LS, copy16); 1248 1249 __ cmp(count, u1(64/granularity)); 1250 __ br(Assembler::HI, copy80); 1251 1252 __ cmp(count, u1(32/granularity)); 1253 __ br(Assembler::LS, copy32); 1254 1255 // 33..64 bytes 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1259 bs.copy_store_at_32(Address(d, 0), v0, v1); 1260 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1261 } else { 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1264 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1265 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1266 1267 bs.copy_store_at_16(Address(d, 0), t0, t1); 1268 bs.copy_store_at_16(Address(d, 16), t2, t3); 1269 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1270 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1271 } 1272 __ b(finish); 1273 1274 // 17..32 bytes 1275 __ bind(copy32); 1276 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1277 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1278 1279 bs.copy_store_at_16(Address(d, 0), t0, t1); 1280 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1281 __ b(finish); 1282 1283 // 65..80/96 bytes 1284 // (96 bytes if SIMD because we do 32 byes per instruction) 1285 __ bind(copy80); 1286 if (UseSIMDForMemoryOps) { 1287 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1288 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1289 // Unaligned pointers can be an issue for copying. 1290 // The issue has more chances to happen when granularity of data is 1291 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1292 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1293 // The most performance drop has been seen for the range 65-80 bytes. 1294 // For such cases using the pair of ldp/stp instead of the third pair of 1295 // ldpq/stpq fixes the performance issue. 1296 if (granularity < sizeof (jint)) { 1297 Label copy96; 1298 __ cmp(count, u1(80/granularity)); 1299 __ br(Assembler::HI, copy96); 1300 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1301 1302 bs.copy_store_at_32(Address(d, 0), v0, v1); 1303 bs.copy_store_at_32(Address(d, 32), v2, v3); 1304 1305 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1306 __ b(finish); 1307 1308 __ bind(copy96); 1309 } 1310 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1311 1312 bs.copy_store_at_32(Address(d, 0), v0, v1); 1313 bs.copy_store_at_32(Address(d, 32), v2, v3); 1314 1315 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1316 } else { 1317 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1318 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1319 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1320 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1321 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1322 1323 bs.copy_store_at_16(Address(d, 0), t0, t1); 1324 bs.copy_store_at_16(Address(d, 16), t2, t3); 1325 bs.copy_store_at_16(Address(d, 32), t4, t5); 1326 bs.copy_store_at_16(Address(d, 48), t6, t7); 1327 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1328 } 1329 __ b(finish); 1330 1331 // 0..16 bytes 1332 __ bind(copy16); 1333 __ cmp(count, u1(8/granularity)); 1334 __ br(Assembler::LO, copy8); 1335 1336 // 8..16 bytes 1337 bs.copy_load_at_8(t0, Address(s, 0)); 1338 bs.copy_load_at_8(t1, Address(send, -8)); 1339 bs.copy_store_at_8(Address(d, 0), t0); 1340 bs.copy_store_at_8(Address(dend, -8), t1); 1341 __ b(finish); 1342 1343 if (granularity < 8) { 1344 // 4..7 bytes 1345 __ bind(copy8); 1346 __ tbz(count, 2 - exact_log2(granularity), copy4); 1347 __ ldrw(t0, Address(s, 0)); 1348 __ ldrw(t1, Address(send, -4)); 1349 __ strw(t0, Address(d, 0)); 1350 __ strw(t1, Address(dend, -4)); 1351 __ b(finish); 1352 if (granularity < 4) { 1353 // 0..3 bytes 1354 __ bind(copy4); 1355 __ cbz(count, finish); // get rid of 0 case 1356 if (granularity == 2) { 1357 __ ldrh(t0, Address(s, 0)); 1358 __ strh(t0, Address(d, 0)); 1359 } else { // granularity == 1 1360 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1361 // the first and last byte. 1362 // Handle the 3 byte case by loading and storing base + count/2 1363 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1364 // This does means in the 1 byte case we load/store the same 1365 // byte 3 times. 1366 __ lsr(count, count, 1); 1367 __ ldrb(t0, Address(s, 0)); 1368 __ ldrb(t1, Address(send, -1)); 1369 __ ldrb(t2, Address(s, count)); 1370 __ strb(t0, Address(d, 0)); 1371 __ strb(t1, Address(dend, -1)); 1372 __ strb(t2, Address(d, count)); 1373 } 1374 __ b(finish); 1375 } 1376 } 1377 1378 __ bind(copy_big); 1379 if (is_backwards) { 1380 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1381 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1382 } 1383 1384 // Now we've got the small case out of the way we can align the 1385 // source address on a 2-word boundary. 1386 1387 // Here we will materialize a count in r15, which is used by copy_memory_small 1388 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1389 // Up until here, we have used t9, which aliases r15, but from here on, that register 1390 // can not be used as a temp register, as it contains the count. 1391 1392 Label aligned; 1393 1394 if (is_aligned) { 1395 // We may have to adjust by 1 word to get s 2-word-aligned. 1396 __ tbz(s, exact_log2(wordSize), aligned); 1397 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1398 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1399 __ sub(count, count, wordSize/granularity); 1400 } else { 1401 if (is_backwards) { 1402 __ andr(r15, s, 2 * wordSize - 1); 1403 } else { 1404 __ neg(r15, s); 1405 __ andr(r15, r15, 2 * wordSize - 1); 1406 } 1407 // r15 is the byte adjustment needed to align s. 1408 __ cbz(r15, aligned); 1409 int shift = exact_log2(granularity); 1410 if (shift > 0) { 1411 __ lsr(r15, r15, shift); 1412 } 1413 __ sub(count, count, r15); 1414 1415 #if 0 1416 // ?? This code is only correct for a disjoint copy. It may or 1417 // may not make sense to use it in that case. 1418 1419 // Copy the first pair; s and d may not be aligned. 1420 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1421 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1422 1423 // Align s and d, adjust count 1424 if (is_backwards) { 1425 __ sub(s, s, r15); 1426 __ sub(d, d, r15); 1427 } else { 1428 __ add(s, s, r15); 1429 __ add(d, d, r15); 1430 } 1431 #else 1432 copy_memory_small(decorators, type, s, d, r15, step); 1433 #endif 1434 } 1435 1436 __ bind(aligned); 1437 1438 // s is now 2-word-aligned. 1439 1440 // We have a count of units and some trailing bytes. Adjust the 1441 // count and do a bulk copy of words. If the shift is zero 1442 // perform a move instead to benefit from zero latency moves. 1443 int shift = exact_log2(wordSize/granularity); 1444 if (shift > 0) { 1445 __ lsr(r15, count, shift); 1446 } else { 1447 __ mov(r15, count); 1448 } 1449 if (direction == copy_forwards) { 1450 if (type != T_OBJECT) { 1451 __ bl(copy_f); 1452 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1453 __ bl(copy_obj_uninit_f); 1454 } else { 1455 __ bl(copy_obj_f); 1456 } 1457 } else { 1458 if (type != T_OBJECT) { 1459 __ bl(copy_b); 1460 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1461 __ bl(copy_obj_uninit_b); 1462 } else { 1463 __ bl(copy_obj_b); 1464 } 1465 } 1466 1467 // And the tail. 1468 copy_memory_small(decorators, type, s, d, count, step); 1469 1470 if (granularity >= 8) __ bind(copy8); 1471 if (granularity >= 4) __ bind(copy4); 1472 __ bind(finish); 1473 } 1474 1475 1476 void clobber_registers() { 1477 #ifdef ASSERT 1478 RegSet clobbered 1479 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1480 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1481 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1482 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1483 __ mov(*it, rscratch1); 1484 } 1485 #endif 1486 1487 } 1488 1489 // Scan over array at a for count oops, verifying each one. 1490 // Preserves a and count, clobbers rscratch1 and rscratch2. 1491 void verify_oop_array (int size, Register a, Register count, Register temp) { 1492 Label loop, end; 1493 __ mov(rscratch1, a); 1494 __ mov(rscratch2, zr); 1495 __ bind(loop); 1496 __ cmp(rscratch2, count); 1497 __ br(Assembler::HS, end); 1498 if (size == wordSize) { 1499 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1500 __ verify_oop(temp); 1501 } else { 1502 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1503 __ decode_heap_oop(temp); // calls verify_oop 1504 } 1505 __ add(rscratch2, rscratch2, 1); 1506 __ b(loop); 1507 __ bind(end); 1508 } 1509 1510 // Arguments: 1511 // stub_id - is used to name the stub and identify all details of 1512 // how to perform the copy. 1513 // 1514 // entry - is assigned to the stub's post push entry point unless 1515 // it is null 1516 // 1517 // Inputs: 1518 // c_rarg0 - source array address 1519 // c_rarg1 - destination array address 1520 // c_rarg2 - element count, treated as ssize_t, can be zero 1521 // 1522 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1523 // the hardware handle it. The two dwords within qwords that span 1524 // cache line boundaries will still be loaded and stored atomically. 1525 // 1526 // Side Effects: entry is set to the (post push) entry point so it 1527 // can be used by the corresponding conjoint copy 1528 // method 1529 // 1530 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1531 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1532 RegSet saved_reg = RegSet::of(s, d, count); 1533 int size; 1534 bool aligned; 1535 bool is_oop; 1536 bool dest_uninitialized; 1537 switch (stub_id) { 1538 case jbyte_disjoint_arraycopy_id: 1539 size = sizeof(jbyte); 1540 aligned = false; 1541 is_oop = false; 1542 dest_uninitialized = false; 1543 break; 1544 case arrayof_jbyte_disjoint_arraycopy_id: 1545 size = sizeof(jbyte); 1546 aligned = true; 1547 is_oop = false; 1548 dest_uninitialized = false; 1549 break; 1550 case jshort_disjoint_arraycopy_id: 1551 size = sizeof(jshort); 1552 aligned = false; 1553 is_oop = false; 1554 dest_uninitialized = false; 1555 break; 1556 case arrayof_jshort_disjoint_arraycopy_id: 1557 size = sizeof(jshort); 1558 aligned = true; 1559 is_oop = false; 1560 dest_uninitialized = false; 1561 break; 1562 case jint_disjoint_arraycopy_id: 1563 size = sizeof(jint); 1564 aligned = false; 1565 is_oop = false; 1566 dest_uninitialized = false; 1567 break; 1568 case arrayof_jint_disjoint_arraycopy_id: 1569 size = sizeof(jint); 1570 aligned = true; 1571 is_oop = false; 1572 dest_uninitialized = false; 1573 break; 1574 case jlong_disjoint_arraycopy_id: 1575 // since this is always aligned we can (should!) use the same 1576 // stub as for case arrayof_jlong_disjoint_arraycopy 1577 ShouldNotReachHere(); 1578 break; 1579 case arrayof_jlong_disjoint_arraycopy_id: 1580 size = sizeof(jlong); 1581 aligned = true; 1582 is_oop = false; 1583 dest_uninitialized = false; 1584 break; 1585 case oop_disjoint_arraycopy_id: 1586 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1587 aligned = !UseCompressedOops; 1588 is_oop = true; 1589 dest_uninitialized = false; 1590 break; 1591 case arrayof_oop_disjoint_arraycopy_id: 1592 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1593 aligned = !UseCompressedOops; 1594 is_oop = true; 1595 dest_uninitialized = false; 1596 break; 1597 case oop_disjoint_arraycopy_uninit_id: 1598 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1599 aligned = !UseCompressedOops; 1600 is_oop = true; 1601 dest_uninitialized = true; 1602 break; 1603 case arrayof_oop_disjoint_arraycopy_uninit_id: 1604 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1605 aligned = !UseCompressedOops; 1606 is_oop = true; 1607 dest_uninitialized = true; 1608 break; 1609 default: 1610 ShouldNotReachHere(); 1611 break; 1612 } 1613 1614 __ align(CodeEntryAlignment); 1615 StubCodeMark mark(this, stub_id); 1616 address start = __ pc(); 1617 __ enter(); 1618 1619 if (entry != nullptr) { 1620 *entry = __ pc(); 1621 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1622 BLOCK_COMMENT("Entry:"); 1623 } 1624 1625 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1626 if (dest_uninitialized) { 1627 decorators |= IS_DEST_UNINITIALIZED; 1628 } 1629 if (aligned) { 1630 decorators |= ARRAYCOPY_ALIGNED; 1631 } 1632 1633 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1634 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1635 1636 if (is_oop) { 1637 // save regs before copy_memory 1638 __ push(RegSet::of(d, count), sp); 1639 } 1640 { 1641 // UnsafeMemoryAccess page error: continue after unsafe access 1642 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1643 UnsafeMemoryAccessMark umam(this, add_entry, true); 1644 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1645 } 1646 1647 if (is_oop) { 1648 __ pop(RegSet::of(d, count), sp); 1649 if (VerifyOops) 1650 verify_oop_array(size, d, count, r16); 1651 } 1652 1653 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1654 1655 __ leave(); 1656 __ mov(r0, zr); // return 0 1657 __ ret(lr); 1658 return start; 1659 } 1660 1661 // Arguments: 1662 // stub_id - is used to name the stub and identify all details of 1663 // how to perform the copy. 1664 // 1665 // nooverlap_target - identifes the (post push) entry for the 1666 // corresponding disjoint copy routine which can be 1667 // jumped to if the ranges do not actually overlap 1668 // 1669 // entry - is assigned to the stub's post push entry point unless 1670 // it is null 1671 // 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomically. 1681 // 1682 // Side Effects: 1683 // entry is set to the no-overlap entry point so it can be used by 1684 // some other conjoint copy method 1685 // 1686 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1687 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1688 RegSet saved_regs = RegSet::of(s, d, count); 1689 int size; 1690 bool aligned; 1691 bool is_oop; 1692 bool dest_uninitialized; 1693 switch (stub_id) { 1694 case jbyte_arraycopy_id: 1695 size = sizeof(jbyte); 1696 aligned = false; 1697 is_oop = false; 1698 dest_uninitialized = false; 1699 break; 1700 case arrayof_jbyte_arraycopy_id: 1701 size = sizeof(jbyte); 1702 aligned = true; 1703 is_oop = false; 1704 dest_uninitialized = false; 1705 break; 1706 case jshort_arraycopy_id: 1707 size = sizeof(jshort); 1708 aligned = false; 1709 is_oop = false; 1710 dest_uninitialized = false; 1711 break; 1712 case arrayof_jshort_arraycopy_id: 1713 size = sizeof(jshort); 1714 aligned = true; 1715 is_oop = false; 1716 dest_uninitialized = false; 1717 break; 1718 case jint_arraycopy_id: 1719 size = sizeof(jint); 1720 aligned = false; 1721 is_oop = false; 1722 dest_uninitialized = false; 1723 break; 1724 case arrayof_jint_arraycopy_id: 1725 size = sizeof(jint); 1726 aligned = true; 1727 is_oop = false; 1728 dest_uninitialized = false; 1729 break; 1730 case jlong_arraycopy_id: 1731 // since this is always aligned we can (should!) use the same 1732 // stub as for case arrayof_jlong_disjoint_arraycopy 1733 ShouldNotReachHere(); 1734 break; 1735 case arrayof_jlong_arraycopy_id: 1736 size = sizeof(jlong); 1737 aligned = true; 1738 is_oop = false; 1739 dest_uninitialized = false; 1740 break; 1741 case oop_arraycopy_id: 1742 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1743 aligned = !UseCompressedOops; 1744 is_oop = true; 1745 dest_uninitialized = false; 1746 break; 1747 case arrayof_oop_arraycopy_id: 1748 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1749 aligned = !UseCompressedOops; 1750 is_oop = true; 1751 dest_uninitialized = false; 1752 break; 1753 case oop_arraycopy_uninit_id: 1754 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1755 aligned = !UseCompressedOops; 1756 is_oop = true; 1757 dest_uninitialized = true; 1758 break; 1759 case arrayof_oop_arraycopy_uninit_id: 1760 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1761 aligned = !UseCompressedOops; 1762 is_oop = true; 1763 dest_uninitialized = true; 1764 break; 1765 default: 1766 ShouldNotReachHere(); 1767 } 1768 1769 StubCodeMark mark(this, stub_id); 1770 address start = __ pc(); 1771 __ enter(); 1772 1773 if (entry != nullptr) { 1774 *entry = __ pc(); 1775 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1776 BLOCK_COMMENT("Entry:"); 1777 } 1778 1779 // use fwd copy when (d-s) above_equal (count*size) 1780 __ sub(rscratch1, d, s); 1781 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1782 __ br(Assembler::HS, nooverlap_target); 1783 1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1785 if (dest_uninitialized) { 1786 decorators |= IS_DEST_UNINITIALIZED; 1787 } 1788 if (aligned) { 1789 decorators |= ARRAYCOPY_ALIGNED; 1790 } 1791 1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1794 1795 if (is_oop) { 1796 // save regs before copy_memory 1797 __ push(RegSet::of(d, count), sp); 1798 } 1799 { 1800 // UnsafeMemoryAccess page error: continue after unsafe access 1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1802 UnsafeMemoryAccessMark umam(this, add_entry, true); 1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1804 } 1805 if (is_oop) { 1806 __ pop(RegSet::of(d, count), sp); 1807 if (VerifyOops) 1808 verify_oop_array(size, d, count, r16); 1809 } 1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1811 __ leave(); 1812 __ mov(r0, zr); // return 0 1813 __ ret(lr); 1814 return start; 1815 } 1816 1817 // Helper for generating a dynamic type check. 1818 // Smashes rscratch1, rscratch2. 1819 void generate_type_check(Register sub_klass, 1820 Register super_check_offset, 1821 Register super_klass, 1822 Register temp1, 1823 Register temp2, 1824 Register result, 1825 Label& L_success) { 1826 assert_different_registers(sub_klass, super_check_offset, super_klass); 1827 1828 BLOCK_COMMENT("type_check:"); 1829 1830 Label L_miss; 1831 1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1833 super_check_offset); 1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1835 1836 // Fall through on failure! 1837 __ BIND(L_miss); 1838 } 1839 1840 // 1841 // Generate checkcasting array copy stub 1842 // 1843 // Input: 1844 // c_rarg0 - source array address 1845 // c_rarg1 - destination array address 1846 // c_rarg2 - element count, treated as ssize_t, can be zero 1847 // c_rarg3 - size_t ckoff (super_check_offset) 1848 // c_rarg4 - oop ckval (super_klass) 1849 // 1850 // Output: 1851 // r0 == 0 - success 1852 // r0 == -1^K - failure, where K is partial transfer count 1853 // 1854 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1855 bool dest_uninitialized; 1856 switch (stub_id) { 1857 case checkcast_arraycopy_id: 1858 dest_uninitialized = false; 1859 break; 1860 case checkcast_arraycopy_uninit_id: 1861 dest_uninitialized = true; 1862 break; 1863 default: 1864 ShouldNotReachHere(); 1865 } 1866 1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1868 1869 // Input registers (after setup_arg_regs) 1870 const Register from = c_rarg0; // source array address 1871 const Register to = c_rarg1; // destination array address 1872 const Register count = c_rarg2; // elementscount 1873 const Register ckoff = c_rarg3; // super_check_offset 1874 const Register ckval = c_rarg4; // super_klass 1875 1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1877 RegSet wb_post_saved_regs = RegSet::of(count); 1878 1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1880 const Register copied_oop = r22; // actual oop copied 1881 const Register count_save = r21; // orig elementscount 1882 const Register start_to = r20; // destination array start address 1883 const Register r19_klass = r19; // oop._klass 1884 1885 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1886 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1887 1888 //--------------------------------------------------------------- 1889 // Assembler stub will be used for this call to arraycopy 1890 // if the two arrays are subtypes of Object[] but the 1891 // destination array type is not equal to or a supertype 1892 // of the source type. Each element must be separately 1893 // checked. 1894 1895 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1896 copied_oop, r19_klass, count_save); 1897 1898 __ align(CodeEntryAlignment); 1899 StubCodeMark mark(this, stub_id); 1900 address start = __ pc(); 1901 1902 __ enter(); // required for proper stackwalking of RuntimeStub frame 1903 1904 #ifdef ASSERT 1905 // caller guarantees that the arrays really are different 1906 // otherwise, we would have to make conjoint checks 1907 { Label L; 1908 __ b(L); // conjoint check not yet implemented 1909 __ stop("checkcast_copy within a single array"); 1910 __ bind(L); 1911 } 1912 #endif //ASSERT 1913 1914 // Caller of this entry point must set up the argument registers. 1915 if (entry != nullptr) { 1916 *entry = __ pc(); 1917 BLOCK_COMMENT("Entry:"); 1918 } 1919 1920 // Empty array: Nothing to do. 1921 __ cbz(count, L_done); 1922 __ push(RegSet::of(r19, r20, r21, r22), sp); 1923 1924 #ifdef ASSERT 1925 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1926 // The ckoff and ckval must be mutually consistent, 1927 // even though caller generates both. 1928 { Label L; 1929 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1930 __ ldrw(start_to, Address(ckval, sco_offset)); 1931 __ cmpw(ckoff, start_to); 1932 __ br(Assembler::EQ, L); 1933 __ stop("super_check_offset inconsistent"); 1934 __ bind(L); 1935 } 1936 #endif //ASSERT 1937 1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1939 bool is_oop = true; 1940 int element_size = UseCompressedOops ? 4 : 8; 1941 if (dest_uninitialized) { 1942 decorators |= IS_DEST_UNINITIALIZED; 1943 } 1944 1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1947 1948 // save the original count 1949 __ mov(count_save, count); 1950 1951 // Copy from low to high addresses 1952 __ mov(start_to, to); // Save destination array start address 1953 __ b(L_load_element); 1954 1955 // ======== begin loop ======== 1956 // (Loop is rotated; its entry is L_load_element.) 1957 // Loop control: 1958 // for (; count != 0; count--) { 1959 // copied_oop = load_heap_oop(from++); 1960 // ... generate_type_check ...; 1961 // store_heap_oop(to++, copied_oop); 1962 // } 1963 __ align(OptoLoopAlignment); 1964 1965 __ BIND(L_store_element); 1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1967 __ post(to, element_size), copied_oop, noreg, 1968 gct1, gct2, gct3); 1969 __ sub(count, count, 1); 1970 __ cbz(count, L_do_card_marks); 1971 1972 // ======== loop entry is here ======== 1973 __ BIND(L_load_element); 1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1975 copied_oop, noreg, __ post(from, element_size), 1976 gct1); 1977 __ cbz(copied_oop, L_store_element); 1978 1979 __ load_klass(r19_klass, copied_oop);// query the object klass 1980 1981 BLOCK_COMMENT("type_check:"); 1982 generate_type_check(/*sub_klass*/r19_klass, 1983 /*super_check_offset*/ckoff, 1984 /*super_klass*/ckval, 1985 /*r_array_base*/gct1, 1986 /*temp2*/gct2, 1987 /*result*/r10, L_store_element); 1988 1989 // Fall through on failure! 1990 1991 // ======== end loop ======== 1992 1993 // It was a real error; we must depend on the caller to finish the job. 1994 // Register count = remaining oops, count_orig = total oops. 1995 // Emit GC store barriers for the oops we have copied and report 1996 // their number to the caller. 1997 1998 __ subs(count, count_save, count); // K = partially copied oop count 1999 __ eon(count, count, zr); // report (-1^K) to caller 2000 __ br(Assembler::EQ, L_done_pop); 2001 2002 __ BIND(L_do_card_marks); 2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2004 2005 __ bind(L_done_pop); 2006 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2008 2009 __ bind(L_done); 2010 __ mov(r0, count); 2011 __ leave(); 2012 __ ret(lr); 2013 2014 return start; 2015 } 2016 2017 // Perform range checks on the proposed arraycopy. 2018 // Kills temp, but nothing else. 2019 // Also, clean the sign bits of src_pos and dst_pos. 2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2021 Register src_pos, // source position (c_rarg1) 2022 Register dst, // destination array oo (c_rarg2) 2023 Register dst_pos, // destination position (c_rarg3) 2024 Register length, 2025 Register temp, 2026 Label& L_failed) { 2027 BLOCK_COMMENT("arraycopy_range_checks:"); 2028 2029 assert_different_registers(rscratch1, temp); 2030 2031 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2033 __ addw(temp, length, src_pos); 2034 __ cmpw(temp, rscratch1); 2035 __ br(Assembler::HI, L_failed); 2036 2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2039 __ addw(temp, length, dst_pos); 2040 __ cmpw(temp, rscratch1); 2041 __ br(Assembler::HI, L_failed); 2042 2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2044 __ movw(src_pos, src_pos); 2045 __ movw(dst_pos, dst_pos); 2046 2047 BLOCK_COMMENT("arraycopy_range_checks done"); 2048 } 2049 2050 // These stubs get called from some dumb test routine. 2051 // I'll write them properly when they're called from 2052 // something that's actually doing something. 2053 static void fake_arraycopy_stub(address src, address dst, int count) { 2054 assert(count == 0, "huh?"); 2055 } 2056 2057 2058 // 2059 // Generate 'unsafe' array copy stub 2060 // Though just as safe as the other stubs, it takes an unscaled 2061 // size_t argument instead of an element count. 2062 // 2063 // Input: 2064 // c_rarg0 - source array address 2065 // c_rarg1 - destination array address 2066 // c_rarg2 - byte count, treated as ssize_t, can be zero 2067 // 2068 // Examines the alignment of the operands and dispatches 2069 // to a long, int, short, or byte copy loop. 2070 // 2071 address generate_unsafe_copy(address byte_copy_entry, 2072 address short_copy_entry, 2073 address int_copy_entry, 2074 address long_copy_entry) { 2075 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2076 2077 Label L_long_aligned, L_int_aligned, L_short_aligned; 2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2079 2080 __ align(CodeEntryAlignment); 2081 StubCodeMark mark(this, stub_id); 2082 address start = __ pc(); 2083 __ enter(); // required for proper stackwalking of RuntimeStub frame 2084 2085 // bump this on entry, not on exit: 2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2087 2088 __ orr(rscratch1, s, d); 2089 __ orr(rscratch1, rscratch1, count); 2090 2091 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2092 __ cbz(rscratch1, L_long_aligned); 2093 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2094 __ cbz(rscratch1, L_int_aligned); 2095 __ tbz(rscratch1, 0, L_short_aligned); 2096 __ b(RuntimeAddress(byte_copy_entry)); 2097 2098 __ BIND(L_short_aligned); 2099 __ lsr(count, count, LogBytesPerShort); // size => short_count 2100 __ b(RuntimeAddress(short_copy_entry)); 2101 __ BIND(L_int_aligned); 2102 __ lsr(count, count, LogBytesPerInt); // size => int_count 2103 __ b(RuntimeAddress(int_copy_entry)); 2104 __ BIND(L_long_aligned); 2105 __ lsr(count, count, LogBytesPerLong); // size => long_count 2106 __ b(RuntimeAddress(long_copy_entry)); 2107 2108 return start; 2109 } 2110 2111 // 2112 // Generate generic array copy stubs 2113 // 2114 // Input: 2115 // c_rarg0 - src oop 2116 // c_rarg1 - src_pos (32-bits) 2117 // c_rarg2 - dst oop 2118 // c_rarg3 - dst_pos (32-bits) 2119 // c_rarg4 - element count (32-bits) 2120 // 2121 // Output: 2122 // r0 == 0 - success 2123 // r0 == -1^K - failure, where K is partial transfer count 2124 // 2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2126 address int_copy_entry, address oop_copy_entry, 2127 address long_copy_entry, address checkcast_copy_entry) { 2128 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2129 2130 Label L_failed, L_objArray; 2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2132 2133 // Input registers 2134 const Register src = c_rarg0; // source array oop 2135 const Register src_pos = c_rarg1; // source position 2136 const Register dst = c_rarg2; // destination array oop 2137 const Register dst_pos = c_rarg3; // destination position 2138 const Register length = c_rarg4; 2139 2140 2141 // Registers used as temps 2142 const Register dst_klass = c_rarg5; 2143 2144 __ align(CodeEntryAlignment); 2145 2146 StubCodeMark mark(this, stub_id); 2147 2148 address start = __ pc(); 2149 2150 __ enter(); // required for proper stackwalking of RuntimeStub frame 2151 2152 // bump this on entry, not on exit: 2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2154 2155 //----------------------------------------------------------------------- 2156 // Assembler stub will be used for this call to arraycopy 2157 // if the following conditions are met: 2158 // 2159 // (1) src and dst must not be null. 2160 // (2) src_pos must not be negative. 2161 // (3) dst_pos must not be negative. 2162 // (4) length must not be negative. 2163 // (5) src klass and dst klass should be the same and not null. 2164 // (6) src and dst should be arrays. 2165 // (7) src_pos + length must not exceed length of src. 2166 // (8) dst_pos + length must not exceed length of dst. 2167 // 2168 2169 // if (src == nullptr) return -1; 2170 __ cbz(src, L_failed); 2171 2172 // if (src_pos < 0) return -1; 2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2174 2175 // if (dst == nullptr) return -1; 2176 __ cbz(dst, L_failed); 2177 2178 // if (dst_pos < 0) return -1; 2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2180 2181 // registers used as temp 2182 const Register scratch_length = r16; // elements count to copy 2183 const Register scratch_src_klass = r17; // array klass 2184 const Register lh = r15; // layout helper 2185 2186 // if (length < 0) return -1; 2187 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2189 2190 __ load_klass(scratch_src_klass, src); 2191 #ifdef ASSERT 2192 // assert(src->klass() != nullptr); 2193 { 2194 BLOCK_COMMENT("assert klasses not null {"); 2195 Label L1, L2; 2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2197 __ bind(L1); 2198 __ stop("broken null klass"); 2199 __ bind(L2); 2200 __ load_klass(rscratch1, dst); 2201 __ cbz(rscratch1, L1); // this would be broken also 2202 BLOCK_COMMENT("} assert klasses not null done"); 2203 } 2204 #endif 2205 2206 // Load layout helper (32-bits) 2207 // 2208 // |array_tag| | header_size | element_type | |log2_element_size| 2209 // 32 30 24 16 8 2 0 2210 // 2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2212 // 2213 2214 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2215 2216 // Handle objArrays completely differently... 2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2219 __ movw(rscratch1, objArray_lh); 2220 __ eorw(rscratch2, lh, rscratch1); 2221 __ cbzw(rscratch2, L_objArray); 2222 2223 // if (src->klass() != dst->klass()) return -1; 2224 __ load_klass(rscratch2, dst); 2225 __ eor(rscratch2, rscratch2, scratch_src_klass); 2226 __ cbnz(rscratch2, L_failed); 2227 2228 // if (!src->is_Array()) return -1; 2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2230 2231 // At this point, it is known to be a typeArray (array_tag 0x3). 2232 #ifdef ASSERT 2233 { 2234 BLOCK_COMMENT("assert primitive array {"); 2235 Label L; 2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2237 __ cmpw(lh, rscratch2); 2238 __ br(Assembler::GE, L); 2239 __ stop("must be a primitive array"); 2240 __ bind(L); 2241 BLOCK_COMMENT("} assert primitive array done"); 2242 } 2243 #endif 2244 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 rscratch2, L_failed); 2247 2248 // TypeArrayKlass 2249 // 2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2252 // 2253 2254 const Register rscratch1_offset = rscratch1; // array offset 2255 const Register r15_elsize = lh; // element size 2256 2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2259 __ add(src, src, rscratch1_offset); // src array offset 2260 __ add(dst, dst, rscratch1_offset); // dst array offset 2261 BLOCK_COMMENT("choose copy loop based on element size"); 2262 2263 // next registers should be set before the jump to corresponding stub 2264 const Register from = c_rarg0; // source array address 2265 const Register to = c_rarg1; // destination array address 2266 const Register count = c_rarg2; // elements count 2267 2268 // 'from', 'to', 'count' registers should be set in such order 2269 // since they are the same as 'src', 'src_pos', 'dst'. 2270 2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2272 2273 // The possible values of elsize are 0-3, i.e. exact_log2(element 2274 // size in bytes). We do a simple bitwise binary search. 2275 __ BIND(L_copy_bytes); 2276 __ tbnz(r15_elsize, 1, L_copy_ints); 2277 __ tbnz(r15_elsize, 0, L_copy_shorts); 2278 __ lea(from, Address(src, src_pos));// src_addr 2279 __ lea(to, Address(dst, dst_pos));// dst_addr 2280 __ movw(count, scratch_length); // length 2281 __ b(RuntimeAddress(byte_copy_entry)); 2282 2283 __ BIND(L_copy_shorts); 2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2286 __ movw(count, scratch_length); // length 2287 __ b(RuntimeAddress(short_copy_entry)); 2288 2289 __ BIND(L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_longs); 2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(int_copy_entry)); 2295 2296 __ BIND(L_copy_longs); 2297 #ifdef ASSERT 2298 { 2299 BLOCK_COMMENT("assert long copy {"); 2300 Label L; 2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2302 __ cmpw(r15_elsize, LogBytesPerLong); 2303 __ br(Assembler::EQ, L); 2304 __ stop("must be long copy, but elsize is wrong"); 2305 __ bind(L); 2306 BLOCK_COMMENT("} assert long copy done"); 2307 } 2308 #endif 2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2311 __ movw(count, scratch_length); // length 2312 __ b(RuntimeAddress(long_copy_entry)); 2313 2314 // ObjArrayKlass 2315 __ BIND(L_objArray); 2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2317 2318 Label L_plain_copy, L_checkcast_copy; 2319 // test array classes for subtyping 2320 __ load_klass(r15, dst); 2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2322 __ br(Assembler::NE, L_checkcast_copy); 2323 2324 // Identically typed arrays can be copied without element-wise checks. 2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2326 rscratch2, L_failed); 2327 2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2332 __ movw(count, scratch_length); // length 2333 __ BIND(L_plain_copy); 2334 __ b(RuntimeAddress(oop_copy_entry)); 2335 2336 __ BIND(L_checkcast_copy); 2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2338 { 2339 // Before looking at dst.length, make sure dst is also an objArray. 2340 __ ldrw(rscratch1, Address(r15, lh_offset)); 2341 __ movw(rscratch2, objArray_lh); 2342 __ eorw(rscratch1, rscratch1, rscratch2); 2343 __ cbnzw(rscratch1, L_failed); 2344 2345 // It is safe to examine both src.length and dst.length. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 r15, L_failed); 2348 2349 __ load_klass(dst_klass, dst); // reload 2350 2351 // Marshal the base address arguments now, freeing registers. 2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2356 __ movw(count, length); // length (reloaded) 2357 Register sco_temp = c_rarg3; // this register is free now 2358 assert_different_registers(from, to, count, sco_temp, 2359 dst_klass, scratch_src_klass); 2360 // assert_clean_int(count, sco_temp); 2361 2362 // Generate the type check. 2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2365 2366 // Smashes rscratch1, rscratch2 2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2368 L_plain_copy); 2369 2370 // Fetch destination element klass from the ObjArrayKlass header. 2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2372 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2374 2375 // the checkcast_copy loop needs two extra arguments: 2376 assert(c_rarg3 == sco_temp, "#3 already in place"); 2377 // Set up arguments for checkcast_copy_entry. 2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2379 __ b(RuntimeAddress(checkcast_copy_entry)); 2380 } 2381 2382 __ BIND(L_failed); 2383 __ mov(r0, -1); 2384 __ leave(); // required for proper stackwalking of RuntimeStub frame 2385 __ ret(lr); 2386 2387 return start; 2388 } 2389 2390 // 2391 // Generate stub for array fill. If "aligned" is true, the 2392 // "to" address is assumed to be heapword aligned. 2393 // 2394 // Arguments for generated stub: 2395 // to: c_rarg0 2396 // value: c_rarg1 2397 // count: c_rarg2 treated as signed 2398 // 2399 address generate_fill(StubGenStubId stub_id) { 2400 BasicType t; 2401 bool aligned; 2402 2403 switch (stub_id) { 2404 case jbyte_fill_id: 2405 t = T_BYTE; 2406 aligned = false; 2407 break; 2408 case jshort_fill_id: 2409 t = T_SHORT; 2410 aligned = false; 2411 break; 2412 case jint_fill_id: 2413 t = T_INT; 2414 aligned = false; 2415 break; 2416 case arrayof_jbyte_fill_id: 2417 t = T_BYTE; 2418 aligned = true; 2419 break; 2420 case arrayof_jshort_fill_id: 2421 t = T_SHORT; 2422 aligned = true; 2423 break; 2424 case arrayof_jint_fill_id: 2425 t = T_INT; 2426 aligned = true; 2427 break; 2428 default: 2429 ShouldNotReachHere(); 2430 }; 2431 2432 __ align(CodeEntryAlignment); 2433 StubCodeMark mark(this, stub_id); 2434 address start = __ pc(); 2435 2436 BLOCK_COMMENT("Entry:"); 2437 2438 const Register to = c_rarg0; // source array address 2439 const Register value = c_rarg1; // value 2440 const Register count = c_rarg2; // elements count 2441 2442 const Register bz_base = r10; // base for block_zero routine 2443 const Register cnt_words = r11; // temp register 2444 2445 __ enter(); 2446 2447 Label L_fill_elements, L_exit1; 2448 2449 int shift = -1; 2450 switch (t) { 2451 case T_BYTE: 2452 shift = 0; 2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2456 __ br(Assembler::LO, L_fill_elements); 2457 break; 2458 case T_SHORT: 2459 shift = 1; 2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2462 __ br(Assembler::LO, L_fill_elements); 2463 break; 2464 case T_INT: 2465 shift = 2; 2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2467 __ br(Assembler::LO, L_fill_elements); 2468 break; 2469 default: ShouldNotReachHere(); 2470 } 2471 2472 // Align source address at 8 bytes address boundary. 2473 Label L_skip_align1, L_skip_align2, L_skip_align4; 2474 if (!aligned) { 2475 switch (t) { 2476 case T_BYTE: 2477 // One byte misalignment happens only for byte arrays. 2478 __ tbz(to, 0, L_skip_align1); 2479 __ strb(value, Address(__ post(to, 1))); 2480 __ subw(count, count, 1); 2481 __ bind(L_skip_align1); 2482 // Fallthrough 2483 case T_SHORT: 2484 // Two bytes misalignment happens only for byte and short (char) arrays. 2485 __ tbz(to, 1, L_skip_align2); 2486 __ strh(value, Address(__ post(to, 2))); 2487 __ subw(count, count, 2 >> shift); 2488 __ bind(L_skip_align2); 2489 // Fallthrough 2490 case T_INT: 2491 // Align to 8 bytes, we know we are 4 byte aligned to start. 2492 __ tbz(to, 2, L_skip_align4); 2493 __ strw(value, Address(__ post(to, 4))); 2494 __ subw(count, count, 4 >> shift); 2495 __ bind(L_skip_align4); 2496 break; 2497 default: ShouldNotReachHere(); 2498 } 2499 } 2500 2501 // 2502 // Fill large chunks 2503 // 2504 __ lsrw(cnt_words, count, 3 - shift); // number of words 2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2507 if (UseBlockZeroing) { 2508 Label non_block_zeroing, rest; 2509 // If the fill value is zero we can use the fast zero_words(). 2510 __ cbnz(value, non_block_zeroing); 2511 __ mov(bz_base, to); 2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2513 address tpc = __ zero_words(bz_base, cnt_words); 2514 if (tpc == nullptr) { 2515 fatal("CodeCache is full at generate_fill"); 2516 } 2517 __ b(rest); 2518 __ bind(non_block_zeroing); 2519 __ fill_words(to, cnt_words, value); 2520 __ bind(rest); 2521 } else { 2522 __ fill_words(to, cnt_words, value); 2523 } 2524 2525 // Remaining count is less than 8 bytes. Fill it by a single store. 2526 // Note that the total length is no less than 8 bytes. 2527 if (t == T_BYTE || t == T_SHORT) { 2528 Label L_exit1; 2529 __ cbzw(count, L_exit1); 2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2531 __ str(value, Address(to, -8)); // overwrite some elements 2532 __ bind(L_exit1); 2533 __ leave(); 2534 __ ret(lr); 2535 } 2536 2537 // Handle copies less than 8 bytes. 2538 Label L_fill_2, L_fill_4, L_exit2; 2539 __ bind(L_fill_elements); 2540 switch (t) { 2541 case T_BYTE: 2542 __ tbz(count, 0, L_fill_2); 2543 __ strb(value, Address(__ post(to, 1))); 2544 __ bind(L_fill_2); 2545 __ tbz(count, 1, L_fill_4); 2546 __ strh(value, Address(__ post(to, 2))); 2547 __ bind(L_fill_4); 2548 __ tbz(count, 2, L_exit2); 2549 __ strw(value, Address(to)); 2550 break; 2551 case T_SHORT: 2552 __ tbz(count, 0, L_fill_4); 2553 __ strh(value, Address(__ post(to, 2))); 2554 __ bind(L_fill_4); 2555 __ tbz(count, 1, L_exit2); 2556 __ strw(value, Address(to)); 2557 break; 2558 case T_INT: 2559 __ cbzw(count, L_exit2); 2560 __ strw(value, Address(to)); 2561 break; 2562 default: ShouldNotReachHere(); 2563 } 2564 __ bind(L_exit2); 2565 __ leave(); 2566 __ ret(lr); 2567 return start; 2568 } 2569 2570 address generate_data_cache_writeback() { 2571 const Register line = c_rarg0; // address of line to write back 2572 2573 __ align(CodeEntryAlignment); 2574 2575 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2576 StubCodeMark mark(this, stub_id); 2577 2578 address start = __ pc(); 2579 __ enter(); 2580 __ cache_wb(Address(line, 0)); 2581 __ leave(); 2582 __ ret(lr); 2583 2584 return start; 2585 } 2586 2587 address generate_data_cache_writeback_sync() { 2588 const Register is_pre = c_rarg0; // pre or post sync 2589 2590 __ align(CodeEntryAlignment); 2591 2592 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2593 StubCodeMark mark(this, stub_id); 2594 2595 // pre wbsync is a no-op 2596 // post wbsync translates to an sfence 2597 2598 Label skip; 2599 address start = __ pc(); 2600 __ enter(); 2601 __ cbnz(is_pre, skip); 2602 __ cache_wbsync(false); 2603 __ bind(skip); 2604 __ leave(); 2605 __ ret(lr); 2606 2607 return start; 2608 } 2609 2610 void generate_arraycopy_stubs() { 2611 address entry; 2612 address entry_jbyte_arraycopy; 2613 address entry_jshort_arraycopy; 2614 address entry_jint_arraycopy; 2615 address entry_oop_arraycopy; 2616 address entry_jlong_arraycopy; 2617 address entry_checkcast_arraycopy; 2618 2619 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2620 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2621 2622 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2623 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2624 2625 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2626 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2627 2628 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2629 2630 //*** jbyte 2631 // Always need aligned and unaligned versions 2632 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2633 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2634 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2635 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2636 2637 //*** jshort 2638 // Always need aligned and unaligned versions 2639 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2640 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2641 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2642 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2643 2644 //*** jint 2645 // Aligned versions 2646 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2647 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2648 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2649 // entry_jint_arraycopy always points to the unaligned version 2650 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2651 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2652 2653 //*** jlong 2654 // It is always aligned 2655 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2656 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2657 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2658 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2659 2660 //*** oops 2661 { 2662 // With compressed oops we need unaligned versions; notice that 2663 // we overwrite entry_oop_arraycopy. 2664 bool aligned = !UseCompressedOops; 2665 2666 StubRoutines::_arrayof_oop_disjoint_arraycopy 2667 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2668 StubRoutines::_arrayof_oop_arraycopy 2669 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2670 // Aligned versions without pre-barriers 2671 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2672 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2673 StubRoutines::_arrayof_oop_arraycopy_uninit 2674 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2675 } 2676 2677 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2678 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2679 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2680 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2681 2682 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2683 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2684 2685 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2686 entry_jshort_arraycopy, 2687 entry_jint_arraycopy, 2688 entry_jlong_arraycopy); 2689 2690 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2691 entry_jshort_arraycopy, 2692 entry_jint_arraycopy, 2693 entry_oop_arraycopy, 2694 entry_jlong_arraycopy, 2695 entry_checkcast_arraycopy); 2696 2697 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2698 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2699 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2700 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2701 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2702 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2703 } 2704 2705 void generate_math_stubs() { Unimplemented(); } 2706 2707 // Arguments: 2708 // 2709 // Inputs: 2710 // c_rarg0 - source byte array address 2711 // c_rarg1 - destination byte array address 2712 // c_rarg2 - K (key) in little endian int array 2713 // 2714 address generate_aescrypt_encryptBlock() { 2715 __ align(CodeEntryAlignment); 2716 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2717 StubCodeMark mark(this, stub_id); 2718 2719 const Register from = c_rarg0; // source array address 2720 const Register to = c_rarg1; // destination array address 2721 const Register key = c_rarg2; // key array address 2722 const Register keylen = rscratch1; 2723 2724 address start = __ pc(); 2725 __ enter(); 2726 2727 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2728 2729 __ aesenc_loadkeys(key, keylen); 2730 __ aesecb_encrypt(from, to, keylen); 2731 2732 __ mov(r0, 0); 2733 2734 __ leave(); 2735 __ ret(lr); 2736 2737 return start; 2738 } 2739 2740 // Arguments: 2741 // 2742 // Inputs: 2743 // c_rarg0 - source byte array address 2744 // c_rarg1 - destination byte array address 2745 // c_rarg2 - K (key) in little endian int array 2746 // 2747 address generate_aescrypt_decryptBlock() { 2748 assert(UseAES, "need AES cryptographic extension support"); 2749 __ align(CodeEntryAlignment); 2750 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2751 StubCodeMark mark(this, stub_id); 2752 Label L_doLast; 2753 2754 const Register from = c_rarg0; // source array address 2755 const Register to = c_rarg1; // destination array address 2756 const Register key = c_rarg2; // key array address 2757 const Register keylen = rscratch1; 2758 2759 address start = __ pc(); 2760 __ enter(); // required for proper stackwalking of RuntimeStub frame 2761 2762 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2763 2764 __ aesecb_decrypt(from, to, key, keylen); 2765 2766 __ mov(r0, 0); 2767 2768 __ leave(); 2769 __ ret(lr); 2770 2771 return start; 2772 } 2773 2774 // Arguments: 2775 // 2776 // Inputs: 2777 // c_rarg0 - source byte array address 2778 // c_rarg1 - destination byte array address 2779 // c_rarg2 - K (key) in little endian int array 2780 // c_rarg3 - r vector byte array address 2781 // c_rarg4 - input length 2782 // 2783 // Output: 2784 // x0 - input length 2785 // 2786 address generate_cipherBlockChaining_encryptAESCrypt() { 2787 assert(UseAES, "need AES cryptographic extension support"); 2788 __ align(CodeEntryAlignment); 2789 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2790 StubCodeMark mark(this, stub_id); 2791 2792 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2793 2794 const Register from = c_rarg0; // source array address 2795 const Register to = c_rarg1; // destination array address 2796 const Register key = c_rarg2; // key array address 2797 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2798 // and left with the results of the last encryption block 2799 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2800 const Register keylen = rscratch1; 2801 2802 address start = __ pc(); 2803 2804 __ enter(); 2805 2806 __ movw(rscratch2, len_reg); 2807 2808 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2809 2810 __ ld1(v0, __ T16B, rvec); 2811 2812 __ cmpw(keylen, 52); 2813 __ br(Assembler::CC, L_loadkeys_44); 2814 __ br(Assembler::EQ, L_loadkeys_52); 2815 2816 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2817 __ rev32(v17, __ T16B, v17); 2818 __ rev32(v18, __ T16B, v18); 2819 __ BIND(L_loadkeys_52); 2820 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2821 __ rev32(v19, __ T16B, v19); 2822 __ rev32(v20, __ T16B, v20); 2823 __ BIND(L_loadkeys_44); 2824 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2825 __ rev32(v21, __ T16B, v21); 2826 __ rev32(v22, __ T16B, v22); 2827 __ rev32(v23, __ T16B, v23); 2828 __ rev32(v24, __ T16B, v24); 2829 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2830 __ rev32(v25, __ T16B, v25); 2831 __ rev32(v26, __ T16B, v26); 2832 __ rev32(v27, __ T16B, v27); 2833 __ rev32(v28, __ T16B, v28); 2834 __ ld1(v29, v30, v31, __ T16B, key); 2835 __ rev32(v29, __ T16B, v29); 2836 __ rev32(v30, __ T16B, v30); 2837 __ rev32(v31, __ T16B, v31); 2838 2839 __ BIND(L_aes_loop); 2840 __ ld1(v1, __ T16B, __ post(from, 16)); 2841 __ eor(v0, __ T16B, v0, v1); 2842 2843 __ br(Assembler::CC, L_rounds_44); 2844 __ br(Assembler::EQ, L_rounds_52); 2845 2846 __ aese(v0, v17); __ aesmc(v0, v0); 2847 __ aese(v0, v18); __ aesmc(v0, v0); 2848 __ BIND(L_rounds_52); 2849 __ aese(v0, v19); __ aesmc(v0, v0); 2850 __ aese(v0, v20); __ aesmc(v0, v0); 2851 __ BIND(L_rounds_44); 2852 __ aese(v0, v21); __ aesmc(v0, v0); 2853 __ aese(v0, v22); __ aesmc(v0, v0); 2854 __ aese(v0, v23); __ aesmc(v0, v0); 2855 __ aese(v0, v24); __ aesmc(v0, v0); 2856 __ aese(v0, v25); __ aesmc(v0, v0); 2857 __ aese(v0, v26); __ aesmc(v0, v0); 2858 __ aese(v0, v27); __ aesmc(v0, v0); 2859 __ aese(v0, v28); __ aesmc(v0, v0); 2860 __ aese(v0, v29); __ aesmc(v0, v0); 2861 __ aese(v0, v30); 2862 __ eor(v0, __ T16B, v0, v31); 2863 2864 __ st1(v0, __ T16B, __ post(to, 16)); 2865 2866 __ subw(len_reg, len_reg, 16); 2867 __ cbnzw(len_reg, L_aes_loop); 2868 2869 __ st1(v0, __ T16B, rvec); 2870 2871 __ mov(r0, rscratch2); 2872 2873 __ leave(); 2874 __ ret(lr); 2875 2876 return start; 2877 } 2878 2879 // Arguments: 2880 // 2881 // Inputs: 2882 // c_rarg0 - source byte array address 2883 // c_rarg1 - destination byte array address 2884 // c_rarg2 - K (key) in little endian int array 2885 // c_rarg3 - r vector byte array address 2886 // c_rarg4 - input length 2887 // 2888 // Output: 2889 // r0 - input length 2890 // 2891 address generate_cipherBlockChaining_decryptAESCrypt() { 2892 assert(UseAES, "need AES cryptographic extension support"); 2893 __ align(CodeEntryAlignment); 2894 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2895 StubCodeMark mark(this, stub_id); 2896 2897 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2898 2899 const Register from = c_rarg0; // source array address 2900 const Register to = c_rarg1; // destination array address 2901 const Register key = c_rarg2; // key array address 2902 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2903 // and left with the results of the last encryption block 2904 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2905 const Register keylen = rscratch1; 2906 2907 address start = __ pc(); 2908 2909 __ enter(); 2910 2911 __ movw(rscratch2, len_reg); 2912 2913 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2914 2915 __ ld1(v2, __ T16B, rvec); 2916 2917 __ ld1(v31, __ T16B, __ post(key, 16)); 2918 __ rev32(v31, __ T16B, v31); 2919 2920 __ cmpw(keylen, 52); 2921 __ br(Assembler::CC, L_loadkeys_44); 2922 __ br(Assembler::EQ, L_loadkeys_52); 2923 2924 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2925 __ rev32(v17, __ T16B, v17); 2926 __ rev32(v18, __ T16B, v18); 2927 __ BIND(L_loadkeys_52); 2928 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2929 __ rev32(v19, __ T16B, v19); 2930 __ rev32(v20, __ T16B, v20); 2931 __ BIND(L_loadkeys_44); 2932 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2933 __ rev32(v21, __ T16B, v21); 2934 __ rev32(v22, __ T16B, v22); 2935 __ rev32(v23, __ T16B, v23); 2936 __ rev32(v24, __ T16B, v24); 2937 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2938 __ rev32(v25, __ T16B, v25); 2939 __ rev32(v26, __ T16B, v26); 2940 __ rev32(v27, __ T16B, v27); 2941 __ rev32(v28, __ T16B, v28); 2942 __ ld1(v29, v30, __ T16B, key); 2943 __ rev32(v29, __ T16B, v29); 2944 __ rev32(v30, __ T16B, v30); 2945 2946 __ BIND(L_aes_loop); 2947 __ ld1(v0, __ T16B, __ post(from, 16)); 2948 __ orr(v1, __ T16B, v0, v0); 2949 2950 __ br(Assembler::CC, L_rounds_44); 2951 __ br(Assembler::EQ, L_rounds_52); 2952 2953 __ aesd(v0, v17); __ aesimc(v0, v0); 2954 __ aesd(v0, v18); __ aesimc(v0, v0); 2955 __ BIND(L_rounds_52); 2956 __ aesd(v0, v19); __ aesimc(v0, v0); 2957 __ aesd(v0, v20); __ aesimc(v0, v0); 2958 __ BIND(L_rounds_44); 2959 __ aesd(v0, v21); __ aesimc(v0, v0); 2960 __ aesd(v0, v22); __ aesimc(v0, v0); 2961 __ aesd(v0, v23); __ aesimc(v0, v0); 2962 __ aesd(v0, v24); __ aesimc(v0, v0); 2963 __ aesd(v0, v25); __ aesimc(v0, v0); 2964 __ aesd(v0, v26); __ aesimc(v0, v0); 2965 __ aesd(v0, v27); __ aesimc(v0, v0); 2966 __ aesd(v0, v28); __ aesimc(v0, v0); 2967 __ aesd(v0, v29); __ aesimc(v0, v0); 2968 __ aesd(v0, v30); 2969 __ eor(v0, __ T16B, v0, v31); 2970 __ eor(v0, __ T16B, v0, v2); 2971 2972 __ st1(v0, __ T16B, __ post(to, 16)); 2973 __ orr(v2, __ T16B, v1, v1); 2974 2975 __ subw(len_reg, len_reg, 16); 2976 __ cbnzw(len_reg, L_aes_loop); 2977 2978 __ st1(v2, __ T16B, rvec); 2979 2980 __ mov(r0, rscratch2); 2981 2982 __ leave(); 2983 __ ret(lr); 2984 2985 return start; 2986 } 2987 2988 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2989 // Inputs: 128-bits. in is preserved. 2990 // The least-significant 64-bit word is in the upper dword of each vector. 2991 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2992 // Output: result 2993 void be_add_128_64(FloatRegister result, FloatRegister in, 2994 FloatRegister inc, FloatRegister tmp) { 2995 assert_different_registers(result, tmp, inc); 2996 2997 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2998 // input 2999 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3000 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3001 // MSD == 0 (must be!) to LSD 3002 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3003 } 3004 3005 // CTR AES crypt. 3006 // Arguments: 3007 // 3008 // Inputs: 3009 // c_rarg0 - source byte array address 3010 // c_rarg1 - destination byte array address 3011 // c_rarg2 - K (key) in little endian int array 3012 // c_rarg3 - counter vector byte array address 3013 // c_rarg4 - input length 3014 // c_rarg5 - saved encryptedCounter start 3015 // c_rarg6 - saved used length 3016 // 3017 // Output: 3018 // r0 - input length 3019 // 3020 address generate_counterMode_AESCrypt() { 3021 const Register in = c_rarg0; 3022 const Register out = c_rarg1; 3023 const Register key = c_rarg2; 3024 const Register counter = c_rarg3; 3025 const Register saved_len = c_rarg4, len = r10; 3026 const Register saved_encrypted_ctr = c_rarg5; 3027 const Register used_ptr = c_rarg6, used = r12; 3028 3029 const Register offset = r7; 3030 const Register keylen = r11; 3031 3032 const unsigned char block_size = 16; 3033 const int bulk_width = 4; 3034 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3035 // performance with larger data sizes, but it also means that the 3036 // fast path isn't used until you have at least 8 blocks, and up 3037 // to 127 bytes of data will be executed on the slow path. For 3038 // that reason, and also so as not to blow away too much icache, 4 3039 // blocks seems like a sensible compromise. 3040 3041 // Algorithm: 3042 // 3043 // if (len == 0) { 3044 // goto DONE; 3045 // } 3046 // int result = len; 3047 // do { 3048 // if (used >= blockSize) { 3049 // if (len >= bulk_width * blockSize) { 3050 // CTR_large_block(); 3051 // if (len == 0) 3052 // goto DONE; 3053 // } 3054 // for (;;) { 3055 // 16ByteVector v0 = counter; 3056 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3057 // used = 0; 3058 // if (len < blockSize) 3059 // break; /* goto NEXT */ 3060 // 16ByteVector v1 = load16Bytes(in, offset); 3061 // v1 = v1 ^ encryptedCounter; 3062 // store16Bytes(out, offset); 3063 // used = blockSize; 3064 // offset += blockSize; 3065 // len -= blockSize; 3066 // if (len == 0) 3067 // goto DONE; 3068 // } 3069 // } 3070 // NEXT: 3071 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3072 // len--; 3073 // } while (len != 0); 3074 // DONE: 3075 // return result; 3076 // 3077 // CTR_large_block() 3078 // Wide bulk encryption of whole blocks. 3079 3080 __ align(CodeEntryAlignment); 3081 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3082 StubCodeMark mark(this, stub_id); 3083 const address start = __ pc(); 3084 __ enter(); 3085 3086 Label DONE, CTR_large_block, large_block_return; 3087 __ ldrw(used, Address(used_ptr)); 3088 __ cbzw(saved_len, DONE); 3089 3090 __ mov(len, saved_len); 3091 __ mov(offset, 0); 3092 3093 // Compute #rounds for AES based on the length of the key array 3094 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3095 3096 __ aesenc_loadkeys(key, keylen); 3097 3098 { 3099 Label L_CTR_loop, NEXT; 3100 3101 __ bind(L_CTR_loop); 3102 3103 __ cmp(used, block_size); 3104 __ br(__ LO, NEXT); 3105 3106 // Maybe we have a lot of data 3107 __ subsw(rscratch1, len, bulk_width * block_size); 3108 __ br(__ HS, CTR_large_block); 3109 __ BIND(large_block_return); 3110 __ cbzw(len, DONE); 3111 3112 // Setup the counter 3113 __ movi(v4, __ T4S, 0); 3114 __ movi(v5, __ T4S, 1); 3115 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3116 3117 // 128-bit big-endian increment 3118 __ ld1(v0, __ T16B, counter); 3119 __ rev64(v16, __ T16B, v0); 3120 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3121 __ rev64(v16, __ T16B, v16); 3122 __ st1(v16, __ T16B, counter); 3123 // Previous counter value is in v0 3124 // v4 contains { 0, 1 } 3125 3126 { 3127 // We have fewer than bulk_width blocks of data left. Encrypt 3128 // them one by one until there is less than a full block 3129 // remaining, being careful to save both the encrypted counter 3130 // and the counter. 3131 3132 Label inner_loop; 3133 __ bind(inner_loop); 3134 // Counter to encrypt is in v0 3135 __ aesecb_encrypt(noreg, noreg, keylen); 3136 __ st1(v0, __ T16B, saved_encrypted_ctr); 3137 3138 // Do we have a remaining full block? 3139 3140 __ mov(used, 0); 3141 __ cmp(len, block_size); 3142 __ br(__ LO, NEXT); 3143 3144 // Yes, we have a full block 3145 __ ldrq(v1, Address(in, offset)); 3146 __ eor(v1, __ T16B, v1, v0); 3147 __ strq(v1, Address(out, offset)); 3148 __ mov(used, block_size); 3149 __ add(offset, offset, block_size); 3150 3151 __ subw(len, len, block_size); 3152 __ cbzw(len, DONE); 3153 3154 // Increment the counter, store it back 3155 __ orr(v0, __ T16B, v16, v16); 3156 __ rev64(v16, __ T16B, v16); 3157 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3158 __ rev64(v16, __ T16B, v16); 3159 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3160 3161 __ b(inner_loop); 3162 } 3163 3164 __ BIND(NEXT); 3165 3166 // Encrypt a single byte, and loop. 3167 // We expect this to be a rare event. 3168 __ ldrb(rscratch1, Address(in, offset)); 3169 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3170 __ eor(rscratch1, rscratch1, rscratch2); 3171 __ strb(rscratch1, Address(out, offset)); 3172 __ add(offset, offset, 1); 3173 __ add(used, used, 1); 3174 __ subw(len, len,1); 3175 __ cbnzw(len, L_CTR_loop); 3176 } 3177 3178 __ bind(DONE); 3179 __ strw(used, Address(used_ptr)); 3180 __ mov(r0, saved_len); 3181 3182 __ leave(); // required for proper stackwalking of RuntimeStub frame 3183 __ ret(lr); 3184 3185 // Bulk encryption 3186 3187 __ BIND (CTR_large_block); 3188 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3189 3190 if (bulk_width == 8) { 3191 __ sub(sp, sp, 4 * 16); 3192 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3193 } 3194 __ sub(sp, sp, 4 * 16); 3195 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3196 RegSet saved_regs = (RegSet::of(in, out, offset) 3197 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3198 __ push(saved_regs, sp); 3199 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3200 __ add(in, in, offset); 3201 __ add(out, out, offset); 3202 3203 // Keys should already be loaded into the correct registers 3204 3205 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3206 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3207 3208 // AES/CTR loop 3209 { 3210 Label L_CTR_loop; 3211 __ BIND(L_CTR_loop); 3212 3213 // Setup the counters 3214 __ movi(v8, __ T4S, 0); 3215 __ movi(v9, __ T4S, 1); 3216 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3217 3218 for (int i = 0; i < bulk_width; i++) { 3219 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3220 __ rev64(v0_ofs, __ T16B, v16); 3221 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3222 } 3223 3224 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3225 3226 // Encrypt the counters 3227 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3228 3229 if (bulk_width == 8) { 3230 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3231 } 3232 3233 // XOR the encrypted counters with the inputs 3234 for (int i = 0; i < bulk_width; i++) { 3235 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3236 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3237 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3238 } 3239 3240 // Write the encrypted data 3241 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3242 if (bulk_width == 8) { 3243 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3244 } 3245 3246 __ subw(len, len, 16 * bulk_width); 3247 __ cbnzw(len, L_CTR_loop); 3248 } 3249 3250 // Save the counter back where it goes 3251 __ rev64(v16, __ T16B, v16); 3252 __ st1(v16, __ T16B, counter); 3253 3254 __ pop(saved_regs, sp); 3255 3256 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3257 if (bulk_width == 8) { 3258 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3259 } 3260 3261 __ andr(rscratch1, len, -16 * bulk_width); 3262 __ sub(len, len, rscratch1); 3263 __ add(offset, offset, rscratch1); 3264 __ mov(used, 16); 3265 __ strw(used, Address(used_ptr)); 3266 __ b(large_block_return); 3267 3268 return start; 3269 } 3270 3271 // Vector AES Galois Counter Mode implementation. Parameters: 3272 // 3273 // in = c_rarg0 3274 // len = c_rarg1 3275 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3276 // out = c_rarg3 3277 // key = c_rarg4 3278 // state = c_rarg5 - GHASH.state 3279 // subkeyHtbl = c_rarg6 - powers of H 3280 // counter = c_rarg7 - 16 bytes of CTR 3281 // return - number of processed bytes 3282 address generate_galoisCounterMode_AESCrypt() { 3283 address ghash_polynomial = __ pc(); 3284 __ emit_int64(0x87); // The low-order bits of the field 3285 // polynomial (i.e. p = z^7+z^2+z+1) 3286 // repeated in the low and high parts of a 3287 // 128-bit vector 3288 __ emit_int64(0x87); 3289 3290 __ align(CodeEntryAlignment); 3291 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3292 StubCodeMark mark(this, stub_id); 3293 address start = __ pc(); 3294 __ enter(); 3295 3296 const Register in = c_rarg0; 3297 const Register len = c_rarg1; 3298 const Register ct = c_rarg2; 3299 const Register out = c_rarg3; 3300 // and updated with the incremented counter in the end 3301 3302 const Register key = c_rarg4; 3303 const Register state = c_rarg5; 3304 3305 const Register subkeyHtbl = c_rarg6; 3306 3307 const Register counter = c_rarg7; 3308 3309 const Register keylen = r10; 3310 // Save state before entering routine 3311 __ sub(sp, sp, 4 * 16); 3312 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3313 __ sub(sp, sp, 4 * 16); 3314 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3315 3316 // __ andr(len, len, -512); 3317 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3318 __ str(len, __ pre(sp, -2 * wordSize)); 3319 3320 Label DONE; 3321 __ cbz(len, DONE); 3322 3323 // Compute #rounds for AES based on the length of the key array 3324 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3325 3326 __ aesenc_loadkeys(key, keylen); 3327 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3328 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3329 3330 // AES/CTR loop 3331 { 3332 Label L_CTR_loop; 3333 __ BIND(L_CTR_loop); 3334 3335 // Setup the counters 3336 __ movi(v8, __ T4S, 0); 3337 __ movi(v9, __ T4S, 1); 3338 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3339 3340 assert(v0->encoding() < v8->encoding(), ""); 3341 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3342 FloatRegister f = as_FloatRegister(i); 3343 __ rev32(f, __ T16B, v16); 3344 __ addv(v16, __ T4S, v16, v8); 3345 } 3346 3347 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3348 3349 // Encrypt the counters 3350 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3351 3352 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3353 3354 // XOR the encrypted counters with the inputs 3355 for (int i = 0; i < 8; i++) { 3356 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3357 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3358 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3359 } 3360 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3361 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3362 3363 __ subw(len, len, 16 * 8); 3364 __ cbnzw(len, L_CTR_loop); 3365 } 3366 3367 __ rev32(v16, __ T16B, v16); 3368 __ st1(v16, __ T16B, counter); 3369 3370 __ ldr(len, Address(sp)); 3371 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3372 3373 // GHASH/CTR loop 3374 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3375 len, /*unrolls*/4); 3376 3377 #ifdef ASSERT 3378 { Label L; 3379 __ cmp(len, (unsigned char)0); 3380 __ br(Assembler::EQ, L); 3381 __ stop("stubGenerator: abort"); 3382 __ bind(L); 3383 } 3384 #endif 3385 3386 __ bind(DONE); 3387 // Return the number of bytes processed 3388 __ ldr(r0, __ post(sp, 2 * wordSize)); 3389 3390 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3391 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3392 3393 __ leave(); // required for proper stackwalking of RuntimeStub frame 3394 __ ret(lr); 3395 return start; 3396 } 3397 3398 class Cached64Bytes { 3399 private: 3400 MacroAssembler *_masm; 3401 Register _regs[8]; 3402 3403 public: 3404 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3405 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3406 auto it = rs.begin(); 3407 for (auto &r: _regs) { 3408 r = *it; 3409 ++it; 3410 } 3411 } 3412 3413 void gen_loads(Register base) { 3414 for (int i = 0; i < 8; i += 2) { 3415 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3416 } 3417 } 3418 3419 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3420 void extract_u32(Register dest, int i) { 3421 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3422 } 3423 }; 3424 3425 // Utility routines for md5. 3426 // Clobbers r10 and r11. 3427 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3428 int k, int s, int t) { 3429 Register rscratch3 = r10; 3430 Register rscratch4 = r11; 3431 3432 __ eorw(rscratch3, r3, r4); 3433 __ movw(rscratch2, t); 3434 __ andw(rscratch3, rscratch3, r2); 3435 __ addw(rscratch4, r1, rscratch2); 3436 reg_cache.extract_u32(rscratch1, k); 3437 __ eorw(rscratch3, rscratch3, r4); 3438 __ addw(rscratch4, rscratch4, rscratch1); 3439 __ addw(rscratch3, rscratch3, rscratch4); 3440 __ rorw(rscratch2, rscratch3, 32 - s); 3441 __ addw(r1, rscratch2, r2); 3442 } 3443 3444 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3445 int k, int s, int t) { 3446 Register rscratch3 = r10; 3447 Register rscratch4 = r11; 3448 3449 reg_cache.extract_u32(rscratch1, k); 3450 __ movw(rscratch2, t); 3451 __ addw(rscratch4, r1, rscratch2); 3452 __ addw(rscratch4, rscratch4, rscratch1); 3453 __ bicw(rscratch2, r3, r4); 3454 __ andw(rscratch3, r2, r4); 3455 __ addw(rscratch2, rscratch2, rscratch4); 3456 __ addw(rscratch2, rscratch2, rscratch3); 3457 __ rorw(rscratch2, rscratch2, 32 - s); 3458 __ addw(r1, rscratch2, r2); 3459 } 3460 3461 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3462 int k, int s, int t) { 3463 Register rscratch3 = r10; 3464 Register rscratch4 = r11; 3465 3466 __ eorw(rscratch3, r3, r4); 3467 __ movw(rscratch2, t); 3468 __ addw(rscratch4, r1, rscratch2); 3469 reg_cache.extract_u32(rscratch1, k); 3470 __ eorw(rscratch3, rscratch3, r2); 3471 __ addw(rscratch4, rscratch4, rscratch1); 3472 __ addw(rscratch3, rscratch3, rscratch4); 3473 __ rorw(rscratch2, rscratch3, 32 - s); 3474 __ addw(r1, rscratch2, r2); 3475 } 3476 3477 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3478 int k, int s, int t) { 3479 Register rscratch3 = r10; 3480 Register rscratch4 = r11; 3481 3482 __ movw(rscratch3, t); 3483 __ ornw(rscratch2, r2, r4); 3484 __ addw(rscratch4, r1, rscratch3); 3485 reg_cache.extract_u32(rscratch1, k); 3486 __ eorw(rscratch3, rscratch2, r3); 3487 __ addw(rscratch4, rscratch4, rscratch1); 3488 __ addw(rscratch3, rscratch3, rscratch4); 3489 __ rorw(rscratch2, rscratch3, 32 - s); 3490 __ addw(r1, rscratch2, r2); 3491 } 3492 3493 // Arguments: 3494 // 3495 // Inputs: 3496 // c_rarg0 - byte[] source+offset 3497 // c_rarg1 - int[] SHA.state 3498 // c_rarg2 - int offset 3499 // c_rarg3 - int limit 3500 // 3501 address generate_md5_implCompress(StubGenStubId stub_id) { 3502 bool multi_block; 3503 switch (stub_id) { 3504 case md5_implCompress_id: 3505 multi_block = false; 3506 break; 3507 case md5_implCompressMB_id: 3508 multi_block = true; 3509 break; 3510 default: 3511 ShouldNotReachHere(); 3512 } 3513 __ align(CodeEntryAlignment); 3514 3515 StubCodeMark mark(this, stub_id); 3516 address start = __ pc(); 3517 3518 Register buf = c_rarg0; 3519 Register state = c_rarg1; 3520 Register ofs = c_rarg2; 3521 Register limit = c_rarg3; 3522 Register a = r4; 3523 Register b = r5; 3524 Register c = r6; 3525 Register d = r7; 3526 Register rscratch3 = r10; 3527 Register rscratch4 = r11; 3528 3529 Register state_regs[2] = { r12, r13 }; 3530 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3531 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3532 3533 __ push(saved_regs, sp); 3534 3535 __ ldp(state_regs[0], state_regs[1], Address(state)); 3536 __ ubfx(a, state_regs[0], 0, 32); 3537 __ ubfx(b, state_regs[0], 32, 32); 3538 __ ubfx(c, state_regs[1], 0, 32); 3539 __ ubfx(d, state_regs[1], 32, 32); 3540 3541 Label md5_loop; 3542 __ BIND(md5_loop); 3543 3544 reg_cache.gen_loads(buf); 3545 3546 // Round 1 3547 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3548 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3549 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3550 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3551 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3552 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3553 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3554 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3555 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3556 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3557 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3558 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3559 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3560 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3561 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3562 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3563 3564 // Round 2 3565 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3566 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3567 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3568 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3569 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3570 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3571 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3572 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3573 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3574 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3575 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3576 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3577 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3578 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3579 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3580 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3581 3582 // Round 3 3583 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3584 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3585 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3586 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3587 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3588 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3589 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3590 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3591 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3592 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3593 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3594 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3595 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3596 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3597 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3598 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3599 3600 // Round 4 3601 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3602 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3603 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3604 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3605 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3606 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3607 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3608 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3609 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3610 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3611 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3612 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3613 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3614 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3615 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3616 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3617 3618 __ addw(a, state_regs[0], a); 3619 __ ubfx(rscratch2, state_regs[0], 32, 32); 3620 __ addw(b, rscratch2, b); 3621 __ addw(c, state_regs[1], c); 3622 __ ubfx(rscratch4, state_regs[1], 32, 32); 3623 __ addw(d, rscratch4, d); 3624 3625 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3626 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3627 3628 if (multi_block) { 3629 __ add(buf, buf, 64); 3630 __ add(ofs, ofs, 64); 3631 __ cmp(ofs, limit); 3632 __ br(Assembler::LE, md5_loop); 3633 __ mov(c_rarg0, ofs); // return ofs 3634 } 3635 3636 // write hash values back in the correct order 3637 __ stp(state_regs[0], state_regs[1], Address(state)); 3638 3639 __ pop(saved_regs, sp); 3640 3641 __ ret(lr); 3642 3643 return start; 3644 } 3645 3646 // Arguments: 3647 // 3648 // Inputs: 3649 // c_rarg0 - byte[] source+offset 3650 // c_rarg1 - int[] SHA.state 3651 // c_rarg2 - int offset 3652 // c_rarg3 - int limit 3653 // 3654 address generate_sha1_implCompress(StubGenStubId stub_id) { 3655 bool multi_block; 3656 switch (stub_id) { 3657 case sha1_implCompress_id: 3658 multi_block = false; 3659 break; 3660 case sha1_implCompressMB_id: 3661 multi_block = true; 3662 break; 3663 default: 3664 ShouldNotReachHere(); 3665 } 3666 3667 __ align(CodeEntryAlignment); 3668 3669 StubCodeMark mark(this, stub_id); 3670 address start = __ pc(); 3671 3672 Register buf = c_rarg0; 3673 Register state = c_rarg1; 3674 Register ofs = c_rarg2; 3675 Register limit = c_rarg3; 3676 3677 Label keys; 3678 Label sha1_loop; 3679 3680 // load the keys into v0..v3 3681 __ adr(rscratch1, keys); 3682 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3683 // load 5 words state into v6, v7 3684 __ ldrq(v6, Address(state, 0)); 3685 __ ldrs(v7, Address(state, 16)); 3686 3687 3688 __ BIND(sha1_loop); 3689 // load 64 bytes of data into v16..v19 3690 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3691 __ rev32(v16, __ T16B, v16); 3692 __ rev32(v17, __ T16B, v17); 3693 __ rev32(v18, __ T16B, v18); 3694 __ rev32(v19, __ T16B, v19); 3695 3696 // do the sha1 3697 __ addv(v4, __ T4S, v16, v0); 3698 __ orr(v20, __ T16B, v6, v6); 3699 3700 FloatRegister d0 = v16; 3701 FloatRegister d1 = v17; 3702 FloatRegister d2 = v18; 3703 FloatRegister d3 = v19; 3704 3705 for (int round = 0; round < 20; round++) { 3706 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3707 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3708 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3709 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3710 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3711 3712 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3713 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3714 __ sha1h(tmp2, __ T4S, v20); 3715 if (round < 5) 3716 __ sha1c(v20, __ T4S, tmp3, tmp4); 3717 else if (round < 10 || round >= 15) 3718 __ sha1p(v20, __ T4S, tmp3, tmp4); 3719 else 3720 __ sha1m(v20, __ T4S, tmp3, tmp4); 3721 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3722 3723 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3724 } 3725 3726 __ addv(v7, __ T2S, v7, v21); 3727 __ addv(v6, __ T4S, v6, v20); 3728 3729 if (multi_block) { 3730 __ add(ofs, ofs, 64); 3731 __ cmp(ofs, limit); 3732 __ br(Assembler::LE, sha1_loop); 3733 __ mov(c_rarg0, ofs); // return ofs 3734 } 3735 3736 __ strq(v6, Address(state, 0)); 3737 __ strs(v7, Address(state, 16)); 3738 3739 __ ret(lr); 3740 3741 __ bind(keys); 3742 __ emit_int32(0x5a827999); 3743 __ emit_int32(0x6ed9eba1); 3744 __ emit_int32(0x8f1bbcdc); 3745 __ emit_int32(0xca62c1d6); 3746 3747 return start; 3748 } 3749 3750 3751 // Arguments: 3752 // 3753 // Inputs: 3754 // c_rarg0 - byte[] source+offset 3755 // c_rarg1 - int[] SHA.state 3756 // c_rarg2 - int offset 3757 // c_rarg3 - int limit 3758 // 3759 address generate_sha256_implCompress(StubGenStubId stub_id) { 3760 bool multi_block; 3761 switch (stub_id) { 3762 case sha256_implCompress_id: 3763 multi_block = false; 3764 break; 3765 case sha256_implCompressMB_id: 3766 multi_block = true; 3767 break; 3768 default: 3769 ShouldNotReachHere(); 3770 } 3771 3772 static const uint32_t round_consts[64] = { 3773 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3774 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3775 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3776 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3777 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3778 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3779 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3780 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3781 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3782 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3783 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3784 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3785 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3786 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3787 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3788 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3789 }; 3790 3791 __ align(CodeEntryAlignment); 3792 3793 StubCodeMark mark(this, stub_id); 3794 address start = __ pc(); 3795 3796 Register buf = c_rarg0; 3797 Register state = c_rarg1; 3798 Register ofs = c_rarg2; 3799 Register limit = c_rarg3; 3800 3801 Label sha1_loop; 3802 3803 __ stpd(v8, v9, __ pre(sp, -32)); 3804 __ stpd(v10, v11, Address(sp, 16)); 3805 3806 // dga == v0 3807 // dgb == v1 3808 // dg0 == v2 3809 // dg1 == v3 3810 // dg2 == v4 3811 // t0 == v6 3812 // t1 == v7 3813 3814 // load 16 keys to v16..v31 3815 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3816 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3817 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3818 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3819 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3820 3821 // load 8 words (256 bits) state 3822 __ ldpq(v0, v1, state); 3823 3824 __ BIND(sha1_loop); 3825 // load 64 bytes of data into v8..v11 3826 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3827 __ rev32(v8, __ T16B, v8); 3828 __ rev32(v9, __ T16B, v9); 3829 __ rev32(v10, __ T16B, v10); 3830 __ rev32(v11, __ T16B, v11); 3831 3832 __ addv(v6, __ T4S, v8, v16); 3833 __ orr(v2, __ T16B, v0, v0); 3834 __ orr(v3, __ T16B, v1, v1); 3835 3836 FloatRegister d0 = v8; 3837 FloatRegister d1 = v9; 3838 FloatRegister d2 = v10; 3839 FloatRegister d3 = v11; 3840 3841 3842 for (int round = 0; round < 16; round++) { 3843 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3844 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3845 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3846 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3847 3848 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3849 __ orr(v4, __ T16B, v2, v2); 3850 if (round < 15) 3851 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3852 __ sha256h(v2, __ T4S, v3, tmp2); 3853 __ sha256h2(v3, __ T4S, v4, tmp2); 3854 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3855 3856 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3857 } 3858 3859 __ addv(v0, __ T4S, v0, v2); 3860 __ addv(v1, __ T4S, v1, v3); 3861 3862 if (multi_block) { 3863 __ add(ofs, ofs, 64); 3864 __ cmp(ofs, limit); 3865 __ br(Assembler::LE, sha1_loop); 3866 __ mov(c_rarg0, ofs); // return ofs 3867 } 3868 3869 __ ldpd(v10, v11, Address(sp, 16)); 3870 __ ldpd(v8, v9, __ post(sp, 32)); 3871 3872 __ stpq(v0, v1, state); 3873 3874 __ ret(lr); 3875 3876 return start; 3877 } 3878 3879 // Double rounds for sha512. 3880 void sha512_dround(int dr, 3881 FloatRegister vi0, FloatRegister vi1, 3882 FloatRegister vi2, FloatRegister vi3, 3883 FloatRegister vi4, FloatRegister vrc0, 3884 FloatRegister vrc1, FloatRegister vin0, 3885 FloatRegister vin1, FloatRegister vin2, 3886 FloatRegister vin3, FloatRegister vin4) { 3887 if (dr < 36) { 3888 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3889 } 3890 __ addv(v5, __ T2D, vrc0, vin0); 3891 __ ext(v6, __ T16B, vi2, vi3, 8); 3892 __ ext(v5, __ T16B, v5, v5, 8); 3893 __ ext(v7, __ T16B, vi1, vi2, 8); 3894 __ addv(vi3, __ T2D, vi3, v5); 3895 if (dr < 32) { 3896 __ ext(v5, __ T16B, vin3, vin4, 8); 3897 __ sha512su0(vin0, __ T2D, vin1); 3898 } 3899 __ sha512h(vi3, __ T2D, v6, v7); 3900 if (dr < 32) { 3901 __ sha512su1(vin0, __ T2D, vin2, v5); 3902 } 3903 __ addv(vi4, __ T2D, vi1, vi3); 3904 __ sha512h2(vi3, __ T2D, vi1, vi0); 3905 } 3906 3907 // Arguments: 3908 // 3909 // Inputs: 3910 // c_rarg0 - byte[] source+offset 3911 // c_rarg1 - int[] SHA.state 3912 // c_rarg2 - int offset 3913 // c_rarg3 - int limit 3914 // 3915 address generate_sha512_implCompress(StubGenStubId stub_id) { 3916 bool multi_block; 3917 switch (stub_id) { 3918 case sha512_implCompress_id: 3919 multi_block = false; 3920 break; 3921 case sha512_implCompressMB_id: 3922 multi_block = true; 3923 break; 3924 default: 3925 ShouldNotReachHere(); 3926 } 3927 3928 static const uint64_t round_consts[80] = { 3929 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3930 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3931 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3932 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3933 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3934 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3935 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3936 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3937 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3938 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3939 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3940 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3941 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3942 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3943 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3944 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3945 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3946 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3947 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3948 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3949 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3950 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3951 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3952 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3953 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3954 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3955 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3956 }; 3957 3958 __ align(CodeEntryAlignment); 3959 3960 StubCodeMark mark(this, stub_id); 3961 address start = __ pc(); 3962 3963 Register buf = c_rarg0; 3964 Register state = c_rarg1; 3965 Register ofs = c_rarg2; 3966 Register limit = c_rarg3; 3967 3968 __ stpd(v8, v9, __ pre(sp, -64)); 3969 __ stpd(v10, v11, Address(sp, 16)); 3970 __ stpd(v12, v13, Address(sp, 32)); 3971 __ stpd(v14, v15, Address(sp, 48)); 3972 3973 Label sha512_loop; 3974 3975 // load state 3976 __ ld1(v8, v9, v10, v11, __ T2D, state); 3977 3978 // load first 4 round constants 3979 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3980 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3981 3982 __ BIND(sha512_loop); 3983 // load 128B of data into v12..v19 3984 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3985 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3986 __ rev64(v12, __ T16B, v12); 3987 __ rev64(v13, __ T16B, v13); 3988 __ rev64(v14, __ T16B, v14); 3989 __ rev64(v15, __ T16B, v15); 3990 __ rev64(v16, __ T16B, v16); 3991 __ rev64(v17, __ T16B, v17); 3992 __ rev64(v18, __ T16B, v18); 3993 __ rev64(v19, __ T16B, v19); 3994 3995 __ mov(rscratch2, rscratch1); 3996 3997 __ mov(v0, __ T16B, v8); 3998 __ mov(v1, __ T16B, v9); 3999 __ mov(v2, __ T16B, v10); 4000 __ mov(v3, __ T16B, v11); 4001 4002 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4003 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4004 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4005 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4006 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4007 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4008 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4009 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4010 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4011 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4012 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4013 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4014 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4015 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4016 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4017 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4018 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4019 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4020 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4021 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4022 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4023 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4024 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4025 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4026 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4027 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4028 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4029 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4030 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4031 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4032 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4033 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4034 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4035 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4036 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4037 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4038 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4039 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4040 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4041 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4042 4043 __ addv(v8, __ T2D, v8, v0); 4044 __ addv(v9, __ T2D, v9, v1); 4045 __ addv(v10, __ T2D, v10, v2); 4046 __ addv(v11, __ T2D, v11, v3); 4047 4048 if (multi_block) { 4049 __ add(ofs, ofs, 128); 4050 __ cmp(ofs, limit); 4051 __ br(Assembler::LE, sha512_loop); 4052 __ mov(c_rarg0, ofs); // return ofs 4053 } 4054 4055 __ st1(v8, v9, v10, v11, __ T2D, state); 4056 4057 __ ldpd(v14, v15, Address(sp, 48)); 4058 __ ldpd(v12, v13, Address(sp, 32)); 4059 __ ldpd(v10, v11, Address(sp, 16)); 4060 __ ldpd(v8, v9, __ post(sp, 64)); 4061 4062 __ ret(lr); 4063 4064 return start; 4065 } 4066 4067 // Execute one round of keccak of two computations in parallel. 4068 // One of the states should be loaded into the lower halves of 4069 // the vector registers v0-v24, the other should be loaded into 4070 // the upper halves of those registers. The ld1r instruction loads 4071 // the round constant into both halves of register v31. 4072 // Intermediate results c0...c5 and d0...d5 are computed 4073 // in registers v25...v30. 4074 // All vector instructions that are used operate on both register 4075 // halves in parallel. 4076 // If only a single computation is needed, one can only load the lower halves. 4077 void keccak_round(Register rscratch1) { 4078 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4079 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4080 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4081 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4082 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4083 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4084 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4085 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4086 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4087 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4088 4089 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4090 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4091 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4092 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4093 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4094 4095 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4096 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4097 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4098 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4099 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4100 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4101 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4102 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4103 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4104 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4105 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4106 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4107 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4108 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4109 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4110 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4111 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4112 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4113 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4114 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4115 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4116 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4117 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4118 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4119 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4120 4121 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4122 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4123 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4124 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4125 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4126 4127 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4128 4129 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4130 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4131 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4132 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4133 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4134 4135 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4136 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4137 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4138 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4139 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4140 4141 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4142 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4143 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4144 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4145 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4146 4147 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4148 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4149 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4150 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4151 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4152 4153 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4154 } 4155 4156 // Arguments: 4157 // 4158 // Inputs: 4159 // c_rarg0 - byte[] source+offset 4160 // c_rarg1 - byte[] SHA.state 4161 // c_rarg2 - int block_size 4162 // c_rarg3 - int offset 4163 // c_rarg4 - int limit 4164 // 4165 address generate_sha3_implCompress(StubGenStubId stub_id) { 4166 bool multi_block; 4167 switch (stub_id) { 4168 case sha3_implCompress_id: 4169 multi_block = false; 4170 break; 4171 case sha3_implCompressMB_id: 4172 multi_block = true; 4173 break; 4174 default: 4175 ShouldNotReachHere(); 4176 } 4177 4178 static const uint64_t round_consts[24] = { 4179 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4180 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4181 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4182 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4183 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4184 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4185 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4186 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4187 }; 4188 4189 __ align(CodeEntryAlignment); 4190 4191 StubCodeMark mark(this, stub_id); 4192 address start = __ pc(); 4193 4194 Register buf = c_rarg0; 4195 Register state = c_rarg1; 4196 Register block_size = c_rarg2; 4197 Register ofs = c_rarg3; 4198 Register limit = c_rarg4; 4199 4200 Label sha3_loop, rounds24_loop; 4201 Label sha3_512_or_sha3_384, shake128; 4202 4203 __ stpd(v8, v9, __ pre(sp, -64)); 4204 __ stpd(v10, v11, Address(sp, 16)); 4205 __ stpd(v12, v13, Address(sp, 32)); 4206 __ stpd(v14, v15, Address(sp, 48)); 4207 4208 // load state 4209 __ add(rscratch1, state, 32); 4210 __ ld1(v0, v1, v2, v3, __ T1D, state); 4211 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4212 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4213 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4214 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4215 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4216 __ ld1(v24, __ T1D, rscratch1); 4217 4218 __ BIND(sha3_loop); 4219 4220 // 24 keccak rounds 4221 __ movw(rscratch2, 24); 4222 4223 // load round_constants base 4224 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4225 4226 // load input 4227 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4228 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4229 __ eor(v0, __ T8B, v0, v25); 4230 __ eor(v1, __ T8B, v1, v26); 4231 __ eor(v2, __ T8B, v2, v27); 4232 __ eor(v3, __ T8B, v3, v28); 4233 __ eor(v4, __ T8B, v4, v29); 4234 __ eor(v5, __ T8B, v5, v30); 4235 __ eor(v6, __ T8B, v6, v31); 4236 4237 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4238 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4239 4240 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4241 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4242 __ eor(v7, __ T8B, v7, v25); 4243 __ eor(v8, __ T8B, v8, v26); 4244 __ eor(v9, __ T8B, v9, v27); 4245 __ eor(v10, __ T8B, v10, v28); 4246 __ eor(v11, __ T8B, v11, v29); 4247 __ eor(v12, __ T8B, v12, v30); 4248 __ eor(v13, __ T8B, v13, v31); 4249 4250 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4251 __ eor(v14, __ T8B, v14, v25); 4252 __ eor(v15, __ T8B, v15, v26); 4253 __ eor(v16, __ T8B, v16, v27); 4254 4255 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4256 __ andw(c_rarg5, block_size, 48); 4257 __ cbzw(c_rarg5, rounds24_loop); 4258 4259 __ tbnz(block_size, 5, shake128); 4260 // block_size == 144, bit5 == 0, SHA3-224 4261 __ ldrd(v28, __ post(buf, 8)); 4262 __ eor(v17, __ T8B, v17, v28); 4263 __ b(rounds24_loop); 4264 4265 __ BIND(shake128); 4266 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4267 __ eor(v17, __ T8B, v17, v28); 4268 __ eor(v18, __ T8B, v18, v29); 4269 __ eor(v19, __ T8B, v19, v30); 4270 __ eor(v20, __ T8B, v20, v31); 4271 __ b(rounds24_loop); // block_size == 168, SHAKE128 4272 4273 __ BIND(sha3_512_or_sha3_384); 4274 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4275 __ eor(v7, __ T8B, v7, v25); 4276 __ eor(v8, __ T8B, v8, v26); 4277 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4278 4279 // SHA3-384 4280 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4281 __ eor(v9, __ T8B, v9, v27); 4282 __ eor(v10, __ T8B, v10, v28); 4283 __ eor(v11, __ T8B, v11, v29); 4284 __ eor(v12, __ T8B, v12, v30); 4285 4286 __ BIND(rounds24_loop); 4287 __ subw(rscratch2, rscratch2, 1); 4288 4289 keccak_round(rscratch1); 4290 4291 __ cbnzw(rscratch2, rounds24_loop); 4292 4293 if (multi_block) { 4294 __ add(ofs, ofs, block_size); 4295 __ cmp(ofs, limit); 4296 __ br(Assembler::LE, sha3_loop); 4297 __ mov(c_rarg0, ofs); // return ofs 4298 } 4299 4300 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4301 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4302 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4303 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4304 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4305 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4306 __ st1(v24, __ T1D, state); 4307 4308 // restore callee-saved registers 4309 __ ldpd(v14, v15, Address(sp, 48)); 4310 __ ldpd(v12, v13, Address(sp, 32)); 4311 __ ldpd(v10, v11, Address(sp, 16)); 4312 __ ldpd(v8, v9, __ post(sp, 64)); 4313 4314 __ ret(lr); 4315 4316 return start; 4317 } 4318 4319 // Inputs: 4320 // c_rarg0 - long[] state0 4321 // c_rarg1 - long[] state1 4322 address generate_double_keccak() { 4323 static const uint64_t round_consts[24] = { 4324 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4325 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4326 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4327 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4328 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4329 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4330 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4331 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4332 }; 4333 4334 // Implements the double_keccak() method of the 4335 // sun.secyrity.provider.SHA3Parallel class 4336 __ align(CodeEntryAlignment); 4337 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4338 address start = __ pc(); 4339 __ enter(); 4340 4341 Register state0 = c_rarg0; 4342 Register state1 = c_rarg1; 4343 4344 Label rounds24_loop; 4345 4346 // save callee-saved registers 4347 __ stpd(v8, v9, __ pre(sp, -64)); 4348 __ stpd(v10, v11, Address(sp, 16)); 4349 __ stpd(v12, v13, Address(sp, 32)); 4350 __ stpd(v14, v15, Address(sp, 48)); 4351 4352 // load states 4353 __ add(rscratch1, state0, 32); 4354 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4355 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4356 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4357 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4358 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4359 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4360 __ ld1(v24, __ D, 0, rscratch1); 4361 __ add(rscratch1, state1, 32); 4362 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4363 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4364 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4365 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4366 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4367 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4368 __ ld1(v24, __ D, 1, rscratch1); 4369 4370 // 24 keccak rounds 4371 __ movw(rscratch2, 24); 4372 4373 // load round_constants base 4374 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4375 4376 __ BIND(rounds24_loop); 4377 __ subw(rscratch2, rscratch2, 1); 4378 keccak_round(rscratch1); 4379 __ cbnzw(rscratch2, rounds24_loop); 4380 4381 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4382 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4383 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4384 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4385 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4386 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4387 __ st1(v24, __ D, 0, state0); 4388 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4389 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4390 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4391 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4392 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4393 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4394 __ st1(v24, __ D, 1, state1); 4395 4396 // restore callee-saved vector registers 4397 __ ldpd(v14, v15, Address(sp, 48)); 4398 __ ldpd(v12, v13, Address(sp, 32)); 4399 __ ldpd(v10, v11, Address(sp, 16)); 4400 __ ldpd(v8, v9, __ post(sp, 64)); 4401 4402 __ leave(); // required for proper stackwalking of RuntimeStub frame 4403 __ mov(r0, zr); // return 0 4404 __ ret(lr); 4405 4406 return start; 4407 } 4408 4409 // ChaCha20 block function. This version parallelizes the 32-bit 4410 // state elements on each of 16 vectors, producing 4 blocks of 4411 // keystream at a time. 4412 // 4413 // state (int[16]) = c_rarg0 4414 // keystream (byte[256]) = c_rarg1 4415 // return - number of bytes of produced keystream (always 256) 4416 // 4417 // This implementation takes each 32-bit integer from the state 4418 // array and broadcasts it across all 4 32-bit lanes of a vector register 4419 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4420 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4421 // the quarter round schedule is implemented as outlined in RFC 7539 section 4422 // 2.3. However, instead of sequentially processing the 3 quarter round 4423 // operations represented by one QUARTERROUND function, we instead stack all 4424 // the adds, xors and left-rotations from the first 4 quarter rounds together 4425 // and then do the same for the second set of 4 quarter rounds. This removes 4426 // some latency that would otherwise be incurred by waiting for an add to 4427 // complete before performing an xor (which depends on the result of the 4428 // add), etc. An adjustment happens between the first and second groups of 4 4429 // quarter rounds, but this is done only in the inputs to the macro functions 4430 // that generate the assembly instructions - these adjustments themselves are 4431 // not part of the resulting assembly. 4432 // The 4 registers v0-v3 are used during the quarter round operations as 4433 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4434 // registers become the vectors involved in adding the start state back onto 4435 // the post-QR working state. After the adds are complete, each of the 16 4436 // vectors write their first lane back to the keystream buffer, followed 4437 // by the second lane from all vectors and so on. 4438 address generate_chacha20Block_blockpar() { 4439 Label L_twoRounds, L_cc20_const; 4440 // The constant data is broken into two 128-bit segments to be loaded 4441 // onto FloatRegisters. The first 128 bits are a counter add overlay 4442 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4443 // The second 128-bits is a table constant used for 8-bit left rotations. 4444 __ BIND(L_cc20_const); 4445 __ emit_int64(0x0000000100000000UL); 4446 __ emit_int64(0x0000000300000002UL); 4447 __ emit_int64(0x0605040702010003UL); 4448 __ emit_int64(0x0E0D0C0F0A09080BUL); 4449 4450 __ align(CodeEntryAlignment); 4451 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4452 StubCodeMark mark(this, stub_id); 4453 address start = __ pc(); 4454 __ enter(); 4455 4456 int i, j; 4457 const Register state = c_rarg0; 4458 const Register keystream = c_rarg1; 4459 const Register loopCtr = r10; 4460 const Register tmpAddr = r11; 4461 const FloatRegister ctrAddOverlay = v28; 4462 const FloatRegister lrot8Tbl = v29; 4463 4464 // Organize SIMD registers in an array that facilitates 4465 // putting repetitive opcodes into loop structures. It is 4466 // important that each grouping of 4 registers is monotonically 4467 // increasing to support the requirements of multi-register 4468 // instructions (e.g. ld4r, st4, etc.) 4469 const FloatRegister workSt[16] = { 4470 v4, v5, v6, v7, v16, v17, v18, v19, 4471 v20, v21, v22, v23, v24, v25, v26, v27 4472 }; 4473 4474 // Pull in constant data. The first 16 bytes are the add overlay 4475 // which is applied to the vector holding the counter (state[12]). 4476 // The second 16 bytes is the index register for the 8-bit left 4477 // rotation tbl instruction. 4478 __ adr(tmpAddr, L_cc20_const); 4479 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4480 4481 // Load from memory and interlace across 16 SIMD registers, 4482 // With each word from memory being broadcast to all lanes of 4483 // each successive SIMD register. 4484 // Addr(0) -> All lanes in workSt[i] 4485 // Addr(4) -> All lanes workSt[i + 1], etc. 4486 __ mov(tmpAddr, state); 4487 for (i = 0; i < 16; i += 4) { 4488 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4489 __ post(tmpAddr, 16)); 4490 } 4491 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4492 4493 // Before entering the loop, create 5 4-register arrays. These 4494 // will hold the 4 registers that represent the a/b/c/d fields 4495 // in the quarter round operation. For instance the "b" field 4496 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4497 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4498 // since it is part of a diagonal organization. The aSet and scratch 4499 // register sets are defined at declaration time because they do not change 4500 // organization at any point during the 20-round processing. 4501 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4502 FloatRegister bSet[4]; 4503 FloatRegister cSet[4]; 4504 FloatRegister dSet[4]; 4505 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4506 4507 // Set up the 10 iteration loop and perform all 8 quarter round ops 4508 __ mov(loopCtr, 10); 4509 __ BIND(L_twoRounds); 4510 4511 // Set to columnar organization and do the following 4 quarter-rounds: 4512 // QUARTERROUND(0, 4, 8, 12) 4513 // QUARTERROUND(1, 5, 9, 13) 4514 // QUARTERROUND(2, 6, 10, 14) 4515 // QUARTERROUND(3, 7, 11, 15) 4516 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4517 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4518 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4519 4520 __ cc20_qr_add4(aSet, bSet); // a += b 4521 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4522 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4523 4524 __ cc20_qr_add4(cSet, dSet); // c += d 4525 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4526 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4527 4528 __ cc20_qr_add4(aSet, bSet); // a += b 4529 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4530 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4531 4532 __ cc20_qr_add4(cSet, dSet); // c += d 4533 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4534 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4535 4536 // Set to diagonal organization and do the next 4 quarter-rounds: 4537 // QUARTERROUND(0, 5, 10, 15) 4538 // QUARTERROUND(1, 6, 11, 12) 4539 // QUARTERROUND(2, 7, 8, 13) 4540 // QUARTERROUND(3, 4, 9, 14) 4541 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4542 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4543 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4544 4545 __ cc20_qr_add4(aSet, bSet); // a += b 4546 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4547 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4548 4549 __ cc20_qr_add4(cSet, dSet); // c += d 4550 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4551 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4552 4553 __ cc20_qr_add4(aSet, bSet); // a += b 4554 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4555 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4556 4557 __ cc20_qr_add4(cSet, dSet); // c += d 4558 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4559 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4560 4561 // Decrement and iterate 4562 __ sub(loopCtr, loopCtr, 1); 4563 __ cbnz(loopCtr, L_twoRounds); 4564 4565 __ mov(tmpAddr, state); 4566 4567 // Add the starting state back to the post-loop keystream 4568 // state. We read/interlace the state array from memory into 4569 // 4 registers similar to what we did in the beginning. Then 4570 // add the counter overlay onto workSt[12] at the end. 4571 for (i = 0; i < 16; i += 4) { 4572 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4573 __ addv(workSt[i], __ T4S, workSt[i], v0); 4574 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4575 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4576 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4577 } 4578 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4579 4580 // Write working state into the keystream buffer. This is accomplished 4581 // by taking the lane "i" from each of the four vectors and writing 4582 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4583 // repeating with the next 4 vectors until all 16 vectors have been used. 4584 // Then move to the next lane and repeat the process until all lanes have 4585 // been written. 4586 for (i = 0; i < 4; i++) { 4587 for (j = 0; j < 16; j += 4) { 4588 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4589 __ post(keystream, 16)); 4590 } 4591 } 4592 4593 __ mov(r0, 256); // Return length of output keystream 4594 __ leave(); 4595 __ ret(lr); 4596 4597 return start; 4598 } 4599 4600 // Helpers to schedule parallel operation bundles across vector 4601 // register sequences of size 2, 4 or 8. 4602 4603 // Implement various primitive computations across vector sequences 4604 4605 template<int N> 4606 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4607 const VSeq<N>& v1, const VSeq<N>& v2) { 4608 // output must not be constant 4609 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4610 // output cannot overwrite pending inputs 4611 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4612 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4613 for (int i = 0; i < N; i++) { 4614 __ addv(v[i], T, v1[i], v2[i]); 4615 } 4616 } 4617 4618 template<int N> 4619 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4620 const VSeq<N>& v1, const VSeq<N>& v2) { 4621 // output must not be constant 4622 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4623 // output cannot overwrite pending inputs 4624 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4625 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4626 for (int i = 0; i < N; i++) { 4627 __ subv(v[i], T, v1[i], v2[i]); 4628 } 4629 } 4630 4631 template<int N> 4632 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4633 const VSeq<N>& v1, const VSeq<N>& v2) { 4634 // output must not be constant 4635 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4636 // output cannot overwrite pending inputs 4637 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4638 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4639 for (int i = 0; i < N; i++) { 4640 __ mulv(v[i], T, v1[i], v2[i]); 4641 } 4642 } 4643 4644 template<int N> 4645 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4646 // output must not be constant 4647 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4648 // output cannot overwrite pending inputs 4649 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4650 for (int i = 0; i < N; i++) { 4651 __ negr(v[i], T, v1[i]); 4652 } 4653 } 4654 4655 template<int N> 4656 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4657 const VSeq<N>& v1, int shift) { 4658 // output must not be constant 4659 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4660 // output cannot overwrite pending inputs 4661 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4662 for (int i = 0; i < N; i++) { 4663 __ sshr(v[i], T, v1[i], shift); 4664 } 4665 } 4666 4667 template<int N> 4668 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4669 // output must not be constant 4670 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4671 // output cannot overwrite pending inputs 4672 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4673 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4674 for (int i = 0; i < N; i++) { 4675 __ andr(v[i], __ T16B, v1[i], v2[i]); 4676 } 4677 } 4678 4679 template<int N> 4680 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4681 // output must not be constant 4682 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4683 // output cannot overwrite pending inputs 4684 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4685 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4686 for (int i = 0; i < N; i++) { 4687 __ orr(v[i], __ T16B, v1[i], v2[i]); 4688 } 4689 } 4690 4691 template<int N> 4692 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4693 // output must not be constant 4694 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4695 // output cannot overwrite pending inputs 4696 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4697 for (int i = 0; i < N; i++) { 4698 __ notr(v[i], __ T16B, v1[i]); 4699 } 4700 } 4701 4702 template<int N> 4703 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4704 // output must not be constant 4705 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4706 // output cannot overwrite pending inputs 4707 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4708 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4709 for (int i = 0; i < N; i++) { 4710 __ sqdmulh(v[i], T, v1[i], v2[i]); 4711 } 4712 } 4713 4714 template<int N> 4715 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4716 // output must not be constant 4717 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4718 // output cannot overwrite pending inputs 4719 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4720 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4721 for (int i = 0; i < N; i++) { 4722 __ mlsv(v[i], T, v1[i], v2[i]); 4723 } 4724 } 4725 4726 // load N/2 successive pairs of quadword values from memory in order 4727 // into N successive vector registers of the sequence via the 4728 // address supplied in base. 4729 template<int N> 4730 void vs_ldpq(const VSeq<N>& v, Register base) { 4731 for (int i = 0; i < N; i += 2) { 4732 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4733 } 4734 } 4735 4736 // load N/2 successive pairs of quadword values from memory in order 4737 // into N vector registers of the sequence via the address supplied 4738 // in base using post-increment addressing 4739 template<int N> 4740 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4741 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4742 for (int i = 0; i < N; i += 2) { 4743 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4744 } 4745 } 4746 4747 // store N successive vector registers of the sequence into N/2 4748 // successive pairs of quadword memory locations via the address 4749 // supplied in base using post-increment addressing 4750 template<int N> 4751 void vs_stpq_post(const VSeq<N>& v, Register base) { 4752 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4753 for (int i = 0; i < N; i += 2) { 4754 __ stpq(v[i], v[i+1], __ post(base, 32)); 4755 } 4756 } 4757 4758 // load N/2 pairs of quadword values from memory de-interleaved into 4759 // N vector registers 2 at a time via the address supplied in base 4760 // using post-increment addressing. 4761 template<int N> 4762 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4763 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4764 for (int i = 0; i < N; i += 2) { 4765 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4766 } 4767 } 4768 4769 // store N vector registers interleaved into N/2 pairs of quadword 4770 // memory locations via the address supplied in base using 4771 // post-increment addressing. 4772 template<int N> 4773 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4774 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4775 for (int i = 0; i < N; i += 2) { 4776 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4777 } 4778 } 4779 4780 // load N quadword values from memory de-interleaved into N vector 4781 // registers 3 elements at a time via the address supplied in base. 4782 template<int N> 4783 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4784 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4785 for (int i = 0; i < N; i += 3) { 4786 __ ld3(v[i], v[i+1], v[i+2], T, base); 4787 } 4788 } 4789 4790 // load N quadword values from memory de-interleaved into N vector 4791 // registers 3 elements at a time via the address supplied in base 4792 // using post-increment addressing. 4793 template<int N> 4794 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4795 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4796 for (int i = 0; i < N; i += 3) { 4797 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4798 } 4799 } 4800 4801 // load N/2 pairs of quadword values from memory into N vector 4802 // registers via the address supplied in base with each pair indexed 4803 // using the the start offset plus the corresponding entry in the 4804 // offsets array 4805 template<int N> 4806 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4807 for (int i = 0; i < N/2; i++) { 4808 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4809 } 4810 } 4811 4812 // store N vector registers into N/2 pairs of quadword memory 4813 // locations via the address supplied in base with each pair indexed 4814 // using the the start offset plus the corresponding entry in the 4815 // offsets array 4816 template<int N> 4817 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4818 for (int i = 0; i < N/2; i++) { 4819 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4820 } 4821 } 4822 4823 // load N single quadword values from memory into N vector registers 4824 // via the address supplied in base with each value indexed using 4825 // the the start offset plus the corresponding entry in the offsets 4826 // array 4827 template<int N> 4828 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4829 int start, int (&offsets)[N]) { 4830 for (int i = 0; i < N; i++) { 4831 __ ldr(v[i], T, Address(base, start + offsets[i])); 4832 } 4833 } 4834 4835 // store N vector registers into N single quadword memory locations 4836 // via the address supplied in base with each value indexed using 4837 // the the start offset plus the corresponding entry in the offsets 4838 // array 4839 template<int N> 4840 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4841 int start, int (&offsets)[N]) { 4842 for (int i = 0; i < N; i++) { 4843 __ str(v[i], T, Address(base, start + offsets[i])); 4844 } 4845 } 4846 4847 // load N/2 pairs of quadword values from memory de-interleaved into 4848 // N vector registers 2 at a time via the address supplied in base 4849 // with each pair indexed using the the start offset plus the 4850 // corresponding entry in the offsets array 4851 template<int N> 4852 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4853 Register tmp, int start, int (&offsets)[N/2]) { 4854 for (int i = 0; i < N/2; i++) { 4855 __ add(tmp, base, start + offsets[i]); 4856 __ ld2(v[2*i], v[2*i+1], T, tmp); 4857 } 4858 } 4859 4860 // store N vector registers 2 at a time interleaved into N/2 pairs 4861 // of quadword memory locations via the address supplied in base 4862 // with each pair indexed using the the start offset plus the 4863 // corresponding entry in the offsets array 4864 template<int N> 4865 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4866 Register tmp, int start, int (&offsets)[N/2]) { 4867 for (int i = 0; i < N/2; i++) { 4868 __ add(tmp, base, start + offsets[i]); 4869 __ st2(v[2*i], v[2*i+1], T, tmp); 4870 } 4871 } 4872 4873 // Helper routines for various flavours of Montgomery multiply 4874 4875 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 4876 // multiplications in parallel 4877 // 4878 4879 // See the montMul() method of the sun.security.provider.ML_DSA 4880 // class. 4881 // 4882 // Computes 4x4S results or 8x8H results 4883 // a = b * c * 2^MONT_R_BITS mod MONT_Q 4884 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 4885 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 4886 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 4887 // Outputs: va - 4x4S or 4x8H vector register sequences 4888 // vb, vc, vtmp and vq must all be disjoint 4889 // va must be disjoint from all other inputs/temps or must equal vc 4890 // va must have a non-zero delta i.e. it must not be a constant vseq. 4891 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 4892 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 4893 Assembler::SIMD_Arrangement T, 4894 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4895 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 4896 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4897 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4898 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4899 4900 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4901 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4902 4903 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4904 4905 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4906 assert(vs_disjoint(va, vb), "va and vb overlap"); 4907 assert(vs_disjoint(va, vq), "va and vq overlap"); 4908 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4909 assert(!va.is_constant(), "output vector must identify 4 different registers"); 4910 4911 // schedule 4 streams of instructions across the vector sequences 4912 for (int i = 0; i < 4; i++) { 4913 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 4914 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 4915 } 4916 4917 for (int i = 0; i < 4; i++) { 4918 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 4919 } 4920 4921 for (int i = 0; i < 4; i++) { 4922 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 4923 } 4924 4925 for (int i = 0; i < 4; i++) { 4926 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 4927 } 4928 } 4929 4930 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 4931 // multiplications in parallel 4932 // 4933 4934 // See the montMul() method of the sun.security.provider.ML_DSA 4935 // class. 4936 // 4937 // Computes 4x4S results or 8x8H results 4938 // a = b * c * 2^MONT_R_BITS mod MONT_Q 4939 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 4940 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 4941 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 4942 // Outputs: va - 4x4S or 4x8H vector register sequences 4943 // vb, vc, vtmp and vq must all be disjoint 4944 // va must be disjoint from all other inputs/temps or must equal vc 4945 // va must have a non-zero delta i.e. it must not be a constant vseq. 4946 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 4947 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 4948 Assembler::SIMD_Arrangement T, 4949 const VSeq<2>& vtmp, const VSeq<2>& vq) { 4950 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 4951 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4952 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4953 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4954 4955 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4956 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4957 4958 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4959 4960 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4961 assert(vs_disjoint(va, vb), "va and vb overlap"); 4962 assert(vs_disjoint(va, vq), "va and vq overlap"); 4963 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4964 assert(!va.is_constant(), "output vector must identify 2 different registers"); 4965 4966 // schedule 2 streams of instructions across the vector sequences 4967 for (int i = 0; i < 2; i++) { 4968 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 4969 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 4970 } 4971 4972 for (int i = 0; i < 2; i++) { 4973 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 4974 } 4975 4976 for (int i = 0; i < 2; i++) { 4977 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 4978 } 4979 4980 for (int i = 0; i < 2; i++) { 4981 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 4982 } 4983 } 4984 4985 // Perform 16 16-bit Montgomery multiplications in parallel. 4986 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 4987 const VSeq<2>& vtmp, const VSeq<2>& vq) { 4988 // Use the helper routine to schedule a 2x8H Montgomery multiply. 4989 // It will assert that the register use is valid 4990 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 4991 } 4992 4993 // Perform 32 16-bit Montgomery multiplications in parallel. 4994 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 4995 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4996 // Use the helper routine to schedule a 4x8H Montgomery multiply. 4997 // It will assert that the register use is valid 4998 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 4999 } 5000 5001 // Perform 64 16-bit Montgomery multiplications in parallel. 5002 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5003 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5004 // Schedule two successive 4x8H multiplies via the montmul helper 5005 // on the front and back halves of va, vb and vc. The helper will 5006 // assert that the register use has no overlap conflicts on each 5007 // individual call but we also need to ensure that the necessary 5008 // disjoint/equality constraints are met across both calls. 5009 5010 // vb, vc, vtmp and vq must be disjoint. va must either be 5011 // disjoint from all other registers or equal vc 5012 5013 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5014 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5015 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5016 5017 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5018 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5019 5020 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5021 5022 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5023 assert(vs_disjoint(va, vb), "va and vb overlap"); 5024 assert(vs_disjoint(va, vq), "va and vq overlap"); 5025 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5026 5027 // we multiply the front and back halves of each sequence 4 at a 5028 // time because 5029 // 5030 // 1) we are currently only able to get 4-way instruction 5031 // parallelism at best 5032 // 5033 // 2) we need registers for the constants in vq and temporary 5034 // scratch registers to hold intermediate results so vtmp can only 5035 // be a VSeq<4> which means we only have 4 scratch slots 5036 5037 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5038 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5039 } 5040 5041 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5042 const VSeq<4>& vc, 5043 const VSeq<4>& vtmp, 5044 const VSeq<2>& vq) { 5045 // compute a = montmul(a1, c) 5046 kyber_montmul32(vc, va1, vc, vtmp, vq); 5047 // ouptut a1 = a0 - a 5048 vs_subv(va1, __ T8H, va0, vc); 5049 // and a0 = a0 + a 5050 vs_addv(va0, __ T8H, va0, vc); 5051 } 5052 5053 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5054 const VSeq<4>& vb, 5055 const VSeq<4>& vtmp1, 5056 const VSeq<4>& vtmp2, 5057 const VSeq<2>& vq) { 5058 // compute c = a0 - a1 5059 vs_subv(vtmp1, __ T8H, va0, va1); 5060 // output a0 = a0 + a1 5061 vs_addv(va0, __ T8H, va0, va1); 5062 // output a1 = b montmul c 5063 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5064 } 5065 5066 void load64shorts(const VSeq<8>& v, Register shorts) { 5067 vs_ldpq_post(v, shorts); 5068 } 5069 5070 void load32shorts(const VSeq<4>& v, Register shorts) { 5071 vs_ldpq_post(v, shorts); 5072 } 5073 5074 void store64shorts(VSeq<8> v, Register tmpAddr) { 5075 vs_stpq_post(v, tmpAddr); 5076 } 5077 5078 // Kyber NTT function. 5079 // Implements 5080 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5081 // 5082 // coeffs (short[256]) = c_rarg0 5083 // ntt_zetas (short[256]) = c_rarg1 5084 address generate_kyberNtt() { 5085 5086 __ align(CodeEntryAlignment); 5087 StubGenStubId stub_id = StubGenStubId::kyberNtt_id; 5088 StubCodeMark mark(this, stub_id); 5089 address start = __ pc(); 5090 __ enter(); 5091 5092 const Register coeffs = c_rarg0; 5093 const Register zetas = c_rarg1; 5094 5095 const Register kyberConsts = r10; 5096 const Register tmpAddr = r11; 5097 5098 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5099 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5100 VSeq<2> vq(30); // n.b. constants overlap vs3 5101 5102 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5103 // load the montmul constants 5104 vs_ldpq(vq, kyberConsts); 5105 5106 // Each level corresponds to an iteration of the outermost loop of the 5107 // Java method seilerNTT(int[] coeffs). There are some differences 5108 // from what is done in the seilerNTT() method, though: 5109 // 1. The computation is using 16-bit signed values, we do not convert them 5110 // to ints here. 5111 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5112 // this array for each level, it is easier that way to fill up the vector 5113 // registers. 5114 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5115 // multiplications (this is because that way there should not be any 5116 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5117 // that we can use the 16-bit arithmetic in the vector unit. 5118 // 5119 // On each level, we fill up the vector registers in such a way that the 5120 // array elements that need to be multiplied by the zetas go into one 5121 // set of vector registers while the corresponding ones that don't need to 5122 // be multiplied, go into another set. 5123 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5124 // registers interleaving the steps of 4 identical computations, 5125 // each done on 8 16-bit values per register. 5126 5127 // At levels 0-3 the coefficients multiplied by or added/subtracted 5128 // to the zetas occur in discrete blocks whose size is some multiple 5129 // of 32. 5130 5131 // level 0 5132 __ add(tmpAddr, coeffs, 256); 5133 load64shorts(vs1, tmpAddr); 5134 load64shorts(vs2, zetas); 5135 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5136 __ add(tmpAddr, coeffs, 0); 5137 load64shorts(vs1, tmpAddr); 5138 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5139 vs_addv(vs1, __ T8H, vs1, vs2); 5140 __ add(tmpAddr, coeffs, 0); 5141 vs_stpq_post(vs1, tmpAddr); 5142 __ add(tmpAddr, coeffs, 256); 5143 vs_stpq_post(vs3, tmpAddr); 5144 // restore montmul constants 5145 vs_ldpq(vq, kyberConsts); 5146 load64shorts(vs1, tmpAddr); 5147 load64shorts(vs2, zetas); 5148 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5149 __ add(tmpAddr, coeffs, 128); 5150 load64shorts(vs1, tmpAddr); 5151 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5152 vs_addv(vs1, __ T8H, vs1, vs2); 5153 __ add(tmpAddr, coeffs, 128); 5154 store64shorts(vs1, tmpAddr); 5155 __ add(tmpAddr, coeffs, 384); 5156 store64shorts(vs3, tmpAddr); 5157 5158 // level 1 5159 // restore montmul constants 5160 vs_ldpq(vq, kyberConsts); 5161 __ add(tmpAddr, coeffs, 128); 5162 load64shorts(vs1, tmpAddr); 5163 load64shorts(vs2, zetas); 5164 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5165 __ add(tmpAddr, coeffs, 0); 5166 load64shorts(vs1, tmpAddr); 5167 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5168 vs_addv(vs1, __ T8H, vs1, vs2); 5169 __ add(tmpAddr, coeffs, 0); 5170 store64shorts(vs1, tmpAddr); 5171 store64shorts(vs3, tmpAddr); 5172 vs_ldpq(vq, kyberConsts); 5173 __ add(tmpAddr, coeffs, 384); 5174 load64shorts(vs1, tmpAddr); 5175 load64shorts(vs2, zetas); 5176 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5177 __ add(tmpAddr, coeffs, 256); 5178 load64shorts(vs1, tmpAddr); 5179 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5180 vs_addv(vs1, __ T8H, vs1, vs2); 5181 __ add(tmpAddr, coeffs, 256); 5182 store64shorts(vs1, tmpAddr); 5183 store64shorts(vs3, tmpAddr); 5184 5185 // level 2 5186 vs_ldpq(vq, kyberConsts); 5187 int offsets1[4] = { 0, 32, 128, 160 }; 5188 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5189 load64shorts(vs2, zetas); 5190 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5191 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5192 // kyber_subv_addv64(); 5193 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5194 vs_addv(vs1, __ T8H, vs1, vs2); 5195 __ add(tmpAddr, coeffs, 0); 5196 vs_stpq_post(vs_front(vs1), tmpAddr); 5197 vs_stpq_post(vs_front(vs3), tmpAddr); 5198 vs_stpq_post(vs_back(vs1), tmpAddr); 5199 vs_stpq_post(vs_back(vs3), tmpAddr); 5200 vs_ldpq(vq, kyberConsts); 5201 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5202 load64shorts(vs2, zetas); 5203 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5204 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5205 // kyber_subv_addv64(); 5206 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5207 vs_addv(vs1, __ T8H, vs1, vs2); 5208 __ add(tmpAddr, coeffs, 256); 5209 vs_stpq_post(vs_front(vs1), tmpAddr); 5210 vs_stpq_post(vs_front(vs3), tmpAddr); 5211 vs_stpq_post(vs_back(vs1), tmpAddr); 5212 vs_stpq_post(vs_back(vs3), tmpAddr); 5213 5214 // level 3 5215 vs_ldpq(vq, kyberConsts); 5216 int offsets2[4] = { 0, 64, 128, 192 }; 5217 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5218 load64shorts(vs2, zetas); 5219 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5220 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5221 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5222 vs_addv(vs1, __ T8H, vs1, vs2); 5223 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5224 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5225 5226 vs_ldpq(vq, kyberConsts); 5227 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5228 load64shorts(vs2, zetas); 5229 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5230 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5231 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5232 vs_addv(vs1, __ T8H, vs1, vs2); 5233 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5234 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5235 5236 // level 4 5237 // At level 4 coefficients occur in 8 discrete blocks of size 16 5238 // so they are loaded using employing an ldr at 8 distinct offsets. 5239 5240 vs_ldpq(vq, kyberConsts); 5241 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5242 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5243 load64shorts(vs2, zetas); 5244 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5245 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5246 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5247 vs_addv(vs1, __ T8H, vs1, vs2); 5248 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5249 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5250 5251 vs_ldpq(vq, kyberConsts); 5252 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5253 load64shorts(vs2, zetas); 5254 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5255 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5256 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5257 vs_addv(vs1, __ T8H, vs1, vs2); 5258 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5259 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5260 5261 // level 5 5262 // At level 5 related coefficients occur in discrete blocks of size 8 so 5263 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5264 5265 vs_ldpq(vq, kyberConsts); 5266 int offsets4[4] = { 0, 32, 64, 96 }; 5267 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5268 load32shorts(vs_front(vs2), zetas); 5269 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5270 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5271 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5272 load32shorts(vs_front(vs2), zetas); 5273 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5274 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5275 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5276 load32shorts(vs_front(vs2), zetas); 5277 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5278 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5279 5280 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5281 load32shorts(vs_front(vs2), zetas); 5282 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5283 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5284 5285 // level 6 5286 // At level 6 related coefficients occur in discrete blocks of size 4 so 5287 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5288 5289 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5290 load32shorts(vs_front(vs2), zetas); 5291 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5292 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5293 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5294 // __ ldpq(v18, v19, __ post(zetas, 32)); 5295 load32shorts(vs_front(vs2), zetas); 5296 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5297 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5298 5299 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5300 load32shorts(vs_front(vs2), zetas); 5301 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5302 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5303 5304 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5305 load32shorts(vs_front(vs2), zetas); 5306 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5307 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5308 5309 __ leave(); // required for proper stackwalking of RuntimeStub frame 5310 __ mov(r0, zr); // return 0 5311 __ ret(lr); 5312 5313 return start; 5314 } 5315 5316 // Kyber Inverse NTT function 5317 // Implements 5318 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5319 // 5320 // coeffs (short[256]) = c_rarg0 5321 // ntt_zetas (short[256]) = c_rarg1 5322 address generate_kyberInverseNtt() { 5323 5324 __ align(CodeEntryAlignment); 5325 StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id; 5326 StubCodeMark mark(this, stub_id); 5327 address start = __ pc(); 5328 __ enter(); 5329 5330 const Register coeffs = c_rarg0; 5331 const Register zetas = c_rarg1; 5332 5333 const Register kyberConsts = r10; 5334 const Register tmpAddr = r11; 5335 const Register tmpAddr2 = c_rarg2; 5336 5337 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5338 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5339 VSeq<2> vq(30); // n.b. constants overlap vs3 5340 5341 __ lea(kyberConsts, 5342 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5343 5344 // level 0 5345 // At level 0 related coefficients occur in discrete blocks of size 4 so 5346 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5347 5348 vs_ldpq(vq, kyberConsts); 5349 int offsets4[4] = { 0, 32, 64, 96 }; 5350 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5351 load32shorts(vs_front(vs2), zetas); 5352 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5353 vs_front(vs2), vs_back(vs2), vtmp, vq); 5354 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5355 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5356 load32shorts(vs_front(vs2), zetas); 5357 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5358 vs_front(vs2), vs_back(vs2), vtmp, vq); 5359 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5360 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5361 load32shorts(vs_front(vs2), zetas); 5362 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5363 vs_front(vs2), vs_back(vs2), vtmp, vq); 5364 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5365 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5366 load32shorts(vs_front(vs2), zetas); 5367 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5368 vs_front(vs2), vs_back(vs2), vtmp, vq); 5369 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5370 5371 // level 1 5372 // At level 1 related coefficients occur in discrete blocks of size 8 so 5373 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5374 5375 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5376 load32shorts(vs_front(vs2), zetas); 5377 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5378 vs_front(vs2), vs_back(vs2), vtmp, vq); 5379 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5380 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5381 load32shorts(vs_front(vs2), zetas); 5382 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5383 vs_front(vs2), vs_back(vs2), vtmp, vq); 5384 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5385 5386 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5387 load32shorts(vs_front(vs2), zetas); 5388 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5389 vs_front(vs2), vs_back(vs2), vtmp, vq); 5390 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5391 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5392 load32shorts(vs_front(vs2), zetas); 5393 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5394 vs_front(vs2), vs_back(vs2), vtmp, vq); 5395 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5396 5397 // level 2 5398 // At level 2 coefficients occur in 8 discrete blocks of size 16 5399 // so they are loaded using employing an ldr at 8 distinct offsets. 5400 5401 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5402 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5403 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5404 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5405 vs_subv(vs1, __ T8H, vs1, vs2); 5406 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5407 load64shorts(vs2, zetas); 5408 vs_ldpq(vq, kyberConsts); 5409 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5410 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5411 5412 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5413 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5414 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5415 vs_subv(vs1, __ T8H, vs1, vs2); 5416 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5417 load64shorts(vs2, zetas); 5418 vs_ldpq(vq, kyberConsts); 5419 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5420 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5421 5422 // Barrett reduction at indexes where overflow may happen 5423 5424 // load q and the multiplier for the Barrett reduction 5425 __ add(tmpAddr, kyberConsts, 16); 5426 vs_ldpq(vq, tmpAddr); 5427 5428 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5429 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5430 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5431 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5432 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5433 vs_sshr(vs2, __ T8H, vs2, 11); 5434 vs_mlsv(vs1, __ T8H, vs2, vq1); 5435 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5436 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5437 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5438 vs_sshr(vs2, __ T8H, vs2, 11); 5439 vs_mlsv(vs1, __ T8H, vs2, vq1); 5440 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5441 5442 // level 3 5443 // From level 3 upwards coefficients occur in discrete blocks whose size is 5444 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5445 5446 int offsets2[4] = { 0, 64, 128, 192 }; 5447 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5448 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5449 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5450 vs_subv(vs1, __ T8H, vs1, vs2); 5451 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5452 load64shorts(vs2, zetas); 5453 vs_ldpq(vq, kyberConsts); 5454 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5455 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5456 5457 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5458 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5459 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5460 vs_subv(vs1, __ T8H, vs1, vs2); 5461 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5462 load64shorts(vs2, zetas); 5463 vs_ldpq(vq, kyberConsts); 5464 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5465 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5466 5467 // level 4 5468 5469 int offsets1[4] = { 0, 32, 128, 160 }; 5470 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5471 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5472 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5473 vs_subv(vs1, __ T8H, vs1, vs2); 5474 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5475 load64shorts(vs2, zetas); 5476 vs_ldpq(vq, kyberConsts); 5477 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5478 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5479 5480 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5481 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5482 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5483 vs_subv(vs1, __ T8H, vs1, vs2); 5484 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5485 load64shorts(vs2, zetas); 5486 vs_ldpq(vq, kyberConsts); 5487 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5488 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5489 5490 // level 5 5491 5492 __ add(tmpAddr, coeffs, 0); 5493 load64shorts(vs1, tmpAddr); 5494 __ add(tmpAddr, coeffs, 128); 5495 load64shorts(vs2, tmpAddr); 5496 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5497 vs_subv(vs1, __ T8H, vs1, vs2); 5498 __ add(tmpAddr, coeffs, 0); 5499 store64shorts(vs3, tmpAddr); 5500 load64shorts(vs2, zetas); 5501 vs_ldpq(vq, kyberConsts); 5502 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5503 __ add(tmpAddr, coeffs, 128); 5504 store64shorts(vs2, tmpAddr); 5505 5506 load64shorts(vs1, tmpAddr); 5507 __ add(tmpAddr, coeffs, 384); 5508 load64shorts(vs2, tmpAddr); 5509 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5510 vs_subv(vs1, __ T8H, vs1, vs2); 5511 __ add(tmpAddr, coeffs, 256); 5512 store64shorts(vs3, tmpAddr); 5513 load64shorts(vs2, zetas); 5514 vs_ldpq(vq, kyberConsts); 5515 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5516 __ add(tmpAddr, coeffs, 384); 5517 store64shorts(vs2, tmpAddr); 5518 5519 // Barrett reduction at indexes where overflow may happen 5520 5521 // load q and the multiplier for the Barrett reduction 5522 __ add(tmpAddr, kyberConsts, 16); 5523 vs_ldpq(vq, tmpAddr); 5524 5525 int offsets0[2] = { 0, 256 }; 5526 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5527 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5528 vs_sshr(vs2, __ T8H, vs2, 11); 5529 vs_mlsv(vs1, __ T8H, vs2, vq1); 5530 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5531 5532 // level 6 5533 5534 __ add(tmpAddr, coeffs, 0); 5535 load64shorts(vs1, tmpAddr); 5536 __ add(tmpAddr, coeffs, 256); 5537 load64shorts(vs2, tmpAddr); 5538 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5539 vs_subv(vs1, __ T8H, vs1, vs2); 5540 __ add(tmpAddr, coeffs, 0); 5541 store64shorts(vs3, tmpAddr); 5542 load64shorts(vs2, zetas); 5543 vs_ldpq(vq, kyberConsts); 5544 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5545 __ add(tmpAddr, coeffs, 256); 5546 store64shorts(vs2, tmpAddr); 5547 5548 __ add(tmpAddr, coeffs, 128); 5549 load64shorts(vs1, tmpAddr); 5550 __ add(tmpAddr, coeffs, 384); 5551 load64shorts(vs2, tmpAddr); 5552 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5553 vs_subv(vs1, __ T8H, vs1, vs2); 5554 __ add(tmpAddr, coeffs, 128); 5555 store64shorts(vs3, tmpAddr); 5556 load64shorts(vs2, zetas); 5557 vs_ldpq(vq, kyberConsts); 5558 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5559 __ add(tmpAddr, coeffs, 384); 5560 store64shorts(vs2, tmpAddr); 5561 5562 // multiply by 2^-n 5563 5564 // load toMont(2^-n mod q) 5565 __ add(tmpAddr, kyberConsts, 48); 5566 __ ldr(v29, __ Q, tmpAddr); 5567 5568 vs_ldpq(vq, kyberConsts); 5569 __ add(tmpAddr, coeffs, 0); 5570 load64shorts(vs1, tmpAddr); 5571 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5572 __ add(tmpAddr, coeffs, 0); 5573 store64shorts(vs2, tmpAddr); 5574 5575 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5576 load64shorts(vs1, tmpAddr); 5577 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5578 __ add(tmpAddr, coeffs, 128); 5579 store64shorts(vs2, tmpAddr); 5580 5581 // now tmpAddr contains coeffs + 256 5582 load64shorts(vs1, tmpAddr); 5583 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5584 __ add(tmpAddr, coeffs, 256); 5585 store64shorts(vs2, tmpAddr); 5586 5587 // now tmpAddr contains coeffs + 384 5588 load64shorts(vs1, tmpAddr); 5589 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5590 __ add(tmpAddr, coeffs, 384); 5591 store64shorts(vs2, tmpAddr); 5592 5593 __ leave(); // required for proper stackwalking of RuntimeStub frame 5594 __ mov(r0, zr); // return 0 5595 __ ret(lr); 5596 5597 return start; 5598 } 5599 5600 // Kyber multiply polynomials in the NTT domain. 5601 // Implements 5602 // static int implKyberNttMult( 5603 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5604 // 5605 // result (short[256]) = c_rarg0 5606 // ntta (short[256]) = c_rarg1 5607 // nttb (short[256]) = c_rarg2 5608 // zetas (short[128]) = c_rarg3 5609 address generate_kyberNttMult() { 5610 5611 __ align(CodeEntryAlignment); 5612 StubGenStubId stub_id = StubGenStubId::kyberNttMult_id; 5613 StubCodeMark mark(this, stub_id); 5614 address start = __ pc(); 5615 __ enter(); 5616 5617 const Register result = c_rarg0; 5618 const Register ntta = c_rarg1; 5619 const Register nttb = c_rarg2; 5620 const Register zetas = c_rarg3; 5621 5622 const Register kyberConsts = r10; 5623 const Register limit = r11; 5624 5625 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5626 VSeq<4> vs3(16), vs4(20); 5627 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5628 VSeq<2> vz(28); // pair of zetas 5629 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5630 5631 __ lea(kyberConsts, 5632 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5633 5634 Label kyberNttMult_loop; 5635 5636 __ add(limit, result, 512); 5637 5638 // load q and qinv 5639 vs_ldpq(vq, kyberConsts); 5640 5641 // load R^2 mod q (to convert back from Montgomery representation) 5642 __ add(kyberConsts, kyberConsts, 64); 5643 __ ldr(v27, __ Q, kyberConsts); 5644 5645 __ BIND(kyberNttMult_loop); 5646 5647 // load 16 zetas 5648 vs_ldpq_post(vz, zetas); 5649 5650 // load 2 sets of 32 coefficients from the two input arrays 5651 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5652 // are striped across pairs of vector registers 5653 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5654 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5655 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5656 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5657 5658 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5659 // i.e. montmul the first and second halves of vs1 in order and 5660 // then with one sequence reversed storing the two results in vs3 5661 // 5662 // vs3[0] <- montmul(a0, b0) 5663 // vs3[1] <- montmul(a1, b1) 5664 // vs3[2] <- montmul(a0, b1) 5665 // vs3[3] <- montmul(a1, b0) 5666 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5667 kyber_montmul16(vs_back(vs3), 5668 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5669 5670 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5671 // i.e. montmul the first and second halves of vs4 in order and 5672 // then with one sequence reversed storing the two results in vs1 5673 // 5674 // vs1[0] <- montmul(a2, b2) 5675 // vs1[1] <- montmul(a3, b3) 5676 // vs1[2] <- montmul(a2, b3) 5677 // vs1[3] <- montmul(a3, b2) 5678 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5679 kyber_montmul16(vs_back(vs1), 5680 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5681 5682 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5683 // We can schedule two montmuls at a time if we use a suitable vector 5684 // sequence <vs3[1], vs1[1]>. 5685 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5686 VSeq<2> vs5(vs3[1], delta); 5687 5688 // vs3[1] <- montmul(montmul(a1, b1), z0) 5689 // vs1[1] <- montmul(montmul(a3, b3), z1) 5690 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5691 5692 // add results in pairs storing in vs3 5693 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5694 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5695 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5696 5697 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5698 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5699 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5700 5701 // vs1 <- montmul(vs3, montRSquareModQ) 5702 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5703 5704 // store back the two pairs of result vectors de-interleaved as 8H elements 5705 // i.e. storing each pairs of shorts striped across a register pair adjacent 5706 // in memory 5707 vs_st2_post(vs1, __ T8H, result); 5708 5709 __ cmp(result, limit); 5710 __ br(Assembler::NE, kyberNttMult_loop); 5711 5712 __ leave(); // required for proper stackwalking of RuntimeStub frame 5713 __ mov(r0, zr); // return 0 5714 __ ret(lr); 5715 5716 return start; 5717 } 5718 5719 // Kyber add 2 polynomials. 5720 // Implements 5721 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5722 // 5723 // result (short[256]) = c_rarg0 5724 // a (short[256]) = c_rarg1 5725 // b (short[256]) = c_rarg2 5726 address generate_kyberAddPoly_2() { 5727 5728 __ align(CodeEntryAlignment); 5729 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id; 5730 StubCodeMark mark(this, stub_id); 5731 address start = __ pc(); 5732 __ enter(); 5733 5734 const Register result = c_rarg0; 5735 const Register a = c_rarg1; 5736 const Register b = c_rarg2; 5737 5738 const Register kyberConsts = r11; 5739 5740 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5741 // So, we can load, add and store the data in 3 groups of 11, 5742 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5743 // registers. A further constraint is that the mapping needs 5744 // to skip callee saves. So, we allocate the register 5745 // sequences using two 8 sequences, two 2 sequences and two 5746 // single registers. 5747 VSeq<8> vs1_1(0); 5748 VSeq<2> vs1_2(16); 5749 FloatRegister vs1_3 = v28; 5750 VSeq<8> vs2_1(18); 5751 VSeq<2> vs2_2(26); 5752 FloatRegister vs2_3 = v29; 5753 5754 // two constant vector sequences 5755 VSeq<8> vc_1(31, 0); 5756 VSeq<2> vc_2(31, 0); 5757 5758 FloatRegister vc_3 = v31; 5759 __ lea(kyberConsts, 5760 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5761 5762 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5763 for (int i = 0; i < 3; i++) { 5764 // load 80 or 88 values from a into vs1_1/2/3 5765 vs_ldpq_post(vs1_1, a); 5766 vs_ldpq_post(vs1_2, a); 5767 if (i < 2) { 5768 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5769 } 5770 // load 80 or 88 values from b into vs2_1/2/3 5771 vs_ldpq_post(vs2_1, b); 5772 vs_ldpq_post(vs2_2, b); 5773 if (i < 2) { 5774 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5775 } 5776 // sum 80 or 88 values across vs1 and vs2 into vs1 5777 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5778 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5779 if (i < 2) { 5780 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5781 } 5782 // add constant to all 80 or 88 results 5783 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5784 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5785 if (i < 2) { 5786 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5787 } 5788 // store 80 or 88 values 5789 vs_stpq_post(vs1_1, result); 5790 vs_stpq_post(vs1_2, result); 5791 if (i < 2) { 5792 __ str(vs1_3, __ Q, __ post(result, 16)); 5793 } 5794 } 5795 5796 __ leave(); // required for proper stackwalking of RuntimeStub frame 5797 __ mov(r0, zr); // return 0 5798 __ ret(lr); 5799 5800 return start; 5801 } 5802 5803 // Kyber add 3 polynomials. 5804 // Implements 5805 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5806 // 5807 // result (short[256]) = c_rarg0 5808 // a (short[256]) = c_rarg1 5809 // b (short[256]) = c_rarg2 5810 // c (short[256]) = c_rarg3 5811 address generate_kyberAddPoly_3() { 5812 5813 __ align(CodeEntryAlignment); 5814 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id; 5815 StubCodeMark mark(this, stub_id); 5816 address start = __ pc(); 5817 __ enter(); 5818 5819 const Register result = c_rarg0; 5820 const Register a = c_rarg1; 5821 const Register b = c_rarg2; 5822 const Register c = c_rarg3; 5823 5824 const Register kyberConsts = r11; 5825 5826 // As above we sum 256 sets of values in total i.e. 32 x 8H 5827 // quadwords. So, we can load, add and store the data in 3 5828 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5829 // of 10 or 11 registers. A further constraint is that the 5830 // mapping needs to skip callee saves. So, we allocate the 5831 // register sequences using two 8 sequences, two 2 sequences 5832 // and two single registers. 5833 VSeq<8> vs1_1(0); 5834 VSeq<2> vs1_2(16); 5835 FloatRegister vs1_3 = v28; 5836 VSeq<8> vs2_1(18); 5837 VSeq<2> vs2_2(26); 5838 FloatRegister vs2_3 = v29; 5839 5840 // two constant vector sequences 5841 VSeq<8> vc_1(31, 0); 5842 VSeq<2> vc_2(31, 0); 5843 5844 FloatRegister vc_3 = v31; 5845 5846 __ lea(kyberConsts, 5847 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5848 5849 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5850 for (int i = 0; i < 3; i++) { 5851 // load 80 or 88 values from a into vs1_1/2/3 5852 vs_ldpq_post(vs1_1, a); 5853 vs_ldpq_post(vs1_2, a); 5854 if (i < 2) { 5855 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5856 } 5857 // load 80 or 88 values from b into vs2_1/2/3 5858 vs_ldpq_post(vs2_1, b); 5859 vs_ldpq_post(vs2_2, b); 5860 if (i < 2) { 5861 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5862 } 5863 // sum 80 or 88 values across vs1 and vs2 into vs1 5864 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5865 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5866 if (i < 2) { 5867 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5868 } 5869 // load 80 or 88 values from c into vs2_1/2/3 5870 vs_ldpq_post(vs2_1, c); 5871 vs_ldpq_post(vs2_2, c); 5872 if (i < 2) { 5873 __ ldr(vs2_3, __ Q, __ post(c, 16)); 5874 } 5875 // sum 80 or 88 values across vs1 and vs2 into vs1 5876 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5877 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5878 if (i < 2) { 5879 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5880 } 5881 // add constant to all 80 or 88 results 5882 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5883 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5884 if (i < 2) { 5885 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5886 } 5887 // store 80 or 88 values 5888 vs_stpq_post(vs1_1, result); 5889 vs_stpq_post(vs1_2, result); 5890 if (i < 2) { 5891 __ str(vs1_3, __ Q, __ post(result, 16)); 5892 } 5893 } 5894 5895 __ leave(); // required for proper stackwalking of RuntimeStub frame 5896 __ mov(r0, zr); // return 0 5897 __ ret(lr); 5898 5899 return start; 5900 } 5901 5902 // Kyber parse XOF output to polynomial coefficient candidates 5903 // or decodePoly(12, ...). 5904 // Implements 5905 // static int implKyber12To16( 5906 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 5907 // 5908 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 5909 // 5910 // condensed (byte[]) = c_rarg0 5911 // condensedIndex = c_rarg1 5912 // parsed (short[112 or 256]) = c_rarg2 5913 // parsedLength (112 or 256) = c_rarg3 5914 address generate_kyber12To16() { 5915 Label L_F00, L_loop, L_end; 5916 5917 __ BIND(L_F00); 5918 __ emit_int64(0x0f000f000f000f00); 5919 __ emit_int64(0x0f000f000f000f00); 5920 5921 __ align(CodeEntryAlignment); 5922 StubGenStubId stub_id = StubGenStubId::kyber12To16_id; 5923 StubCodeMark mark(this, stub_id); 5924 address start = __ pc(); 5925 __ enter(); 5926 5927 const Register condensed = c_rarg0; 5928 const Register condensedOffs = c_rarg1; 5929 const Register parsed = c_rarg2; 5930 const Register parsedLength = c_rarg3; 5931 5932 const Register tmpAddr = r11; 5933 5934 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 5935 // quadwords so we need a 6 vector sequence for the inputs. 5936 // Parsing produces 64 shorts, employing two 8 vector 5937 // sequences to store and combine the intermediate data. 5938 VSeq<6> vin(24); 5939 VSeq<8> va(0), vb(16); 5940 5941 __ adr(tmpAddr, L_F00); 5942 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 5943 __ add(condensed, condensed, condensedOffs); 5944 5945 __ BIND(L_loop); 5946 // load 96 (6 x 16B) byte values 5947 vs_ld3_post(vin, __ T16B, condensed); 5948 5949 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 5950 // holds 48 (16x3) contiguous bytes from memory striped 5951 // horizontally across each of the 16 byte lanes. Equivalently, 5952 // that is 16 pairs of 12-bit integers. Likewise the back half 5953 // holds the next 48 bytes in the same arrangement. 5954 5955 // Each vector in the front half can also be viewed as a vertical 5956 // strip across the 16 pairs of 12 bit integers. Each byte in 5957 // vin[0] stores the low 8 bits of the first int in a pair. Each 5958 // byte in vin[1] stores the high 4 bits of the first int and the 5959 // low 4 bits of the second int. Each byte in vin[2] stores the 5960 // high 8 bits of the second int. Likewise the vectors in second 5961 // half. 5962 5963 // Converting the data to 16-bit shorts requires first of all 5964 // expanding each of the 6 x 16B vectors into 6 corresponding 5965 // pairs of 8H vectors. Mask, shift and add operations on the 5966 // resulting vector pairs can be used to combine 4 and 8 bit 5967 // parts of related 8H vector elements. 5968 // 5969 // The middle vectors (vin[2] and vin[5]) are actually expanded 5970 // twice, one copy manipulated to provide the lower 4 bits 5971 // belonging to the first short in a pair and another copy 5972 // manipulated to provide the higher 4 bits belonging to the 5973 // second short in a pair. This is why the the vector sequences va 5974 // and vb used to hold the expanded 8H elements are of length 8. 5975 5976 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 5977 // n.b. target elements 2 and 3 duplicate elements 4 and 5 5978 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 5979 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 5980 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 5981 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 5982 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 5983 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 5984 5985 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 5986 // and vb[4:5] 5987 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 5988 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 5989 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 5990 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 5991 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 5992 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 5993 5994 // shift lo byte of copy 1 of the middle stripe into the high byte 5995 __ shl(va[2], __ T8H, va[2], 8); 5996 __ shl(va[3], __ T8H, va[3], 8); 5997 __ shl(vb[2], __ T8H, vb[2], 8); 5998 __ shl(vb[3], __ T8H, vb[3], 8); 5999 6000 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6001 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6002 // are in bit positions [4..11]. 6003 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6004 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6005 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6006 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6007 6008 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6009 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6010 // copy2 6011 __ andr(va[2], __ T16B, va[2], v31); 6012 __ andr(va[3], __ T16B, va[3], v31); 6013 __ ushr(va[4], __ T8H, va[4], 4); 6014 __ ushr(va[5], __ T8H, va[5], 4); 6015 __ andr(vb[2], __ T16B, vb[2], v31); 6016 __ andr(vb[3], __ T16B, vb[3], v31); 6017 __ ushr(vb[4], __ T8H, vb[4], 4); 6018 __ ushr(vb[5], __ T8H, vb[5], 4); 6019 6020 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6021 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6022 // n.b. the ordering ensures: i) inputs are consumed before they 6023 // are overwritten ii) the order of 16-bit results across successive 6024 // pairs of vectors in va and then vb reflects the order of the 6025 // corresponding 12-bit inputs 6026 __ addv(va[0], __ T8H, va[0], va[2]); 6027 __ addv(va[2], __ T8H, va[1], va[3]); 6028 __ addv(va[1], __ T8H, va[4], va[6]); 6029 __ addv(va[3], __ T8H, va[5], va[7]); 6030 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6031 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6032 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6033 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6034 6035 // store 64 results interleaved as shorts 6036 vs_st2_post(vs_front(va), __ T8H, parsed); 6037 vs_st2_post(vs_front(vb), __ T8H, parsed); 6038 6039 __ sub(parsedLength, parsedLength, 64); 6040 __ cmp(parsedLength, (u1)64); 6041 __ br(Assembler::GE, L_loop); 6042 __ cbz(parsedLength, L_end); 6043 6044 // if anything is left it should be a final 72 bytes of input 6045 // i.e. a final 48 12-bit values. so we handle this by loading 6046 // 48 bytes into all 16B lanes of front(vin) and only 24 6047 // bytes into the lower 8B lane of back(vin) 6048 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6049 vs_ld3(vs_back(vin), __ T8B, condensed); 6050 6051 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6052 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6053 // 5 and target element 2 of vb duplicates element 4. 6054 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6055 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6056 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6057 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6058 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6059 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6060 6061 // This time expand just the lower 8 lanes 6062 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6063 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6064 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6065 6066 // shift lo byte of copy 1 of the middle stripe into the high byte 6067 __ shl(va[2], __ T8H, va[2], 8); 6068 __ shl(va[3], __ T8H, va[3], 8); 6069 __ shl(vb[2], __ T8H, vb[2], 8); 6070 6071 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6072 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6073 // int are in bit positions [4..11]. 6074 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6075 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6076 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6077 6078 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6079 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6080 // copy2 6081 __ andr(va[2], __ T16B, va[2], v31); 6082 __ andr(va[3], __ T16B, va[3], v31); 6083 __ ushr(va[4], __ T8H, va[4], 4); 6084 __ ushr(va[5], __ T8H, va[5], 4); 6085 __ andr(vb[2], __ T16B, vb[2], v31); 6086 __ ushr(vb[4], __ T8H, vb[4], 4); 6087 6088 6089 6090 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6091 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6092 6093 // n.b. ordering ensures: i) inputs are consumed before they are 6094 // overwritten ii) order of 16-bit results across succsessive 6095 // pairs of vectors in va and then lower half of vb reflects order 6096 // of corresponding 12-bit inputs 6097 __ addv(va[0], __ T8H, va[0], va[2]); 6098 __ addv(va[2], __ T8H, va[1], va[3]); 6099 __ addv(va[1], __ T8H, va[4], va[6]); 6100 __ addv(va[3], __ T8H, va[5], va[7]); 6101 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6102 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6103 6104 // store 48 results interleaved as shorts 6105 vs_st2_post(vs_front(va), __ T8H, parsed); 6106 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6107 6108 __ BIND(L_end); 6109 6110 __ leave(); // required for proper stackwalking of RuntimeStub frame 6111 __ mov(r0, zr); // return 0 6112 __ ret(lr); 6113 6114 return start; 6115 } 6116 6117 // Kyber Barrett reduce function. 6118 // Implements 6119 // static int implKyberBarrettReduce(short[] coeffs) {} 6120 // 6121 // coeffs (short[256]) = c_rarg0 6122 address generate_kyberBarrettReduce() { 6123 6124 __ align(CodeEntryAlignment); 6125 StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id; 6126 StubCodeMark mark(this, stub_id); 6127 address start = __ pc(); 6128 __ enter(); 6129 6130 const Register coeffs = c_rarg0; 6131 6132 const Register kyberConsts = r10; 6133 const Register result = r11; 6134 6135 // As above we process 256 sets of values in total i.e. 32 x 6136 // 8H quadwords. So, we can load, add and store the data in 3 6137 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6138 // of 10 or 11 registers. A further constraint is that the 6139 // mapping needs to skip callee saves. So, we allocate the 6140 // register sequences using two 8 sequences, two 2 sequences 6141 // and two single registers. 6142 VSeq<8> vs1_1(0); 6143 VSeq<2> vs1_2(16); 6144 FloatRegister vs1_3 = v28; 6145 VSeq<8> vs2_1(18); 6146 VSeq<2> vs2_2(26); 6147 FloatRegister vs2_3 = v29; 6148 6149 // we also need a pair of corresponding constant sequences 6150 6151 VSeq<8> vc1_1(30, 0); 6152 VSeq<2> vc1_2(30, 0); 6153 FloatRegister vc1_3 = v30; // for kyber_q 6154 6155 VSeq<8> vc2_1(31, 0); 6156 VSeq<2> vc2_2(31, 0); 6157 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6158 6159 __ add(result, coeffs, 0); 6160 __ lea(kyberConsts, 6161 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6162 6163 // load q and the multiplier for the Barrett reduction 6164 __ add(kyberConsts, kyberConsts, 16); 6165 __ ldpq(vc1_3, vc2_3, kyberConsts); 6166 6167 for (int i = 0; i < 3; i++) { 6168 // load 80 or 88 coefficients 6169 vs_ldpq_post(vs1_1, coeffs); 6170 vs_ldpq_post(vs1_2, coeffs); 6171 if (i < 2) { 6172 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6173 } 6174 6175 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6176 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6177 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6178 if (i < 2) { 6179 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6180 } 6181 6182 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6183 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6184 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6185 if (i < 2) { 6186 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6187 } 6188 6189 // vs1 <- vs1 - vs2 * kyber_q 6190 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6191 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6192 if (i < 2) { 6193 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6194 } 6195 6196 vs_stpq_post(vs1_1, result); 6197 vs_stpq_post(vs1_2, result); 6198 if (i < 2) { 6199 __ str(vs1_3, __ Q, __ post(result, 16)); 6200 } 6201 } 6202 6203 __ leave(); // required for proper stackwalking of RuntimeStub frame 6204 __ mov(r0, zr); // return 0 6205 __ ret(lr); 6206 6207 return start; 6208 } 6209 6210 6211 // Dilithium-specific montmul helper routines that generate parallel 6212 // code for, respectively, a single 4x4s vector sequence montmul or 6213 // two such multiplies in a row. 6214 6215 // Perform 16 32-bit Montgomery multiplications in parallel 6216 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6217 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6218 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6219 // It will assert that the register use is valid 6220 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6221 } 6222 6223 // Perform 2x16 32-bit Montgomery multiplications in parallel 6224 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6225 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6226 // Schedule two successive 4x4S multiplies via the montmul helper 6227 // on the front and back halves of va, vb and vc. The helper will 6228 // assert that the register use has no overlap conflicts on each 6229 // individual call but we also need to ensure that the necessary 6230 // disjoint/equality constraints are met across both calls. 6231 6232 // vb, vc, vtmp and vq must be disjoint. va must either be 6233 // disjoint from all other registers or equal vc 6234 6235 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6236 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6237 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6238 6239 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6240 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6241 6242 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6243 6244 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6245 assert(vs_disjoint(va, vb), "va and vb overlap"); 6246 assert(vs_disjoint(va, vq), "va and vq overlap"); 6247 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6248 6249 // We multiply the front and back halves of each sequence 4 at a 6250 // time because 6251 // 6252 // 1) we are currently only able to get 4-way instruction 6253 // parallelism at best 6254 // 6255 // 2) we need registers for the constants in vq and temporary 6256 // scratch registers to hold intermediate results so vtmp can only 6257 // be a VSeq<4> which means we only have 4 scratch slots. 6258 6259 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6260 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6261 } 6262 6263 // Perform combined montmul then add/sub on 4x4S vectors. 6264 void dilithium_montmul16_sub_add( 6265 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6266 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6267 // compute a = montmul(a1, c) 6268 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6269 // ouptut a1 = a0 - a 6270 vs_subv(va1, __ T4S, va0, vc); 6271 // and a0 = a0 + a 6272 vs_addv(va0, __ T4S, va0, vc); 6273 } 6274 6275 // Perform combined add/sub then montul on 4x4S vectors. 6276 void dilithium_sub_add_montmul16( 6277 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6278 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6279 // compute c = a0 - a1 6280 vs_subv(vtmp1, __ T4S, va0, va1); 6281 // output a0 = a0 + a1 6282 vs_addv(va0, __ T4S, va0, va1); 6283 // output a1 = b montmul c 6284 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6285 } 6286 6287 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6288 // in the Java implementation come in sequences of at least 8, so we 6289 // can use ldpq to collect the corresponding data into pairs of vector 6290 // registers. 6291 // We collect the coefficients corresponding to the 'j+l' indexes into 6292 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6293 // then we do the (Montgomery) multiplications by the zetas in parallel 6294 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6295 // v0-v7, then do the additions into v24-v31 and the subtractions into 6296 // v0-v7 and finally save the results back to the coeffs array. 6297 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6298 const Register coeffs, const Register zetas) { 6299 int c1 = 0; 6300 int c2 = 512; 6301 int startIncr; 6302 // don't use callee save registers v8 - v15 6303 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6304 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6305 VSeq<2> vq(30); // n.b. constants overlap vs3 6306 int offsets[4] = { 0, 32, 64, 96 }; 6307 6308 for (int level = 0; level < 5; level++) { 6309 int c1Start = c1; 6310 int c2Start = c2; 6311 if (level == 3) { 6312 offsets[1] = 32; 6313 offsets[2] = 128; 6314 offsets[3] = 160; 6315 } else if (level == 4) { 6316 offsets[1] = 64; 6317 offsets[2] = 128; 6318 offsets[3] = 192; 6319 } 6320 6321 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6322 // time at 4 different offsets and multiply them in order by the 6323 // next set of input values. So we employ indexed load and store 6324 // pair instructions with arrangement 4S. 6325 for (int i = 0; i < 4; i++) { 6326 // reload q and qinv 6327 vs_ldpq(vq, dilithiumConsts); // qInv, q 6328 // load 8x4S coefficients via second start pos == c2 6329 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6330 // load next 8x4S inputs == b 6331 vs_ldpq_post(vs2, zetas); 6332 // compute a == c2 * b mod MONT_Q 6333 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6334 // load 8x4s coefficients via first start pos == c1 6335 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6336 // compute a1 = c1 + a 6337 vs_addv(vs3, __ T4S, vs1, vs2); 6338 // compute a2 = c1 - a 6339 vs_subv(vs1, __ T4S, vs1, vs2); 6340 // output a1 and a2 6341 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6342 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6343 6344 int k = 4 * level + i; 6345 6346 if (k > 7) { 6347 startIncr = 256; 6348 } else if (k == 5) { 6349 startIncr = 384; 6350 } else { 6351 startIncr = 128; 6352 } 6353 6354 c1Start += startIncr; 6355 c2Start += startIncr; 6356 } 6357 6358 c2 /= 2; 6359 } 6360 } 6361 6362 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6363 // Implements the method 6364 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6365 // of the Java class sun.security.provider 6366 // 6367 // coeffs (int[256]) = c_rarg0 6368 // zetas (int[256]) = c_rarg1 6369 address generate_dilithiumAlmostNtt() { 6370 6371 __ align(CodeEntryAlignment); 6372 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 6373 StubCodeMark mark(this, stub_id); 6374 address start = __ pc(); 6375 __ enter(); 6376 6377 const Register coeffs = c_rarg0; 6378 const Register zetas = c_rarg1; 6379 6380 const Register tmpAddr = r9; 6381 const Register dilithiumConsts = r10; 6382 const Register result = r11; 6383 // don't use callee save registers v8 - v15 6384 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6385 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6386 VSeq<2> vq(30); // n.b. constants overlap vs3 6387 int offsets[4] = { 0, 32, 64, 96}; 6388 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6389 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6390 __ add(result, coeffs, 0); 6391 __ lea(dilithiumConsts, 6392 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6393 6394 // Each level represents one iteration of the outer for loop of the Java version. 6395 6396 // level 0-4 6397 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6398 6399 // level 5 6400 6401 // At level 5 the coefficients we need to combine with the zetas 6402 // are grouped in memory in blocks of size 4. So, for both sets of 6403 // coefficients we load 4 adjacent values at 8 different offsets 6404 // using an indexed ldr with register variant Q and multiply them 6405 // in sequence order by the next set of inputs. Likewise we store 6406 // the resuls using an indexed str with register variant Q. 6407 for (int i = 0; i < 1024; i += 256) { 6408 // reload constants q, qinv each iteration as they get clobbered later 6409 vs_ldpq(vq, dilithiumConsts); // qInv, q 6410 // load 32 (8x4S) coefficients via first offsets = c1 6411 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6412 // load next 32 (8x4S) inputs = b 6413 vs_ldpq_post(vs2, zetas); 6414 // a = b montul c1 6415 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6416 // load 32 (8x4S) coefficients via second offsets = c2 6417 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6418 // add/sub with result of multiply 6419 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6420 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6421 // write back new coefficients using same offsets 6422 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6423 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6424 } 6425 6426 // level 6 6427 // At level 6 the coefficients we need to combine with the zetas 6428 // are grouped in memory in pairs, the first two being montmul 6429 // inputs and the second add/sub inputs. We can still implement 6430 // the montmul+sub+add using 4-way parallelism but only if we 6431 // combine the coefficients with the zetas 16 at a time. We load 8 6432 // adjacent values at 4 different offsets using an ld2 load with 6433 // arrangement 2D. That interleaves the lower and upper halves of 6434 // each pair of quadwords into successive vector registers. We 6435 // then need to montmul the 4 even elements of the coefficients 6436 // register sequence by the zetas in order and then add/sub the 4 6437 // odd elements of the coefficients register sequence. We use an 6438 // equivalent st2 operation to store the results back into memory 6439 // de-interleaved. 6440 for (int i = 0; i < 1024; i += 128) { 6441 // reload constants q, qinv each iteration as they get clobbered later 6442 vs_ldpq(vq, dilithiumConsts); // qInv, q 6443 // load interleaved 16 (4x2D) coefficients via offsets 6444 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6445 // load next 16 (4x4S) inputs 6446 vs_ldpq_post(vs_front(vs2), zetas); 6447 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6448 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6449 vs_front(vs2), vtmp, vq); 6450 // store interleaved 16 (4x2D) coefficients via offsets 6451 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6452 } 6453 6454 // level 7 6455 // At level 7 the coefficients we need to combine with the zetas 6456 // occur singly with montmul inputs alterating with add/sub 6457 // inputs. Once again we can use 4-way parallelism to combine 16 6458 // zetas at a time. However, we have to load 8 adjacent values at 6459 // 4 different offsets using an ld2 load with arrangement 4S. That 6460 // interleaves the the odd words of each pair into one 6461 // coefficients vector register and the even words of the pair 6462 // into the next register. We then need to montmul the 4 even 6463 // elements of the coefficients register sequence by the zetas in 6464 // order and then add/sub the 4 odd elements of the coefficients 6465 // register sequence. We use an equivalent st2 operation to store 6466 // the results back into memory de-interleaved. 6467 6468 for (int i = 0; i < 1024; i += 128) { 6469 // reload constants q, qinv each iteration as they get clobbered later 6470 vs_ldpq(vq, dilithiumConsts); // qInv, q 6471 // load interleaved 16 (4x4S) coefficients via offsets 6472 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6473 // load next 16 (4x4S) inputs 6474 vs_ldpq_post(vs_front(vs2), zetas); 6475 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6476 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6477 vs_front(vs2), vtmp, vq); 6478 // store interleaved 16 (4x4S) coefficients via offsets 6479 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6480 } 6481 __ leave(); // required for proper stackwalking of RuntimeStub frame 6482 __ mov(r0, zr); // return 0 6483 __ ret(lr); 6484 6485 return start; 6486 } 6487 6488 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6489 // in the Java implementation come in sequences of at least 8, so we 6490 // can use ldpq to collect the corresponding data into pairs of vector 6491 // registers 6492 // We collect the coefficients that correspond to the 'j's into vs1 6493 // the coefficiets that correspond to the 'j+l's into vs2 then 6494 // do the additions into vs3 and the subtractions into vs1 then 6495 // save the result of the additions, load the zetas into vs2 6496 // do the (Montgomery) multiplications by zeta in parallel into vs2 6497 // finally save the results back to the coeffs array 6498 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6499 const Register coeffs, const Register zetas) { 6500 int c1 = 0; 6501 int c2 = 32; 6502 int startIncr; 6503 int offsets[4]; 6504 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6505 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6506 VSeq<2> vq(30); // n.b. constants overlap vs3 6507 6508 offsets[0] = 0; 6509 6510 for (int level = 3; level < 8; level++) { 6511 int c1Start = c1; 6512 int c2Start = c2; 6513 if (level == 3) { 6514 offsets[1] = 64; 6515 offsets[2] = 128; 6516 offsets[3] = 192; 6517 } else if (level == 4) { 6518 offsets[1] = 32; 6519 offsets[2] = 128; 6520 offsets[3] = 160; 6521 } else { 6522 offsets[1] = 32; 6523 offsets[2] = 64; 6524 offsets[3] = 96; 6525 } 6526 6527 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6528 // time at 4 different offsets and multiply them in order by the 6529 // next set of input values. So we employ indexed load and store 6530 // pair instructions with arrangement 4S. 6531 for (int i = 0; i < 4; i++) { 6532 // load v1 32 (8x4S) coefficients relative to first start index 6533 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6534 // load v2 32 (8x4S) coefficients relative to second start index 6535 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6536 // a0 = v1 + v2 -- n.b. clobbers vqs 6537 vs_addv(vs3, __ T4S, vs1, vs2); 6538 // a1 = v1 - v2 6539 vs_subv(vs1, __ T4S, vs1, vs2); 6540 // save a1 relative to first start index 6541 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6542 // load constants q, qinv each iteration as they get clobbered above 6543 vs_ldpq(vq, dilithiumConsts); // qInv, q 6544 // load b next 32 (8x4S) inputs 6545 vs_ldpq_post(vs2, zetas); 6546 // a = a1 montmul b 6547 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6548 // save a relative to second start index 6549 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6550 6551 int k = 4 * level + i; 6552 6553 if (k < 24) { 6554 startIncr = 256; 6555 } else if (k == 25) { 6556 startIncr = 384; 6557 } else { 6558 startIncr = 128; 6559 } 6560 6561 c1Start += startIncr; 6562 c2Start += startIncr; 6563 } 6564 6565 c2 *= 2; 6566 } 6567 } 6568 6569 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6570 // Implements the method 6571 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6572 // the sun.security.provider.ML_DSA class. 6573 // 6574 // coeffs (int[256]) = c_rarg0 6575 // zetas (int[256]) = c_rarg1 6576 address generate_dilithiumAlmostInverseNtt() { 6577 6578 __ align(CodeEntryAlignment); 6579 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 6580 StubCodeMark mark(this, stub_id); 6581 address start = __ pc(); 6582 __ enter(); 6583 6584 const Register coeffs = c_rarg0; 6585 const Register zetas = c_rarg1; 6586 6587 const Register tmpAddr = r9; 6588 const Register dilithiumConsts = r10; 6589 const Register result = r11; 6590 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6591 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6592 VSeq<2> vq(30); // n.b. constants overlap vs3 6593 int offsets[4] = { 0, 32, 64, 96 }; 6594 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6595 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6596 6597 __ add(result, coeffs, 0); 6598 __ lea(dilithiumConsts, 6599 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6600 6601 // Each level represents one iteration of the outer for loop of the Java version 6602 6603 // level 0 6604 // At level 0 we need to interleave adjacent quartets of 6605 // coefficients before we multiply and add/sub by the next 16 6606 // zetas just as we did for level 7 in the multiply code. So we 6607 // load and store the values using an ld2/st2 with arrangement 4S. 6608 for (int i = 0; i < 1024; i += 128) { 6609 // load constants q, qinv 6610 // n.b. this can be moved out of the loop as they do not get 6611 // clobbered by first two loops 6612 vs_ldpq(vq, dilithiumConsts); // qInv, q 6613 // a0/a1 load interleaved 32 (8x4S) coefficients 6614 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6615 // b load next 32 (8x4S) inputs 6616 vs_ldpq_post(vs_front(vs2), zetas); 6617 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6618 // n.b. second half of vs2 provides temporary register storage 6619 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6620 vs_front(vs2), vs_back(vs2), vtmp, vq); 6621 // a0/a1 store interleaved 32 (8x4S) coefficients 6622 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6623 } 6624 6625 // level 1 6626 // At level 1 we need to interleave pairs of adjacent pairs of 6627 // coefficients before we multiply by the next 16 zetas just as we 6628 // did for level 6 in the multiply code. So we load and store the 6629 // values an ld2/st2 with arrangement 2D. 6630 for (int i = 0; i < 1024; i += 128) { 6631 // a0/a1 load interleaved 32 (8x2D) coefficients 6632 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6633 // b load next 16 (4x4S) inputs 6634 vs_ldpq_post(vs_front(vs2), zetas); 6635 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6636 // n.b. second half of vs2 provides temporary register storage 6637 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6638 vs_front(vs2), vs_back(vs2), vtmp, vq); 6639 // a0/a1 store interleaved 32 (8x2D) coefficients 6640 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6641 } 6642 6643 // level 2 6644 // At level 2 coefficients come in blocks of 4. So, we load 4 6645 // adjacent coefficients at 8 distinct offsets for both the first 6646 // and second coefficient sequences, using an ldr with register 6647 // variant Q then combine them with next set of 32 zetas. Likewise 6648 // we store the results using an str with register variant Q. 6649 for (int i = 0; i < 1024; i += 256) { 6650 // c0 load 32 (8x4S) coefficients via first offsets 6651 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6652 // c1 load 32 (8x4S) coefficients via second offsets 6653 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6654 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6655 vs_addv(vs3, __ T4S, vs1, vs2); 6656 // c = c0 - c1 6657 vs_subv(vs1, __ T4S, vs1, vs2); 6658 // store a0 32 (8x4S) coefficients via first offsets 6659 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6660 // b load 32 (8x4S) next inputs 6661 vs_ldpq_post(vs2, zetas); 6662 // reload constants q, qinv -- they were clobbered earlier 6663 vs_ldpq(vq, dilithiumConsts); // qInv, q 6664 // compute a1 = b montmul c 6665 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6666 // store a1 32 (8x4S) coefficients via second offsets 6667 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6668 } 6669 6670 // level 3-7 6671 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6672 6673 __ leave(); // required for proper stackwalking of RuntimeStub frame 6674 __ mov(r0, zr); // return 0 6675 __ ret(lr); 6676 6677 return start; 6678 } 6679 6680 // Dilithium multiply polynomials in the NTT domain. 6681 // Straightforward implementation of the method 6682 // static int implDilithiumNttMult( 6683 // int[] result, int[] ntta, int[] nttb {} of 6684 // the sun.security.provider.ML_DSA class. 6685 // 6686 // result (int[256]) = c_rarg0 6687 // poly1 (int[256]) = c_rarg1 6688 // poly2 (int[256]) = c_rarg2 6689 address generate_dilithiumNttMult() { 6690 6691 __ align(CodeEntryAlignment); 6692 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 6693 StubCodeMark mark(this, stub_id); 6694 address start = __ pc(); 6695 __ enter(); 6696 6697 Label L_loop; 6698 6699 const Register result = c_rarg0; 6700 const Register poly1 = c_rarg1; 6701 const Register poly2 = c_rarg2; 6702 6703 const Register dilithiumConsts = r10; 6704 const Register len = r11; 6705 6706 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6707 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6708 VSeq<2> vq(30); // n.b. constants overlap vs3 6709 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6710 6711 __ lea(dilithiumConsts, 6712 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6713 6714 // load constants q, qinv 6715 vs_ldpq(vq, dilithiumConsts); // qInv, q 6716 // load constant rSquare into v29 6717 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6718 6719 __ mov(len, zr); 6720 __ add(len, len, 1024); 6721 6722 __ BIND(L_loop); 6723 6724 // b load 32 (8x4S) next inputs from poly1 6725 vs_ldpq_post(vs1, poly1); 6726 // c load 32 (8x4S) next inputs from poly2 6727 vs_ldpq_post(vs2, poly2); 6728 // compute a = b montmul c 6729 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6730 // compute a = rsquare montmul a 6731 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6732 // save a 32 (8x4S) results 6733 vs_stpq_post(vs2, result); 6734 6735 __ sub(len, len, 128); 6736 __ cmp(len, (u1)128); 6737 __ br(Assembler::GE, L_loop); 6738 6739 __ leave(); // required for proper stackwalking of RuntimeStub frame 6740 __ mov(r0, zr); // return 0 6741 __ ret(lr); 6742 6743 return start; 6744 } 6745 6746 // Dilithium Motgomery multiply an array by a constant. 6747 // A straightforward implementation of the method 6748 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6749 // of the sun.security.provider.MLDSA class 6750 // 6751 // coeffs (int[256]) = c_rarg0 6752 // constant (int) = c_rarg1 6753 address generate_dilithiumMontMulByConstant() { 6754 6755 __ align(CodeEntryAlignment); 6756 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 6757 StubCodeMark mark(this, stub_id); 6758 address start = __ pc(); 6759 __ enter(); 6760 6761 Label L_loop; 6762 6763 const Register coeffs = c_rarg0; 6764 const Register constant = c_rarg1; 6765 6766 const Register dilithiumConsts = r10; 6767 const Register result = r11; 6768 const Register len = r12; 6769 6770 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6771 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6772 VSeq<2> vq(30); // n.b. constants overlap vs3 6773 VSeq<8> vconst(29, 0); // for montmul by constant 6774 6775 // results track inputs 6776 __ add(result, coeffs, 0); 6777 __ lea(dilithiumConsts, 6778 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6779 6780 // load constants q, qinv -- they do not get clobbered by first two loops 6781 vs_ldpq(vq, dilithiumConsts); // qInv, q 6782 // copy caller supplied constant across vconst 6783 __ dup(vconst[0], __ T4S, constant); 6784 __ mov(len, zr); 6785 __ add(len, len, 1024); 6786 6787 __ BIND(L_loop); 6788 6789 // load next 32 inputs 6790 vs_ldpq_post(vs2, coeffs); 6791 // mont mul by constant 6792 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6793 // write next 32 results 6794 vs_stpq_post(vs2, result); 6795 6796 __ sub(len, len, 128); 6797 __ cmp(len, (u1)128); 6798 __ br(Assembler::GE, L_loop); 6799 6800 __ leave(); // required for proper stackwalking of RuntimeStub frame 6801 __ mov(r0, zr); // return 0 6802 __ ret(lr); 6803 6804 return start; 6805 } 6806 6807 // Dilithium decompose poly. 6808 // Implements the method 6809 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6810 // of the sun.security.provider.ML_DSA class 6811 // 6812 // input (int[256]) = c_rarg0 6813 // lowPart (int[256]) = c_rarg1 6814 // highPart (int[256]) = c_rarg2 6815 // twoGamma2 (int) = c_rarg3 6816 // multiplier (int) = c_rarg4 6817 address generate_dilithiumDecomposePoly() { 6818 6819 __ align(CodeEntryAlignment); 6820 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 6821 StubCodeMark mark(this, stub_id); 6822 address start = __ pc(); 6823 Label L_loop; 6824 6825 const Register input = c_rarg0; 6826 const Register lowPart = c_rarg1; 6827 const Register highPart = c_rarg2; 6828 const Register twoGamma2 = c_rarg3; 6829 const Register multiplier = c_rarg4; 6830 6831 const Register len = r9; 6832 const Register dilithiumConsts = r10; 6833 const Register tmp = r11; 6834 6835 // 6 independent sets of 4x4s values 6836 VSeq<4> vs1(0), vs2(4), vs3(8); 6837 VSeq<4> vs4(12), vs5(16), vtmp(20); 6838 6839 // 7 constants for cross-multiplying 6840 VSeq<4> one(25, 0); 6841 VSeq<4> qminus1(26, 0); 6842 VSeq<4> g2(27, 0); 6843 VSeq<4> twog2(28, 0); 6844 VSeq<4> mult(29, 0); 6845 VSeq<4> q(30, 0); 6846 VSeq<4> qadd(31, 0); 6847 6848 __ enter(); 6849 6850 __ lea(dilithiumConsts, 6851 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6852 6853 // save callee-saved registers 6854 __ stpd(v8, v9, __ pre(sp, -64)); 6855 __ stpd(v10, v11, Address(sp, 16)); 6856 __ stpd(v12, v13, Address(sp, 32)); 6857 __ stpd(v14, v15, Address(sp, 48)); 6858 6859 // populate constant registers 6860 __ mov(tmp, zr); 6861 __ add(tmp, tmp, 1); 6862 __ dup(one[0], __ T4S, tmp); // 1 6863 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 6864 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 6865 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 6866 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 6867 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 6868 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 6869 6870 __ mov(len, zr); 6871 __ add(len, len, 1024); 6872 6873 __ BIND(L_loop); 6874 6875 // load next 4x4S inputs interleaved: rplus --> vs1 6876 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 6877 6878 // rplus = rplus - ((rplus + qadd) >> 23) * q 6879 vs_addv(vtmp, __ T4S, vs1, qadd); 6880 vs_sshr(vtmp, __ T4S, vtmp, 23); 6881 vs_mulv(vtmp, __ T4S, vtmp, q); 6882 vs_subv(vs1, __ T4S, vs1, vtmp); 6883 6884 // rplus = rplus + ((rplus >> 31) & dilithium_q); 6885 vs_sshr(vtmp, __ T4S, vs1, 31); 6886 vs_andr(vtmp, vtmp, q); 6887 vs_addv(vs1, __ T4S, vs1, vtmp); 6888 6889 // quotient --> vs2 6890 // int quotient = (rplus * multiplier) >> 22; 6891 vs_mulv(vtmp, __ T4S, vs1, mult); 6892 vs_sshr(vs2, __ T4S, vtmp, 22); 6893 6894 // r0 --> vs3 6895 // int r0 = rplus - quotient * twoGamma2; 6896 vs_mulv(vtmp, __ T4S, vs2, twog2); 6897 vs_subv(vs3, __ T4S, vs1, vtmp); 6898 6899 // mask --> vs4 6900 // int mask = (twoGamma2 - r0) >> 22; 6901 vs_subv(vtmp, __ T4S, twog2, vs3); 6902 vs_sshr(vs4, __ T4S, vtmp, 22); 6903 6904 // r0 -= (mask & twoGamma2); 6905 vs_andr(vtmp, vs4, twog2); 6906 vs_subv(vs3, __ T4S, vs3, vtmp); 6907 6908 // quotient += (mask & 1); 6909 vs_andr(vtmp, vs4, one); 6910 vs_addv(vs2, __ T4S, vs2, vtmp); 6911 6912 // mask = (twoGamma2 / 2 - r0) >> 31; 6913 vs_subv(vtmp, __ T4S, g2, vs3); 6914 vs_sshr(vs4, __ T4S, vtmp, 31); 6915 6916 // r0 -= (mask & twoGamma2); 6917 vs_andr(vtmp, vs4, twog2); 6918 vs_subv(vs3, __ T4S, vs3, vtmp); 6919 6920 // quotient += (mask & 1); 6921 vs_andr(vtmp, vs4, one); 6922 vs_addv(vs2, __ T4S, vs2, vtmp); 6923 6924 // r1 --> vs5 6925 // int r1 = rplus - r0 - (dilithium_q - 1); 6926 vs_subv(vtmp, __ T4S, vs1, vs3); 6927 vs_subv(vs5, __ T4S, vtmp, qminus1); 6928 6929 // r1 --> vs1 (overwriting rplus) 6930 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 6931 vs_negr(vtmp, __ T4S, vs5); 6932 vs_orr(vtmp, vs5, vtmp); 6933 vs_sshr(vs1, __ T4S, vtmp, 31); 6934 6935 // r0 += ~r1; 6936 vs_notr(vtmp, vs1); 6937 vs_addv(vs3, __ T4S, vs3, vtmp); 6938 6939 // r1 = r1 & quotient; 6940 vs_andr(vs1, vs2, vs1); 6941 6942 // store results inteleaved 6943 // lowPart[m] = r0; 6944 // highPart[m] = r1; 6945 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 6946 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 6947 6948 __ sub(len, len, 64); 6949 __ cmp(len, (u1)64); 6950 __ br(Assembler::GE, L_loop); 6951 6952 // restore callee-saved vector registers 6953 __ ldpd(v14, v15, Address(sp, 48)); 6954 __ ldpd(v12, v13, Address(sp, 32)); 6955 __ ldpd(v10, v11, Address(sp, 16)); 6956 __ ldpd(v8, v9, __ post(sp, 64)); 6957 6958 __ leave(); // required for proper stackwalking of RuntimeStub frame 6959 __ mov(r0, zr); // return 0 6960 __ ret(lr); 6961 6962 return start; 6963 } 6964 6965 /** 6966 * Arguments: 6967 * 6968 * Inputs: 6969 * c_rarg0 - int crc 6970 * c_rarg1 - byte* buf 6971 * c_rarg2 - int length 6972 * 6973 * Output: 6974 * rax - int crc result 6975 */ 6976 address generate_updateBytesCRC32() { 6977 assert(UseCRC32Intrinsics, "what are we doing here?"); 6978 6979 __ align(CodeEntryAlignment); 6980 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 6981 StubCodeMark mark(this, stub_id); 6982 6983 address start = __ pc(); 6984 6985 const Register crc = c_rarg0; // crc 6986 const Register buf = c_rarg1; // source java byte array address 6987 const Register len = c_rarg2; // length 6988 const Register table0 = c_rarg3; // crc_table address 6989 const Register table1 = c_rarg4; 6990 const Register table2 = c_rarg5; 6991 const Register table3 = c_rarg6; 6992 const Register tmp3 = c_rarg7; 6993 6994 BLOCK_COMMENT("Entry:"); 6995 __ enter(); // required for proper stackwalking of RuntimeStub frame 6996 6997 __ kernel_crc32(crc, buf, len, 6998 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 6999 7000 __ leave(); // required for proper stackwalking of RuntimeStub frame 7001 __ ret(lr); 7002 7003 return start; 7004 } 7005 7006 /** 7007 * Arguments: 7008 * 7009 * Inputs: 7010 * c_rarg0 - int crc 7011 * c_rarg1 - byte* buf 7012 * c_rarg2 - int length 7013 * c_rarg3 - int* table 7014 * 7015 * Output: 7016 * r0 - int crc result 7017 */ 7018 address generate_updateBytesCRC32C() { 7019 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7020 7021 __ align(CodeEntryAlignment); 7022 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 7023 StubCodeMark mark(this, stub_id); 7024 7025 address start = __ pc(); 7026 7027 const Register crc = c_rarg0; // crc 7028 const Register buf = c_rarg1; // source java byte array address 7029 const Register len = c_rarg2; // length 7030 const Register table0 = c_rarg3; // crc_table address 7031 const Register table1 = c_rarg4; 7032 const Register table2 = c_rarg5; 7033 const Register table3 = c_rarg6; 7034 const Register tmp3 = c_rarg7; 7035 7036 BLOCK_COMMENT("Entry:"); 7037 __ enter(); // required for proper stackwalking of RuntimeStub frame 7038 7039 __ kernel_crc32c(crc, buf, len, 7040 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7041 7042 __ leave(); // required for proper stackwalking of RuntimeStub frame 7043 __ ret(lr); 7044 7045 return start; 7046 } 7047 7048 /*** 7049 * Arguments: 7050 * 7051 * Inputs: 7052 * c_rarg0 - int adler 7053 * c_rarg1 - byte* buff 7054 * c_rarg2 - int len 7055 * 7056 * Output: 7057 * c_rarg0 - int adler result 7058 */ 7059 address generate_updateBytesAdler32() { 7060 __ align(CodeEntryAlignment); 7061 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 7062 StubCodeMark mark(this, stub_id); 7063 address start = __ pc(); 7064 7065 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7066 7067 // Aliases 7068 Register adler = c_rarg0; 7069 Register s1 = c_rarg0; 7070 Register s2 = c_rarg3; 7071 Register buff = c_rarg1; 7072 Register len = c_rarg2; 7073 Register nmax = r4; 7074 Register base = r5; 7075 Register count = r6; 7076 Register temp0 = rscratch1; 7077 Register temp1 = rscratch2; 7078 FloatRegister vbytes = v0; 7079 FloatRegister vs1acc = v1; 7080 FloatRegister vs2acc = v2; 7081 FloatRegister vtable = v3; 7082 7083 // Max number of bytes we can process before having to take the mod 7084 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7085 uint64_t BASE = 0xfff1; 7086 uint64_t NMAX = 0x15B0; 7087 7088 __ mov(base, BASE); 7089 __ mov(nmax, NMAX); 7090 7091 // Load accumulation coefficients for the upper 16 bits 7092 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7093 __ ld1(vtable, __ T16B, Address(temp0)); 7094 7095 // s1 is initialized to the lower 16 bits of adler 7096 // s2 is initialized to the upper 16 bits of adler 7097 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7098 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7099 7100 // The pipelined loop needs at least 16 elements for 1 iteration 7101 // It does check this, but it is more effective to skip to the cleanup loop 7102 __ cmp(len, (u1)16); 7103 __ br(Assembler::HS, L_nmax); 7104 __ cbz(len, L_combine); 7105 7106 __ bind(L_simple_by1_loop); 7107 __ ldrb(temp0, Address(__ post(buff, 1))); 7108 __ add(s1, s1, temp0); 7109 __ add(s2, s2, s1); 7110 __ subs(len, len, 1); 7111 __ br(Assembler::HI, L_simple_by1_loop); 7112 7113 // s1 = s1 % BASE 7114 __ subs(temp0, s1, base); 7115 __ csel(s1, temp0, s1, Assembler::HS); 7116 7117 // s2 = s2 % BASE 7118 __ lsr(temp0, s2, 16); 7119 __ lsl(temp1, temp0, 4); 7120 __ sub(temp1, temp1, temp0); 7121 __ add(s2, temp1, s2, ext::uxth); 7122 7123 __ subs(temp0, s2, base); 7124 __ csel(s2, temp0, s2, Assembler::HS); 7125 7126 __ b(L_combine); 7127 7128 __ bind(L_nmax); 7129 __ subs(len, len, nmax); 7130 __ sub(count, nmax, 16); 7131 __ br(Assembler::LO, L_by16); 7132 7133 __ bind(L_nmax_loop); 7134 7135 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7136 vbytes, vs1acc, vs2acc, vtable); 7137 7138 __ subs(count, count, 16); 7139 __ br(Assembler::HS, L_nmax_loop); 7140 7141 // s1 = s1 % BASE 7142 __ lsr(temp0, s1, 16); 7143 __ lsl(temp1, temp0, 4); 7144 __ sub(temp1, temp1, temp0); 7145 __ add(temp1, temp1, s1, ext::uxth); 7146 7147 __ lsr(temp0, temp1, 16); 7148 __ lsl(s1, temp0, 4); 7149 __ sub(s1, s1, temp0); 7150 __ add(s1, s1, temp1, ext:: uxth); 7151 7152 __ subs(temp0, s1, base); 7153 __ csel(s1, temp0, s1, Assembler::HS); 7154 7155 // s2 = s2 % BASE 7156 __ lsr(temp0, s2, 16); 7157 __ lsl(temp1, temp0, 4); 7158 __ sub(temp1, temp1, temp0); 7159 __ add(temp1, temp1, s2, ext::uxth); 7160 7161 __ lsr(temp0, temp1, 16); 7162 __ lsl(s2, temp0, 4); 7163 __ sub(s2, s2, temp0); 7164 __ add(s2, s2, temp1, ext:: uxth); 7165 7166 __ subs(temp0, s2, base); 7167 __ csel(s2, temp0, s2, Assembler::HS); 7168 7169 __ subs(len, len, nmax); 7170 __ sub(count, nmax, 16); 7171 __ br(Assembler::HS, L_nmax_loop); 7172 7173 __ bind(L_by16); 7174 __ adds(len, len, count); 7175 __ br(Assembler::LO, L_by1); 7176 7177 __ bind(L_by16_loop); 7178 7179 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7180 vbytes, vs1acc, vs2acc, vtable); 7181 7182 __ subs(len, len, 16); 7183 __ br(Assembler::HS, L_by16_loop); 7184 7185 __ bind(L_by1); 7186 __ adds(len, len, 15); 7187 __ br(Assembler::LO, L_do_mod); 7188 7189 __ bind(L_by1_loop); 7190 __ ldrb(temp0, Address(__ post(buff, 1))); 7191 __ add(s1, temp0, s1); 7192 __ add(s2, s2, s1); 7193 __ subs(len, len, 1); 7194 __ br(Assembler::HS, L_by1_loop); 7195 7196 __ bind(L_do_mod); 7197 // s1 = s1 % BASE 7198 __ lsr(temp0, s1, 16); 7199 __ lsl(temp1, temp0, 4); 7200 __ sub(temp1, temp1, temp0); 7201 __ add(temp1, temp1, s1, ext::uxth); 7202 7203 __ lsr(temp0, temp1, 16); 7204 __ lsl(s1, temp0, 4); 7205 __ sub(s1, s1, temp0); 7206 __ add(s1, s1, temp1, ext:: uxth); 7207 7208 __ subs(temp0, s1, base); 7209 __ csel(s1, temp0, s1, Assembler::HS); 7210 7211 // s2 = s2 % BASE 7212 __ lsr(temp0, s2, 16); 7213 __ lsl(temp1, temp0, 4); 7214 __ sub(temp1, temp1, temp0); 7215 __ add(temp1, temp1, s2, ext::uxth); 7216 7217 __ lsr(temp0, temp1, 16); 7218 __ lsl(s2, temp0, 4); 7219 __ sub(s2, s2, temp0); 7220 __ add(s2, s2, temp1, ext:: uxth); 7221 7222 __ subs(temp0, s2, base); 7223 __ csel(s2, temp0, s2, Assembler::HS); 7224 7225 // Combine lower bits and higher bits 7226 __ bind(L_combine); 7227 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7228 7229 __ ret(lr); 7230 7231 return start; 7232 } 7233 7234 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7235 Register temp0, Register temp1, FloatRegister vbytes, 7236 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7237 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7238 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7239 // In non-vectorized code, we update s1 and s2 as: 7240 // s1 <- s1 + b1 7241 // s2 <- s2 + s1 7242 // s1 <- s1 + b2 7243 // s2 <- s2 + b1 7244 // ... 7245 // s1 <- s1 + b16 7246 // s2 <- s2 + s1 7247 // Putting above assignments together, we have: 7248 // s1_new = s1 + b1 + b2 + ... + b16 7249 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7250 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7251 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7252 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7253 7254 // s2 = s2 + s1 * 16 7255 __ add(s2, s2, s1, Assembler::LSL, 4); 7256 7257 // vs1acc = b1 + b2 + b3 + ... + b16 7258 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7259 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7260 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7261 __ uaddlv(vs1acc, __ T16B, vbytes); 7262 __ uaddlv(vs2acc, __ T8H, vs2acc); 7263 7264 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7265 __ fmovd(temp0, vs1acc); 7266 __ fmovd(temp1, vs2acc); 7267 __ add(s1, s1, temp0); 7268 __ add(s2, s2, temp1); 7269 } 7270 7271 /** 7272 * Arguments: 7273 * 7274 * Input: 7275 * c_rarg0 - x address 7276 * c_rarg1 - x length 7277 * c_rarg2 - y address 7278 * c_rarg3 - y length 7279 * c_rarg4 - z address 7280 */ 7281 address generate_multiplyToLen() { 7282 __ align(CodeEntryAlignment); 7283 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 7284 StubCodeMark mark(this, stub_id); 7285 7286 address start = __ pc(); 7287 7288 if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 7289 return start; 7290 } 7291 const Register x = r0; 7292 const Register xlen = r1; 7293 const Register y = r2; 7294 const Register ylen = r3; 7295 const Register z = r4; 7296 7297 const Register tmp0 = r5; 7298 const Register tmp1 = r10; 7299 const Register tmp2 = r11; 7300 const Register tmp3 = r12; 7301 const Register tmp4 = r13; 7302 const Register tmp5 = r14; 7303 const Register tmp6 = r15; 7304 const Register tmp7 = r16; 7305 7306 BLOCK_COMMENT("Entry:"); 7307 __ enter(); // required for proper stackwalking of RuntimeStub frame 7308 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7309 __ leave(); // required for proper stackwalking of RuntimeStub frame 7310 __ ret(lr); 7311 7312 AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 7313 return start; 7314 } 7315 7316 address generate_squareToLen() { 7317 // squareToLen algorithm for sizes 1..127 described in java code works 7318 // faster than multiply_to_len on some CPUs and slower on others, but 7319 // multiply_to_len shows a bit better overall results 7320 __ align(CodeEntryAlignment); 7321 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 7322 StubCodeMark mark(this, stub_id); 7323 address start = __ pc(); 7324 7325 if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 7326 return start; 7327 } 7328 const Register x = r0; 7329 const Register xlen = r1; 7330 const Register z = r2; 7331 const Register y = r4; // == x 7332 const Register ylen = r5; // == xlen 7333 7334 const Register tmp0 = r3; 7335 const Register tmp1 = r10; 7336 const Register tmp2 = r11; 7337 const Register tmp3 = r12; 7338 const Register tmp4 = r13; 7339 const Register tmp5 = r14; 7340 const Register tmp6 = r15; 7341 const Register tmp7 = r16; 7342 7343 RegSet spilled_regs = RegSet::of(y, ylen); 7344 BLOCK_COMMENT("Entry:"); 7345 __ enter(); 7346 __ push(spilled_regs, sp); 7347 __ mov(y, x); 7348 __ mov(ylen, xlen); 7349 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7350 __ pop(spilled_regs, sp); 7351 __ leave(); 7352 __ ret(lr); 7353 7354 AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 7355 return start; 7356 } 7357 7358 address generate_mulAdd() { 7359 __ align(CodeEntryAlignment); 7360 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 7361 StubCodeMark mark(this, stub_id); 7362 7363 address start = __ pc(); 7364 7365 if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 7366 return start; 7367 } 7368 const Register out = r0; 7369 const Register in = r1; 7370 const Register offset = r2; 7371 const Register len = r3; 7372 const Register k = r4; 7373 7374 BLOCK_COMMENT("Entry:"); 7375 __ enter(); 7376 __ mul_add(out, in, offset, len, k); 7377 __ leave(); 7378 __ ret(lr); 7379 7380 AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 7381 return start; 7382 } 7383 7384 // Arguments: 7385 // 7386 // Input: 7387 // c_rarg0 - newArr address 7388 // c_rarg1 - oldArr address 7389 // c_rarg2 - newIdx 7390 // c_rarg3 - shiftCount 7391 // c_rarg4 - numIter 7392 // 7393 address generate_bigIntegerRightShift() { 7394 __ align(CodeEntryAlignment); 7395 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 7396 StubCodeMark mark(this, stub_id); 7397 address start = __ pc(); 7398 7399 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7400 7401 Register newArr = c_rarg0; 7402 Register oldArr = c_rarg1; 7403 Register newIdx = c_rarg2; 7404 Register shiftCount = c_rarg3; 7405 Register numIter = c_rarg4; 7406 Register idx = numIter; 7407 7408 Register newArrCur = rscratch1; 7409 Register shiftRevCount = rscratch2; 7410 Register oldArrCur = r13; 7411 Register oldArrNext = r14; 7412 7413 FloatRegister oldElem0 = v0; 7414 FloatRegister oldElem1 = v1; 7415 FloatRegister newElem = v2; 7416 FloatRegister shiftVCount = v3; 7417 FloatRegister shiftVRevCount = v4; 7418 7419 __ cbz(idx, Exit); 7420 7421 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7422 7423 // left shift count 7424 __ movw(shiftRevCount, 32); 7425 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7426 7427 // numIter too small to allow a 4-words SIMD loop, rolling back 7428 __ cmp(numIter, (u1)4); 7429 __ br(Assembler::LT, ShiftThree); 7430 7431 __ dup(shiftVCount, __ T4S, shiftCount); 7432 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7433 __ negr(shiftVCount, __ T4S, shiftVCount); 7434 7435 __ BIND(ShiftSIMDLoop); 7436 7437 // Calculate the load addresses 7438 __ sub(idx, idx, 4); 7439 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7440 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7441 __ add(oldArrCur, oldArrNext, 4); 7442 7443 // Load 4 words and process 7444 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7445 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7446 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7447 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7448 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7449 __ st1(newElem, __ T4S, Address(newArrCur)); 7450 7451 __ cmp(idx, (u1)4); 7452 __ br(Assembler::LT, ShiftTwoLoop); 7453 __ b(ShiftSIMDLoop); 7454 7455 __ BIND(ShiftTwoLoop); 7456 __ cbz(idx, Exit); 7457 __ cmp(idx, (u1)1); 7458 __ br(Assembler::EQ, ShiftOne); 7459 7460 // Calculate the load addresses 7461 __ sub(idx, idx, 2); 7462 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7463 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7464 __ add(oldArrCur, oldArrNext, 4); 7465 7466 // Load 2 words and process 7467 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7468 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7469 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7470 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7471 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7472 __ st1(newElem, __ T2S, Address(newArrCur)); 7473 __ b(ShiftTwoLoop); 7474 7475 __ BIND(ShiftThree); 7476 __ tbz(idx, 1, ShiftOne); 7477 __ tbz(idx, 0, ShiftTwo); 7478 __ ldrw(r10, Address(oldArr, 12)); 7479 __ ldrw(r11, Address(oldArr, 8)); 7480 __ lsrvw(r10, r10, shiftCount); 7481 __ lslvw(r11, r11, shiftRevCount); 7482 __ orrw(r12, r10, r11); 7483 __ strw(r12, Address(newArr, 8)); 7484 7485 __ BIND(ShiftTwo); 7486 __ ldrw(r10, Address(oldArr, 8)); 7487 __ ldrw(r11, Address(oldArr, 4)); 7488 __ lsrvw(r10, r10, shiftCount); 7489 __ lslvw(r11, r11, shiftRevCount); 7490 __ orrw(r12, r10, r11); 7491 __ strw(r12, Address(newArr, 4)); 7492 7493 __ BIND(ShiftOne); 7494 __ ldrw(r10, Address(oldArr, 4)); 7495 __ ldrw(r11, Address(oldArr)); 7496 __ lsrvw(r10, r10, shiftCount); 7497 __ lslvw(r11, r11, shiftRevCount); 7498 __ orrw(r12, r10, r11); 7499 __ strw(r12, Address(newArr)); 7500 7501 __ BIND(Exit); 7502 __ ret(lr); 7503 7504 return start; 7505 } 7506 7507 // Arguments: 7508 // 7509 // Input: 7510 // c_rarg0 - newArr address 7511 // c_rarg1 - oldArr address 7512 // c_rarg2 - newIdx 7513 // c_rarg3 - shiftCount 7514 // c_rarg4 - numIter 7515 // 7516 address generate_bigIntegerLeftShift() { 7517 __ align(CodeEntryAlignment); 7518 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 7519 StubCodeMark mark(this, stub_id); 7520 address start = __ pc(); 7521 7522 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7523 7524 Register newArr = c_rarg0; 7525 Register oldArr = c_rarg1; 7526 Register newIdx = c_rarg2; 7527 Register shiftCount = c_rarg3; 7528 Register numIter = c_rarg4; 7529 7530 Register shiftRevCount = rscratch1; 7531 Register oldArrNext = rscratch2; 7532 7533 FloatRegister oldElem0 = v0; 7534 FloatRegister oldElem1 = v1; 7535 FloatRegister newElem = v2; 7536 FloatRegister shiftVCount = v3; 7537 FloatRegister shiftVRevCount = v4; 7538 7539 __ cbz(numIter, Exit); 7540 7541 __ add(oldArrNext, oldArr, 4); 7542 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7543 7544 // right shift count 7545 __ movw(shiftRevCount, 32); 7546 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7547 7548 // numIter too small to allow a 4-words SIMD loop, rolling back 7549 __ cmp(numIter, (u1)4); 7550 __ br(Assembler::LT, ShiftThree); 7551 7552 __ dup(shiftVCount, __ T4S, shiftCount); 7553 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7554 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 7555 7556 __ BIND(ShiftSIMDLoop); 7557 7558 // load 4 words and process 7559 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 7560 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 7561 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7562 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7563 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7564 __ st1(newElem, __ T4S, __ post(newArr, 16)); 7565 __ sub(numIter, numIter, 4); 7566 7567 __ cmp(numIter, (u1)4); 7568 __ br(Assembler::LT, ShiftTwoLoop); 7569 __ b(ShiftSIMDLoop); 7570 7571 __ BIND(ShiftTwoLoop); 7572 __ cbz(numIter, Exit); 7573 __ cmp(numIter, (u1)1); 7574 __ br(Assembler::EQ, ShiftOne); 7575 7576 // load 2 words and process 7577 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 7578 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 7579 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7580 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7581 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7582 __ st1(newElem, __ T2S, __ post(newArr, 8)); 7583 __ sub(numIter, numIter, 2); 7584 __ b(ShiftTwoLoop); 7585 7586 __ BIND(ShiftThree); 7587 __ ldrw(r10, __ post(oldArr, 4)); 7588 __ ldrw(r11, __ post(oldArrNext, 4)); 7589 __ lslvw(r10, r10, shiftCount); 7590 __ lsrvw(r11, r11, shiftRevCount); 7591 __ orrw(r12, r10, r11); 7592 __ strw(r12, __ post(newArr, 4)); 7593 __ tbz(numIter, 1, Exit); 7594 __ tbz(numIter, 0, ShiftOne); 7595 7596 __ BIND(ShiftTwo); 7597 __ ldrw(r10, __ post(oldArr, 4)); 7598 __ ldrw(r11, __ post(oldArrNext, 4)); 7599 __ lslvw(r10, r10, shiftCount); 7600 __ lsrvw(r11, r11, shiftRevCount); 7601 __ orrw(r12, r10, r11); 7602 __ strw(r12, __ post(newArr, 4)); 7603 7604 __ BIND(ShiftOne); 7605 __ ldrw(r10, Address(oldArr)); 7606 __ ldrw(r11, Address(oldArrNext)); 7607 __ lslvw(r10, r10, shiftCount); 7608 __ lsrvw(r11, r11, shiftRevCount); 7609 __ orrw(r12, r10, r11); 7610 __ strw(r12, Address(newArr)); 7611 7612 __ BIND(Exit); 7613 __ ret(lr); 7614 7615 return start; 7616 } 7617 7618 address generate_count_positives(address &count_positives_long) { 7619 const u1 large_loop_size = 64; 7620 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 7621 int dcache_line = VM_Version::dcache_line_size(); 7622 7623 Register ary1 = r1, len = r2, result = r0; 7624 7625 __ align(CodeEntryAlignment); 7626 7627 StubGenStubId stub_id = StubGenStubId::count_positives_id; 7628 StubCodeMark mark(this, stub_id); 7629 7630 address entry = __ pc(); 7631 7632 __ enter(); 7633 // precondition: a copy of len is already in result 7634 // __ mov(result, len); 7635 7636 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 7637 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 7638 7639 __ cmp(len, (u1)15); 7640 __ br(Assembler::GT, LEN_OVER_15); 7641 // The only case when execution falls into this code is when pointer is near 7642 // the end of memory page and we have to avoid reading next page 7643 __ add(ary1, ary1, len); 7644 __ subs(len, len, 8); 7645 __ br(Assembler::GT, LEN_OVER_8); 7646 __ ldr(rscratch2, Address(ary1, -8)); 7647 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 7648 __ lsrv(rscratch2, rscratch2, rscratch1); 7649 __ tst(rscratch2, UPPER_BIT_MASK); 7650 __ csel(result, zr, result, Assembler::NE); 7651 __ leave(); 7652 __ ret(lr); 7653 __ bind(LEN_OVER_8); 7654 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 7655 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 7656 __ tst(rscratch2, UPPER_BIT_MASK); 7657 __ br(Assembler::NE, RET_NO_POP); 7658 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 7659 __ lsrv(rscratch1, rscratch1, rscratch2); 7660 __ tst(rscratch1, UPPER_BIT_MASK); 7661 __ bind(RET_NO_POP); 7662 __ csel(result, zr, result, Assembler::NE); 7663 __ leave(); 7664 __ ret(lr); 7665 7666 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 7667 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 7668 7669 count_positives_long = __ pc(); // 2nd entry point 7670 7671 __ enter(); 7672 7673 __ bind(LEN_OVER_15); 7674 __ push(spilled_regs, sp); 7675 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 7676 __ cbz(rscratch2, ALIGNED); 7677 __ ldp(tmp6, tmp1, Address(ary1)); 7678 __ mov(tmp5, 16); 7679 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 7680 __ add(ary1, ary1, rscratch1); 7681 __ orr(tmp6, tmp6, tmp1); 7682 __ tst(tmp6, UPPER_BIT_MASK); 7683 __ br(Assembler::NE, RET_ADJUST); 7684 __ sub(len, len, rscratch1); 7685 7686 __ bind(ALIGNED); 7687 __ cmp(len, large_loop_size); 7688 __ br(Assembler::LT, CHECK_16); 7689 // Perform 16-byte load as early return in pre-loop to handle situation 7690 // when initially aligned large array has negative values at starting bytes, 7691 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 7692 // slower. Cases with negative bytes further ahead won't be affected that 7693 // much. In fact, it'll be faster due to early loads, less instructions and 7694 // less branches in LARGE_LOOP. 7695 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 7696 __ sub(len, len, 16); 7697 __ orr(tmp6, tmp6, tmp1); 7698 __ tst(tmp6, UPPER_BIT_MASK); 7699 __ br(Assembler::NE, RET_ADJUST_16); 7700 __ cmp(len, large_loop_size); 7701 __ br(Assembler::LT, CHECK_16); 7702 7703 if (SoftwarePrefetchHintDistance >= 0 7704 && SoftwarePrefetchHintDistance >= dcache_line) { 7705 // initial prefetch 7706 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 7707 } 7708 __ bind(LARGE_LOOP); 7709 if (SoftwarePrefetchHintDistance >= 0) { 7710 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 7711 } 7712 // Issue load instructions first, since it can save few CPU/MEM cycles, also 7713 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 7714 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 7715 // instructions per cycle and have less branches, but this approach disables 7716 // early return, thus, all 64 bytes are loaded and checked every time. 7717 __ ldp(tmp2, tmp3, Address(ary1)); 7718 __ ldp(tmp4, tmp5, Address(ary1, 16)); 7719 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 7720 __ ldp(tmp6, tmp1, Address(ary1, 48)); 7721 __ add(ary1, ary1, large_loop_size); 7722 __ sub(len, len, large_loop_size); 7723 __ orr(tmp2, tmp2, tmp3); 7724 __ orr(tmp4, tmp4, tmp5); 7725 __ orr(rscratch1, rscratch1, rscratch2); 7726 __ orr(tmp6, tmp6, tmp1); 7727 __ orr(tmp2, tmp2, tmp4); 7728 __ orr(rscratch1, rscratch1, tmp6); 7729 __ orr(tmp2, tmp2, rscratch1); 7730 __ tst(tmp2, UPPER_BIT_MASK); 7731 __ br(Assembler::NE, RET_ADJUST_LONG); 7732 __ cmp(len, large_loop_size); 7733 __ br(Assembler::GE, LARGE_LOOP); 7734 7735 __ bind(CHECK_16); // small 16-byte load pre-loop 7736 __ cmp(len, (u1)16); 7737 __ br(Assembler::LT, POST_LOOP16); 7738 7739 __ bind(LOOP16); // small 16-byte load loop 7740 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 7741 __ sub(len, len, 16); 7742 __ orr(tmp2, tmp2, tmp3); 7743 __ tst(tmp2, UPPER_BIT_MASK); 7744 __ br(Assembler::NE, RET_ADJUST_16); 7745 __ cmp(len, (u1)16); 7746 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 7747 7748 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 7749 __ cmp(len, (u1)8); 7750 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 7751 __ ldr(tmp3, Address(__ post(ary1, 8))); 7752 __ tst(tmp3, UPPER_BIT_MASK); 7753 __ br(Assembler::NE, RET_ADJUST); 7754 __ sub(len, len, 8); 7755 7756 __ bind(POST_LOOP16_LOAD_TAIL); 7757 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 7758 __ ldr(tmp1, Address(ary1)); 7759 __ mov(tmp2, 64); 7760 __ sub(tmp4, tmp2, len, __ LSL, 3); 7761 __ lslv(tmp1, tmp1, tmp4); 7762 __ tst(tmp1, UPPER_BIT_MASK); 7763 __ br(Assembler::NE, RET_ADJUST); 7764 // Fallthrough 7765 7766 __ bind(RET_LEN); 7767 __ pop(spilled_regs, sp); 7768 __ leave(); 7769 __ ret(lr); 7770 7771 // difference result - len is the count of guaranteed to be 7772 // positive bytes 7773 7774 __ bind(RET_ADJUST_LONG); 7775 __ add(len, len, (u1)(large_loop_size - 16)); 7776 __ bind(RET_ADJUST_16); 7777 __ add(len, len, 16); 7778 __ bind(RET_ADJUST); 7779 __ pop(spilled_regs, sp); 7780 __ leave(); 7781 __ sub(result, result, len); 7782 __ ret(lr); 7783 7784 return entry; 7785 } 7786 7787 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 7788 bool usePrefetch, Label &NOT_EQUAL) { 7789 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7790 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7791 tmp7 = r12, tmp8 = r13; 7792 Label LOOP; 7793 7794 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7795 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7796 __ bind(LOOP); 7797 if (usePrefetch) { 7798 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7799 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7800 } 7801 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7802 __ eor(tmp1, tmp1, tmp2); 7803 __ eor(tmp3, tmp3, tmp4); 7804 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7805 __ orr(tmp1, tmp1, tmp3); 7806 __ cbnz(tmp1, NOT_EQUAL); 7807 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7808 __ eor(tmp5, tmp5, tmp6); 7809 __ eor(tmp7, tmp7, tmp8); 7810 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7811 __ orr(tmp5, tmp5, tmp7); 7812 __ cbnz(tmp5, NOT_EQUAL); 7813 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7814 __ eor(tmp1, tmp1, tmp2); 7815 __ eor(tmp3, tmp3, tmp4); 7816 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7817 __ orr(tmp1, tmp1, tmp3); 7818 __ cbnz(tmp1, NOT_EQUAL); 7819 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7820 __ eor(tmp5, tmp5, tmp6); 7821 __ sub(cnt1, cnt1, 8 * wordSize); 7822 __ eor(tmp7, tmp7, tmp8); 7823 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7824 // tmp6 is not used. MacroAssembler::subs is used here (rather than 7825 // cmp) because subs allows an unlimited range of immediate operand. 7826 __ subs(tmp6, cnt1, loopThreshold); 7827 __ orr(tmp5, tmp5, tmp7); 7828 __ cbnz(tmp5, NOT_EQUAL); 7829 __ br(__ GE, LOOP); 7830 // post-loop 7831 __ eor(tmp1, tmp1, tmp2); 7832 __ eor(tmp3, tmp3, tmp4); 7833 __ orr(tmp1, tmp1, tmp3); 7834 __ sub(cnt1, cnt1, 2 * wordSize); 7835 __ cbnz(tmp1, NOT_EQUAL); 7836 } 7837 7838 void generate_large_array_equals_loop_simd(int loopThreshold, 7839 bool usePrefetch, Label &NOT_EQUAL) { 7840 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7841 tmp2 = rscratch2; 7842 Label LOOP; 7843 7844 __ bind(LOOP); 7845 if (usePrefetch) { 7846 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7847 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7848 } 7849 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 7850 __ sub(cnt1, cnt1, 8 * wordSize); 7851 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 7852 __ subs(tmp1, cnt1, loopThreshold); 7853 __ eor(v0, __ T16B, v0, v4); 7854 __ eor(v1, __ T16B, v1, v5); 7855 __ eor(v2, __ T16B, v2, v6); 7856 __ eor(v3, __ T16B, v3, v7); 7857 __ orr(v0, __ T16B, v0, v1); 7858 __ orr(v1, __ T16B, v2, v3); 7859 __ orr(v0, __ T16B, v0, v1); 7860 __ umov(tmp1, v0, __ D, 0); 7861 __ umov(tmp2, v0, __ D, 1); 7862 __ orr(tmp1, tmp1, tmp2); 7863 __ cbnz(tmp1, NOT_EQUAL); 7864 __ br(__ GE, LOOP); 7865 } 7866 7867 // a1 = r1 - array1 address 7868 // a2 = r2 - array2 address 7869 // result = r0 - return value. Already contains "false" 7870 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 7871 // r3-r5 are reserved temporary registers 7872 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 7873 address generate_large_array_equals() { 7874 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7875 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7876 tmp7 = r12, tmp8 = r13; 7877 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 7878 SMALL_LOOP, POST_LOOP; 7879 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 7880 // calculate if at least 32 prefetched bytes are used 7881 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 7882 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 7883 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 7884 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 7885 tmp5, tmp6, tmp7, tmp8); 7886 7887 __ align(CodeEntryAlignment); 7888 7889 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 7890 StubCodeMark mark(this, stub_id); 7891 7892 address entry = __ pc(); 7893 __ enter(); 7894 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 7895 // also advance pointers to use post-increment instead of pre-increment 7896 __ add(a1, a1, wordSize); 7897 __ add(a2, a2, wordSize); 7898 if (AvoidUnalignedAccesses) { 7899 // both implementations (SIMD/nonSIMD) are using relatively large load 7900 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 7901 // on some CPUs in case of address is not at least 16-byte aligned. 7902 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 7903 // load if needed at least for 1st address and make if 16-byte aligned. 7904 Label ALIGNED16; 7905 __ tbz(a1, 3, ALIGNED16); 7906 __ ldr(tmp1, Address(__ post(a1, wordSize))); 7907 __ ldr(tmp2, Address(__ post(a2, wordSize))); 7908 __ sub(cnt1, cnt1, wordSize); 7909 __ eor(tmp1, tmp1, tmp2); 7910 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 7911 __ bind(ALIGNED16); 7912 } 7913 if (UseSIMDForArrayEquals) { 7914 if (SoftwarePrefetchHintDistance >= 0) { 7915 __ subs(tmp1, cnt1, prefetchLoopThreshold); 7916 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 7917 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 7918 /* prfm = */ true, NOT_EQUAL); 7919 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 7920 __ br(__ LT, TAIL); 7921 } 7922 __ bind(NO_PREFETCH_LARGE_LOOP); 7923 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 7924 /* prfm = */ false, NOT_EQUAL); 7925 } else { 7926 __ push(spilled_regs, sp); 7927 if (SoftwarePrefetchHintDistance >= 0) { 7928 __ subs(tmp1, cnt1, prefetchLoopThreshold); 7929 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 7930 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 7931 /* prfm = */ true, NOT_EQUAL); 7932 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 7933 __ br(__ LT, TAIL); 7934 } 7935 __ bind(NO_PREFETCH_LARGE_LOOP); 7936 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 7937 /* prfm = */ false, NOT_EQUAL); 7938 } 7939 __ bind(TAIL); 7940 __ cbz(cnt1, EQUAL); 7941 __ subs(cnt1, cnt1, wordSize); 7942 __ br(__ LE, POST_LOOP); 7943 __ bind(SMALL_LOOP); 7944 __ ldr(tmp1, Address(__ post(a1, wordSize))); 7945 __ ldr(tmp2, Address(__ post(a2, wordSize))); 7946 __ subs(cnt1, cnt1, wordSize); 7947 __ eor(tmp1, tmp1, tmp2); 7948 __ cbnz(tmp1, NOT_EQUAL); 7949 __ br(__ GT, SMALL_LOOP); 7950 __ bind(POST_LOOP); 7951 __ ldr(tmp1, Address(a1, cnt1)); 7952 __ ldr(tmp2, Address(a2, cnt1)); 7953 __ eor(tmp1, tmp1, tmp2); 7954 __ cbnz(tmp1, NOT_EQUAL); 7955 __ bind(EQUAL); 7956 __ mov(result, true); 7957 __ bind(NOT_EQUAL); 7958 if (!UseSIMDForArrayEquals) { 7959 __ pop(spilled_regs, sp); 7960 } 7961 __ bind(NOT_EQUAL_NO_POP); 7962 __ leave(); 7963 __ ret(lr); 7964 return entry; 7965 } 7966 7967 // result = r0 - return value. Contains initial hashcode value on entry. 7968 // ary = r1 - array address 7969 // cnt = r2 - elements count 7970 // Clobbers: v0-v13, rscratch1, rscratch2 7971 address generate_large_arrays_hashcode(BasicType eltype) { 7972 const Register result = r0, ary = r1, cnt = r2; 7973 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 7974 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 7975 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 7976 const FloatRegister vpowm = v13; 7977 7978 ARRAYS_HASHCODE_REGISTERS; 7979 7980 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 7981 7982 unsigned int vf; // vectorization factor 7983 bool multiply_by_halves; 7984 Assembler::SIMD_Arrangement load_arrangement; 7985 switch (eltype) { 7986 case T_BOOLEAN: 7987 case T_BYTE: 7988 load_arrangement = Assembler::T8B; 7989 multiply_by_halves = true; 7990 vf = 8; 7991 break; 7992 case T_CHAR: 7993 case T_SHORT: 7994 load_arrangement = Assembler::T8H; 7995 multiply_by_halves = true; 7996 vf = 8; 7997 break; 7998 case T_INT: 7999 load_arrangement = Assembler::T4S; 8000 multiply_by_halves = false; 8001 vf = 4; 8002 break; 8003 default: 8004 ShouldNotReachHere(); 8005 } 8006 8007 // Unroll factor 8008 const unsigned uf = 4; 8009 8010 // Effective vectorization factor 8011 const unsigned evf = vf * uf; 8012 8013 __ align(CodeEntryAlignment); 8014 8015 StubGenStubId stub_id; 8016 switch (eltype) { 8017 case T_BOOLEAN: 8018 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 8019 break; 8020 case T_BYTE: 8021 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 8022 break; 8023 case T_CHAR: 8024 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 8025 break; 8026 case T_SHORT: 8027 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 8028 break; 8029 case T_INT: 8030 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 8031 break; 8032 default: 8033 stub_id = StubGenStubId::NO_STUBID; 8034 ShouldNotReachHere(); 8035 }; 8036 8037 StubCodeMark mark(this, stub_id); 8038 8039 address entry = __ pc(); 8040 __ enter(); 8041 8042 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8043 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8044 // value shouldn't change throughout both loops. 8045 __ movw(rscratch1, intpow(31U, 3)); 8046 __ mov(vpow, Assembler::S, 0, rscratch1); 8047 __ movw(rscratch1, intpow(31U, 2)); 8048 __ mov(vpow, Assembler::S, 1, rscratch1); 8049 __ movw(rscratch1, intpow(31U, 1)); 8050 __ mov(vpow, Assembler::S, 2, rscratch1); 8051 __ movw(rscratch1, intpow(31U, 0)); 8052 __ mov(vpow, Assembler::S, 3, rscratch1); 8053 8054 __ mov(vmul0, Assembler::T16B, 0); 8055 __ mov(vmul0, Assembler::S, 3, result); 8056 8057 __ andr(rscratch2, cnt, (uf - 1) * vf); 8058 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8059 8060 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8061 __ mov(vpowm, Assembler::S, 0, rscratch1); 8062 8063 // SMALL LOOP 8064 __ bind(SMALL_LOOP); 8065 8066 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8067 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8068 __ subsw(rscratch2, rscratch2, vf); 8069 8070 if (load_arrangement == Assembler::T8B) { 8071 // Extend 8B to 8H to be able to use vector multiply 8072 // instructions 8073 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8074 if (is_signed_subword_type(eltype)) { 8075 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8076 } else { 8077 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8078 } 8079 } 8080 8081 switch (load_arrangement) { 8082 case Assembler::T4S: 8083 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8084 break; 8085 case Assembler::T8B: 8086 case Assembler::T8H: 8087 assert(is_subword_type(eltype), "subword type expected"); 8088 if (is_signed_subword_type(eltype)) { 8089 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8090 } else { 8091 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8092 } 8093 break; 8094 default: 8095 __ should_not_reach_here(); 8096 } 8097 8098 // Process the upper half of a vector 8099 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8100 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8101 if (is_signed_subword_type(eltype)) { 8102 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8103 } else { 8104 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8105 } 8106 } 8107 8108 __ br(Assembler::HI, SMALL_LOOP); 8109 8110 // SMALL LOOP'S EPILOQUE 8111 __ lsr(rscratch2, cnt, exact_log2(evf)); 8112 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8113 8114 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8115 __ addv(vmul0, Assembler::T4S, vmul0); 8116 __ umov(result, vmul0, Assembler::S, 0); 8117 8118 // TAIL 8119 __ bind(TAIL); 8120 8121 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8122 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8123 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8124 __ andr(rscratch2, cnt, vf - 1); 8125 __ bind(TAIL_SHORTCUT); 8126 __ adr(rscratch1, BR_BASE); 8127 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 8128 __ movw(rscratch2, 0x1f); 8129 __ br(rscratch1); 8130 8131 for (size_t i = 0; i < vf - 1; ++i) { 8132 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8133 eltype); 8134 __ maddw(result, result, rscratch2, rscratch1); 8135 } 8136 __ bind(BR_BASE); 8137 8138 __ leave(); 8139 __ ret(lr); 8140 8141 // LARGE LOOP 8142 __ bind(LARGE_LOOP_PREHEADER); 8143 8144 __ lsr(rscratch2, cnt, exact_log2(evf)); 8145 8146 if (multiply_by_halves) { 8147 // 31^4 - multiplier between lower and upper parts of a register 8148 __ movw(rscratch1, intpow(31U, vf / 2)); 8149 __ mov(vpowm, Assembler::S, 1, rscratch1); 8150 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8151 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8152 __ mov(vpowm, Assembler::S, 0, rscratch1); 8153 } else { 8154 // 31^16 8155 __ movw(rscratch1, intpow(31U, evf)); 8156 __ mov(vpowm, Assembler::S, 0, rscratch1); 8157 } 8158 8159 __ mov(vmul3, Assembler::T16B, 0); 8160 __ mov(vmul2, Assembler::T16B, 0); 8161 __ mov(vmul1, Assembler::T16B, 0); 8162 8163 __ bind(LARGE_LOOP); 8164 8165 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8166 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8167 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8168 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8169 8170 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8171 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8172 8173 if (load_arrangement == Assembler::T8B) { 8174 // Extend 8B to 8H to be able to use vector multiply 8175 // instructions 8176 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8177 if (is_signed_subword_type(eltype)) { 8178 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8179 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8180 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8181 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8182 } else { 8183 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8184 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8185 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8186 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8187 } 8188 } 8189 8190 switch (load_arrangement) { 8191 case Assembler::T4S: 8192 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8193 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8194 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8195 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8196 break; 8197 case Assembler::T8B: 8198 case Assembler::T8H: 8199 assert(is_subword_type(eltype), "subword type expected"); 8200 if (is_signed_subword_type(eltype)) { 8201 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8202 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8203 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8204 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8205 } else { 8206 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8207 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8208 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8209 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8210 } 8211 break; 8212 default: 8213 __ should_not_reach_here(); 8214 } 8215 8216 // Process the upper half of a vector 8217 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8218 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8219 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8220 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8221 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8222 if (is_signed_subword_type(eltype)) { 8223 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8224 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8225 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8226 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8227 } else { 8228 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8229 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8230 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8231 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8232 } 8233 } 8234 8235 __ subsw(rscratch2, rscratch2, 1); 8236 __ br(Assembler::HI, LARGE_LOOP); 8237 8238 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8239 __ addv(vmul3, Assembler::T4S, vmul3); 8240 __ umov(result, vmul3, Assembler::S, 0); 8241 8242 __ mov(rscratch2, intpow(31U, vf)); 8243 8244 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8245 __ addv(vmul2, Assembler::T4S, vmul2); 8246 __ umov(rscratch1, vmul2, Assembler::S, 0); 8247 __ maddw(result, result, rscratch2, rscratch1); 8248 8249 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8250 __ addv(vmul1, Assembler::T4S, vmul1); 8251 __ umov(rscratch1, vmul1, Assembler::S, 0); 8252 __ maddw(result, result, rscratch2, rscratch1); 8253 8254 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8255 __ addv(vmul0, Assembler::T4S, vmul0); 8256 __ umov(rscratch1, vmul0, Assembler::S, 0); 8257 __ maddw(result, result, rscratch2, rscratch1); 8258 8259 __ andr(rscratch2, cnt, vf - 1); 8260 __ cbnz(rscratch2, TAIL_SHORTCUT); 8261 8262 __ leave(); 8263 __ ret(lr); 8264 8265 return entry; 8266 } 8267 8268 address generate_dsin_dcos(bool isCos) { 8269 __ align(CodeEntryAlignment); 8270 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 8271 StubCodeMark mark(this, stub_id); 8272 address start = __ pc(); 8273 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8274 (address)StubRoutines::aarch64::_two_over_pi, 8275 (address)StubRoutines::aarch64::_pio2, 8276 (address)StubRoutines::aarch64::_dsin_coef, 8277 (address)StubRoutines::aarch64::_dcos_coef); 8278 return start; 8279 } 8280 8281 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8282 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8283 Label &DIFF2) { 8284 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8285 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8286 8287 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8288 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8289 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8290 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8291 8292 __ fmovd(tmpL, vtmp3); 8293 __ eor(rscratch2, tmp3, tmpL); 8294 __ cbnz(rscratch2, DIFF2); 8295 8296 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8297 __ umov(tmpL, vtmp3, __ D, 1); 8298 __ eor(rscratch2, tmpU, tmpL); 8299 __ cbnz(rscratch2, DIFF1); 8300 8301 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8302 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8303 __ fmovd(tmpL, vtmp); 8304 __ eor(rscratch2, tmp3, tmpL); 8305 __ cbnz(rscratch2, DIFF2); 8306 8307 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8308 __ umov(tmpL, vtmp, __ D, 1); 8309 __ eor(rscratch2, tmpU, tmpL); 8310 __ cbnz(rscratch2, DIFF1); 8311 } 8312 8313 // r0 = result 8314 // r1 = str1 8315 // r2 = cnt1 8316 // r3 = str2 8317 // r4 = cnt2 8318 // r10 = tmp1 8319 // r11 = tmp2 8320 address generate_compare_long_string_different_encoding(bool isLU) { 8321 __ align(CodeEntryAlignment); 8322 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 8323 StubCodeMark mark(this, stub_id); 8324 address entry = __ pc(); 8325 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8326 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8327 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8328 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8329 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8330 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8331 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8332 8333 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8334 8335 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8336 // cnt2 == amount of characters left to compare 8337 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8338 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8339 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8340 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8341 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8342 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8343 __ eor(rscratch2, tmp1, tmp2); 8344 __ mov(rscratch1, tmp2); 8345 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8346 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8347 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8348 __ push(spilled_regs, sp); 8349 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8350 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8351 8352 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8353 8354 if (SoftwarePrefetchHintDistance >= 0) { 8355 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8356 __ br(__ LT, NO_PREFETCH); 8357 __ bind(LARGE_LOOP_PREFETCH); 8358 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8359 __ mov(tmp4, 2); 8360 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8361 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8362 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8363 __ subs(tmp4, tmp4, 1); 8364 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8365 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8366 __ mov(tmp4, 2); 8367 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8368 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8369 __ subs(tmp4, tmp4, 1); 8370 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8371 __ sub(cnt2, cnt2, 64); 8372 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8373 __ br(__ GE, LARGE_LOOP_PREFETCH); 8374 } 8375 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8376 __ bind(NO_PREFETCH); 8377 __ subs(cnt2, cnt2, 16); 8378 __ br(__ LT, TAIL); 8379 __ align(OptoLoopAlignment); 8380 __ bind(SMALL_LOOP); // smaller loop 8381 __ subs(cnt2, cnt2, 16); 8382 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8383 __ br(__ GE, SMALL_LOOP); 8384 __ cmn(cnt2, (u1)16); 8385 __ br(__ EQ, LOAD_LAST); 8386 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8387 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8388 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8389 __ ldr(tmp3, Address(cnt1, -8)); 8390 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8391 __ b(LOAD_LAST); 8392 __ bind(DIFF2); 8393 __ mov(tmpU, tmp3); 8394 __ bind(DIFF1); 8395 __ pop(spilled_regs, sp); 8396 __ b(CALCULATE_DIFFERENCE); 8397 __ bind(LOAD_LAST); 8398 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8399 // No need to load it again 8400 __ mov(tmpU, tmp3); 8401 __ pop(spilled_regs, sp); 8402 8403 // tmp2 points to the address of the last 4 Latin1 characters right now 8404 __ ldrs(vtmp, Address(tmp2)); 8405 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8406 __ fmovd(tmpL, vtmp); 8407 8408 __ eor(rscratch2, tmpU, tmpL); 8409 __ cbz(rscratch2, DONE); 8410 8411 // Find the first different characters in the longwords and 8412 // compute their difference. 8413 __ bind(CALCULATE_DIFFERENCE); 8414 __ rev(rscratch2, rscratch2); 8415 __ clz(rscratch2, rscratch2); 8416 __ andr(rscratch2, rscratch2, -16); 8417 __ lsrv(tmp1, tmp1, rscratch2); 8418 __ uxthw(tmp1, tmp1); 8419 __ lsrv(rscratch1, rscratch1, rscratch2); 8420 __ uxthw(rscratch1, rscratch1); 8421 __ subw(result, tmp1, rscratch1); 8422 __ bind(DONE); 8423 __ ret(lr); 8424 return entry; 8425 } 8426 8427 // r0 = input (float16) 8428 // v0 = result (float) 8429 // v1 = temporary float register 8430 address generate_float16ToFloat() { 8431 __ align(CodeEntryAlignment); 8432 StubGenStubId stub_id = StubGenStubId::hf2f_id; 8433 StubCodeMark mark(this, stub_id); 8434 address entry = __ pc(); 8435 BLOCK_COMMENT("Entry:"); 8436 __ flt16_to_flt(v0, r0, v1); 8437 __ ret(lr); 8438 return entry; 8439 } 8440 8441 // v0 = input (float) 8442 // r0 = result (float16) 8443 // v1 = temporary float register 8444 address generate_floatToFloat16() { 8445 __ align(CodeEntryAlignment); 8446 StubGenStubId stub_id = StubGenStubId::f2hf_id; 8447 StubCodeMark mark(this, stub_id); 8448 address entry = __ pc(); 8449 BLOCK_COMMENT("Entry:"); 8450 __ flt_to_flt16(r0, v0, v1); 8451 __ ret(lr); 8452 return entry; 8453 } 8454 8455 address generate_method_entry_barrier() { 8456 __ align(CodeEntryAlignment); 8457 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 8458 StubCodeMark mark(this, stub_id); 8459 8460 Label deoptimize_label; 8461 8462 address start = __ pc(); 8463 8464 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8465 8466 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8467 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8468 // We can get here despite the nmethod being good, if we have not 8469 // yet applied our cross modification fence (or data fence). 8470 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8471 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8472 __ ldrw(rscratch2, rscratch2); 8473 __ strw(rscratch2, thread_epoch_addr); 8474 __ isb(); 8475 __ membar(__ LoadLoad); 8476 } 8477 8478 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8479 8480 __ enter(); 8481 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8482 8483 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8484 8485 __ push_call_clobbered_registers(); 8486 8487 __ mov(c_rarg0, rscratch2); 8488 __ call_VM_leaf 8489 (CAST_FROM_FN_PTR 8490 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8491 8492 __ reset_last_Java_frame(true); 8493 8494 __ mov(rscratch1, r0); 8495 8496 __ pop_call_clobbered_registers(); 8497 8498 __ cbnz(rscratch1, deoptimize_label); 8499 8500 __ leave(); 8501 __ ret(lr); 8502 8503 __ BIND(deoptimize_label); 8504 8505 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8506 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 8507 8508 __ mov(sp, rscratch1); 8509 __ br(rscratch2); 8510 8511 return start; 8512 } 8513 8514 // r0 = result 8515 // r1 = str1 8516 // r2 = cnt1 8517 // r3 = str2 8518 // r4 = cnt2 8519 // r10 = tmp1 8520 // r11 = tmp2 8521 address generate_compare_long_string_same_encoding(bool isLL) { 8522 __ align(CodeEntryAlignment); 8523 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 8524 StubCodeMark mark(this, stub_id); 8525 address entry = __ pc(); 8526 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8527 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 8528 8529 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 8530 8531 // exit from large loop when less than 64 bytes left to read or we're about 8532 // to prefetch memory behind array border 8533 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 8534 8535 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 8536 __ eor(rscratch2, tmp1, tmp2); 8537 __ cbnz(rscratch2, CAL_DIFFERENCE); 8538 8539 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 8540 // update pointers, because of previous read 8541 __ add(str1, str1, wordSize); 8542 __ add(str2, str2, wordSize); 8543 if (SoftwarePrefetchHintDistance >= 0) { 8544 __ align(OptoLoopAlignment); 8545 __ bind(LARGE_LOOP_PREFETCH); 8546 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 8547 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 8548 8549 for (int i = 0; i < 4; i++) { 8550 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 8551 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 8552 __ cmp(tmp1, tmp2); 8553 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8554 __ br(Assembler::NE, DIFF); 8555 } 8556 __ sub(cnt2, cnt2, isLL ? 64 : 32); 8557 __ add(str1, str1, 64); 8558 __ add(str2, str2, 64); 8559 __ subs(rscratch2, cnt2, largeLoopExitCondition); 8560 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 8561 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 8562 } 8563 8564 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 8565 __ br(Assembler::LE, LESS16); 8566 __ align(OptoLoopAlignment); 8567 __ bind(LOOP_COMPARE16); 8568 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8569 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8570 __ cmp(tmp1, tmp2); 8571 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8572 __ br(Assembler::NE, DIFF); 8573 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8574 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8575 __ br(Assembler::LT, LESS16); 8576 8577 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8578 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8579 __ cmp(tmp1, tmp2); 8580 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8581 __ br(Assembler::NE, DIFF); 8582 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8583 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8584 __ br(Assembler::GE, LOOP_COMPARE16); 8585 __ cbz(cnt2, LENGTH_DIFF); 8586 8587 __ bind(LESS16); 8588 // each 8 compare 8589 __ subs(cnt2, cnt2, isLL ? 8 : 4); 8590 __ br(Assembler::LE, LESS8); 8591 __ ldr(tmp1, Address(__ post(str1, 8))); 8592 __ ldr(tmp2, Address(__ post(str2, 8))); 8593 __ eor(rscratch2, tmp1, tmp2); 8594 __ cbnz(rscratch2, CAL_DIFFERENCE); 8595 __ sub(cnt2, cnt2, isLL ? 8 : 4); 8596 8597 __ bind(LESS8); // directly load last 8 bytes 8598 if (!isLL) { 8599 __ add(cnt2, cnt2, cnt2); 8600 } 8601 __ ldr(tmp1, Address(str1, cnt2)); 8602 __ ldr(tmp2, Address(str2, cnt2)); 8603 __ eor(rscratch2, tmp1, tmp2); 8604 __ cbz(rscratch2, LENGTH_DIFF); 8605 __ b(CAL_DIFFERENCE); 8606 8607 __ bind(DIFF); 8608 __ cmp(tmp1, tmp2); 8609 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 8610 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 8611 // reuse rscratch2 register for the result of eor instruction 8612 __ eor(rscratch2, tmp1, tmp2); 8613 8614 __ bind(CAL_DIFFERENCE); 8615 __ rev(rscratch2, rscratch2); 8616 __ clz(rscratch2, rscratch2); 8617 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 8618 __ lsrv(tmp1, tmp1, rscratch2); 8619 __ lsrv(tmp2, tmp2, rscratch2); 8620 if (isLL) { 8621 __ uxtbw(tmp1, tmp1); 8622 __ uxtbw(tmp2, tmp2); 8623 } else { 8624 __ uxthw(tmp1, tmp1); 8625 __ uxthw(tmp2, tmp2); 8626 } 8627 __ subw(result, tmp1, tmp2); 8628 8629 __ bind(LENGTH_DIFF); 8630 __ ret(lr); 8631 return entry; 8632 } 8633 8634 enum string_compare_mode { 8635 LL, 8636 LU, 8637 UL, 8638 UU, 8639 }; 8640 8641 // The following registers are declared in aarch64.ad 8642 // r0 = result 8643 // r1 = str1 8644 // r2 = cnt1 8645 // r3 = str2 8646 // r4 = cnt2 8647 // r10 = tmp1 8648 // r11 = tmp2 8649 // z0 = ztmp1 8650 // z1 = ztmp2 8651 // p0 = pgtmp1 8652 // p1 = pgtmp2 8653 address generate_compare_long_string_sve(string_compare_mode mode) { 8654 StubGenStubId stub_id; 8655 switch (mode) { 8656 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 8657 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 8658 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 8659 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 8660 default: ShouldNotReachHere(); 8661 } 8662 8663 __ align(CodeEntryAlignment); 8664 address entry = __ pc(); 8665 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8666 tmp1 = r10, tmp2 = r11; 8667 8668 Label LOOP, DONE, MISMATCH; 8669 Register vec_len = tmp1; 8670 Register idx = tmp2; 8671 // The minimum of the string lengths has been stored in cnt2. 8672 Register cnt = cnt2; 8673 FloatRegister ztmp1 = z0, ztmp2 = z1; 8674 PRegister pgtmp1 = p0, pgtmp2 = p1; 8675 8676 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 8677 switch (mode) { \ 8678 case LL: \ 8679 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 8680 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 8681 break; \ 8682 case LU: \ 8683 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 8684 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8685 break; \ 8686 case UL: \ 8687 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8688 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 8689 break; \ 8690 case UU: \ 8691 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8692 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8693 break; \ 8694 default: \ 8695 ShouldNotReachHere(); \ 8696 } 8697 8698 StubCodeMark mark(this, stub_id); 8699 8700 __ mov(idx, 0); 8701 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8702 8703 if (mode == LL) { 8704 __ sve_cntb(vec_len); 8705 } else { 8706 __ sve_cnth(vec_len); 8707 } 8708 8709 __ sub(rscratch1, cnt, vec_len); 8710 8711 __ bind(LOOP); 8712 8713 // main loop 8714 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8715 __ add(idx, idx, vec_len); 8716 // Compare strings. 8717 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8718 __ br(__ NE, MISMATCH); 8719 __ cmp(idx, rscratch1); 8720 __ br(__ LT, LOOP); 8721 8722 // post loop, last iteration 8723 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8724 8725 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8726 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8727 __ br(__ EQ, DONE); 8728 8729 __ bind(MISMATCH); 8730 8731 // Crop the vector to find its location. 8732 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 8733 // Extract the first different characters of each string. 8734 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 8735 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 8736 8737 // Compute the difference of the first different characters. 8738 __ sub(result, rscratch1, rscratch2); 8739 8740 __ bind(DONE); 8741 __ ret(lr); 8742 #undef LOAD_PAIR 8743 return entry; 8744 } 8745 8746 void generate_compare_long_strings() { 8747 if (UseSVE == 0) { 8748 StubRoutines::aarch64::_compare_long_string_LL 8749 = generate_compare_long_string_same_encoding(true); 8750 StubRoutines::aarch64::_compare_long_string_UU 8751 = generate_compare_long_string_same_encoding(false); 8752 StubRoutines::aarch64::_compare_long_string_LU 8753 = generate_compare_long_string_different_encoding(true); 8754 StubRoutines::aarch64::_compare_long_string_UL 8755 = generate_compare_long_string_different_encoding(false); 8756 } else { 8757 StubRoutines::aarch64::_compare_long_string_LL 8758 = generate_compare_long_string_sve(LL); 8759 StubRoutines::aarch64::_compare_long_string_UU 8760 = generate_compare_long_string_sve(UU); 8761 StubRoutines::aarch64::_compare_long_string_LU 8762 = generate_compare_long_string_sve(LU); 8763 StubRoutines::aarch64::_compare_long_string_UL 8764 = generate_compare_long_string_sve(UL); 8765 } 8766 } 8767 8768 // R0 = result 8769 // R1 = str2 8770 // R2 = cnt1 8771 // R3 = str1 8772 // R4 = cnt2 8773 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 8774 // 8775 // This generic linear code use few additional ideas, which makes it faster: 8776 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 8777 // in order to skip initial loading(help in systems with 1 ld pipeline) 8778 // 2) we can use "fast" algorithm of finding single character to search for 8779 // first symbol with less branches(1 branch per each loaded register instead 8780 // of branch for each symbol), so, this is where constants like 8781 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 8782 // 3) after loading and analyzing 1st register of source string, it can be 8783 // used to search for every 1st character entry, saving few loads in 8784 // comparison with "simplier-but-slower" implementation 8785 // 4) in order to avoid lots of push/pop operations, code below is heavily 8786 // re-using/re-initializing/compressing register values, which makes code 8787 // larger and a bit less readable, however, most of extra operations are 8788 // issued during loads or branches, so, penalty is minimal 8789 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 8790 StubGenStubId stub_id; 8791 if (str1_isL) { 8792 if (str2_isL) { 8793 stub_id = StubGenStubId::string_indexof_linear_ll_id; 8794 } else { 8795 stub_id = StubGenStubId::string_indexof_linear_ul_id; 8796 } 8797 } else { 8798 if (str2_isL) { 8799 ShouldNotReachHere(); 8800 } else { 8801 stub_id = StubGenStubId::string_indexof_linear_uu_id; 8802 } 8803 } 8804 __ align(CodeEntryAlignment); 8805 StubCodeMark mark(this, stub_id); 8806 address entry = __ pc(); 8807 8808 int str1_chr_size = str1_isL ? 1 : 2; 8809 int str2_chr_size = str2_isL ? 1 : 2; 8810 int str1_chr_shift = str1_isL ? 0 : 1; 8811 int str2_chr_shift = str2_isL ? 0 : 1; 8812 bool isL = str1_isL && str2_isL; 8813 // parameters 8814 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 8815 // temporary registers 8816 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 8817 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 8818 // redefinitions 8819 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 8820 8821 __ push(spilled_regs, sp); 8822 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 8823 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 8824 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 8825 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 8826 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 8827 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 8828 // Read whole register from str1. It is safe, because length >=8 here 8829 __ ldr(ch1, Address(str1)); 8830 // Read whole register from str2. It is safe, because length >=8 here 8831 __ ldr(ch2, Address(str2)); 8832 __ sub(cnt2, cnt2, cnt1); 8833 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 8834 if (str1_isL != str2_isL) { 8835 __ eor(v0, __ T16B, v0, v0); 8836 } 8837 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 8838 __ mul(first, first, tmp1); 8839 // check if we have less than 1 register to check 8840 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 8841 if (str1_isL != str2_isL) { 8842 __ fmovd(v1, ch1); 8843 } 8844 __ br(__ LE, L_SMALL); 8845 __ eor(ch2, first, ch2); 8846 if (str1_isL != str2_isL) { 8847 __ zip1(v1, __ T16B, v1, v0); 8848 } 8849 __ sub(tmp2, ch2, tmp1); 8850 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8851 __ bics(tmp2, tmp2, ch2); 8852 if (str1_isL != str2_isL) { 8853 __ fmovd(ch1, v1); 8854 } 8855 __ br(__ NE, L_HAS_ZERO); 8856 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8857 __ add(result, result, wordSize/str2_chr_size); 8858 __ add(str2, str2, wordSize); 8859 __ br(__ LT, L_POST_LOOP); 8860 __ BIND(L_LOOP); 8861 __ ldr(ch2, Address(str2)); 8862 __ eor(ch2, first, ch2); 8863 __ sub(tmp2, ch2, tmp1); 8864 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8865 __ bics(tmp2, tmp2, ch2); 8866 __ br(__ NE, L_HAS_ZERO); 8867 __ BIND(L_LOOP_PROCEED); 8868 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8869 __ add(str2, str2, wordSize); 8870 __ add(result, result, wordSize/str2_chr_size); 8871 __ br(__ GE, L_LOOP); 8872 __ BIND(L_POST_LOOP); 8873 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 8874 __ br(__ LE, NOMATCH); 8875 __ ldr(ch2, Address(str2)); 8876 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 8877 __ eor(ch2, first, ch2); 8878 __ sub(tmp2, ch2, tmp1); 8879 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8880 __ mov(tmp4, -1); // all bits set 8881 __ b(L_SMALL_PROCEED); 8882 __ align(OptoLoopAlignment); 8883 __ BIND(L_SMALL); 8884 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 8885 __ eor(ch2, first, ch2); 8886 if (str1_isL != str2_isL) { 8887 __ zip1(v1, __ T16B, v1, v0); 8888 } 8889 __ sub(tmp2, ch2, tmp1); 8890 __ mov(tmp4, -1); // all bits set 8891 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8892 if (str1_isL != str2_isL) { 8893 __ fmovd(ch1, v1); // move converted 4 symbols 8894 } 8895 __ BIND(L_SMALL_PROCEED); 8896 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 8897 __ bic(tmp2, tmp2, ch2); 8898 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 8899 __ rbit(tmp2, tmp2); 8900 __ br(__ EQ, NOMATCH); 8901 __ BIND(L_SMALL_HAS_ZERO_LOOP); 8902 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 8903 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 8904 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 8905 if (str2_isL) { // LL 8906 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 8907 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 8908 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 8909 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 8910 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8911 } else { 8912 __ mov(ch2, 0xE); // all bits in byte set except last one 8913 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8914 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8915 __ lslv(tmp2, tmp2, tmp4); 8916 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8917 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8918 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8919 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8920 } 8921 __ cmp(ch1, ch2); 8922 __ mov(tmp4, wordSize/str2_chr_size); 8923 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8924 __ BIND(L_SMALL_CMP_LOOP); 8925 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 8926 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 8927 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 8928 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 8929 __ add(tmp4, tmp4, 1); 8930 __ cmp(tmp4, cnt1); 8931 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 8932 __ cmp(first, ch2); 8933 __ br(__ EQ, L_SMALL_CMP_LOOP); 8934 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 8935 __ cbz(tmp2, NOMATCH); // no more matches. exit 8936 __ clz(tmp4, tmp2); 8937 __ add(result, result, 1); // advance index 8938 __ add(str2, str2, str2_chr_size); // advance pointer 8939 __ b(L_SMALL_HAS_ZERO_LOOP); 8940 __ align(OptoLoopAlignment); 8941 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 8942 __ cmp(first, ch2); 8943 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8944 __ b(DONE); 8945 __ align(OptoLoopAlignment); 8946 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 8947 if (str2_isL) { // LL 8948 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 8949 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 8950 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 8951 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 8952 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8953 } else { 8954 __ mov(ch2, 0xE); // all bits in byte set except last one 8955 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8956 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8957 __ lslv(tmp2, tmp2, tmp4); 8958 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8959 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8960 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8961 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8962 } 8963 __ cmp(ch1, ch2); 8964 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8965 __ b(DONE); 8966 __ align(OptoLoopAlignment); 8967 __ BIND(L_HAS_ZERO); 8968 __ rbit(tmp2, tmp2); 8969 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 8970 // Now, perform compression of counters(cnt2 and cnt1) into one register. 8971 // It's fine because both counters are 32bit and are not changed in this 8972 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 8973 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 8974 __ sub(result, result, 1); 8975 __ BIND(L_HAS_ZERO_LOOP); 8976 __ mov(cnt1, wordSize/str2_chr_size); 8977 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 8978 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 8979 if (str2_isL) { 8980 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 8981 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8982 __ lslv(tmp2, tmp2, tmp4); 8983 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8984 __ add(tmp4, tmp4, 1); 8985 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8986 __ lsl(tmp2, tmp2, 1); 8987 __ mov(tmp4, wordSize/str2_chr_size); 8988 } else { 8989 __ mov(ch2, 0xE); 8990 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8991 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8992 __ lslv(tmp2, tmp2, tmp4); 8993 __ add(tmp4, tmp4, 1); 8994 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8995 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 8996 __ lsl(tmp2, tmp2, 1); 8997 __ mov(tmp4, wordSize/str2_chr_size); 8998 __ sub(str2, str2, str2_chr_size); 8999 } 9000 __ cmp(ch1, ch2); 9001 __ mov(tmp4, wordSize/str2_chr_size); 9002 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9003 __ BIND(L_CMP_LOOP); 9004 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9005 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9006 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9007 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9008 __ add(tmp4, tmp4, 1); 9009 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9010 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9011 __ cmp(cnt1, ch2); 9012 __ br(__ EQ, L_CMP_LOOP); 9013 __ BIND(L_CMP_LOOP_NOMATCH); 9014 // here we're not matched 9015 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9016 __ clz(tmp4, tmp2); 9017 __ add(str2, str2, str2_chr_size); // advance pointer 9018 __ b(L_HAS_ZERO_LOOP); 9019 __ align(OptoLoopAlignment); 9020 __ BIND(L_CMP_LOOP_LAST_CMP); 9021 __ cmp(cnt1, ch2); 9022 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9023 __ b(DONE); 9024 __ align(OptoLoopAlignment); 9025 __ BIND(L_CMP_LOOP_LAST_CMP2); 9026 if (str2_isL) { 9027 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9028 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9029 __ lslv(tmp2, tmp2, tmp4); 9030 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9031 __ add(tmp4, tmp4, 1); 9032 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9033 __ lsl(tmp2, tmp2, 1); 9034 } else { 9035 __ mov(ch2, 0xE); 9036 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9037 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9038 __ lslv(tmp2, tmp2, tmp4); 9039 __ add(tmp4, tmp4, 1); 9040 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9041 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9042 __ lsl(tmp2, tmp2, 1); 9043 __ sub(str2, str2, str2_chr_size); 9044 } 9045 __ cmp(ch1, ch2); 9046 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9047 __ b(DONE); 9048 __ align(OptoLoopAlignment); 9049 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9050 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9051 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9052 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9053 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9054 // result by analyzed characters value, so, we can just reset lower bits 9055 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9056 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9057 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9058 // index of last analyzed substring inside current octet. So, str2 in at 9059 // respective start address. We need to advance it to next octet 9060 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9061 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9062 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9063 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9064 __ movw(cnt2, cnt2); 9065 __ b(L_LOOP_PROCEED); 9066 __ align(OptoLoopAlignment); 9067 __ BIND(NOMATCH); 9068 __ mov(result, -1); 9069 __ BIND(DONE); 9070 __ pop(spilled_regs, sp); 9071 __ ret(lr); 9072 return entry; 9073 } 9074 9075 void generate_string_indexof_stubs() { 9076 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9077 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9078 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9079 } 9080 9081 void inflate_and_store_2_fp_registers(bool generatePrfm, 9082 FloatRegister src1, FloatRegister src2) { 9083 Register dst = r1; 9084 __ zip1(v1, __ T16B, src1, v0); 9085 __ zip2(v2, __ T16B, src1, v0); 9086 if (generatePrfm) { 9087 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9088 } 9089 __ zip1(v3, __ T16B, src2, v0); 9090 __ zip2(v4, __ T16B, src2, v0); 9091 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9092 } 9093 9094 // R0 = src 9095 // R1 = dst 9096 // R2 = len 9097 // R3 = len >> 3 9098 // V0 = 0 9099 // v1 = loaded 8 bytes 9100 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9101 address generate_large_byte_array_inflate() { 9102 __ align(CodeEntryAlignment); 9103 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 9104 StubCodeMark mark(this, stub_id); 9105 address entry = __ pc(); 9106 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9107 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9108 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9109 9110 // do one more 8-byte read to have address 16-byte aligned in most cases 9111 // also use single store instruction 9112 __ ldrd(v2, __ post(src, 8)); 9113 __ sub(octetCounter, octetCounter, 2); 9114 __ zip1(v1, __ T16B, v1, v0); 9115 __ zip1(v2, __ T16B, v2, v0); 9116 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9117 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9118 __ subs(rscratch1, octetCounter, large_loop_threshold); 9119 __ br(__ LE, LOOP_START); 9120 __ b(LOOP_PRFM_START); 9121 __ bind(LOOP_PRFM); 9122 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9123 __ bind(LOOP_PRFM_START); 9124 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9125 __ sub(octetCounter, octetCounter, 8); 9126 __ subs(rscratch1, octetCounter, large_loop_threshold); 9127 inflate_and_store_2_fp_registers(true, v3, v4); 9128 inflate_and_store_2_fp_registers(true, v5, v6); 9129 __ br(__ GT, LOOP_PRFM); 9130 __ cmp(octetCounter, (u1)8); 9131 __ br(__ LT, DONE); 9132 __ bind(LOOP); 9133 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9134 __ bind(LOOP_START); 9135 __ sub(octetCounter, octetCounter, 8); 9136 __ cmp(octetCounter, (u1)8); 9137 inflate_and_store_2_fp_registers(false, v3, v4); 9138 inflate_and_store_2_fp_registers(false, v5, v6); 9139 __ br(__ GE, LOOP); 9140 __ bind(DONE); 9141 __ ret(lr); 9142 return entry; 9143 } 9144 9145 /** 9146 * Arguments: 9147 * 9148 * Input: 9149 * c_rarg0 - current state address 9150 * c_rarg1 - H key address 9151 * c_rarg2 - data address 9152 * c_rarg3 - number of blocks 9153 * 9154 * Output: 9155 * Updated state at c_rarg0 9156 */ 9157 address generate_ghash_processBlocks() { 9158 // Bafflingly, GCM uses little-endian for the byte order, but 9159 // big-endian for the bit order. For example, the polynomial 1 is 9160 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9161 // 9162 // So, we must either reverse the bytes in each word and do 9163 // everything big-endian or reverse the bits in each byte and do 9164 // it little-endian. On AArch64 it's more idiomatic to reverse 9165 // the bits in each byte (we have an instruction, RBIT, to do 9166 // that) and keep the data in little-endian bit order through the 9167 // calculation, bit-reversing the inputs and outputs. 9168 9169 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 9170 StubCodeMark mark(this, stub_id); 9171 __ align(wordSize * 2); 9172 address p = __ pc(); 9173 __ emit_int64(0x87); // The low-order bits of the field 9174 // polynomial (i.e. p = z^7+z^2+z+1) 9175 // repeated in the low and high parts of a 9176 // 128-bit vector 9177 __ emit_int64(0x87); 9178 9179 __ align(CodeEntryAlignment); 9180 address start = __ pc(); 9181 9182 Register state = c_rarg0; 9183 Register subkeyH = c_rarg1; 9184 Register data = c_rarg2; 9185 Register blocks = c_rarg3; 9186 9187 FloatRegister vzr = v30; 9188 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9189 9190 __ ldrq(v24, p); // The field polynomial 9191 9192 __ ldrq(v0, Address(state)); 9193 __ ldrq(v1, Address(subkeyH)); 9194 9195 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9196 __ rbit(v0, __ T16B, v0); 9197 __ rev64(v1, __ T16B, v1); 9198 __ rbit(v1, __ T16B, v1); 9199 9200 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9201 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9202 9203 { 9204 Label L_ghash_loop; 9205 __ bind(L_ghash_loop); 9206 9207 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9208 // reversing each byte 9209 __ rbit(v2, __ T16B, v2); 9210 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9211 9212 // Multiply state in v2 by subkey in v1 9213 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9214 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9215 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9216 // Reduce v7:v5 by the field polynomial 9217 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9218 9219 __ sub(blocks, blocks, 1); 9220 __ cbnz(blocks, L_ghash_loop); 9221 } 9222 9223 // The bit-reversed result is at this point in v0 9224 __ rev64(v0, __ T16B, v0); 9225 __ rbit(v0, __ T16B, v0); 9226 9227 __ st1(v0, __ T16B, state); 9228 __ ret(lr); 9229 9230 return start; 9231 } 9232 9233 address generate_ghash_processBlocks_wide() { 9234 address small = generate_ghash_processBlocks(); 9235 9236 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 9237 StubCodeMark mark(this, stub_id); 9238 __ align(wordSize * 2); 9239 address p = __ pc(); 9240 __ emit_int64(0x87); // The low-order bits of the field 9241 // polynomial (i.e. p = z^7+z^2+z+1) 9242 // repeated in the low and high parts of a 9243 // 128-bit vector 9244 __ emit_int64(0x87); 9245 9246 __ align(CodeEntryAlignment); 9247 address start = __ pc(); 9248 9249 Register state = c_rarg0; 9250 Register subkeyH = c_rarg1; 9251 Register data = c_rarg2; 9252 Register blocks = c_rarg3; 9253 9254 const int unroll = 4; 9255 9256 __ cmp(blocks, (unsigned char)(unroll * 2)); 9257 __ br(__ LT, small); 9258 9259 if (unroll > 1) { 9260 // Save state before entering routine 9261 __ sub(sp, sp, 4 * 16); 9262 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9263 __ sub(sp, sp, 4 * 16); 9264 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9265 } 9266 9267 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9268 9269 if (unroll > 1) { 9270 // And restore state 9271 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9272 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9273 } 9274 9275 __ cmp(blocks, (unsigned char)0); 9276 __ br(__ GT, small); 9277 9278 __ ret(lr); 9279 9280 return start; 9281 } 9282 9283 void generate_base64_encode_simdround(Register src, Register dst, 9284 FloatRegister codec, u8 size) { 9285 9286 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9287 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9288 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9289 9290 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9291 9292 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9293 9294 __ ushr(ind0, arrangement, in0, 2); 9295 9296 __ ushr(ind1, arrangement, in1, 2); 9297 __ shl(in0, arrangement, in0, 6); 9298 __ orr(ind1, arrangement, ind1, in0); 9299 __ ushr(ind1, arrangement, ind1, 2); 9300 9301 __ ushr(ind2, arrangement, in2, 4); 9302 __ shl(in1, arrangement, in1, 4); 9303 __ orr(ind2, arrangement, in1, ind2); 9304 __ ushr(ind2, arrangement, ind2, 2); 9305 9306 __ shl(ind3, arrangement, in2, 2); 9307 __ ushr(ind3, arrangement, ind3, 2); 9308 9309 __ tbl(out0, arrangement, codec, 4, ind0); 9310 __ tbl(out1, arrangement, codec, 4, ind1); 9311 __ tbl(out2, arrangement, codec, 4, ind2); 9312 __ tbl(out3, arrangement, codec, 4, ind3); 9313 9314 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9315 } 9316 9317 /** 9318 * Arguments: 9319 * 9320 * Input: 9321 * c_rarg0 - src_start 9322 * c_rarg1 - src_offset 9323 * c_rarg2 - src_length 9324 * c_rarg3 - dest_start 9325 * c_rarg4 - dest_offset 9326 * c_rarg5 - isURL 9327 * 9328 */ 9329 address generate_base64_encodeBlock() { 9330 9331 static const char toBase64[64] = { 9332 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9333 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9334 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9335 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9336 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9337 }; 9338 9339 static const char toBase64URL[64] = { 9340 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9341 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9342 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9343 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9344 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9345 }; 9346 9347 __ align(CodeEntryAlignment); 9348 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 9349 StubCodeMark mark(this, stub_id); 9350 address start = __ pc(); 9351 9352 Register src = c_rarg0; // source array 9353 Register soff = c_rarg1; // source start offset 9354 Register send = c_rarg2; // source end offset 9355 Register dst = c_rarg3; // dest array 9356 Register doff = c_rarg4; // position for writing to dest array 9357 Register isURL = c_rarg5; // Base64 or URL character set 9358 9359 // c_rarg6 and c_rarg7 are free to use as temps 9360 Register codec = c_rarg6; 9361 Register length = c_rarg7; 9362 9363 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9364 9365 __ add(src, src, soff); 9366 __ add(dst, dst, doff); 9367 __ sub(length, send, soff); 9368 9369 // load the codec base address 9370 __ lea(codec, ExternalAddress((address) toBase64)); 9371 __ cbz(isURL, ProcessData); 9372 __ lea(codec, ExternalAddress((address) toBase64URL)); 9373 9374 __ BIND(ProcessData); 9375 9376 // too short to formup a SIMD loop, roll back 9377 __ cmp(length, (u1)24); 9378 __ br(Assembler::LT, Process3B); 9379 9380 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9381 9382 __ BIND(Process48B); 9383 __ cmp(length, (u1)48); 9384 __ br(Assembler::LT, Process24B); 9385 generate_base64_encode_simdround(src, dst, v0, 16); 9386 __ sub(length, length, 48); 9387 __ b(Process48B); 9388 9389 __ BIND(Process24B); 9390 __ cmp(length, (u1)24); 9391 __ br(Assembler::LT, SIMDExit); 9392 generate_base64_encode_simdround(src, dst, v0, 8); 9393 __ sub(length, length, 24); 9394 9395 __ BIND(SIMDExit); 9396 __ cbz(length, Exit); 9397 9398 __ BIND(Process3B); 9399 // 3 src bytes, 24 bits 9400 __ ldrb(r10, __ post(src, 1)); 9401 __ ldrb(r11, __ post(src, 1)); 9402 __ ldrb(r12, __ post(src, 1)); 9403 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9404 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9405 // codec index 9406 __ ubfmw(r15, r12, 18, 23); 9407 __ ubfmw(r14, r12, 12, 17); 9408 __ ubfmw(r13, r12, 6, 11); 9409 __ andw(r12, r12, 63); 9410 // get the code based on the codec 9411 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9412 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9413 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9414 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9415 __ strb(r15, __ post(dst, 1)); 9416 __ strb(r14, __ post(dst, 1)); 9417 __ strb(r13, __ post(dst, 1)); 9418 __ strb(r12, __ post(dst, 1)); 9419 __ sub(length, length, 3); 9420 __ cbnz(length, Process3B); 9421 9422 __ BIND(Exit); 9423 __ ret(lr); 9424 9425 return start; 9426 } 9427 9428 void generate_base64_decode_simdround(Register src, Register dst, 9429 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9430 9431 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9432 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9433 9434 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9435 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9436 9437 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9438 9439 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9440 9441 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9442 9443 // we need unsigned saturating subtract, to make sure all input values 9444 // in range [0, 63] will have 0U value in the higher half lookup 9445 __ uqsubv(decH0, __ T16B, in0, v27); 9446 __ uqsubv(decH1, __ T16B, in1, v27); 9447 __ uqsubv(decH2, __ T16B, in2, v27); 9448 __ uqsubv(decH3, __ T16B, in3, v27); 9449 9450 // lower half lookup 9451 __ tbl(decL0, arrangement, codecL, 4, in0); 9452 __ tbl(decL1, arrangement, codecL, 4, in1); 9453 __ tbl(decL2, arrangement, codecL, 4, in2); 9454 __ tbl(decL3, arrangement, codecL, 4, in3); 9455 9456 // higher half lookup 9457 __ tbx(decH0, arrangement, codecH, 4, decH0); 9458 __ tbx(decH1, arrangement, codecH, 4, decH1); 9459 __ tbx(decH2, arrangement, codecH, 4, decH2); 9460 __ tbx(decH3, arrangement, codecH, 4, decH3); 9461 9462 // combine lower and higher 9463 __ orr(decL0, arrangement, decL0, decH0); 9464 __ orr(decL1, arrangement, decL1, decH1); 9465 __ orr(decL2, arrangement, decL2, decH2); 9466 __ orr(decL3, arrangement, decL3, decH3); 9467 9468 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9469 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9470 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9471 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9472 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9473 __ orr(in0, arrangement, decH0, decH1); 9474 __ orr(in1, arrangement, decH2, decH3); 9475 __ orr(in2, arrangement, in0, in1); 9476 __ umaxv(in3, arrangement, in2); 9477 __ umov(rscratch2, in3, __ B, 0); 9478 9479 // get the data to output 9480 __ shl(out0, arrangement, decL0, 2); 9481 __ ushr(out1, arrangement, decL1, 4); 9482 __ orr(out0, arrangement, out0, out1); 9483 __ shl(out1, arrangement, decL1, 4); 9484 __ ushr(out2, arrangement, decL2, 2); 9485 __ orr(out1, arrangement, out1, out2); 9486 __ shl(out2, arrangement, decL2, 6); 9487 __ orr(out2, arrangement, out2, decL3); 9488 9489 __ cbz(rscratch2, NoIllegalData); 9490 9491 // handle illegal input 9492 __ umov(r10, in2, __ D, 0); 9493 if (size == 16) { 9494 __ cbnz(r10, ErrorInLowerHalf); 9495 9496 // illegal input is in higher half, store the lower half now. 9497 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9498 9499 __ umov(r10, in2, __ D, 1); 9500 __ umov(r11, out0, __ D, 1); 9501 __ umov(r12, out1, __ D, 1); 9502 __ umov(r13, out2, __ D, 1); 9503 __ b(StoreLegalData); 9504 9505 __ BIND(ErrorInLowerHalf); 9506 } 9507 __ umov(r11, out0, __ D, 0); 9508 __ umov(r12, out1, __ D, 0); 9509 __ umov(r13, out2, __ D, 0); 9510 9511 __ BIND(StoreLegalData); 9512 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 9513 __ strb(r11, __ post(dst, 1)); 9514 __ strb(r12, __ post(dst, 1)); 9515 __ strb(r13, __ post(dst, 1)); 9516 __ lsr(r10, r10, 8); 9517 __ lsr(r11, r11, 8); 9518 __ lsr(r12, r12, 8); 9519 __ lsr(r13, r13, 8); 9520 __ b(StoreLegalData); 9521 9522 __ BIND(NoIllegalData); 9523 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 9524 } 9525 9526 9527 /** 9528 * Arguments: 9529 * 9530 * Input: 9531 * c_rarg0 - src_start 9532 * c_rarg1 - src_offset 9533 * c_rarg2 - src_length 9534 * c_rarg3 - dest_start 9535 * c_rarg4 - dest_offset 9536 * c_rarg5 - isURL 9537 * c_rarg6 - isMIME 9538 * 9539 */ 9540 address generate_base64_decodeBlock() { 9541 9542 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 9543 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 9544 // titled "Base64 decoding". 9545 9546 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 9547 // except the trailing character '=' is also treated illegal value in this intrinsic. That 9548 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 9549 static const uint8_t fromBase64ForNoSIMD[256] = { 9550 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9551 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9552 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9553 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9554 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9555 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 9556 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9557 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9558 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9559 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9560 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9561 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9562 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9563 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9564 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9565 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9566 }; 9567 9568 static const uint8_t fromBase64URLForNoSIMD[256] = { 9569 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9570 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9571 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9572 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9573 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9574 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 9575 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9576 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9577 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9578 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9579 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9580 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9581 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9582 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9583 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9584 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9585 }; 9586 9587 // A legal value of base64 code is in range [0, 127]. We need two lookups 9588 // with tbl/tbx and combine them to get the decode data. The 1st table vector 9589 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 9590 // table vector lookup use tbx, out of range indices are unchanged in 9591 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 9592 // The value of index 64 is set to 0, so that we know that we already get the 9593 // decoded data with the 1st lookup. 9594 static const uint8_t fromBase64ForSIMD[128] = { 9595 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9596 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9597 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9598 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9599 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9600 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9601 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9602 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9603 }; 9604 9605 static const uint8_t fromBase64URLForSIMD[128] = { 9606 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9607 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9608 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9609 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9610 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9611 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9612 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9613 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9614 }; 9615 9616 __ align(CodeEntryAlignment); 9617 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 9618 StubCodeMark mark(this, stub_id); 9619 address start = __ pc(); 9620 9621 Register src = c_rarg0; // source array 9622 Register soff = c_rarg1; // source start offset 9623 Register send = c_rarg2; // source end offset 9624 Register dst = c_rarg3; // dest array 9625 Register doff = c_rarg4; // position for writing to dest array 9626 Register isURL = c_rarg5; // Base64 or URL character set 9627 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 9628 9629 Register length = send; // reuse send as length of source data to process 9630 9631 Register simd_codec = c_rarg6; 9632 Register nosimd_codec = c_rarg7; 9633 9634 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 9635 9636 __ enter(); 9637 9638 __ add(src, src, soff); 9639 __ add(dst, dst, doff); 9640 9641 __ mov(doff, dst); 9642 9643 __ sub(length, send, soff); 9644 __ bfm(length, zr, 0, 1); 9645 9646 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 9647 __ cbz(isURL, ProcessData); 9648 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 9649 9650 __ BIND(ProcessData); 9651 __ mov(rscratch1, length); 9652 __ cmp(length, (u1)144); // 144 = 80 + 64 9653 __ br(Assembler::LT, Process4B); 9654 9655 // In the MIME case, the line length cannot be more than 76 9656 // bytes (see RFC 2045). This is too short a block for SIMD 9657 // to be worthwhile, so we use non-SIMD here. 9658 __ movw(rscratch1, 79); 9659 9660 __ BIND(Process4B); 9661 __ ldrw(r14, __ post(src, 4)); 9662 __ ubfxw(r10, r14, 0, 8); 9663 __ ubfxw(r11, r14, 8, 8); 9664 __ ubfxw(r12, r14, 16, 8); 9665 __ ubfxw(r13, r14, 24, 8); 9666 // get the de-code 9667 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 9668 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 9669 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 9670 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 9671 // error detection, 255u indicates an illegal input 9672 __ orrw(r14, r10, r11); 9673 __ orrw(r15, r12, r13); 9674 __ orrw(r14, r14, r15); 9675 __ tbnz(r14, 7, Exit); 9676 // recover the data 9677 __ lslw(r14, r10, 10); 9678 __ bfiw(r14, r11, 4, 6); 9679 __ bfmw(r14, r12, 2, 5); 9680 __ rev16w(r14, r14); 9681 __ bfiw(r13, r12, 6, 2); 9682 __ strh(r14, __ post(dst, 2)); 9683 __ strb(r13, __ post(dst, 1)); 9684 // non-simd loop 9685 __ subsw(rscratch1, rscratch1, 4); 9686 __ br(Assembler::GT, Process4B); 9687 9688 // if exiting from PreProcess80B, rscratch1 == -1; 9689 // otherwise, rscratch1 == 0. 9690 __ cbzw(rscratch1, Exit); 9691 __ sub(length, length, 80); 9692 9693 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 9694 __ cbz(isURL, SIMDEnter); 9695 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 9696 9697 __ BIND(SIMDEnter); 9698 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 9699 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 9700 __ mov(rscratch1, 63); 9701 __ dup(v27, __ T16B, rscratch1); 9702 9703 __ BIND(Process64B); 9704 __ cmp(length, (u1)64); 9705 __ br(Assembler::LT, Process32B); 9706 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 9707 __ sub(length, length, 64); 9708 __ b(Process64B); 9709 9710 __ BIND(Process32B); 9711 __ cmp(length, (u1)32); 9712 __ br(Assembler::LT, SIMDExit); 9713 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 9714 __ sub(length, length, 32); 9715 __ b(Process32B); 9716 9717 __ BIND(SIMDExit); 9718 __ cbz(length, Exit); 9719 __ movw(rscratch1, length); 9720 __ b(Process4B); 9721 9722 __ BIND(Exit); 9723 __ sub(c_rarg0, dst, doff); 9724 9725 __ leave(); 9726 __ ret(lr); 9727 9728 return start; 9729 } 9730 9731 // Support for spin waits. 9732 address generate_spin_wait() { 9733 __ align(CodeEntryAlignment); 9734 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 9735 StubCodeMark mark(this, stub_id); 9736 address start = __ pc(); 9737 9738 __ spin_wait(); 9739 __ ret(lr); 9740 9741 return start; 9742 } 9743 9744 void generate_lookup_secondary_supers_table_stub() { 9745 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 9746 StubCodeMark mark(this, stub_id); 9747 9748 const Register 9749 r_super_klass = r0, 9750 r_array_base = r1, 9751 r_array_length = r2, 9752 r_array_index = r3, 9753 r_sub_klass = r4, 9754 r_bitmap = rscratch2, 9755 result = r5; 9756 const FloatRegister 9757 vtemp = v0; 9758 9759 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 9760 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 9761 Label L_success; 9762 __ enter(); 9763 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 9764 r_array_base, r_array_length, r_array_index, 9765 vtemp, result, slot, 9766 /*stub_is_near*/true); 9767 __ leave(); 9768 __ ret(lr); 9769 } 9770 } 9771 9772 // Slow path implementation for UseSecondarySupersTable. 9773 address generate_lookup_secondary_supers_table_slow_path_stub() { 9774 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 9775 StubCodeMark mark(this, stub_id); 9776 9777 address start = __ pc(); 9778 const Register 9779 r_super_klass = r0, // argument 9780 r_array_base = r1, // argument 9781 temp1 = r2, // temp 9782 r_array_index = r3, // argument 9783 r_bitmap = rscratch2, // argument 9784 result = r5; // argument 9785 9786 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 9787 __ ret(lr); 9788 9789 return start; 9790 } 9791 9792 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 9793 9794 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 9795 // 9796 // If LSE is in use, generate LSE versions of all the stubs. The 9797 // non-LSE versions are in atomic_aarch64.S. 9798 9799 // class AtomicStubMark records the entry point of a stub and the 9800 // stub pointer which will point to it. The stub pointer is set to 9801 // the entry point when ~AtomicStubMark() is called, which must be 9802 // after ICache::invalidate_range. This ensures safe publication of 9803 // the generated code. 9804 class AtomicStubMark { 9805 address _entry_point; 9806 aarch64_atomic_stub_t *_stub; 9807 MacroAssembler *_masm; 9808 public: 9809 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 9810 _masm = masm; 9811 __ align(32); 9812 _entry_point = __ pc(); 9813 _stub = stub; 9814 } 9815 ~AtomicStubMark() { 9816 *_stub = (aarch64_atomic_stub_t)_entry_point; 9817 } 9818 }; 9819 9820 // NB: For memory_order_conservative we need a trailing membar after 9821 // LSE atomic operations but not a leading membar. 9822 // 9823 // We don't need a leading membar because a clause in the Arm ARM 9824 // says: 9825 // 9826 // Barrier-ordered-before 9827 // 9828 // Barrier instructions order prior Memory effects before subsequent 9829 // Memory effects generated by the same Observer. A read or a write 9830 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 9831 // Observer if and only if RW1 appears in program order before RW 2 9832 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 9833 // instruction with both Acquire and Release semantics. 9834 // 9835 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 9836 // and Release semantics, therefore we don't need a leading 9837 // barrier. However, there is no corresponding Barrier-ordered-after 9838 // relationship, therefore we need a trailing membar to prevent a 9839 // later store or load from being reordered with the store in an 9840 // atomic instruction. 9841 // 9842 // This was checked by using the herd7 consistency model simulator 9843 // (http://diy.inria.fr/) with this test case: 9844 // 9845 // AArch64 LseCas 9846 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 9847 // P0 | P1; 9848 // LDR W4, [X2] | MOV W3, #0; 9849 // DMB LD | MOV W4, #1; 9850 // LDR W3, [X1] | CASAL W3, W4, [X1]; 9851 // | DMB ISH; 9852 // | STR W4, [X2]; 9853 // exists 9854 // (0:X3=0 /\ 0:X4=1) 9855 // 9856 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 9857 // with the store to x in P1. Without the DMB in P1 this may happen. 9858 // 9859 // At the time of writing we don't know of any AArch64 hardware that 9860 // reorders stores in this way, but the Reference Manual permits it. 9861 9862 void gen_cas_entry(Assembler::operand_size size, 9863 atomic_memory_order order) { 9864 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 9865 exchange_val = c_rarg2; 9866 bool acquire, release; 9867 switch (order) { 9868 case memory_order_relaxed: 9869 acquire = false; 9870 release = false; 9871 break; 9872 case memory_order_release: 9873 acquire = false; 9874 release = true; 9875 break; 9876 default: 9877 acquire = true; 9878 release = true; 9879 break; 9880 } 9881 __ mov(prev, compare_val); 9882 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 9883 if (order == memory_order_conservative) { 9884 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9885 } 9886 if (size == Assembler::xword) { 9887 __ mov(r0, prev); 9888 } else { 9889 __ movw(r0, prev); 9890 } 9891 __ ret(lr); 9892 } 9893 9894 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 9895 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 9896 // If not relaxed, then default to conservative. Relaxed is the only 9897 // case we use enough to be worth specializing. 9898 if (order == memory_order_relaxed) { 9899 __ ldadd(size, incr, prev, addr); 9900 } else { 9901 __ ldaddal(size, incr, prev, addr); 9902 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9903 } 9904 if (size == Assembler::xword) { 9905 __ mov(r0, prev); 9906 } else { 9907 __ movw(r0, prev); 9908 } 9909 __ ret(lr); 9910 } 9911 9912 void gen_swpal_entry(Assembler::operand_size size) { 9913 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 9914 __ swpal(size, incr, prev, addr); 9915 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9916 if (size == Assembler::xword) { 9917 __ mov(r0, prev); 9918 } else { 9919 __ movw(r0, prev); 9920 } 9921 __ ret(lr); 9922 } 9923 9924 void generate_atomic_entry_points() { 9925 if (! UseLSE) { 9926 return; 9927 } 9928 __ align(CodeEntryAlignment); 9929 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 9930 StubCodeMark mark(this, stub_id); 9931 address first_entry = __ pc(); 9932 9933 // ADD, memory_order_conservative 9934 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 9935 gen_ldadd_entry(Assembler::word, memory_order_conservative); 9936 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 9937 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 9938 9939 // ADD, memory_order_relaxed 9940 AtomicStubMark mark_fetch_add_4_relaxed 9941 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 9942 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 9943 AtomicStubMark mark_fetch_add_8_relaxed 9944 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 9945 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 9946 9947 // XCHG, memory_order_conservative 9948 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 9949 gen_swpal_entry(Assembler::word); 9950 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 9951 gen_swpal_entry(Assembler::xword); 9952 9953 // CAS, memory_order_conservative 9954 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 9955 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 9956 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 9957 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 9958 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 9959 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 9960 9961 // CAS, memory_order_relaxed 9962 AtomicStubMark mark_cmpxchg_1_relaxed 9963 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 9964 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 9965 AtomicStubMark mark_cmpxchg_4_relaxed 9966 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 9967 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 9968 AtomicStubMark mark_cmpxchg_8_relaxed 9969 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 9970 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 9971 9972 AtomicStubMark mark_cmpxchg_4_release 9973 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 9974 gen_cas_entry(MacroAssembler::word, memory_order_release); 9975 AtomicStubMark mark_cmpxchg_8_release 9976 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 9977 gen_cas_entry(MacroAssembler::xword, memory_order_release); 9978 9979 AtomicStubMark mark_cmpxchg_4_seq_cst 9980 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 9981 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 9982 AtomicStubMark mark_cmpxchg_8_seq_cst 9983 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 9984 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 9985 9986 ICache::invalidate_range(first_entry, __ pc() - first_entry); 9987 } 9988 #endif // LINUX 9989 9990 address generate_cont_thaw(Continuation::thaw_kind kind) { 9991 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 9992 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 9993 9994 address start = __ pc(); 9995 9996 if (return_barrier) { 9997 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 9998 __ mov(sp, rscratch1); 9999 } 10000 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10001 10002 if (return_barrier) { 10003 // preserve possible return value from a method returning to the return barrier 10004 __ fmovd(rscratch1, v0); 10005 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10006 } 10007 10008 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10009 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10010 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10011 10012 if (return_barrier) { 10013 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10014 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10015 __ fmovd(v0, rscratch1); 10016 } 10017 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10018 10019 10020 Label thaw_success; 10021 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10022 __ cbnz(rscratch2, thaw_success); 10023 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10024 __ br(rscratch1); 10025 __ bind(thaw_success); 10026 10027 // make room for the thawed frames 10028 __ sub(rscratch1, sp, rscratch2); 10029 __ andr(rscratch1, rscratch1, -16); // align 10030 __ mov(sp, rscratch1); 10031 10032 if (return_barrier) { 10033 // save original return value -- again 10034 __ fmovd(rscratch1, v0); 10035 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10036 } 10037 10038 // If we want, we can templatize thaw by kind, and have three different entries 10039 __ movw(c_rarg1, (uint32_t)kind); 10040 10041 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10042 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10043 10044 if (return_barrier) { 10045 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10046 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10047 __ fmovd(v0, rscratch1); 10048 } else { 10049 __ mov(r0, zr); // return 0 (success) from doYield 10050 } 10051 10052 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10053 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10054 __ mov(rfp, sp); 10055 10056 if (return_barrier_exception) { 10057 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10058 __ authenticate_return_address(c_rarg1); 10059 __ verify_oop(r0); 10060 // save return value containing the exception oop in callee-saved R19 10061 __ mov(r19, r0); 10062 10063 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10064 10065 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10066 // __ reinitialize_ptrue(); 10067 10068 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10069 10070 __ mov(r1, r0); // the exception handler 10071 __ mov(r0, r19); // restore return value containing the exception oop 10072 __ verify_oop(r0); 10073 10074 __ leave(); 10075 __ mov(r3, lr); 10076 __ br(r1); // the exception handler 10077 } else { 10078 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10079 __ leave(); 10080 __ ret(lr); 10081 } 10082 10083 return start; 10084 } 10085 10086 address generate_cont_thaw() { 10087 if (!Continuations::enabled()) return nullptr; 10088 10089 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 10090 StubCodeMark mark(this, stub_id); 10091 address start = __ pc(); 10092 generate_cont_thaw(Continuation::thaw_top); 10093 return start; 10094 } 10095 10096 address generate_cont_returnBarrier() { 10097 if (!Continuations::enabled()) return nullptr; 10098 10099 // TODO: will probably need multiple return barriers depending on return type 10100 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 10101 StubCodeMark mark(this, stub_id); 10102 address start = __ pc(); 10103 10104 generate_cont_thaw(Continuation::thaw_return_barrier); 10105 10106 return start; 10107 } 10108 10109 address generate_cont_returnBarrier_exception() { 10110 if (!Continuations::enabled()) return nullptr; 10111 10112 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 10113 StubCodeMark mark(this, stub_id); 10114 address start = __ pc(); 10115 10116 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10117 10118 return start; 10119 } 10120 10121 address generate_cont_preempt_stub() { 10122 if (!Continuations::enabled()) return nullptr; 10123 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 10124 StubCodeMark mark(this, stub_id); 10125 address start = __ pc(); 10126 10127 __ reset_last_Java_frame(true); 10128 10129 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10130 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10131 __ mov(sp, rscratch2); 10132 10133 Label preemption_cancelled; 10134 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10135 __ cbnz(rscratch1, preemption_cancelled); 10136 10137 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10138 SharedRuntime::continuation_enter_cleanup(_masm); 10139 __ leave(); 10140 __ ret(lr); 10141 10142 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10143 __ bind(preemption_cancelled); 10144 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10145 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10146 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10147 __ ldr(rscratch1, Address(rscratch1)); 10148 __ br(rscratch1); 10149 10150 return start; 10151 } 10152 10153 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10154 // are represented as long[5], with BITS_PER_LIMB = 26. 10155 // Pack five 26-bit limbs into three 64-bit registers. 10156 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10157 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10158 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10159 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10160 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10161 10162 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10163 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10164 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10165 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10166 10167 if (dest2->is_valid()) { 10168 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10169 } else { 10170 #ifdef ASSERT 10171 Label OK; 10172 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10173 __ br(__ EQ, OK); 10174 __ stop("high bits of Poly1305 integer should be zero"); 10175 __ should_not_reach_here(); 10176 __ bind(OK); 10177 #endif 10178 } 10179 } 10180 10181 // As above, but return only a 128-bit integer, packed into two 10182 // 64-bit registers. 10183 void pack_26(Register dest0, Register dest1, Register src) { 10184 pack_26(dest0, dest1, noreg, src); 10185 } 10186 10187 // Multiply and multiply-accumulate unsigned 64-bit registers. 10188 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10189 __ mul(prod_lo, n, m); 10190 __ umulh(prod_hi, n, m); 10191 } 10192 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10193 wide_mul(rscratch1, rscratch2, n, m); 10194 __ adds(sum_lo, sum_lo, rscratch1); 10195 __ adc(sum_hi, sum_hi, rscratch2); 10196 } 10197 10198 // Poly1305, RFC 7539 10199 10200 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10201 // description of the tricks used to simplify and accelerate this 10202 // computation. 10203 10204 address generate_poly1305_processBlocks() { 10205 __ align(CodeEntryAlignment); 10206 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 10207 StubCodeMark mark(this, stub_id); 10208 address start = __ pc(); 10209 Label here; 10210 __ enter(); 10211 RegSet callee_saved = RegSet::range(r19, r28); 10212 __ push(callee_saved, sp); 10213 10214 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10215 10216 // Arguments 10217 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10218 10219 // R_n is the 128-bit randomly-generated key, packed into two 10220 // registers. The caller passes this key to us as long[5], with 10221 // BITS_PER_LIMB = 26. 10222 const Register R_0 = *++regs, R_1 = *++regs; 10223 pack_26(R_0, R_1, r_start); 10224 10225 // RR_n is (R_n >> 2) * 5 10226 const Register RR_0 = *++regs, RR_1 = *++regs; 10227 __ lsr(RR_0, R_0, 2); 10228 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10229 __ lsr(RR_1, R_1, 2); 10230 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10231 10232 // U_n is the current checksum 10233 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10234 pack_26(U_0, U_1, U_2, acc_start); 10235 10236 static constexpr int BLOCK_LENGTH = 16; 10237 Label DONE, LOOP; 10238 10239 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10240 __ br(Assembler::LT, DONE); { 10241 __ bind(LOOP); 10242 10243 // S_n is to be the sum of U_n and the next block of data 10244 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10245 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10246 __ adds(S_0, U_0, S_0); 10247 __ adcs(S_1, U_1, S_1); 10248 __ adc(S_2, U_2, zr); 10249 __ add(S_2, S_2, 1); 10250 10251 const Register U_0HI = *++regs, U_1HI = *++regs; 10252 10253 // NB: this logic depends on some of the special properties of 10254 // Poly1305 keys. In particular, because we know that the top 10255 // four bits of R_0 and R_1 are zero, we can add together 10256 // partial products without any risk of needing to propagate a 10257 // carry out. 10258 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10259 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10260 __ andr(U_2, R_0, 3); 10261 __ mul(U_2, S_2, U_2); 10262 10263 // Recycle registers S_0, S_1, S_2 10264 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10265 10266 // Partial reduction mod 2**130 - 5 10267 __ adds(U_1, U_0HI, U_1); 10268 __ adc(U_2, U_1HI, U_2); 10269 // Sum now in U_2:U_1:U_0. 10270 // Dead: U_0HI, U_1HI. 10271 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10272 10273 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10274 10275 // First, U_2:U_1:U_0 += (U_2 >> 2) 10276 __ lsr(rscratch1, U_2, 2); 10277 __ andr(U_2, U_2, (u8)3); 10278 __ adds(U_0, U_0, rscratch1); 10279 __ adcs(U_1, U_1, zr); 10280 __ adc(U_2, U_2, zr); 10281 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10282 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10283 __ adcs(U_1, U_1, zr); 10284 __ adc(U_2, U_2, zr); 10285 10286 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10287 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10288 __ br(~ Assembler::LT, LOOP); 10289 } 10290 10291 // Further reduce modulo 2^130 - 5 10292 __ lsr(rscratch1, U_2, 2); 10293 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10294 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10295 __ adcs(U_1, U_1, zr); 10296 __ andr(U_2, U_2, (u1)3); 10297 __ adc(U_2, U_2, zr); 10298 10299 // Unpack the sum into five 26-bit limbs and write to memory. 10300 __ ubfiz(rscratch1, U_0, 0, 26); 10301 __ ubfx(rscratch2, U_0, 26, 26); 10302 __ stp(rscratch1, rscratch2, Address(acc_start)); 10303 __ ubfx(rscratch1, U_0, 52, 12); 10304 __ bfi(rscratch1, U_1, 12, 14); 10305 __ ubfx(rscratch2, U_1, 14, 26); 10306 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10307 __ ubfx(rscratch1, U_1, 40, 24); 10308 __ bfi(rscratch1, U_2, 24, 3); 10309 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10310 10311 __ bind(DONE); 10312 __ pop(callee_saved, sp); 10313 __ leave(); 10314 __ ret(lr); 10315 10316 return start; 10317 } 10318 10319 // exception handler for upcall stubs 10320 address generate_upcall_stub_exception_handler() { 10321 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 10322 StubCodeMark mark(this, stub_id); 10323 address start = __ pc(); 10324 10325 // Native caller has no idea how to handle exceptions, 10326 // so we just crash here. Up to callee to catch exceptions. 10327 __ verify_oop(r0); 10328 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10329 __ blr(rscratch1); 10330 __ should_not_reach_here(); 10331 10332 return start; 10333 } 10334 10335 // load Method* target of MethodHandle 10336 // j_rarg0 = jobject receiver 10337 // rmethod = result 10338 address generate_upcall_stub_load_target() { 10339 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 10340 StubCodeMark mark(this, stub_id); 10341 address start = __ pc(); 10342 10343 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10344 // Load target method from receiver 10345 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10346 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10347 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10348 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10349 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10350 noreg, noreg); 10351 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10352 10353 __ ret(lr); 10354 10355 return start; 10356 } 10357 10358 #undef __ 10359 #define __ masm-> 10360 10361 class MontgomeryMultiplyGenerator : public MacroAssembler { 10362 10363 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10364 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10365 10366 RegSet _toSave; 10367 bool _squaring; 10368 10369 public: 10370 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10371 : MacroAssembler(as->code()), _squaring(squaring) { 10372 10373 // Register allocation 10374 10375 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10376 Pa_base = *regs; // Argument registers 10377 if (squaring) 10378 Pb_base = Pa_base; 10379 else 10380 Pb_base = *++regs; 10381 Pn_base = *++regs; 10382 Rlen= *++regs; 10383 inv = *++regs; 10384 Pm_base = *++regs; 10385 10386 // Working registers: 10387 Ra = *++regs; // The current digit of a, b, n, and m. 10388 Rb = *++regs; 10389 Rm = *++regs; 10390 Rn = *++regs; 10391 10392 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10393 Pb = *++regs; 10394 Pm = *++regs; 10395 Pn = *++regs; 10396 10397 t0 = *++regs; // Three registers which form a 10398 t1 = *++regs; // triple-precision accumuator. 10399 t2 = *++regs; 10400 10401 Ri = *++regs; // Inner and outer loop indexes. 10402 Rj = *++regs; 10403 10404 Rhi_ab = *++regs; // Product registers: low and high parts 10405 Rlo_ab = *++regs; // of a*b and m*n. 10406 Rhi_mn = *++regs; 10407 Rlo_mn = *++regs; 10408 10409 // r19 and up are callee-saved. 10410 _toSave = RegSet::range(r19, *regs) + Pm_base; 10411 } 10412 10413 private: 10414 void save_regs() { 10415 push(_toSave, sp); 10416 } 10417 10418 void restore_regs() { 10419 pop(_toSave, sp); 10420 } 10421 10422 template <typename T> 10423 void unroll_2(Register count, T block) { 10424 Label loop, end, odd; 10425 tbnz(count, 0, odd); 10426 cbz(count, end); 10427 align(16); 10428 bind(loop); 10429 (this->*block)(); 10430 bind(odd); 10431 (this->*block)(); 10432 subs(count, count, 2); 10433 br(Assembler::GT, loop); 10434 bind(end); 10435 } 10436 10437 template <typename T> 10438 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10439 Label loop, end, odd; 10440 tbnz(count, 0, odd); 10441 cbz(count, end); 10442 align(16); 10443 bind(loop); 10444 (this->*block)(d, s, tmp); 10445 bind(odd); 10446 (this->*block)(d, s, tmp); 10447 subs(count, count, 2); 10448 br(Assembler::GT, loop); 10449 bind(end); 10450 } 10451 10452 void pre1(RegisterOrConstant i) { 10453 block_comment("pre1"); 10454 // Pa = Pa_base; 10455 // Pb = Pb_base + i; 10456 // Pm = Pm_base; 10457 // Pn = Pn_base + i; 10458 // Ra = *Pa; 10459 // Rb = *Pb; 10460 // Rm = *Pm; 10461 // Rn = *Pn; 10462 ldr(Ra, Address(Pa_base)); 10463 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10464 ldr(Rm, Address(Pm_base)); 10465 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10466 lea(Pa, Address(Pa_base)); 10467 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10468 lea(Pm, Address(Pm_base)); 10469 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10470 10471 // Zero the m*n result. 10472 mov(Rhi_mn, zr); 10473 mov(Rlo_mn, zr); 10474 } 10475 10476 // The core multiply-accumulate step of a Montgomery 10477 // multiplication. The idea is to schedule operations as a 10478 // pipeline so that instructions with long latencies (loads and 10479 // multiplies) have time to complete before their results are 10480 // used. This most benefits in-order implementations of the 10481 // architecture but out-of-order ones also benefit. 10482 void step() { 10483 block_comment("step"); 10484 // MACC(Ra, Rb, t0, t1, t2); 10485 // Ra = *++Pa; 10486 // Rb = *--Pb; 10487 umulh(Rhi_ab, Ra, Rb); 10488 mul(Rlo_ab, Ra, Rb); 10489 ldr(Ra, pre(Pa, wordSize)); 10490 ldr(Rb, pre(Pb, -wordSize)); 10491 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 10492 // previous iteration. 10493 // MACC(Rm, Rn, t0, t1, t2); 10494 // Rm = *++Pm; 10495 // Rn = *--Pn; 10496 umulh(Rhi_mn, Rm, Rn); 10497 mul(Rlo_mn, Rm, Rn); 10498 ldr(Rm, pre(Pm, wordSize)); 10499 ldr(Rn, pre(Pn, -wordSize)); 10500 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10501 } 10502 10503 void post1() { 10504 block_comment("post1"); 10505 10506 // MACC(Ra, Rb, t0, t1, t2); 10507 // Ra = *++Pa; 10508 // Rb = *--Pb; 10509 umulh(Rhi_ab, Ra, Rb); 10510 mul(Rlo_ab, Ra, Rb); 10511 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10512 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10513 10514 // *Pm = Rm = t0 * inv; 10515 mul(Rm, t0, inv); 10516 str(Rm, Address(Pm)); 10517 10518 // MACC(Rm, Rn, t0, t1, t2); 10519 // t0 = t1; t1 = t2; t2 = 0; 10520 umulh(Rhi_mn, Rm, Rn); 10521 10522 #ifndef PRODUCT 10523 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10524 { 10525 mul(Rlo_mn, Rm, Rn); 10526 add(Rlo_mn, t0, Rlo_mn); 10527 Label ok; 10528 cbz(Rlo_mn, ok); { 10529 stop("broken Montgomery multiply"); 10530 } bind(ok); 10531 } 10532 #endif 10533 // We have very carefully set things up so that 10534 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10535 // the lower half of Rm * Rn because we know the result already: 10536 // it must be -t0. t0 + (-t0) must generate a carry iff 10537 // t0 != 0. So, rather than do a mul and an adds we just set 10538 // the carry flag iff t0 is nonzero. 10539 // 10540 // mul(Rlo_mn, Rm, Rn); 10541 // adds(zr, t0, Rlo_mn); 10542 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10543 adcs(t0, t1, Rhi_mn); 10544 adc(t1, t2, zr); 10545 mov(t2, zr); 10546 } 10547 10548 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 10549 block_comment("pre2"); 10550 // Pa = Pa_base + i-len; 10551 // Pb = Pb_base + len; 10552 // Pm = Pm_base + i-len; 10553 // Pn = Pn_base + len; 10554 10555 if (i.is_register()) { 10556 sub(Rj, i.as_register(), len); 10557 } else { 10558 mov(Rj, i.as_constant()); 10559 sub(Rj, Rj, len); 10560 } 10561 // Rj == i-len 10562 10563 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 10564 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 10565 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10566 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 10567 10568 // Ra = *++Pa; 10569 // Rb = *--Pb; 10570 // Rm = *++Pm; 10571 // Rn = *--Pn; 10572 ldr(Ra, pre(Pa, wordSize)); 10573 ldr(Rb, pre(Pb, -wordSize)); 10574 ldr(Rm, pre(Pm, wordSize)); 10575 ldr(Rn, pre(Pn, -wordSize)); 10576 10577 mov(Rhi_mn, zr); 10578 mov(Rlo_mn, zr); 10579 } 10580 10581 void post2(RegisterOrConstant i, RegisterOrConstant len) { 10582 block_comment("post2"); 10583 if (i.is_constant()) { 10584 mov(Rj, i.as_constant()-len.as_constant()); 10585 } else { 10586 sub(Rj, i.as_register(), len); 10587 } 10588 10589 adds(t0, t0, Rlo_mn); // The pending m*n, low part 10590 10591 // As soon as we know the least significant digit of our result, 10592 // store it. 10593 // Pm_base[i-len] = t0; 10594 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10595 10596 // t0 = t1; t1 = t2; t2 = 0; 10597 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 10598 adc(t1, t2, zr); 10599 mov(t2, zr); 10600 } 10601 10602 // A carry in t0 after Montgomery multiplication means that we 10603 // should subtract multiples of n from our result in m. We'll 10604 // keep doing that until there is no carry. 10605 void normalize(RegisterOrConstant len) { 10606 block_comment("normalize"); 10607 // while (t0) 10608 // t0 = sub(Pm_base, Pn_base, t0, len); 10609 Label loop, post, again; 10610 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 10611 cbz(t0, post); { 10612 bind(again); { 10613 mov(i, zr); 10614 mov(cnt, len); 10615 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10616 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10617 subs(zr, zr, zr); // set carry flag, i.e. no borrow 10618 align(16); 10619 bind(loop); { 10620 sbcs(Rm, Rm, Rn); 10621 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10622 add(i, i, 1); 10623 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10624 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10625 sub(cnt, cnt, 1); 10626 } cbnz(cnt, loop); 10627 sbc(t0, t0, zr); 10628 } cbnz(t0, again); 10629 } bind(post); 10630 } 10631 10632 // Move memory at s to d, reversing words. 10633 // Increments d to end of copied memory 10634 // Destroys tmp1, tmp2 10635 // Preserves len 10636 // Leaves s pointing to the address which was in d at start 10637 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 10638 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 10639 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 10640 10641 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 10642 mov(tmp1, len); 10643 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 10644 sub(s, d, len, ext::uxtw, LogBytesPerWord); 10645 } 10646 // where 10647 void reverse1(Register d, Register s, Register tmp) { 10648 ldr(tmp, pre(s, -wordSize)); 10649 ror(tmp, tmp, 32); 10650 str(tmp, post(d, wordSize)); 10651 } 10652 10653 void step_squaring() { 10654 // An extra ACC 10655 step(); 10656 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10657 } 10658 10659 void last_squaring(RegisterOrConstant i) { 10660 Label dont; 10661 // if ((i & 1) == 0) { 10662 tbnz(i.as_register(), 0, dont); { 10663 // MACC(Ra, Rb, t0, t1, t2); 10664 // Ra = *++Pa; 10665 // Rb = *--Pb; 10666 umulh(Rhi_ab, Ra, Rb); 10667 mul(Rlo_ab, Ra, Rb); 10668 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10669 } bind(dont); 10670 } 10671 10672 void extra_step_squaring() { 10673 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10674 10675 // MACC(Rm, Rn, t0, t1, t2); 10676 // Rm = *++Pm; 10677 // Rn = *--Pn; 10678 umulh(Rhi_mn, Rm, Rn); 10679 mul(Rlo_mn, Rm, Rn); 10680 ldr(Rm, pre(Pm, wordSize)); 10681 ldr(Rn, pre(Pn, -wordSize)); 10682 } 10683 10684 void post1_squaring() { 10685 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10686 10687 // *Pm = Rm = t0 * inv; 10688 mul(Rm, t0, inv); 10689 str(Rm, Address(Pm)); 10690 10691 // MACC(Rm, Rn, t0, t1, t2); 10692 // t0 = t1; t1 = t2; t2 = 0; 10693 umulh(Rhi_mn, Rm, Rn); 10694 10695 #ifndef PRODUCT 10696 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10697 { 10698 mul(Rlo_mn, Rm, Rn); 10699 add(Rlo_mn, t0, Rlo_mn); 10700 Label ok; 10701 cbz(Rlo_mn, ok); { 10702 stop("broken Montgomery multiply"); 10703 } bind(ok); 10704 } 10705 #endif 10706 // We have very carefully set things up so that 10707 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10708 // the lower half of Rm * Rn because we know the result already: 10709 // it must be -t0. t0 + (-t0) must generate a carry iff 10710 // t0 != 0. So, rather than do a mul and an adds we just set 10711 // the carry flag iff t0 is nonzero. 10712 // 10713 // mul(Rlo_mn, Rm, Rn); 10714 // adds(zr, t0, Rlo_mn); 10715 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10716 adcs(t0, t1, Rhi_mn); 10717 adc(t1, t2, zr); 10718 mov(t2, zr); 10719 } 10720 10721 void acc(Register Rhi, Register Rlo, 10722 Register t0, Register t1, Register t2) { 10723 adds(t0, t0, Rlo); 10724 adcs(t1, t1, Rhi); 10725 adc(t2, t2, zr); 10726 } 10727 10728 public: 10729 /** 10730 * Fast Montgomery multiplication. The derivation of the 10731 * algorithm is in A Cryptographic Library for the Motorola 10732 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 10733 * 10734 * Arguments: 10735 * 10736 * Inputs for multiplication: 10737 * c_rarg0 - int array elements a 10738 * c_rarg1 - int array elements b 10739 * c_rarg2 - int array elements n (the modulus) 10740 * c_rarg3 - int length 10741 * c_rarg4 - int inv 10742 * c_rarg5 - int array elements m (the result) 10743 * 10744 * Inputs for squaring: 10745 * c_rarg0 - int array elements a 10746 * c_rarg1 - int array elements n (the modulus) 10747 * c_rarg2 - int length 10748 * c_rarg3 - int inv 10749 * c_rarg4 - int array elements m (the result) 10750 * 10751 */ 10752 address generate_multiply() { 10753 Label argh, nothing; 10754 bind(argh); 10755 stop("MontgomeryMultiply total_allocation must be <= 8192"); 10756 10757 align(CodeEntryAlignment); 10758 address entry = pc(); 10759 10760 cbzw(Rlen, nothing); 10761 10762 enter(); 10763 10764 // Make room. 10765 cmpw(Rlen, 512); 10766 br(Assembler::HI, argh); 10767 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 10768 andr(sp, Ra, -2 * wordSize); 10769 10770 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 10771 10772 { 10773 // Copy input args, reversing as we go. We use Ra as a 10774 // temporary variable. 10775 reverse(Ra, Pa_base, Rlen, t0, t1); 10776 if (!_squaring) 10777 reverse(Ra, Pb_base, Rlen, t0, t1); 10778 reverse(Ra, Pn_base, Rlen, t0, t1); 10779 } 10780 10781 // Push all call-saved registers and also Pm_base which we'll need 10782 // at the end. 10783 save_regs(); 10784 10785 #ifndef PRODUCT 10786 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 10787 { 10788 ldr(Rn, Address(Pn_base, 0)); 10789 mul(Rlo_mn, Rn, inv); 10790 subs(zr, Rlo_mn, -1); 10791 Label ok; 10792 br(EQ, ok); { 10793 stop("broken inverse in Montgomery multiply"); 10794 } bind(ok); 10795 } 10796 #endif 10797 10798 mov(Pm_base, Ra); 10799 10800 mov(t0, zr); 10801 mov(t1, zr); 10802 mov(t2, zr); 10803 10804 block_comment("for (int i = 0; i < len; i++) {"); 10805 mov(Ri, zr); { 10806 Label loop, end; 10807 cmpw(Ri, Rlen); 10808 br(Assembler::GE, end); 10809 10810 bind(loop); 10811 pre1(Ri); 10812 10813 block_comment(" for (j = i; j; j--) {"); { 10814 movw(Rj, Ri); 10815 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10816 } block_comment(" } // j"); 10817 10818 post1(); 10819 addw(Ri, Ri, 1); 10820 cmpw(Ri, Rlen); 10821 br(Assembler::LT, loop); 10822 bind(end); 10823 block_comment("} // i"); 10824 } 10825 10826 block_comment("for (int i = len; i < 2*len; i++) {"); 10827 mov(Ri, Rlen); { 10828 Label loop, end; 10829 cmpw(Ri, Rlen, Assembler::LSL, 1); 10830 br(Assembler::GE, end); 10831 10832 bind(loop); 10833 pre2(Ri, Rlen); 10834 10835 block_comment(" for (j = len*2-i-1; j; j--) {"); { 10836 lslw(Rj, Rlen, 1); 10837 subw(Rj, Rj, Ri); 10838 subw(Rj, Rj, 1); 10839 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10840 } block_comment(" } // j"); 10841 10842 post2(Ri, Rlen); 10843 addw(Ri, Ri, 1); 10844 cmpw(Ri, Rlen, Assembler::LSL, 1); 10845 br(Assembler::LT, loop); 10846 bind(end); 10847 } 10848 block_comment("} // i"); 10849 10850 normalize(Rlen); 10851 10852 mov(Ra, Pm_base); // Save Pm_base in Ra 10853 restore_regs(); // Restore caller's Pm_base 10854 10855 // Copy our result into caller's Pm_base 10856 reverse(Pm_base, Ra, Rlen, t0, t1); 10857 10858 leave(); 10859 bind(nothing); 10860 ret(lr); 10861 10862 return entry; 10863 } 10864 // In C, approximately: 10865 10866 // void 10867 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 10868 // julong Pn_base[], julong Pm_base[], 10869 // julong inv, int len) { 10870 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 10871 // julong *Pa, *Pb, *Pn, *Pm; 10872 // julong Ra, Rb, Rn, Rm; 10873 10874 // int i; 10875 10876 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 10877 10878 // for (i = 0; i < len; i++) { 10879 // int j; 10880 10881 // Pa = Pa_base; 10882 // Pb = Pb_base + i; 10883 // Pm = Pm_base; 10884 // Pn = Pn_base + i; 10885 10886 // Ra = *Pa; 10887 // Rb = *Pb; 10888 // Rm = *Pm; 10889 // Rn = *Pn; 10890 10891 // int iters = i; 10892 // for (j = 0; iters--; j++) { 10893 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 10894 // MACC(Ra, Rb, t0, t1, t2); 10895 // Ra = *++Pa; 10896 // Rb = *--Pb; 10897 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 10898 // MACC(Rm, Rn, t0, t1, t2); 10899 // Rm = *++Pm; 10900 // Rn = *--Pn; 10901 // } 10902 10903 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 10904 // MACC(Ra, Rb, t0, t1, t2); 10905 // *Pm = Rm = t0 * inv; 10906 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 10907 // MACC(Rm, Rn, t0, t1, t2); 10908 10909 // assert(t0 == 0, "broken Montgomery multiply"); 10910 10911 // t0 = t1; t1 = t2; t2 = 0; 10912 // } 10913 10914 // for (i = len; i < 2*len; i++) { 10915 // int j; 10916 10917 // Pa = Pa_base + i-len; 10918 // Pb = Pb_base + len; 10919 // Pm = Pm_base + i-len; 10920 // Pn = Pn_base + len; 10921 10922 // Ra = *++Pa; 10923 // Rb = *--Pb; 10924 // Rm = *++Pm; 10925 // Rn = *--Pn; 10926 10927 // int iters = len*2-i-1; 10928 // for (j = i-len+1; iters--; j++) { 10929 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 10930 // MACC(Ra, Rb, t0, t1, t2); 10931 // Ra = *++Pa; 10932 // Rb = *--Pb; 10933 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 10934 // MACC(Rm, Rn, t0, t1, t2); 10935 // Rm = *++Pm; 10936 // Rn = *--Pn; 10937 // } 10938 10939 // Pm_base[i-len] = t0; 10940 // t0 = t1; t1 = t2; t2 = 0; 10941 // } 10942 10943 // while (t0) 10944 // t0 = sub(Pm_base, Pn_base, t0, len); 10945 // } 10946 10947 /** 10948 * Fast Montgomery squaring. This uses asymptotically 25% fewer 10949 * multiplies than Montgomery multiplication so it should be up to 10950 * 25% faster. However, its loop control is more complex and it 10951 * may actually run slower on some machines. 10952 * 10953 * Arguments: 10954 * 10955 * Inputs: 10956 * c_rarg0 - int array elements a 10957 * c_rarg1 - int array elements n (the modulus) 10958 * c_rarg2 - int length 10959 * c_rarg3 - int inv 10960 * c_rarg4 - int array elements m (the result) 10961 * 10962 */ 10963 address generate_square() { 10964 Label argh; 10965 bind(argh); 10966 stop("MontgomeryMultiply total_allocation must be <= 8192"); 10967 10968 align(CodeEntryAlignment); 10969 address entry = pc(); 10970 10971 enter(); 10972 10973 // Make room. 10974 cmpw(Rlen, 512); 10975 br(Assembler::HI, argh); 10976 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 10977 andr(sp, Ra, -2 * wordSize); 10978 10979 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 10980 10981 { 10982 // Copy input args, reversing as we go. We use Ra as a 10983 // temporary variable. 10984 reverse(Ra, Pa_base, Rlen, t0, t1); 10985 reverse(Ra, Pn_base, Rlen, t0, t1); 10986 } 10987 10988 // Push all call-saved registers and also Pm_base which we'll need 10989 // at the end. 10990 save_regs(); 10991 10992 mov(Pm_base, Ra); 10993 10994 mov(t0, zr); 10995 mov(t1, zr); 10996 mov(t2, zr); 10997 10998 block_comment("for (int i = 0; i < len; i++) {"); 10999 mov(Ri, zr); { 11000 Label loop, end; 11001 bind(loop); 11002 cmp(Ri, Rlen); 11003 br(Assembler::GE, end); 11004 11005 pre1(Ri); 11006 11007 block_comment("for (j = (i+1)/2; j; j--) {"); { 11008 add(Rj, Ri, 1); 11009 lsr(Rj, Rj, 1); 11010 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11011 } block_comment(" } // j"); 11012 11013 last_squaring(Ri); 11014 11015 block_comment(" for (j = i/2; j; j--) {"); { 11016 lsr(Rj, Ri, 1); 11017 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11018 } block_comment(" } // j"); 11019 11020 post1_squaring(); 11021 add(Ri, Ri, 1); 11022 cmp(Ri, Rlen); 11023 br(Assembler::LT, loop); 11024 11025 bind(end); 11026 block_comment("} // i"); 11027 } 11028 11029 block_comment("for (int i = len; i < 2*len; i++) {"); 11030 mov(Ri, Rlen); { 11031 Label loop, end; 11032 bind(loop); 11033 cmp(Ri, Rlen, Assembler::LSL, 1); 11034 br(Assembler::GE, end); 11035 11036 pre2(Ri, Rlen); 11037 11038 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11039 lsl(Rj, Rlen, 1); 11040 sub(Rj, Rj, Ri); 11041 sub(Rj, Rj, 1); 11042 lsr(Rj, Rj, 1); 11043 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11044 } block_comment(" } // j"); 11045 11046 last_squaring(Ri); 11047 11048 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11049 lsl(Rj, Rlen, 1); 11050 sub(Rj, Rj, Ri); 11051 lsr(Rj, Rj, 1); 11052 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11053 } block_comment(" } // j"); 11054 11055 post2(Ri, Rlen); 11056 add(Ri, Ri, 1); 11057 cmp(Ri, Rlen, Assembler::LSL, 1); 11058 11059 br(Assembler::LT, loop); 11060 bind(end); 11061 block_comment("} // i"); 11062 } 11063 11064 normalize(Rlen); 11065 11066 mov(Ra, Pm_base); // Save Pm_base in Ra 11067 restore_regs(); // Restore caller's Pm_base 11068 11069 // Copy our result into caller's Pm_base 11070 reverse(Pm_base, Ra, Rlen, t0, t1); 11071 11072 leave(); 11073 ret(lr); 11074 11075 return entry; 11076 } 11077 // In C, approximately: 11078 11079 // void 11080 // montgomery_square(julong Pa_base[], julong Pn_base[], 11081 // julong Pm_base[], julong inv, int len) { 11082 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11083 // julong *Pa, *Pb, *Pn, *Pm; 11084 // julong Ra, Rb, Rn, Rm; 11085 11086 // int i; 11087 11088 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11089 11090 // for (i = 0; i < len; i++) { 11091 // int j; 11092 11093 // Pa = Pa_base; 11094 // Pb = Pa_base + i; 11095 // Pm = Pm_base; 11096 // Pn = Pn_base + i; 11097 11098 // Ra = *Pa; 11099 // Rb = *Pb; 11100 // Rm = *Pm; 11101 // Rn = *Pn; 11102 11103 // int iters = (i+1)/2; 11104 // for (j = 0; iters--; j++) { 11105 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11106 // MACC2(Ra, Rb, t0, t1, t2); 11107 // Ra = *++Pa; 11108 // Rb = *--Pb; 11109 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11110 // MACC(Rm, Rn, t0, t1, t2); 11111 // Rm = *++Pm; 11112 // Rn = *--Pn; 11113 // } 11114 // if ((i & 1) == 0) { 11115 // assert(Ra == Pa_base[j], "must be"); 11116 // MACC(Ra, Ra, t0, t1, t2); 11117 // } 11118 // iters = i/2; 11119 // assert(iters == i-j, "must be"); 11120 // for (; iters--; j++) { 11121 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11122 // MACC(Rm, Rn, t0, t1, t2); 11123 // Rm = *++Pm; 11124 // Rn = *--Pn; 11125 // } 11126 11127 // *Pm = Rm = t0 * inv; 11128 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11129 // MACC(Rm, Rn, t0, t1, t2); 11130 11131 // assert(t0 == 0, "broken Montgomery multiply"); 11132 11133 // t0 = t1; t1 = t2; t2 = 0; 11134 // } 11135 11136 // for (i = len; i < 2*len; i++) { 11137 // int start = i-len+1; 11138 // int end = start + (len - start)/2; 11139 // int j; 11140 11141 // Pa = Pa_base + i-len; 11142 // Pb = Pa_base + len; 11143 // Pm = Pm_base + i-len; 11144 // Pn = Pn_base + len; 11145 11146 // Ra = *++Pa; 11147 // Rb = *--Pb; 11148 // Rm = *++Pm; 11149 // Rn = *--Pn; 11150 11151 // int iters = (2*len-i-1)/2; 11152 // assert(iters == end-start, "must be"); 11153 // for (j = start; iters--; j++) { 11154 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11155 // MACC2(Ra, Rb, t0, t1, t2); 11156 // Ra = *++Pa; 11157 // Rb = *--Pb; 11158 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11159 // MACC(Rm, Rn, t0, t1, t2); 11160 // Rm = *++Pm; 11161 // Rn = *--Pn; 11162 // } 11163 // if ((i & 1) == 0) { 11164 // assert(Ra == Pa_base[j], "must be"); 11165 // MACC(Ra, Ra, t0, t1, t2); 11166 // } 11167 // iters = (2*len-i)/2; 11168 // assert(iters == len-j, "must be"); 11169 // for (; iters--; j++) { 11170 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11171 // MACC(Rm, Rn, t0, t1, t2); 11172 // Rm = *++Pm; 11173 // Rn = *--Pn; 11174 // } 11175 // Pm_base[i-len] = t0; 11176 // t0 = t1; t1 = t2; t2 = 0; 11177 // } 11178 11179 // while (t0) 11180 // t0 = sub(Pm_base, Pn_base, t0, len); 11181 // } 11182 }; 11183 11184 // Initialization 11185 void generate_initial_stubs() { 11186 // Generate initial stubs and initializes the entry points 11187 11188 // entry points that exist in all platforms Note: This is code 11189 // that could be shared among different platforms - however the 11190 // benefit seems to be smaller than the disadvantage of having a 11191 // much more complicated generator structure. See also comment in 11192 // stubRoutines.hpp. 11193 11194 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11195 11196 StubRoutines::_call_stub_entry = 11197 generate_call_stub(StubRoutines::_call_stub_return_address); 11198 11199 // is referenced by megamorphic call 11200 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11201 11202 // Initialize table for copy memory (arraycopy) check. 11203 if (UnsafeMemoryAccess::_table == nullptr) { 11204 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11205 } 11206 11207 if (UseCRC32Intrinsics) { 11208 // set table address before stub generation which use it 11209 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 11210 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11211 } 11212 11213 if (UseCRC32CIntrinsics) { 11214 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11215 } 11216 11217 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11218 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11219 } 11220 11221 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11222 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11223 } 11224 11225 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11226 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11227 StubRoutines::_hf2f = generate_float16ToFloat(); 11228 StubRoutines::_f2hf = generate_floatToFloat16(); 11229 } 11230 } 11231 11232 void generate_continuation_stubs() { 11233 // Continuation stubs: 11234 StubRoutines::_cont_thaw = generate_cont_thaw(); 11235 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11236 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11237 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11238 } 11239 11240 void generate_final_stubs() { 11241 // support for verify_oop (must happen after universe_init) 11242 if (VerifyOops) { 11243 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11244 } 11245 11246 // arraycopy stubs used by compilers 11247 generate_arraycopy_stubs(); 11248 11249 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11250 11251 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11252 11253 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11254 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11255 11256 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11257 11258 generate_atomic_entry_points(); 11259 11260 #endif // LINUX 11261 11262 #ifdef COMPILER2 11263 if (UseSecondarySupersTable) { 11264 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11265 if (! InlineSecondarySupersTest) { 11266 generate_lookup_secondary_supers_table_stub(); 11267 } 11268 } 11269 #endif 11270 11271 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11272 } 11273 11274 void generate_compiler_stubs() { 11275 #if COMPILER2_OR_JVMCI 11276 11277 if (UseSVE == 0) { 11278 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 11279 } 11280 11281 // array equals stub for large arrays. 11282 if (!UseSimpleArrayEquals) { 11283 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11284 } 11285 11286 // arrays_hascode stub for large arrays. 11287 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11288 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11289 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11290 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11291 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11292 11293 // byte_array_inflate stub for large arrays. 11294 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11295 11296 // countPositives stub for large arrays. 11297 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11298 11299 generate_compare_long_strings(); 11300 11301 generate_string_indexof_stubs(); 11302 11303 #ifdef COMPILER2 11304 if (UseMultiplyToLenIntrinsic) { 11305 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11306 } 11307 11308 if (UseSquareToLenIntrinsic) { 11309 StubRoutines::_squareToLen = generate_squareToLen(); 11310 } 11311 11312 if (UseMulAddIntrinsic) { 11313 StubRoutines::_mulAdd = generate_mulAdd(); 11314 } 11315 11316 if (UseSIMDForBigIntegerShiftIntrinsics) { 11317 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11318 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11319 } 11320 11321 if (UseMontgomeryMultiplyIntrinsic) { 11322 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 11323 StubCodeMark mark(this, stub_id); 11324 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11325 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11326 } 11327 11328 if (UseMontgomerySquareIntrinsic) { 11329 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 11330 StubCodeMark mark(this, stub_id); 11331 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11332 // We use generate_multiply() rather than generate_square() 11333 // because it's faster for the sizes of modulus we care about. 11334 StubRoutines::_montgomerySquare = g.generate_multiply(); 11335 } 11336 11337 #endif // COMPILER2 11338 11339 if (UseChaCha20Intrinsics) { 11340 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11341 } 11342 11343 if (UseKyberIntrinsics) { 11344 StubRoutines::_kyberNtt = generate_kyberNtt(); 11345 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11346 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11347 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11348 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11349 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11350 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11351 } 11352 11353 if (UseDilithiumIntrinsics) { 11354 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11355 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11356 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11357 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11358 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11359 } 11360 11361 if (UseBASE64Intrinsics) { 11362 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11363 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11364 } 11365 11366 // data cache line writeback 11367 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11368 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11369 11370 if (UseAESIntrinsics) { 11371 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11372 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11373 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11374 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11375 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11376 } 11377 if (UseGHASHIntrinsics) { 11378 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11379 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11380 } 11381 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11382 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11383 } 11384 11385 if (UseMD5Intrinsics) { 11386 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 11387 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 11388 } 11389 if (UseSHA1Intrinsics) { 11390 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 11391 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 11392 } 11393 if (UseSHA256Intrinsics) { 11394 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 11395 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 11396 } 11397 if (UseSHA512Intrinsics) { 11398 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 11399 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 11400 } 11401 if (UseSHA3Intrinsics) { 11402 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 11403 StubRoutines::_double_keccak = generate_double_keccak(); 11404 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 11405 } 11406 11407 if (UsePoly1305Intrinsics) { 11408 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11409 } 11410 11411 // generate Adler32 intrinsics code 11412 if (UseAdler32Intrinsics) { 11413 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11414 } 11415 11416 #endif // COMPILER2_OR_JVMCI 11417 } 11418 11419 public: 11420 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 11421 switch(blob_id) { 11422 case initial_id: 11423 generate_initial_stubs(); 11424 break; 11425 case continuation_id: 11426 generate_continuation_stubs(); 11427 break; 11428 case compiler_id: 11429 generate_compiler_stubs(); 11430 break; 11431 case final_id: 11432 generate_final_stubs(); 11433 break; 11434 default: 11435 fatal("unexpected blob id: %d", blob_id); 11436 break; 11437 }; 11438 } 11439 }; // end class declaration 11440 11441 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 11442 StubGenerator g(code, blob_id); 11443 } 11444 11445 11446 #if defined (LINUX) 11447 11448 // Define pointers to atomic stubs and initialize them to point to the 11449 // code in atomic_aarch64.S. 11450 11451 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 11452 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 11453 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 11454 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 11455 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 11456 11457 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 11458 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 11459 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 11460 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 11461 DEFAULT_ATOMIC_OP(xchg, 4, ) 11462 DEFAULT_ATOMIC_OP(xchg, 8, ) 11463 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 11464 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 11465 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 11466 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 11467 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 11468 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 11469 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 11470 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 11471 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 11472 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 11473 11474 #undef DEFAULT_ATOMIC_OP 11475 11476 #endif // LINUX