1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 426 StubCodeMark mark(this, stub_id); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 482 StubCodeMark mark(this, stub_id); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // Reinitialize the ptrue predicate register, in case the external runtime 521 // call clobbers ptrue reg, as we may return to SVE compiled code. 522 __ reinitialize_ptrue(); 523 524 // we should not really care that lr is no longer the callee 525 // address. we saved the value the handler needs in r19 so we can 526 // just copy it to r3. however, the C2 handler will push its own 527 // frame and then calls into the VM and the VM code asserts that 528 // the PC for the frame above the handler belongs to a compiled 529 // Java method. So, we restore lr here to satisfy that assert. 530 __ mov(lr, r19); 531 // setup r0 & r3 & clear pending exception 532 __ mov(r3, r19); 533 __ mov(r19, r0); 534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 535 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 536 537 #ifdef ASSERT 538 // make sure exception is set 539 { 540 Label L; 541 __ cbnz(r0, L); 542 __ stop("StubRoutines::forward exception: no pending exception (2)"); 543 __ bind(L); 544 } 545 #endif 546 547 // continue at exception handler 548 // r0: exception 549 // r3: throwing pc 550 // r19: exception handler 551 __ verify_oop(r0); 552 __ br(r19); 553 554 return start; 555 } 556 557 // Non-destructive plausibility checks for oops 558 // 559 // Arguments: 560 // r0: oop to verify 561 // rscratch1: error message 562 // 563 // Stack after saving c_rarg3: 564 // [tos + 0]: saved c_rarg3 565 // [tos + 1]: saved c_rarg2 566 // [tos + 2]: saved lr 567 // [tos + 3]: saved rscratch2 568 // [tos + 4]: saved r0 569 // [tos + 5]: saved rscratch1 570 address generate_verify_oop() { 571 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 572 StubCodeMark mark(this, stub_id); 573 address start = __ pc(); 574 575 Label exit, error; 576 577 // save c_rarg2 and c_rarg3 578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 579 580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ ldr(c_rarg3, Address(c_rarg2)); 583 __ add(c_rarg3, c_rarg3, 1); 584 __ str(c_rarg3, Address(c_rarg2)); 585 586 // object is in r0 587 // make sure object is 'reasonable' 588 __ cbz(r0, exit); // if obj is null it is OK 589 590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blr(rscratch1); 614 __ hlt(0); 615 616 return start; 617 } 618 619 // Generate indices for iota vector. 620 address generate_iota_indices(StubGenStubId stub_id) { 621 __ align(CodeEntryAlignment); 622 StubCodeMark mark(this, stub_id); 623 address start = __ pc(); 624 // B 625 __ emit_data64(0x0706050403020100, relocInfo::none); 626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 627 // H 628 __ emit_data64(0x0003000200010000, relocInfo::none); 629 __ emit_data64(0x0007000600050004, relocInfo::none); 630 // S 631 __ emit_data64(0x0000000100000000, relocInfo::none); 632 __ emit_data64(0x0000000300000002, relocInfo::none); 633 // D 634 __ emit_data64(0x0000000000000000, relocInfo::none); 635 __ emit_data64(0x0000000000000001, relocInfo::none); 636 // S - FP 637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 639 // D - FP 640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 642 return start; 643 } 644 645 // The inner part of zero_words(). This is the bulk operation, 646 // zeroing words in blocks, possibly using DC ZVA to do it. The 647 // caller is responsible for zeroing the last few words. 648 // 649 // Inputs: 650 // r10: the HeapWord-aligned base address of an array to zero. 651 // r11: the count in HeapWords, r11 > 0. 652 // 653 // Returns r10 and r11, adjusted for the caller to clear. 654 // r10: the base address of the tail of words left to clear. 655 // r11: the number of words in the tail. 656 // r11 < MacroAssembler::zero_words_block_size. 657 658 address generate_zero_blocks() { 659 Label done; 660 Label base_aligned; 661 662 Register base = r10, cnt = r11; 663 664 __ align(CodeEntryAlignment); 665 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 666 StubCodeMark mark(this, stub_id); 667 address start = __ pc(); 668 669 if (UseBlockZeroing) { 670 int zva_length = VM_Version::zva_length(); 671 672 // Ensure ZVA length can be divided by 16. This is required by 673 // the subsequent operations. 674 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 675 676 __ tbz(base, 3, base_aligned); 677 __ str(zr, Address(__ post(base, 8))); 678 __ sub(cnt, cnt, 1); 679 __ bind(base_aligned); 680 681 // Ensure count >= zva_length * 2 so that it still deserves a zva after 682 // alignment. 683 Label small; 684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 685 __ subs(rscratch1, cnt, low_limit >> 3); 686 __ br(Assembler::LT, small); 687 __ zero_dcache_blocks(base, cnt); 688 __ bind(small); 689 } 690 691 { 692 // Number of stp instructions we'll unroll 693 const int unroll = 694 MacroAssembler::zero_words_block_size / 2; 695 // Clear the remaining blocks. 696 Label loop; 697 __ subs(cnt, cnt, unroll * 2); 698 __ br(Assembler::LT, done); 699 __ bind(loop); 700 for (int i = 0; i < unroll; i++) 701 __ stp(zr, zr, __ post(base, 16)); 702 __ subs(cnt, cnt, unroll * 2); 703 __ br(Assembler::GE, loop); 704 __ bind(done); 705 __ add(cnt, cnt, unroll * 2); 706 } 707 708 __ ret(lr); 709 710 return start; 711 } 712 713 714 typedef enum { 715 copy_forwards = 1, 716 copy_backwards = -1 717 } copy_direction; 718 719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 720 // for arraycopy stubs. 721 class ArrayCopyBarrierSetHelper : StackObj { 722 BarrierSetAssembler* _bs_asm; 723 MacroAssembler* _masm; 724 DecoratorSet _decorators; 725 BasicType _type; 726 Register _gct1; 727 Register _gct2; 728 Register _gct3; 729 FloatRegister _gcvt1; 730 FloatRegister _gcvt2; 731 FloatRegister _gcvt3; 732 733 public: 734 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 735 DecoratorSet decorators, 736 BasicType type, 737 Register gct1, 738 Register gct2, 739 Register gct3, 740 FloatRegister gcvt1, 741 FloatRegister gcvt2, 742 FloatRegister gcvt3) 743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 744 _masm(masm), 745 _decorators(decorators), 746 _type(type), 747 _gct1(gct1), 748 _gct2(gct2), 749 _gct3(gct3), 750 _gcvt1(gcvt1), 751 _gcvt2(gcvt2), 752 _gcvt3(gcvt3) { 753 } 754 755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 757 dst1, dst2, src, 758 _gct1, _gct2, _gcvt1); 759 } 760 761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 763 dst, src1, src2, 764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 765 } 766 767 void copy_load_at_16(Register dst1, Register dst2, Address src) { 768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 769 dst1, dst2, src, 770 _gct1); 771 } 772 773 void copy_store_at_16(Address dst, Register src1, Register src2) { 774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 775 dst, src1, src2, 776 _gct1, _gct2, _gct3); 777 } 778 779 void copy_load_at_8(Register dst, Address src) { 780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 781 dst, noreg, src, 782 _gct1); 783 } 784 785 void copy_store_at_8(Address dst, Register src) { 786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 787 dst, src, noreg, 788 _gct1, _gct2, _gct3); 789 } 790 }; 791 792 // Bulk copy of blocks of 8 words. 793 // 794 // count is a count of words. 795 // 796 // Precondition: count >= 8 797 // 798 // Postconditions: 799 // 800 // The least significant bit of count contains the remaining count 801 // of words to copy. The rest of count is trash. 802 // 803 // s and d are adjusted to point to the remaining words to copy 804 // 805 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 806 BasicType type; 807 copy_direction direction; 808 809 switch (stub_id) { 810 case copy_byte_f_id: 811 direction = copy_forwards; 812 type = T_BYTE; 813 break; 814 case copy_byte_b_id: 815 direction = copy_backwards; 816 type = T_BYTE; 817 break; 818 case copy_oop_f_id: 819 direction = copy_forwards; 820 type = T_OBJECT; 821 break; 822 case copy_oop_b_id: 823 direction = copy_backwards; 824 type = T_OBJECT; 825 break; 826 case copy_oop_uninit_f_id: 827 direction = copy_forwards; 828 type = T_OBJECT; 829 break; 830 case copy_oop_uninit_b_id: 831 direction = copy_backwards; 832 type = T_OBJECT; 833 break; 834 default: 835 ShouldNotReachHere(); 836 } 837 838 int unit = wordSize * direction; 839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 840 841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 842 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 843 const Register stride = r14; 844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 847 848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 849 assert_different_registers(s, d, count, rscratch1, rscratch2); 850 851 Label again, drain; 852 853 __ align(CodeEntryAlignment); 854 855 StubCodeMark mark(this, stub_id); 856 857 __ bind(start); 858 859 Label unaligned_copy_long; 860 if (AvoidUnalignedAccesses) { 861 __ tbnz(d, 3, unaligned_copy_long); 862 } 863 864 if (direction == copy_forwards) { 865 __ sub(s, s, bias); 866 __ sub(d, d, bias); 867 } 868 869 #ifdef ASSERT 870 // Make sure we are never given < 8 words 871 { 872 Label L; 873 __ cmp(count, (u1)8); 874 __ br(Assembler::GE, L); 875 __ stop("genrate_copy_longs called with < 8 words"); 876 __ bind(L); 877 } 878 #endif 879 880 // Fill 8 registers 881 if (UseSIMDForMemoryOps) { 882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 889 } 890 891 __ subs(count, count, 16); 892 __ br(Assembler::LO, drain); 893 894 int prefetch = PrefetchCopyIntervalInBytes; 895 bool use_stride = false; 896 if (direction == copy_backwards) { 897 use_stride = prefetch > 256; 898 prefetch = -prefetch; 899 if (use_stride) __ mov(stride, prefetch); 900 } 901 902 __ bind(again); 903 904 if (PrefetchCopyIntervalInBytes > 0) 905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 906 907 if (UseSIMDForMemoryOps) { 908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 912 } else { 913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 921 } 922 923 __ subs(count, count, 8); 924 __ br(Assembler::HS, again); 925 926 // Drain 927 __ bind(drain); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 931 } else { 932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 } 937 938 { 939 Label L1, L2; 940 __ tbz(count, exact_log2(4), L1); 941 if (UseSIMDForMemoryOps) { 942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 944 } else { 945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 949 } 950 __ bind(L1); 951 952 if (direction == copy_forwards) { 953 __ add(s, s, bias); 954 __ add(d, d, bias); 955 } 956 957 __ tbz(count, 1, L2); 958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 960 __ bind(L2); 961 } 962 963 __ ret(lr); 964 965 if (AvoidUnalignedAccesses) { 966 Label drain, again; 967 // Register order for storing. Order is different for backward copy. 968 969 __ bind(unaligned_copy_long); 970 971 // source address is even aligned, target odd aligned 972 // 973 // when forward copying word pairs we read long pairs at offsets 974 // {0, 2, 4, 6} (in long words). when backwards copying we read 975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 976 // address by -2 in the forwards case so we can compute the 977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 978 // or -1. 979 // 980 // when forward copying we need to store 1 word, 3 pairs and 981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 982 // zero offset We adjust the destination by -1 which means we 983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 984 // 985 // When backwards copyng we need to store 1 word, 3 pairs and 986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 987 // offsets {1, 3, 5, 7, 8} * unit. 988 989 if (direction == copy_forwards) { 990 __ sub(s, s, 16); 991 __ sub(d, d, 8); 992 } 993 994 // Fill 8 registers 995 // 996 // for forwards copy s was offset by -16 from the original input 997 // value of s so the register contents are at these offsets 998 // relative to the 64 bit block addressed by that original input 999 // and so on for each successive 64 byte block when s is updated 1000 // 1001 // t0 at offset 0, t1 at offset 8 1002 // t2 at offset 16, t3 at offset 24 1003 // t4 at offset 32, t5 at offset 40 1004 // t6 at offset 48, t7 at offset 56 1005 1006 // for backwards copy s was not offset so the register contents 1007 // are at these offsets into the preceding 64 byte block 1008 // relative to that original input and so on for each successive 1009 // preceding 64 byte block when s is updated. this explains the 1010 // slightly counter-intuitive looking pattern of register usage 1011 // in the stp instructions for backwards copy. 1012 // 1013 // t0 at offset -16, t1 at offset -8 1014 // t2 at offset -32, t3 at offset -24 1015 // t4 at offset -48, t5 at offset -40 1016 // t6 at offset -64, t7 at offset -56 1017 1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1022 1023 __ subs(count, count, 16); 1024 __ br(Assembler::LO, drain); 1025 1026 int prefetch = PrefetchCopyIntervalInBytes; 1027 bool use_stride = false; 1028 if (direction == copy_backwards) { 1029 use_stride = prefetch > 256; 1030 prefetch = -prefetch; 1031 if (use_stride) __ mov(stride, prefetch); 1032 } 1033 1034 __ bind(again); 1035 1036 if (PrefetchCopyIntervalInBytes > 0) 1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1038 1039 if (direction == copy_forwards) { 1040 // allowing for the offset of -8 the store instructions place 1041 // registers into the target 64 bit block at the following 1042 // offsets 1043 // 1044 // t0 at offset 0 1045 // t1 at offset 8, t2 at offset 16 1046 // t3 at offset 24, t4 at offset 32 1047 // t5 at offset 40, t6 at offset 48 1048 // t7 at offset 56 1049 1050 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1059 } else { 1060 // d was not offset when we started so the registers are 1061 // written into the 64 bit block preceding d with the following 1062 // offsets 1063 // 1064 // t1 at offset -8 1065 // t3 at offset -24, t0 at offset -16 1066 // t5 at offset -48, t2 at offset -32 1067 // t7 at offset -56, t4 at offset -48 1068 // t6 at offset -64 1069 // 1070 // note that this matches the offsets previously noted for the 1071 // loads 1072 1073 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1082 } 1083 1084 __ subs(count, count, 8); 1085 __ br(Assembler::HS, again); 1086 1087 // Drain 1088 // 1089 // this uses the same pattern of offsets and register arguments 1090 // as above 1091 __ bind(drain); 1092 if (direction == copy_forwards) { 1093 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1098 } else { 1099 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1104 } 1105 // now we need to copy any remaining part block which may 1106 // include a 4 word block subblock and/or a 2 word subblock. 1107 // bits 2 and 1 in the count are the tell-tale for whether we 1108 // have each such subblock 1109 { 1110 Label L1, L2; 1111 __ tbz(count, exact_log2(4), L1); 1112 // this is the same as above but copying only 4 longs hence 1113 // with only one intervening stp between the str instructions 1114 // but note that the offsets and registers still follow the 1115 // same pattern 1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1118 if (direction == copy_forwards) { 1119 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1122 } else { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1126 } 1127 __ bind(L1); 1128 1129 __ tbz(count, 1, L2); 1130 // this is the same as above but copying only 2 longs hence 1131 // there is no intervening stp between the str instructions 1132 // but note that the offset and register patterns are still 1133 // the same 1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1135 if (direction == copy_forwards) { 1136 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1141 } 1142 __ bind(L2); 1143 1144 // for forwards copy we need to re-adjust the offsets we 1145 // applied so that s and d are follow the last words written 1146 1147 if (direction == copy_forwards) { 1148 __ add(s, s, 16); 1149 __ add(d, d, 8); 1150 } 1151 1152 } 1153 1154 __ ret(lr); 1155 } 1156 } 1157 1158 // Small copy: less than 16 bytes. 1159 // 1160 // NB: Ignores all of the bits of count which represent more than 15 1161 // bytes, so a caller doesn't have to mask them. 1162 1163 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1164 bool is_backwards = step < 0; 1165 size_t granularity = uabs(step); 1166 int direction = is_backwards ? -1 : 1; 1167 1168 Label Lword, Lint, Lshort, Lbyte; 1169 1170 assert(granularity 1171 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1172 1173 const Register t0 = r3; 1174 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1175 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1176 1177 // ??? I don't know if this bit-test-and-branch is the right thing 1178 // to do. It does a lot of jumping, resulting in several 1179 // mispredicted branches. It might make more sense to do this 1180 // with something like Duff's device with a single computed branch. 1181 1182 __ tbz(count, 3 - exact_log2(granularity), Lword); 1183 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1184 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1185 __ bind(Lword); 1186 1187 if (granularity <= sizeof (jint)) { 1188 __ tbz(count, 2 - exact_log2(granularity), Lint); 1189 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1190 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1191 __ bind(Lint); 1192 } 1193 1194 if (granularity <= sizeof (jshort)) { 1195 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1196 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1197 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1198 __ bind(Lshort); 1199 } 1200 1201 if (granularity <= sizeof (jbyte)) { 1202 __ tbz(count, 0, Lbyte); 1203 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1204 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1205 __ bind(Lbyte); 1206 } 1207 } 1208 1209 Label copy_f, copy_b; 1210 Label copy_obj_f, copy_obj_b; 1211 Label copy_obj_uninit_f, copy_obj_uninit_b; 1212 1213 // All-singing all-dancing memory copy. 1214 // 1215 // Copy count units of memory from s to d. The size of a unit is 1216 // step, which can be positive or negative depending on the direction 1217 // of copy. If is_aligned is false, we align the source address. 1218 // 1219 1220 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1221 Register s, Register d, Register count, int step) { 1222 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1223 bool is_backwards = step < 0; 1224 unsigned int granularity = uabs(step); 1225 const Register t0 = r3, t1 = r4; 1226 1227 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1228 // load all the data before writing anything 1229 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1230 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1231 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1232 const Register send = r17, dend = r16; 1233 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1234 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1235 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1236 1237 if (PrefetchCopyIntervalInBytes > 0) 1238 __ prfm(Address(s, 0), PLDL1KEEP); 1239 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1240 __ br(Assembler::HI, copy_big); 1241 1242 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1243 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1244 1245 __ cmp(count, u1(16/granularity)); 1246 __ br(Assembler::LS, copy16); 1247 1248 __ cmp(count, u1(64/granularity)); 1249 __ br(Assembler::HI, copy80); 1250 1251 __ cmp(count, u1(32/granularity)); 1252 __ br(Assembler::LS, copy32); 1253 1254 // 33..64 bytes 1255 if (UseSIMDForMemoryOps) { 1256 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1257 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1258 bs.copy_store_at_32(Address(d, 0), v0, v1); 1259 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1260 } else { 1261 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1262 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1263 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1264 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1265 1266 bs.copy_store_at_16(Address(d, 0), t0, t1); 1267 bs.copy_store_at_16(Address(d, 16), t2, t3); 1268 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1269 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1270 } 1271 __ b(finish); 1272 1273 // 17..32 bytes 1274 __ bind(copy32); 1275 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1276 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1277 1278 bs.copy_store_at_16(Address(d, 0), t0, t1); 1279 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1280 __ b(finish); 1281 1282 // 65..80/96 bytes 1283 // (96 bytes if SIMD because we do 32 byes per instruction) 1284 __ bind(copy80); 1285 if (UseSIMDForMemoryOps) { 1286 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1287 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1288 // Unaligned pointers can be an issue for copying. 1289 // The issue has more chances to happen when granularity of data is 1290 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1291 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1292 // The most performance drop has been seen for the range 65-80 bytes. 1293 // For such cases using the pair of ldp/stp instead of the third pair of 1294 // ldpq/stpq fixes the performance issue. 1295 if (granularity < sizeof (jint)) { 1296 Label copy96; 1297 __ cmp(count, u1(80/granularity)); 1298 __ br(Assembler::HI, copy96); 1299 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1300 1301 bs.copy_store_at_32(Address(d, 0), v0, v1); 1302 bs.copy_store_at_32(Address(d, 32), v2, v3); 1303 1304 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1305 __ b(finish); 1306 1307 __ bind(copy96); 1308 } 1309 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1310 1311 bs.copy_store_at_32(Address(d, 0), v0, v1); 1312 bs.copy_store_at_32(Address(d, 32), v2, v3); 1313 1314 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1315 } else { 1316 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1317 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1318 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1319 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1320 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1321 1322 bs.copy_store_at_16(Address(d, 0), t0, t1); 1323 bs.copy_store_at_16(Address(d, 16), t2, t3); 1324 bs.copy_store_at_16(Address(d, 32), t4, t5); 1325 bs.copy_store_at_16(Address(d, 48), t6, t7); 1326 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1327 } 1328 __ b(finish); 1329 1330 // 0..16 bytes 1331 __ bind(copy16); 1332 __ cmp(count, u1(8/granularity)); 1333 __ br(Assembler::LO, copy8); 1334 1335 // 8..16 bytes 1336 bs.copy_load_at_8(t0, Address(s, 0)); 1337 bs.copy_load_at_8(t1, Address(send, -8)); 1338 bs.copy_store_at_8(Address(d, 0), t0); 1339 bs.copy_store_at_8(Address(dend, -8), t1); 1340 __ b(finish); 1341 1342 if (granularity < 8) { 1343 // 4..7 bytes 1344 __ bind(copy8); 1345 __ tbz(count, 2 - exact_log2(granularity), copy4); 1346 __ ldrw(t0, Address(s, 0)); 1347 __ ldrw(t1, Address(send, -4)); 1348 __ strw(t0, Address(d, 0)); 1349 __ strw(t1, Address(dend, -4)); 1350 __ b(finish); 1351 if (granularity < 4) { 1352 // 0..3 bytes 1353 __ bind(copy4); 1354 __ cbz(count, finish); // get rid of 0 case 1355 if (granularity == 2) { 1356 __ ldrh(t0, Address(s, 0)); 1357 __ strh(t0, Address(d, 0)); 1358 } else { // granularity == 1 1359 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1360 // the first and last byte. 1361 // Handle the 3 byte case by loading and storing base + count/2 1362 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1363 // This does means in the 1 byte case we load/store the same 1364 // byte 3 times. 1365 __ lsr(count, count, 1); 1366 __ ldrb(t0, Address(s, 0)); 1367 __ ldrb(t1, Address(send, -1)); 1368 __ ldrb(t2, Address(s, count)); 1369 __ strb(t0, Address(d, 0)); 1370 __ strb(t1, Address(dend, -1)); 1371 __ strb(t2, Address(d, count)); 1372 } 1373 __ b(finish); 1374 } 1375 } 1376 1377 __ bind(copy_big); 1378 if (is_backwards) { 1379 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1380 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1381 } 1382 1383 // Now we've got the small case out of the way we can align the 1384 // source address on a 2-word boundary. 1385 1386 // Here we will materialize a count in r15, which is used by copy_memory_small 1387 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1388 // Up until here, we have used t9, which aliases r15, but from here on, that register 1389 // can not be used as a temp register, as it contains the count. 1390 1391 Label aligned; 1392 1393 if (is_aligned) { 1394 // We may have to adjust by 1 word to get s 2-word-aligned. 1395 __ tbz(s, exact_log2(wordSize), aligned); 1396 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1397 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1398 __ sub(count, count, wordSize/granularity); 1399 } else { 1400 if (is_backwards) { 1401 __ andr(r15, s, 2 * wordSize - 1); 1402 } else { 1403 __ neg(r15, s); 1404 __ andr(r15, r15, 2 * wordSize - 1); 1405 } 1406 // r15 is the byte adjustment needed to align s. 1407 __ cbz(r15, aligned); 1408 int shift = exact_log2(granularity); 1409 if (shift > 0) { 1410 __ lsr(r15, r15, shift); 1411 } 1412 __ sub(count, count, r15); 1413 1414 #if 0 1415 // ?? This code is only correct for a disjoint copy. It may or 1416 // may not make sense to use it in that case. 1417 1418 // Copy the first pair; s and d may not be aligned. 1419 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1420 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1421 1422 // Align s and d, adjust count 1423 if (is_backwards) { 1424 __ sub(s, s, r15); 1425 __ sub(d, d, r15); 1426 } else { 1427 __ add(s, s, r15); 1428 __ add(d, d, r15); 1429 } 1430 #else 1431 copy_memory_small(decorators, type, s, d, r15, step); 1432 #endif 1433 } 1434 1435 __ bind(aligned); 1436 1437 // s is now 2-word-aligned. 1438 1439 // We have a count of units and some trailing bytes. Adjust the 1440 // count and do a bulk copy of words. If the shift is zero 1441 // perform a move instead to benefit from zero latency moves. 1442 int shift = exact_log2(wordSize/granularity); 1443 if (shift > 0) { 1444 __ lsr(r15, count, shift); 1445 } else { 1446 __ mov(r15, count); 1447 } 1448 if (direction == copy_forwards) { 1449 if (type != T_OBJECT) { 1450 __ bl(copy_f); 1451 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1452 __ bl(copy_obj_uninit_f); 1453 } else { 1454 __ bl(copy_obj_f); 1455 } 1456 } else { 1457 if (type != T_OBJECT) { 1458 __ bl(copy_b); 1459 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1460 __ bl(copy_obj_uninit_b); 1461 } else { 1462 __ bl(copy_obj_b); 1463 } 1464 } 1465 1466 // And the tail. 1467 copy_memory_small(decorators, type, s, d, count, step); 1468 1469 if (granularity >= 8) __ bind(copy8); 1470 if (granularity >= 4) __ bind(copy4); 1471 __ bind(finish); 1472 } 1473 1474 1475 void clobber_registers() { 1476 #ifdef ASSERT 1477 RegSet clobbered 1478 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1479 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1480 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1481 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1482 __ mov(*it, rscratch1); 1483 } 1484 #endif 1485 1486 } 1487 1488 // Scan over array at a for count oops, verifying each one. 1489 // Preserves a and count, clobbers rscratch1 and rscratch2. 1490 void verify_oop_array (int size, Register a, Register count, Register temp) { 1491 Label loop, end; 1492 __ mov(rscratch1, a); 1493 __ mov(rscratch2, zr); 1494 __ bind(loop); 1495 __ cmp(rscratch2, count); 1496 __ br(Assembler::HS, end); 1497 if (size == wordSize) { 1498 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1499 __ verify_oop(temp); 1500 } else { 1501 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1502 __ decode_heap_oop(temp); // calls verify_oop 1503 } 1504 __ add(rscratch2, rscratch2, 1); 1505 __ b(loop); 1506 __ bind(end); 1507 } 1508 1509 // Arguments: 1510 // stub_id - is used to name the stub and identify all details of 1511 // how to perform the copy. 1512 // 1513 // entry - is assigned to the stub's post push entry point unless 1514 // it is null 1515 // 1516 // Inputs: 1517 // c_rarg0 - source array address 1518 // c_rarg1 - destination array address 1519 // c_rarg2 - element count, treated as ssize_t, can be zero 1520 // 1521 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1522 // the hardware handle it. The two dwords within qwords that span 1523 // cache line boundaries will still be loaded and stored atomically. 1524 // 1525 // Side Effects: entry is set to the (post push) entry point so it 1526 // can be used by the corresponding conjoint copy 1527 // method 1528 // 1529 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1530 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1531 RegSet saved_reg = RegSet::of(s, d, count); 1532 int size; 1533 bool aligned; 1534 bool is_oop; 1535 bool dest_uninitialized; 1536 switch (stub_id) { 1537 case jbyte_disjoint_arraycopy_id: 1538 size = sizeof(jbyte); 1539 aligned = false; 1540 is_oop = false; 1541 dest_uninitialized = false; 1542 break; 1543 case arrayof_jbyte_disjoint_arraycopy_id: 1544 size = sizeof(jbyte); 1545 aligned = true; 1546 is_oop = false; 1547 dest_uninitialized = false; 1548 break; 1549 case jshort_disjoint_arraycopy_id: 1550 size = sizeof(jshort); 1551 aligned = false; 1552 is_oop = false; 1553 dest_uninitialized = false; 1554 break; 1555 case arrayof_jshort_disjoint_arraycopy_id: 1556 size = sizeof(jshort); 1557 aligned = true; 1558 is_oop = false; 1559 dest_uninitialized = false; 1560 break; 1561 case jint_disjoint_arraycopy_id: 1562 size = sizeof(jint); 1563 aligned = false; 1564 is_oop = false; 1565 dest_uninitialized = false; 1566 break; 1567 case arrayof_jint_disjoint_arraycopy_id: 1568 size = sizeof(jint); 1569 aligned = true; 1570 is_oop = false; 1571 dest_uninitialized = false; 1572 break; 1573 case jlong_disjoint_arraycopy_id: 1574 // since this is always aligned we can (should!) use the same 1575 // stub as for case arrayof_jlong_disjoint_arraycopy 1576 ShouldNotReachHere(); 1577 break; 1578 case arrayof_jlong_disjoint_arraycopy_id: 1579 size = sizeof(jlong); 1580 aligned = true; 1581 is_oop = false; 1582 dest_uninitialized = false; 1583 break; 1584 case oop_disjoint_arraycopy_id: 1585 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1586 aligned = !UseCompressedOops; 1587 is_oop = true; 1588 dest_uninitialized = false; 1589 break; 1590 case arrayof_oop_disjoint_arraycopy_id: 1591 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1592 aligned = !UseCompressedOops; 1593 is_oop = true; 1594 dest_uninitialized = false; 1595 break; 1596 case oop_disjoint_arraycopy_uninit_id: 1597 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1598 aligned = !UseCompressedOops; 1599 is_oop = true; 1600 dest_uninitialized = true; 1601 break; 1602 case arrayof_oop_disjoint_arraycopy_uninit_id: 1603 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1604 aligned = !UseCompressedOops; 1605 is_oop = true; 1606 dest_uninitialized = true; 1607 break; 1608 default: 1609 ShouldNotReachHere(); 1610 break; 1611 } 1612 1613 __ align(CodeEntryAlignment); 1614 StubCodeMark mark(this, stub_id); 1615 address start = __ pc(); 1616 __ enter(); 1617 1618 if (entry != nullptr) { 1619 *entry = __ pc(); 1620 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1621 BLOCK_COMMENT("Entry:"); 1622 } 1623 1624 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1625 if (dest_uninitialized) { 1626 decorators |= IS_DEST_UNINITIALIZED; 1627 } 1628 if (aligned) { 1629 decorators |= ARRAYCOPY_ALIGNED; 1630 } 1631 1632 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1633 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1634 1635 if (is_oop) { 1636 // save regs before copy_memory 1637 __ push(RegSet::of(d, count), sp); 1638 } 1639 { 1640 // UnsafeMemoryAccess page error: continue after unsafe access 1641 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1642 UnsafeMemoryAccessMark umam(this, add_entry, true); 1643 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1644 } 1645 1646 if (is_oop) { 1647 __ pop(RegSet::of(d, count), sp); 1648 if (VerifyOops) 1649 verify_oop_array(size, d, count, r16); 1650 } 1651 1652 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1653 1654 __ leave(); 1655 __ mov(r0, zr); // return 0 1656 __ ret(lr); 1657 return start; 1658 } 1659 1660 // Arguments: 1661 // stub_id - is used to name the stub and identify all details of 1662 // how to perform the copy. 1663 // 1664 // nooverlap_target - identifes the (post push) entry for the 1665 // corresponding disjoint copy routine which can be 1666 // jumped to if the ranges do not actually overlap 1667 // 1668 // entry - is assigned to the stub's post push entry point unless 1669 // it is null 1670 // 1671 // 1672 // Inputs: 1673 // c_rarg0 - source array address 1674 // c_rarg1 - destination array address 1675 // c_rarg2 - element count, treated as ssize_t, can be zero 1676 // 1677 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1678 // the hardware handle it. The two dwords within qwords that span 1679 // cache line boundaries will still be loaded and stored atomically. 1680 // 1681 // Side Effects: 1682 // entry is set to the no-overlap entry point so it can be used by 1683 // some other conjoint copy method 1684 // 1685 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1686 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1687 RegSet saved_regs = RegSet::of(s, d, count); 1688 int size; 1689 bool aligned; 1690 bool is_oop; 1691 bool dest_uninitialized; 1692 switch (stub_id) { 1693 case jbyte_arraycopy_id: 1694 size = sizeof(jbyte); 1695 aligned = false; 1696 is_oop = false; 1697 dest_uninitialized = false; 1698 break; 1699 case arrayof_jbyte_arraycopy_id: 1700 size = sizeof(jbyte); 1701 aligned = true; 1702 is_oop = false; 1703 dest_uninitialized = false; 1704 break; 1705 case jshort_arraycopy_id: 1706 size = sizeof(jshort); 1707 aligned = false; 1708 is_oop = false; 1709 dest_uninitialized = false; 1710 break; 1711 case arrayof_jshort_arraycopy_id: 1712 size = sizeof(jshort); 1713 aligned = true; 1714 is_oop = false; 1715 dest_uninitialized = false; 1716 break; 1717 case jint_arraycopy_id: 1718 size = sizeof(jint); 1719 aligned = false; 1720 is_oop = false; 1721 dest_uninitialized = false; 1722 break; 1723 case arrayof_jint_arraycopy_id: 1724 size = sizeof(jint); 1725 aligned = true; 1726 is_oop = false; 1727 dest_uninitialized = false; 1728 break; 1729 case jlong_arraycopy_id: 1730 // since this is always aligned we can (should!) use the same 1731 // stub as for case arrayof_jlong_disjoint_arraycopy 1732 ShouldNotReachHere(); 1733 break; 1734 case arrayof_jlong_arraycopy_id: 1735 size = sizeof(jlong); 1736 aligned = true; 1737 is_oop = false; 1738 dest_uninitialized = false; 1739 break; 1740 case oop_arraycopy_id: 1741 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1742 aligned = !UseCompressedOops; 1743 is_oop = true; 1744 dest_uninitialized = false; 1745 break; 1746 case arrayof_oop_arraycopy_id: 1747 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1748 aligned = !UseCompressedOops; 1749 is_oop = true; 1750 dest_uninitialized = false; 1751 break; 1752 case oop_arraycopy_uninit_id: 1753 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1754 aligned = !UseCompressedOops; 1755 is_oop = true; 1756 dest_uninitialized = true; 1757 break; 1758 case arrayof_oop_arraycopy_uninit_id: 1759 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1760 aligned = !UseCompressedOops; 1761 is_oop = true; 1762 dest_uninitialized = true; 1763 break; 1764 default: 1765 ShouldNotReachHere(); 1766 } 1767 1768 StubCodeMark mark(this, stub_id); 1769 address start = __ pc(); 1770 __ enter(); 1771 1772 if (entry != nullptr) { 1773 *entry = __ pc(); 1774 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1775 BLOCK_COMMENT("Entry:"); 1776 } 1777 1778 // use fwd copy when (d-s) above_equal (count*size) 1779 __ sub(rscratch1, d, s); 1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1781 __ br(Assembler::HS, nooverlap_target); 1782 1783 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1784 if (dest_uninitialized) { 1785 decorators |= IS_DEST_UNINITIALIZED; 1786 } 1787 if (aligned) { 1788 decorators |= ARRAYCOPY_ALIGNED; 1789 } 1790 1791 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1792 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1793 1794 if (is_oop) { 1795 // save regs before copy_memory 1796 __ push(RegSet::of(d, count), sp); 1797 } 1798 { 1799 // UnsafeMemoryAccess page error: continue after unsafe access 1800 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1801 UnsafeMemoryAccessMark umam(this, add_entry, true); 1802 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1803 } 1804 if (is_oop) { 1805 __ pop(RegSet::of(d, count), sp); 1806 if (VerifyOops) 1807 verify_oop_array(size, d, count, r16); 1808 } 1809 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1810 __ leave(); 1811 __ mov(r0, zr); // return 0 1812 __ ret(lr); 1813 return start; 1814 } 1815 1816 // Helper for generating a dynamic type check. 1817 // Smashes rscratch1, rscratch2. 1818 void generate_type_check(Register sub_klass, 1819 Register super_check_offset, 1820 Register super_klass, 1821 Register temp1, 1822 Register temp2, 1823 Register result, 1824 Label& L_success) { 1825 assert_different_registers(sub_klass, super_check_offset, super_klass); 1826 1827 BLOCK_COMMENT("type_check:"); 1828 1829 Label L_miss; 1830 1831 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1832 super_check_offset); 1833 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1834 1835 // Fall through on failure! 1836 __ BIND(L_miss); 1837 } 1838 1839 // 1840 // Generate checkcasting array copy stub 1841 // 1842 // Input: 1843 // c_rarg0 - source array address 1844 // c_rarg1 - destination array address 1845 // c_rarg2 - element count, treated as ssize_t, can be zero 1846 // c_rarg3 - size_t ckoff (super_check_offset) 1847 // c_rarg4 - oop ckval (super_klass) 1848 // 1849 // Output: 1850 // r0 == 0 - success 1851 // r0 == -1^K - failure, where K is partial transfer count 1852 // 1853 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1854 bool dest_uninitialized; 1855 switch (stub_id) { 1856 case checkcast_arraycopy_id: 1857 dest_uninitialized = false; 1858 break; 1859 case checkcast_arraycopy_uninit_id: 1860 dest_uninitialized = true; 1861 break; 1862 default: 1863 ShouldNotReachHere(); 1864 } 1865 1866 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1867 1868 // Input registers (after setup_arg_regs) 1869 const Register from = c_rarg0; // source array address 1870 const Register to = c_rarg1; // destination array address 1871 const Register count = c_rarg2; // elementscount 1872 const Register ckoff = c_rarg3; // super_check_offset 1873 const Register ckval = c_rarg4; // super_klass 1874 1875 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1876 RegSet wb_post_saved_regs = RegSet::of(count); 1877 1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1879 const Register copied_oop = r22; // actual oop copied 1880 const Register count_save = r21; // orig elementscount 1881 const Register start_to = r20; // destination array start address 1882 const Register r19_klass = r19; // oop._klass 1883 1884 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1885 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1886 1887 //--------------------------------------------------------------- 1888 // Assembler stub will be used for this call to arraycopy 1889 // if the two arrays are subtypes of Object[] but the 1890 // destination array type is not equal to or a supertype 1891 // of the source type. Each element must be separately 1892 // checked. 1893 1894 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1895 copied_oop, r19_klass, count_save); 1896 1897 __ align(CodeEntryAlignment); 1898 StubCodeMark mark(this, stub_id); 1899 address start = __ pc(); 1900 1901 __ enter(); // required for proper stackwalking of RuntimeStub frame 1902 1903 #ifdef ASSERT 1904 // caller guarantees that the arrays really are different 1905 // otherwise, we would have to make conjoint checks 1906 { Label L; 1907 __ b(L); // conjoint check not yet implemented 1908 __ stop("checkcast_copy within a single array"); 1909 __ bind(L); 1910 } 1911 #endif //ASSERT 1912 1913 // Caller of this entry point must set up the argument registers. 1914 if (entry != nullptr) { 1915 *entry = __ pc(); 1916 BLOCK_COMMENT("Entry:"); 1917 } 1918 1919 // Empty array: Nothing to do. 1920 __ cbz(count, L_done); 1921 __ push(RegSet::of(r19, r20, r21, r22), sp); 1922 1923 #ifdef ASSERT 1924 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1925 // The ckoff and ckval must be mutually consistent, 1926 // even though caller generates both. 1927 { Label L; 1928 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1929 __ ldrw(start_to, Address(ckval, sco_offset)); 1930 __ cmpw(ckoff, start_to); 1931 __ br(Assembler::EQ, L); 1932 __ stop("super_check_offset inconsistent"); 1933 __ bind(L); 1934 } 1935 #endif //ASSERT 1936 1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1938 bool is_oop = true; 1939 int element_size = UseCompressedOops ? 4 : 8; 1940 if (dest_uninitialized) { 1941 decorators |= IS_DEST_UNINITIALIZED; 1942 } 1943 1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1946 1947 // save the original count 1948 __ mov(count_save, count); 1949 1950 // Copy from low to high addresses 1951 __ mov(start_to, to); // Save destination array start address 1952 __ b(L_load_element); 1953 1954 // ======== begin loop ======== 1955 // (Loop is rotated; its entry is L_load_element.) 1956 // Loop control: 1957 // for (; count != 0; count--) { 1958 // copied_oop = load_heap_oop(from++); 1959 // ... generate_type_check ...; 1960 // store_heap_oop(to++, copied_oop); 1961 // } 1962 __ align(OptoLoopAlignment); 1963 1964 __ BIND(L_store_element); 1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1966 __ post(to, element_size), copied_oop, noreg, 1967 gct1, gct2, gct3); 1968 __ sub(count, count, 1); 1969 __ cbz(count, L_do_card_marks); 1970 1971 // ======== loop entry is here ======== 1972 __ BIND(L_load_element); 1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1974 copied_oop, noreg, __ post(from, element_size), 1975 gct1); 1976 __ cbz(copied_oop, L_store_element); 1977 1978 __ load_klass(r19_klass, copied_oop);// query the object klass 1979 1980 BLOCK_COMMENT("type_check:"); 1981 generate_type_check(/*sub_klass*/r19_klass, 1982 /*super_check_offset*/ckoff, 1983 /*super_klass*/ckval, 1984 /*r_array_base*/gct1, 1985 /*temp2*/gct2, 1986 /*result*/r10, L_store_element); 1987 1988 // Fall through on failure! 1989 1990 // ======== end loop ======== 1991 1992 // It was a real error; we must depend on the caller to finish the job. 1993 // Register count = remaining oops, count_orig = total oops. 1994 // Emit GC store barriers for the oops we have copied and report 1995 // their number to the caller. 1996 1997 __ subs(count, count_save, count); // K = partially copied oop count 1998 __ eon(count, count, zr); // report (-1^K) to caller 1999 __ br(Assembler::EQ, L_done_pop); 2000 2001 __ BIND(L_do_card_marks); 2002 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2003 2004 __ bind(L_done_pop); 2005 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2007 2008 __ bind(L_done); 2009 __ mov(r0, count); 2010 __ leave(); 2011 __ ret(lr); 2012 2013 return start; 2014 } 2015 2016 // Perform range checks on the proposed arraycopy. 2017 // Kills temp, but nothing else. 2018 // Also, clean the sign bits of src_pos and dst_pos. 2019 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2020 Register src_pos, // source position (c_rarg1) 2021 Register dst, // destination array oo (c_rarg2) 2022 Register dst_pos, // destination position (c_rarg3) 2023 Register length, 2024 Register temp, 2025 Label& L_failed) { 2026 BLOCK_COMMENT("arraycopy_range_checks:"); 2027 2028 assert_different_registers(rscratch1, temp); 2029 2030 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2031 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2032 __ addw(temp, length, src_pos); 2033 __ cmpw(temp, rscratch1); 2034 __ br(Assembler::HI, L_failed); 2035 2036 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2037 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2038 __ addw(temp, length, dst_pos); 2039 __ cmpw(temp, rscratch1); 2040 __ br(Assembler::HI, L_failed); 2041 2042 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2043 __ movw(src_pos, src_pos); 2044 __ movw(dst_pos, dst_pos); 2045 2046 BLOCK_COMMENT("arraycopy_range_checks done"); 2047 } 2048 2049 // These stubs get called from some dumb test routine. 2050 // I'll write them properly when they're called from 2051 // something that's actually doing something. 2052 static void fake_arraycopy_stub(address src, address dst, int count) { 2053 assert(count == 0, "huh?"); 2054 } 2055 2056 2057 // 2058 // Generate 'unsafe' array copy stub 2059 // Though just as safe as the other stubs, it takes an unscaled 2060 // size_t argument instead of an element count. 2061 // 2062 // Input: 2063 // c_rarg0 - source array address 2064 // c_rarg1 - destination array address 2065 // c_rarg2 - byte count, treated as ssize_t, can be zero 2066 // 2067 // Examines the alignment of the operands and dispatches 2068 // to a long, int, short, or byte copy loop. 2069 // 2070 address generate_unsafe_copy(address byte_copy_entry, 2071 address short_copy_entry, 2072 address int_copy_entry, 2073 address long_copy_entry) { 2074 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2075 2076 Label L_long_aligned, L_int_aligned, L_short_aligned; 2077 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2078 2079 __ align(CodeEntryAlignment); 2080 StubCodeMark mark(this, stub_id); 2081 address start = __ pc(); 2082 __ enter(); // required for proper stackwalking of RuntimeStub frame 2083 2084 // bump this on entry, not on exit: 2085 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2086 2087 __ orr(rscratch1, s, d); 2088 __ orr(rscratch1, rscratch1, count); 2089 2090 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2091 __ cbz(rscratch1, L_long_aligned); 2092 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2093 __ cbz(rscratch1, L_int_aligned); 2094 __ tbz(rscratch1, 0, L_short_aligned); 2095 __ b(RuntimeAddress(byte_copy_entry)); 2096 2097 __ BIND(L_short_aligned); 2098 __ lsr(count, count, LogBytesPerShort); // size => short_count 2099 __ b(RuntimeAddress(short_copy_entry)); 2100 __ BIND(L_int_aligned); 2101 __ lsr(count, count, LogBytesPerInt); // size => int_count 2102 __ b(RuntimeAddress(int_copy_entry)); 2103 __ BIND(L_long_aligned); 2104 __ lsr(count, count, LogBytesPerLong); // size => long_count 2105 __ b(RuntimeAddress(long_copy_entry)); 2106 2107 return start; 2108 } 2109 2110 // 2111 // Generate generic array copy stubs 2112 // 2113 // Input: 2114 // c_rarg0 - src oop 2115 // c_rarg1 - src_pos (32-bits) 2116 // c_rarg2 - dst oop 2117 // c_rarg3 - dst_pos (32-bits) 2118 // c_rarg4 - element count (32-bits) 2119 // 2120 // Output: 2121 // r0 == 0 - success 2122 // r0 == -1^K - failure, where K is partial transfer count 2123 // 2124 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2125 address int_copy_entry, address oop_copy_entry, 2126 address long_copy_entry, address checkcast_copy_entry) { 2127 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2128 2129 Label L_failed, L_objArray; 2130 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2131 2132 // Input registers 2133 const Register src = c_rarg0; // source array oop 2134 const Register src_pos = c_rarg1; // source position 2135 const Register dst = c_rarg2; // destination array oop 2136 const Register dst_pos = c_rarg3; // destination position 2137 const Register length = c_rarg4; 2138 2139 2140 // Registers used as temps 2141 const Register dst_klass = c_rarg5; 2142 2143 __ align(CodeEntryAlignment); 2144 2145 StubCodeMark mark(this, stub_id); 2146 2147 address start = __ pc(); 2148 2149 __ enter(); // required for proper stackwalking of RuntimeStub frame 2150 2151 // bump this on entry, not on exit: 2152 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2153 2154 //----------------------------------------------------------------------- 2155 // Assembler stub will be used for this call to arraycopy 2156 // if the following conditions are met: 2157 // 2158 // (1) src and dst must not be null. 2159 // (2) src_pos must not be negative. 2160 // (3) dst_pos must not be negative. 2161 // (4) length must not be negative. 2162 // (5) src klass and dst klass should be the same and not null. 2163 // (6) src and dst should be arrays. 2164 // (7) src_pos + length must not exceed length of src. 2165 // (8) dst_pos + length must not exceed length of dst. 2166 // 2167 2168 // if (src == nullptr) return -1; 2169 __ cbz(src, L_failed); 2170 2171 // if (src_pos < 0) return -1; 2172 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2173 2174 // if (dst == nullptr) return -1; 2175 __ cbz(dst, L_failed); 2176 2177 // if (dst_pos < 0) return -1; 2178 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2179 2180 // registers used as temp 2181 const Register scratch_length = r16; // elements count to copy 2182 const Register scratch_src_klass = r17; // array klass 2183 const Register lh = r15; // layout helper 2184 2185 // if (length < 0) return -1; 2186 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2187 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2188 2189 __ load_klass(scratch_src_klass, src); 2190 #ifdef ASSERT 2191 // assert(src->klass() != nullptr); 2192 { 2193 BLOCK_COMMENT("assert klasses not null {"); 2194 Label L1, L2; 2195 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2196 __ bind(L1); 2197 __ stop("broken null klass"); 2198 __ bind(L2); 2199 __ load_klass(rscratch1, dst); 2200 __ cbz(rscratch1, L1); // this would be broken also 2201 BLOCK_COMMENT("} assert klasses not null done"); 2202 } 2203 #endif 2204 2205 // Load layout helper (32-bits) 2206 // 2207 // |array_tag| | header_size | element_type | |log2_element_size| 2208 // 32 30 24 16 8 2 0 2209 // 2210 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2211 // 2212 2213 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2214 2215 // Handle objArrays completely differently... 2216 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2217 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2218 __ movw(rscratch1, objArray_lh); 2219 __ eorw(rscratch2, lh, rscratch1); 2220 __ cbzw(rscratch2, L_objArray); 2221 2222 // if (src->klass() != dst->klass()) return -1; 2223 __ load_klass(rscratch2, dst); 2224 __ eor(rscratch2, rscratch2, scratch_src_klass); 2225 __ cbnz(rscratch2, L_failed); 2226 2227 // if (!src->is_Array()) return -1; 2228 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2229 2230 // At this point, it is known to be a typeArray (array_tag 0x3). 2231 #ifdef ASSERT 2232 { 2233 BLOCK_COMMENT("assert primitive array {"); 2234 Label L; 2235 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2236 __ cmpw(lh, rscratch2); 2237 __ br(Assembler::GE, L); 2238 __ stop("must be a primitive array"); 2239 __ bind(L); 2240 BLOCK_COMMENT("} assert primitive array done"); 2241 } 2242 #endif 2243 2244 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2245 rscratch2, L_failed); 2246 2247 // TypeArrayKlass 2248 // 2249 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2250 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2251 // 2252 2253 const Register rscratch1_offset = rscratch1; // array offset 2254 const Register r15_elsize = lh; // element size 2255 2256 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2257 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2258 __ add(src, src, rscratch1_offset); // src array offset 2259 __ add(dst, dst, rscratch1_offset); // dst array offset 2260 BLOCK_COMMENT("choose copy loop based on element size"); 2261 2262 // next registers should be set before the jump to corresponding stub 2263 const Register from = c_rarg0; // source array address 2264 const Register to = c_rarg1; // destination array address 2265 const Register count = c_rarg2; // elements count 2266 2267 // 'from', 'to', 'count' registers should be set in such order 2268 // since they are the same as 'src', 'src_pos', 'dst'. 2269 2270 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2271 2272 // The possible values of elsize are 0-3, i.e. exact_log2(element 2273 // size in bytes). We do a simple bitwise binary search. 2274 __ BIND(L_copy_bytes); 2275 __ tbnz(r15_elsize, 1, L_copy_ints); 2276 __ tbnz(r15_elsize, 0, L_copy_shorts); 2277 __ lea(from, Address(src, src_pos));// src_addr 2278 __ lea(to, Address(dst, dst_pos));// dst_addr 2279 __ movw(count, scratch_length); // length 2280 __ b(RuntimeAddress(byte_copy_entry)); 2281 2282 __ BIND(L_copy_shorts); 2283 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2284 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2285 __ movw(count, scratch_length); // length 2286 __ b(RuntimeAddress(short_copy_entry)); 2287 2288 __ BIND(L_copy_ints); 2289 __ tbnz(r15_elsize, 0, L_copy_longs); 2290 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2291 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2292 __ movw(count, scratch_length); // length 2293 __ b(RuntimeAddress(int_copy_entry)); 2294 2295 __ BIND(L_copy_longs); 2296 #ifdef ASSERT 2297 { 2298 BLOCK_COMMENT("assert long copy {"); 2299 Label L; 2300 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2301 __ cmpw(r15_elsize, LogBytesPerLong); 2302 __ br(Assembler::EQ, L); 2303 __ stop("must be long copy, but elsize is wrong"); 2304 __ bind(L); 2305 BLOCK_COMMENT("} assert long copy done"); 2306 } 2307 #endif 2308 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2309 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2310 __ movw(count, scratch_length); // length 2311 __ b(RuntimeAddress(long_copy_entry)); 2312 2313 // ObjArrayKlass 2314 __ BIND(L_objArray); 2315 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2316 2317 Label L_plain_copy, L_checkcast_copy; 2318 // test array classes for subtyping 2319 __ load_klass(r15, dst); 2320 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2321 __ br(Assembler::NE, L_checkcast_copy); 2322 2323 // Identically typed arrays can be copied without element-wise checks. 2324 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2325 rscratch2, L_failed); 2326 2327 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2328 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2329 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2330 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2331 __ movw(count, scratch_length); // length 2332 __ BIND(L_plain_copy); 2333 __ b(RuntimeAddress(oop_copy_entry)); 2334 2335 __ BIND(L_checkcast_copy); 2336 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2337 { 2338 // Before looking at dst.length, make sure dst is also an objArray. 2339 __ ldrw(rscratch1, Address(r15, lh_offset)); 2340 __ movw(rscratch2, objArray_lh); 2341 __ eorw(rscratch1, rscratch1, rscratch2); 2342 __ cbnzw(rscratch1, L_failed); 2343 2344 // It is safe to examine both src.length and dst.length. 2345 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2346 r15, L_failed); 2347 2348 __ load_klass(dst_klass, dst); // reload 2349 2350 // Marshal the base address arguments now, freeing registers. 2351 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2354 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2355 __ movw(count, length); // length (reloaded) 2356 Register sco_temp = c_rarg3; // this register is free now 2357 assert_different_registers(from, to, count, sco_temp, 2358 dst_klass, scratch_src_klass); 2359 // assert_clean_int(count, sco_temp); 2360 2361 // Generate the type check. 2362 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2364 2365 // Smashes rscratch1, rscratch2 2366 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2367 L_plain_copy); 2368 2369 // Fetch destination element klass from the ObjArrayKlass header. 2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2371 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2373 2374 // the checkcast_copy loop needs two extra arguments: 2375 assert(c_rarg3 == sco_temp, "#3 already in place"); 2376 // Set up arguments for checkcast_copy_entry. 2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2378 __ b(RuntimeAddress(checkcast_copy_entry)); 2379 } 2380 2381 __ BIND(L_failed); 2382 __ mov(r0, -1); 2383 __ leave(); // required for proper stackwalking of RuntimeStub frame 2384 __ ret(lr); 2385 2386 return start; 2387 } 2388 2389 // 2390 // Generate stub for array fill. If "aligned" is true, the 2391 // "to" address is assumed to be heapword aligned. 2392 // 2393 // Arguments for generated stub: 2394 // to: c_rarg0 2395 // value: c_rarg1 2396 // count: c_rarg2 treated as signed 2397 // 2398 address generate_fill(StubGenStubId stub_id) { 2399 BasicType t; 2400 bool aligned; 2401 2402 switch (stub_id) { 2403 case jbyte_fill_id: 2404 t = T_BYTE; 2405 aligned = false; 2406 break; 2407 case jshort_fill_id: 2408 t = T_SHORT; 2409 aligned = false; 2410 break; 2411 case jint_fill_id: 2412 t = T_INT; 2413 aligned = false; 2414 break; 2415 case arrayof_jbyte_fill_id: 2416 t = T_BYTE; 2417 aligned = true; 2418 break; 2419 case arrayof_jshort_fill_id: 2420 t = T_SHORT; 2421 aligned = true; 2422 break; 2423 case arrayof_jint_fill_id: 2424 t = T_INT; 2425 aligned = true; 2426 break; 2427 default: 2428 ShouldNotReachHere(); 2429 }; 2430 2431 __ align(CodeEntryAlignment); 2432 StubCodeMark mark(this, stub_id); 2433 address start = __ pc(); 2434 2435 BLOCK_COMMENT("Entry:"); 2436 2437 const Register to = c_rarg0; // source array address 2438 const Register value = c_rarg1; // value 2439 const Register count = c_rarg2; // elements count 2440 2441 const Register bz_base = r10; // base for block_zero routine 2442 const Register cnt_words = r11; // temp register 2443 2444 __ enter(); 2445 2446 Label L_fill_elements, L_exit1; 2447 2448 int shift = -1; 2449 switch (t) { 2450 case T_BYTE: 2451 shift = 0; 2452 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2453 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2454 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2455 __ br(Assembler::LO, L_fill_elements); 2456 break; 2457 case T_SHORT: 2458 shift = 1; 2459 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2461 __ br(Assembler::LO, L_fill_elements); 2462 break; 2463 case T_INT: 2464 shift = 2; 2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2466 __ br(Assembler::LO, L_fill_elements); 2467 break; 2468 default: ShouldNotReachHere(); 2469 } 2470 2471 // Align source address at 8 bytes address boundary. 2472 Label L_skip_align1, L_skip_align2, L_skip_align4; 2473 if (!aligned) { 2474 switch (t) { 2475 case T_BYTE: 2476 // One byte misalignment happens only for byte arrays. 2477 __ tbz(to, 0, L_skip_align1); 2478 __ strb(value, Address(__ post(to, 1))); 2479 __ subw(count, count, 1); 2480 __ bind(L_skip_align1); 2481 // Fallthrough 2482 case T_SHORT: 2483 // Two bytes misalignment happens only for byte and short (char) arrays. 2484 __ tbz(to, 1, L_skip_align2); 2485 __ strh(value, Address(__ post(to, 2))); 2486 __ subw(count, count, 2 >> shift); 2487 __ bind(L_skip_align2); 2488 // Fallthrough 2489 case T_INT: 2490 // Align to 8 bytes, we know we are 4 byte aligned to start. 2491 __ tbz(to, 2, L_skip_align4); 2492 __ strw(value, Address(__ post(to, 4))); 2493 __ subw(count, count, 4 >> shift); 2494 __ bind(L_skip_align4); 2495 break; 2496 default: ShouldNotReachHere(); 2497 } 2498 } 2499 2500 // 2501 // Fill large chunks 2502 // 2503 __ lsrw(cnt_words, count, 3 - shift); // number of words 2504 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2505 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2506 if (UseBlockZeroing) { 2507 Label non_block_zeroing, rest; 2508 // If the fill value is zero we can use the fast zero_words(). 2509 __ cbnz(value, non_block_zeroing); 2510 __ mov(bz_base, to); 2511 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2512 address tpc = __ zero_words(bz_base, cnt_words); 2513 if (tpc == nullptr) { 2514 fatal("CodeCache is full at generate_fill"); 2515 } 2516 __ b(rest); 2517 __ bind(non_block_zeroing); 2518 __ fill_words(to, cnt_words, value); 2519 __ bind(rest); 2520 } else { 2521 __ fill_words(to, cnt_words, value); 2522 } 2523 2524 // Remaining count is less than 8 bytes. Fill it by a single store. 2525 // Note that the total length is no less than 8 bytes. 2526 if (t == T_BYTE || t == T_SHORT) { 2527 Label L_exit1; 2528 __ cbzw(count, L_exit1); 2529 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2530 __ str(value, Address(to, -8)); // overwrite some elements 2531 __ bind(L_exit1); 2532 __ leave(); 2533 __ ret(lr); 2534 } 2535 2536 // Handle copies less than 8 bytes. 2537 Label L_fill_2, L_fill_4, L_exit2; 2538 __ bind(L_fill_elements); 2539 switch (t) { 2540 case T_BYTE: 2541 __ tbz(count, 0, L_fill_2); 2542 __ strb(value, Address(__ post(to, 1))); 2543 __ bind(L_fill_2); 2544 __ tbz(count, 1, L_fill_4); 2545 __ strh(value, Address(__ post(to, 2))); 2546 __ bind(L_fill_4); 2547 __ tbz(count, 2, L_exit2); 2548 __ strw(value, Address(to)); 2549 break; 2550 case T_SHORT: 2551 __ tbz(count, 0, L_fill_4); 2552 __ strh(value, Address(__ post(to, 2))); 2553 __ bind(L_fill_4); 2554 __ tbz(count, 1, L_exit2); 2555 __ strw(value, Address(to)); 2556 break; 2557 case T_INT: 2558 __ cbzw(count, L_exit2); 2559 __ strw(value, Address(to)); 2560 break; 2561 default: ShouldNotReachHere(); 2562 } 2563 __ bind(L_exit2); 2564 __ leave(); 2565 __ ret(lr); 2566 return start; 2567 } 2568 2569 address generate_data_cache_writeback() { 2570 const Register line = c_rarg0; // address of line to write back 2571 2572 __ align(CodeEntryAlignment); 2573 2574 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2575 StubCodeMark mark(this, stub_id); 2576 2577 address start = __ pc(); 2578 __ enter(); 2579 __ cache_wb(Address(line, 0)); 2580 __ leave(); 2581 __ ret(lr); 2582 2583 return start; 2584 } 2585 2586 address generate_data_cache_writeback_sync() { 2587 const Register is_pre = c_rarg0; // pre or post sync 2588 2589 __ align(CodeEntryAlignment); 2590 2591 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2592 StubCodeMark mark(this, stub_id); 2593 2594 // pre wbsync is a no-op 2595 // post wbsync translates to an sfence 2596 2597 Label skip; 2598 address start = __ pc(); 2599 __ enter(); 2600 __ cbnz(is_pre, skip); 2601 __ cache_wbsync(false); 2602 __ bind(skip); 2603 __ leave(); 2604 __ ret(lr); 2605 2606 return start; 2607 } 2608 2609 void generate_arraycopy_stubs() { 2610 address entry; 2611 address entry_jbyte_arraycopy; 2612 address entry_jshort_arraycopy; 2613 address entry_jint_arraycopy; 2614 address entry_oop_arraycopy; 2615 address entry_jlong_arraycopy; 2616 address entry_checkcast_arraycopy; 2617 2618 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2619 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2620 2621 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2622 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2623 2624 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2625 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2626 2627 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2628 2629 //*** jbyte 2630 // Always need aligned and unaligned versions 2631 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2632 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2633 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2634 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2635 2636 //*** jshort 2637 // Always need aligned and unaligned versions 2638 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2639 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2640 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2641 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2642 2643 //*** jint 2644 // Aligned versions 2645 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2646 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2647 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2648 // entry_jint_arraycopy always points to the unaligned version 2649 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2650 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2651 2652 //*** jlong 2653 // It is always aligned 2654 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2655 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2656 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2657 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2658 2659 //*** oops 2660 { 2661 // With compressed oops we need unaligned versions; notice that 2662 // we overwrite entry_oop_arraycopy. 2663 bool aligned = !UseCompressedOops; 2664 2665 StubRoutines::_arrayof_oop_disjoint_arraycopy 2666 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2667 StubRoutines::_arrayof_oop_arraycopy 2668 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2669 // Aligned versions without pre-barriers 2670 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2671 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2672 StubRoutines::_arrayof_oop_arraycopy_uninit 2673 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2674 } 2675 2676 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2677 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2678 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2679 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2680 2681 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2682 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2683 2684 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2685 entry_jshort_arraycopy, 2686 entry_jint_arraycopy, 2687 entry_jlong_arraycopy); 2688 2689 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2690 entry_jshort_arraycopy, 2691 entry_jint_arraycopy, 2692 entry_oop_arraycopy, 2693 entry_jlong_arraycopy, 2694 entry_checkcast_arraycopy); 2695 2696 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2697 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2698 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2699 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2700 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2701 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2702 } 2703 2704 void generate_math_stubs() { Unimplemented(); } 2705 2706 // Arguments: 2707 // 2708 // Inputs: 2709 // c_rarg0 - source byte array address 2710 // c_rarg1 - destination byte array address 2711 // c_rarg2 - K (key) in little endian int array 2712 // 2713 address generate_aescrypt_encryptBlock() { 2714 __ align(CodeEntryAlignment); 2715 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2716 StubCodeMark mark(this, stub_id); 2717 2718 const Register from = c_rarg0; // source array address 2719 const Register to = c_rarg1; // destination array address 2720 const Register key = c_rarg2; // key array address 2721 const Register keylen = rscratch1; 2722 2723 address start = __ pc(); 2724 __ enter(); 2725 2726 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2727 2728 __ aesenc_loadkeys(key, keylen); 2729 __ aesecb_encrypt(from, to, keylen); 2730 2731 __ mov(r0, 0); 2732 2733 __ leave(); 2734 __ ret(lr); 2735 2736 return start; 2737 } 2738 2739 // Arguments: 2740 // 2741 // Inputs: 2742 // c_rarg0 - source byte array address 2743 // c_rarg1 - destination byte array address 2744 // c_rarg2 - K (key) in little endian int array 2745 // 2746 address generate_aescrypt_decryptBlock() { 2747 assert(UseAES, "need AES cryptographic extension support"); 2748 __ align(CodeEntryAlignment); 2749 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2750 StubCodeMark mark(this, stub_id); 2751 Label L_doLast; 2752 2753 const Register from = c_rarg0; // source array address 2754 const Register to = c_rarg1; // destination array address 2755 const Register key = c_rarg2; // key array address 2756 const Register keylen = rscratch1; 2757 2758 address start = __ pc(); 2759 __ enter(); // required for proper stackwalking of RuntimeStub frame 2760 2761 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2762 2763 __ aesecb_decrypt(from, to, key, keylen); 2764 2765 __ mov(r0, 0); 2766 2767 __ leave(); 2768 __ ret(lr); 2769 2770 return start; 2771 } 2772 2773 // Arguments: 2774 // 2775 // Inputs: 2776 // c_rarg0 - source byte array address 2777 // c_rarg1 - destination byte array address 2778 // c_rarg2 - K (key) in little endian int array 2779 // c_rarg3 - r vector byte array address 2780 // c_rarg4 - input length 2781 // 2782 // Output: 2783 // x0 - input length 2784 // 2785 address generate_cipherBlockChaining_encryptAESCrypt() { 2786 assert(UseAES, "need AES cryptographic extension support"); 2787 __ align(CodeEntryAlignment); 2788 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2789 StubCodeMark mark(this, stub_id); 2790 2791 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2792 2793 const Register from = c_rarg0; // source array address 2794 const Register to = c_rarg1; // destination array address 2795 const Register key = c_rarg2; // key array address 2796 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2797 // and left with the results of the last encryption block 2798 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2799 const Register keylen = rscratch1; 2800 2801 address start = __ pc(); 2802 2803 __ enter(); 2804 2805 __ movw(rscratch2, len_reg); 2806 2807 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2808 2809 __ ld1(v0, __ T16B, rvec); 2810 2811 __ cmpw(keylen, 52); 2812 __ br(Assembler::CC, L_loadkeys_44); 2813 __ br(Assembler::EQ, L_loadkeys_52); 2814 2815 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2816 __ rev32(v17, __ T16B, v17); 2817 __ rev32(v18, __ T16B, v18); 2818 __ BIND(L_loadkeys_52); 2819 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2820 __ rev32(v19, __ T16B, v19); 2821 __ rev32(v20, __ T16B, v20); 2822 __ BIND(L_loadkeys_44); 2823 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2824 __ rev32(v21, __ T16B, v21); 2825 __ rev32(v22, __ T16B, v22); 2826 __ rev32(v23, __ T16B, v23); 2827 __ rev32(v24, __ T16B, v24); 2828 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2829 __ rev32(v25, __ T16B, v25); 2830 __ rev32(v26, __ T16B, v26); 2831 __ rev32(v27, __ T16B, v27); 2832 __ rev32(v28, __ T16B, v28); 2833 __ ld1(v29, v30, v31, __ T16B, key); 2834 __ rev32(v29, __ T16B, v29); 2835 __ rev32(v30, __ T16B, v30); 2836 __ rev32(v31, __ T16B, v31); 2837 2838 __ BIND(L_aes_loop); 2839 __ ld1(v1, __ T16B, __ post(from, 16)); 2840 __ eor(v0, __ T16B, v0, v1); 2841 2842 __ br(Assembler::CC, L_rounds_44); 2843 __ br(Assembler::EQ, L_rounds_52); 2844 2845 __ aese(v0, v17); __ aesmc(v0, v0); 2846 __ aese(v0, v18); __ aesmc(v0, v0); 2847 __ BIND(L_rounds_52); 2848 __ aese(v0, v19); __ aesmc(v0, v0); 2849 __ aese(v0, v20); __ aesmc(v0, v0); 2850 __ BIND(L_rounds_44); 2851 __ aese(v0, v21); __ aesmc(v0, v0); 2852 __ aese(v0, v22); __ aesmc(v0, v0); 2853 __ aese(v0, v23); __ aesmc(v0, v0); 2854 __ aese(v0, v24); __ aesmc(v0, v0); 2855 __ aese(v0, v25); __ aesmc(v0, v0); 2856 __ aese(v0, v26); __ aesmc(v0, v0); 2857 __ aese(v0, v27); __ aesmc(v0, v0); 2858 __ aese(v0, v28); __ aesmc(v0, v0); 2859 __ aese(v0, v29); __ aesmc(v0, v0); 2860 __ aese(v0, v30); 2861 __ eor(v0, __ T16B, v0, v31); 2862 2863 __ st1(v0, __ T16B, __ post(to, 16)); 2864 2865 __ subw(len_reg, len_reg, 16); 2866 __ cbnzw(len_reg, L_aes_loop); 2867 2868 __ st1(v0, __ T16B, rvec); 2869 2870 __ mov(r0, rscratch2); 2871 2872 __ leave(); 2873 __ ret(lr); 2874 2875 return start; 2876 } 2877 2878 // Arguments: 2879 // 2880 // Inputs: 2881 // c_rarg0 - source byte array address 2882 // c_rarg1 - destination byte array address 2883 // c_rarg2 - K (key) in little endian int array 2884 // c_rarg3 - r vector byte array address 2885 // c_rarg4 - input length 2886 // 2887 // Output: 2888 // r0 - input length 2889 // 2890 address generate_cipherBlockChaining_decryptAESCrypt() { 2891 assert(UseAES, "need AES cryptographic extension support"); 2892 __ align(CodeEntryAlignment); 2893 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2894 StubCodeMark mark(this, stub_id); 2895 2896 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2897 2898 const Register from = c_rarg0; // source array address 2899 const Register to = c_rarg1; // destination array address 2900 const Register key = c_rarg2; // key array address 2901 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2902 // and left with the results of the last encryption block 2903 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2904 const Register keylen = rscratch1; 2905 2906 address start = __ pc(); 2907 2908 __ enter(); 2909 2910 __ movw(rscratch2, len_reg); 2911 2912 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2913 2914 __ ld1(v2, __ T16B, rvec); 2915 2916 __ ld1(v31, __ T16B, __ post(key, 16)); 2917 __ rev32(v31, __ T16B, v31); 2918 2919 __ cmpw(keylen, 52); 2920 __ br(Assembler::CC, L_loadkeys_44); 2921 __ br(Assembler::EQ, L_loadkeys_52); 2922 2923 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2924 __ rev32(v17, __ T16B, v17); 2925 __ rev32(v18, __ T16B, v18); 2926 __ BIND(L_loadkeys_52); 2927 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2928 __ rev32(v19, __ T16B, v19); 2929 __ rev32(v20, __ T16B, v20); 2930 __ BIND(L_loadkeys_44); 2931 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2932 __ rev32(v21, __ T16B, v21); 2933 __ rev32(v22, __ T16B, v22); 2934 __ rev32(v23, __ T16B, v23); 2935 __ rev32(v24, __ T16B, v24); 2936 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2937 __ rev32(v25, __ T16B, v25); 2938 __ rev32(v26, __ T16B, v26); 2939 __ rev32(v27, __ T16B, v27); 2940 __ rev32(v28, __ T16B, v28); 2941 __ ld1(v29, v30, __ T16B, key); 2942 __ rev32(v29, __ T16B, v29); 2943 __ rev32(v30, __ T16B, v30); 2944 2945 __ BIND(L_aes_loop); 2946 __ ld1(v0, __ T16B, __ post(from, 16)); 2947 __ orr(v1, __ T16B, v0, v0); 2948 2949 __ br(Assembler::CC, L_rounds_44); 2950 __ br(Assembler::EQ, L_rounds_52); 2951 2952 __ aesd(v0, v17); __ aesimc(v0, v0); 2953 __ aesd(v0, v18); __ aesimc(v0, v0); 2954 __ BIND(L_rounds_52); 2955 __ aesd(v0, v19); __ aesimc(v0, v0); 2956 __ aesd(v0, v20); __ aesimc(v0, v0); 2957 __ BIND(L_rounds_44); 2958 __ aesd(v0, v21); __ aesimc(v0, v0); 2959 __ aesd(v0, v22); __ aesimc(v0, v0); 2960 __ aesd(v0, v23); __ aesimc(v0, v0); 2961 __ aesd(v0, v24); __ aesimc(v0, v0); 2962 __ aesd(v0, v25); __ aesimc(v0, v0); 2963 __ aesd(v0, v26); __ aesimc(v0, v0); 2964 __ aesd(v0, v27); __ aesimc(v0, v0); 2965 __ aesd(v0, v28); __ aesimc(v0, v0); 2966 __ aesd(v0, v29); __ aesimc(v0, v0); 2967 __ aesd(v0, v30); 2968 __ eor(v0, __ T16B, v0, v31); 2969 __ eor(v0, __ T16B, v0, v2); 2970 2971 __ st1(v0, __ T16B, __ post(to, 16)); 2972 __ orr(v2, __ T16B, v1, v1); 2973 2974 __ subw(len_reg, len_reg, 16); 2975 __ cbnzw(len_reg, L_aes_loop); 2976 2977 __ st1(v2, __ T16B, rvec); 2978 2979 __ mov(r0, rscratch2); 2980 2981 __ leave(); 2982 __ ret(lr); 2983 2984 return start; 2985 } 2986 2987 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2988 // Inputs: 128-bits. in is preserved. 2989 // The least-significant 64-bit word is in the upper dword of each vector. 2990 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2991 // Output: result 2992 void be_add_128_64(FloatRegister result, FloatRegister in, 2993 FloatRegister inc, FloatRegister tmp) { 2994 assert_different_registers(result, tmp, inc); 2995 2996 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2997 // input 2998 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2999 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3000 // MSD == 0 (must be!) to LSD 3001 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3002 } 3003 3004 // CTR AES crypt. 3005 // Arguments: 3006 // 3007 // Inputs: 3008 // c_rarg0 - source byte array address 3009 // c_rarg1 - destination byte array address 3010 // c_rarg2 - K (key) in little endian int array 3011 // c_rarg3 - counter vector byte array address 3012 // c_rarg4 - input length 3013 // c_rarg5 - saved encryptedCounter start 3014 // c_rarg6 - saved used length 3015 // 3016 // Output: 3017 // r0 - input length 3018 // 3019 address generate_counterMode_AESCrypt() { 3020 const Register in = c_rarg0; 3021 const Register out = c_rarg1; 3022 const Register key = c_rarg2; 3023 const Register counter = c_rarg3; 3024 const Register saved_len = c_rarg4, len = r10; 3025 const Register saved_encrypted_ctr = c_rarg5; 3026 const Register used_ptr = c_rarg6, used = r12; 3027 3028 const Register offset = r7; 3029 const Register keylen = r11; 3030 3031 const unsigned char block_size = 16; 3032 const int bulk_width = 4; 3033 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3034 // performance with larger data sizes, but it also means that the 3035 // fast path isn't used until you have at least 8 blocks, and up 3036 // to 127 bytes of data will be executed on the slow path. For 3037 // that reason, and also so as not to blow away too much icache, 4 3038 // blocks seems like a sensible compromise. 3039 3040 // Algorithm: 3041 // 3042 // if (len == 0) { 3043 // goto DONE; 3044 // } 3045 // int result = len; 3046 // do { 3047 // if (used >= blockSize) { 3048 // if (len >= bulk_width * blockSize) { 3049 // CTR_large_block(); 3050 // if (len == 0) 3051 // goto DONE; 3052 // } 3053 // for (;;) { 3054 // 16ByteVector v0 = counter; 3055 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3056 // used = 0; 3057 // if (len < blockSize) 3058 // break; /* goto NEXT */ 3059 // 16ByteVector v1 = load16Bytes(in, offset); 3060 // v1 = v1 ^ encryptedCounter; 3061 // store16Bytes(out, offset); 3062 // used = blockSize; 3063 // offset += blockSize; 3064 // len -= blockSize; 3065 // if (len == 0) 3066 // goto DONE; 3067 // } 3068 // } 3069 // NEXT: 3070 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3071 // len--; 3072 // } while (len != 0); 3073 // DONE: 3074 // return result; 3075 // 3076 // CTR_large_block() 3077 // Wide bulk encryption of whole blocks. 3078 3079 __ align(CodeEntryAlignment); 3080 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3081 StubCodeMark mark(this, stub_id); 3082 const address start = __ pc(); 3083 __ enter(); 3084 3085 Label DONE, CTR_large_block, large_block_return; 3086 __ ldrw(used, Address(used_ptr)); 3087 __ cbzw(saved_len, DONE); 3088 3089 __ mov(len, saved_len); 3090 __ mov(offset, 0); 3091 3092 // Compute #rounds for AES based on the length of the key array 3093 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3094 3095 __ aesenc_loadkeys(key, keylen); 3096 3097 { 3098 Label L_CTR_loop, NEXT; 3099 3100 __ bind(L_CTR_loop); 3101 3102 __ cmp(used, block_size); 3103 __ br(__ LO, NEXT); 3104 3105 // Maybe we have a lot of data 3106 __ subsw(rscratch1, len, bulk_width * block_size); 3107 __ br(__ HS, CTR_large_block); 3108 __ BIND(large_block_return); 3109 __ cbzw(len, DONE); 3110 3111 // Setup the counter 3112 __ movi(v4, __ T4S, 0); 3113 __ movi(v5, __ T4S, 1); 3114 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3115 3116 // 128-bit big-endian increment 3117 __ ld1(v0, __ T16B, counter); 3118 __ rev64(v16, __ T16B, v0); 3119 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3120 __ rev64(v16, __ T16B, v16); 3121 __ st1(v16, __ T16B, counter); 3122 // Previous counter value is in v0 3123 // v4 contains { 0, 1 } 3124 3125 { 3126 // We have fewer than bulk_width blocks of data left. Encrypt 3127 // them one by one until there is less than a full block 3128 // remaining, being careful to save both the encrypted counter 3129 // and the counter. 3130 3131 Label inner_loop; 3132 __ bind(inner_loop); 3133 // Counter to encrypt is in v0 3134 __ aesecb_encrypt(noreg, noreg, keylen); 3135 __ st1(v0, __ T16B, saved_encrypted_ctr); 3136 3137 // Do we have a remaining full block? 3138 3139 __ mov(used, 0); 3140 __ cmp(len, block_size); 3141 __ br(__ LO, NEXT); 3142 3143 // Yes, we have a full block 3144 __ ldrq(v1, Address(in, offset)); 3145 __ eor(v1, __ T16B, v1, v0); 3146 __ strq(v1, Address(out, offset)); 3147 __ mov(used, block_size); 3148 __ add(offset, offset, block_size); 3149 3150 __ subw(len, len, block_size); 3151 __ cbzw(len, DONE); 3152 3153 // Increment the counter, store it back 3154 __ orr(v0, __ T16B, v16, v16); 3155 __ rev64(v16, __ T16B, v16); 3156 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3157 __ rev64(v16, __ T16B, v16); 3158 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3159 3160 __ b(inner_loop); 3161 } 3162 3163 __ BIND(NEXT); 3164 3165 // Encrypt a single byte, and loop. 3166 // We expect this to be a rare event. 3167 __ ldrb(rscratch1, Address(in, offset)); 3168 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3169 __ eor(rscratch1, rscratch1, rscratch2); 3170 __ strb(rscratch1, Address(out, offset)); 3171 __ add(offset, offset, 1); 3172 __ add(used, used, 1); 3173 __ subw(len, len,1); 3174 __ cbnzw(len, L_CTR_loop); 3175 } 3176 3177 __ bind(DONE); 3178 __ strw(used, Address(used_ptr)); 3179 __ mov(r0, saved_len); 3180 3181 __ leave(); // required for proper stackwalking of RuntimeStub frame 3182 __ ret(lr); 3183 3184 // Bulk encryption 3185 3186 __ BIND (CTR_large_block); 3187 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3188 3189 if (bulk_width == 8) { 3190 __ sub(sp, sp, 4 * 16); 3191 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3192 } 3193 __ sub(sp, sp, 4 * 16); 3194 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3195 RegSet saved_regs = (RegSet::of(in, out, offset) 3196 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3197 __ push(saved_regs, sp); 3198 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3199 __ add(in, in, offset); 3200 __ add(out, out, offset); 3201 3202 // Keys should already be loaded into the correct registers 3203 3204 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3205 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3206 3207 // AES/CTR loop 3208 { 3209 Label L_CTR_loop; 3210 __ BIND(L_CTR_loop); 3211 3212 // Setup the counters 3213 __ movi(v8, __ T4S, 0); 3214 __ movi(v9, __ T4S, 1); 3215 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3216 3217 for (int i = 0; i < bulk_width; i++) { 3218 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3219 __ rev64(v0_ofs, __ T16B, v16); 3220 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3221 } 3222 3223 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3224 3225 // Encrypt the counters 3226 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3227 3228 if (bulk_width == 8) { 3229 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3230 } 3231 3232 // XOR the encrypted counters with the inputs 3233 for (int i = 0; i < bulk_width; i++) { 3234 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3235 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3236 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3237 } 3238 3239 // Write the encrypted data 3240 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3241 if (bulk_width == 8) { 3242 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3243 } 3244 3245 __ subw(len, len, 16 * bulk_width); 3246 __ cbnzw(len, L_CTR_loop); 3247 } 3248 3249 // Save the counter back where it goes 3250 __ rev64(v16, __ T16B, v16); 3251 __ st1(v16, __ T16B, counter); 3252 3253 __ pop(saved_regs, sp); 3254 3255 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3256 if (bulk_width == 8) { 3257 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3258 } 3259 3260 __ andr(rscratch1, len, -16 * bulk_width); 3261 __ sub(len, len, rscratch1); 3262 __ add(offset, offset, rscratch1); 3263 __ mov(used, 16); 3264 __ strw(used, Address(used_ptr)); 3265 __ b(large_block_return); 3266 3267 return start; 3268 } 3269 3270 // Vector AES Galois Counter Mode implementation. Parameters: 3271 // 3272 // in = c_rarg0 3273 // len = c_rarg1 3274 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3275 // out = c_rarg3 3276 // key = c_rarg4 3277 // state = c_rarg5 - GHASH.state 3278 // subkeyHtbl = c_rarg6 - powers of H 3279 // counter = c_rarg7 - 16 bytes of CTR 3280 // return - number of processed bytes 3281 address generate_galoisCounterMode_AESCrypt() { 3282 address ghash_polynomial = __ pc(); 3283 __ emit_int64(0x87); // The low-order bits of the field 3284 // polynomial (i.e. p = z^7+z^2+z+1) 3285 // repeated in the low and high parts of a 3286 // 128-bit vector 3287 __ emit_int64(0x87); 3288 3289 __ align(CodeEntryAlignment); 3290 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3291 StubCodeMark mark(this, stub_id); 3292 address start = __ pc(); 3293 __ enter(); 3294 3295 const Register in = c_rarg0; 3296 const Register len = c_rarg1; 3297 const Register ct = c_rarg2; 3298 const Register out = c_rarg3; 3299 // and updated with the incremented counter in the end 3300 3301 const Register key = c_rarg4; 3302 const Register state = c_rarg5; 3303 3304 const Register subkeyHtbl = c_rarg6; 3305 3306 const Register counter = c_rarg7; 3307 3308 const Register keylen = r10; 3309 // Save state before entering routine 3310 __ sub(sp, sp, 4 * 16); 3311 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3312 __ sub(sp, sp, 4 * 16); 3313 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3314 3315 // __ andr(len, len, -512); 3316 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3317 __ str(len, __ pre(sp, -2 * wordSize)); 3318 3319 Label DONE; 3320 __ cbz(len, DONE); 3321 3322 // Compute #rounds for AES based on the length of the key array 3323 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3324 3325 __ aesenc_loadkeys(key, keylen); 3326 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3327 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3328 3329 // AES/CTR loop 3330 { 3331 Label L_CTR_loop; 3332 __ BIND(L_CTR_loop); 3333 3334 // Setup the counters 3335 __ movi(v8, __ T4S, 0); 3336 __ movi(v9, __ T4S, 1); 3337 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3338 3339 assert(v0->encoding() < v8->encoding(), ""); 3340 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3341 FloatRegister f = as_FloatRegister(i); 3342 __ rev32(f, __ T16B, v16); 3343 __ addv(v16, __ T4S, v16, v8); 3344 } 3345 3346 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3347 3348 // Encrypt the counters 3349 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3350 3351 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3352 3353 // XOR the encrypted counters with the inputs 3354 for (int i = 0; i < 8; i++) { 3355 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3356 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3357 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3358 } 3359 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3360 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3361 3362 __ subw(len, len, 16 * 8); 3363 __ cbnzw(len, L_CTR_loop); 3364 } 3365 3366 __ rev32(v16, __ T16B, v16); 3367 __ st1(v16, __ T16B, counter); 3368 3369 __ ldr(len, Address(sp)); 3370 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3371 3372 // GHASH/CTR loop 3373 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3374 len, /*unrolls*/4); 3375 3376 #ifdef ASSERT 3377 { Label L; 3378 __ cmp(len, (unsigned char)0); 3379 __ br(Assembler::EQ, L); 3380 __ stop("stubGenerator: abort"); 3381 __ bind(L); 3382 } 3383 #endif 3384 3385 __ bind(DONE); 3386 // Return the number of bytes processed 3387 __ ldr(r0, __ post(sp, 2 * wordSize)); 3388 3389 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3390 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3391 3392 __ leave(); // required for proper stackwalking of RuntimeStub frame 3393 __ ret(lr); 3394 return start; 3395 } 3396 3397 class Cached64Bytes { 3398 private: 3399 MacroAssembler *_masm; 3400 Register _regs[8]; 3401 3402 public: 3403 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3404 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3405 auto it = rs.begin(); 3406 for (auto &r: _regs) { 3407 r = *it; 3408 ++it; 3409 } 3410 } 3411 3412 void gen_loads(Register base) { 3413 for (int i = 0; i < 8; i += 2) { 3414 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3415 } 3416 } 3417 3418 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3419 void extract_u32(Register dest, int i) { 3420 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3421 } 3422 }; 3423 3424 // Utility routines for md5. 3425 // Clobbers r10 and r11. 3426 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3427 int k, int s, int t) { 3428 Register rscratch3 = r10; 3429 Register rscratch4 = r11; 3430 3431 __ eorw(rscratch3, r3, r4); 3432 __ movw(rscratch2, t); 3433 __ andw(rscratch3, rscratch3, r2); 3434 __ addw(rscratch4, r1, rscratch2); 3435 reg_cache.extract_u32(rscratch1, k); 3436 __ eorw(rscratch3, rscratch3, r4); 3437 __ addw(rscratch4, rscratch4, rscratch1); 3438 __ addw(rscratch3, rscratch3, rscratch4); 3439 __ rorw(rscratch2, rscratch3, 32 - s); 3440 __ addw(r1, rscratch2, r2); 3441 } 3442 3443 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3444 int k, int s, int t) { 3445 Register rscratch3 = r10; 3446 Register rscratch4 = r11; 3447 3448 reg_cache.extract_u32(rscratch1, k); 3449 __ movw(rscratch2, t); 3450 __ addw(rscratch4, r1, rscratch2); 3451 __ addw(rscratch4, rscratch4, rscratch1); 3452 __ bicw(rscratch2, r3, r4); 3453 __ andw(rscratch3, r2, r4); 3454 __ addw(rscratch2, rscratch2, rscratch4); 3455 __ addw(rscratch2, rscratch2, rscratch3); 3456 __ rorw(rscratch2, rscratch2, 32 - s); 3457 __ addw(r1, rscratch2, r2); 3458 } 3459 3460 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3461 int k, int s, int t) { 3462 Register rscratch3 = r10; 3463 Register rscratch4 = r11; 3464 3465 __ eorw(rscratch3, r3, r4); 3466 __ movw(rscratch2, t); 3467 __ addw(rscratch4, r1, rscratch2); 3468 reg_cache.extract_u32(rscratch1, k); 3469 __ eorw(rscratch3, rscratch3, r2); 3470 __ addw(rscratch4, rscratch4, rscratch1); 3471 __ addw(rscratch3, rscratch3, rscratch4); 3472 __ rorw(rscratch2, rscratch3, 32 - s); 3473 __ addw(r1, rscratch2, r2); 3474 } 3475 3476 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3477 int k, int s, int t) { 3478 Register rscratch3 = r10; 3479 Register rscratch4 = r11; 3480 3481 __ movw(rscratch3, t); 3482 __ ornw(rscratch2, r2, r4); 3483 __ addw(rscratch4, r1, rscratch3); 3484 reg_cache.extract_u32(rscratch1, k); 3485 __ eorw(rscratch3, rscratch2, r3); 3486 __ addw(rscratch4, rscratch4, rscratch1); 3487 __ addw(rscratch3, rscratch3, rscratch4); 3488 __ rorw(rscratch2, rscratch3, 32 - s); 3489 __ addw(r1, rscratch2, r2); 3490 } 3491 3492 // Arguments: 3493 // 3494 // Inputs: 3495 // c_rarg0 - byte[] source+offset 3496 // c_rarg1 - int[] SHA.state 3497 // c_rarg2 - int offset 3498 // c_rarg3 - int limit 3499 // 3500 address generate_md5_implCompress(StubGenStubId stub_id) { 3501 bool multi_block; 3502 switch (stub_id) { 3503 case md5_implCompress_id: 3504 multi_block = false; 3505 break; 3506 case md5_implCompressMB_id: 3507 multi_block = true; 3508 break; 3509 default: 3510 ShouldNotReachHere(); 3511 } 3512 __ align(CodeEntryAlignment); 3513 3514 StubCodeMark mark(this, stub_id); 3515 address start = __ pc(); 3516 3517 Register buf = c_rarg0; 3518 Register state = c_rarg1; 3519 Register ofs = c_rarg2; 3520 Register limit = c_rarg3; 3521 Register a = r4; 3522 Register b = r5; 3523 Register c = r6; 3524 Register d = r7; 3525 Register rscratch3 = r10; 3526 Register rscratch4 = r11; 3527 3528 Register state_regs[2] = { r12, r13 }; 3529 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3530 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3531 3532 __ push(saved_regs, sp); 3533 3534 __ ldp(state_regs[0], state_regs[1], Address(state)); 3535 __ ubfx(a, state_regs[0], 0, 32); 3536 __ ubfx(b, state_regs[0], 32, 32); 3537 __ ubfx(c, state_regs[1], 0, 32); 3538 __ ubfx(d, state_regs[1], 32, 32); 3539 3540 Label md5_loop; 3541 __ BIND(md5_loop); 3542 3543 reg_cache.gen_loads(buf); 3544 3545 // Round 1 3546 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3547 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3548 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3549 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3550 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3551 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3552 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3553 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3554 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3555 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3556 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3557 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3558 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3559 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3560 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3561 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3562 3563 // Round 2 3564 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3565 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3566 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3567 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3568 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3569 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3570 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3571 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3572 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3573 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3574 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3575 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3576 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3577 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3578 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3579 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3580 3581 // Round 3 3582 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3583 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3584 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3585 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3586 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3587 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3588 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3589 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3590 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3591 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3592 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3593 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3594 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3595 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3596 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3597 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3598 3599 // Round 4 3600 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3601 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3602 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3603 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3604 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3605 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3606 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3607 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3608 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3609 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3610 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3611 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3612 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3613 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3614 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3615 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3616 3617 __ addw(a, state_regs[0], a); 3618 __ ubfx(rscratch2, state_regs[0], 32, 32); 3619 __ addw(b, rscratch2, b); 3620 __ addw(c, state_regs[1], c); 3621 __ ubfx(rscratch4, state_regs[1], 32, 32); 3622 __ addw(d, rscratch4, d); 3623 3624 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3625 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3626 3627 if (multi_block) { 3628 __ add(buf, buf, 64); 3629 __ add(ofs, ofs, 64); 3630 __ cmp(ofs, limit); 3631 __ br(Assembler::LE, md5_loop); 3632 __ mov(c_rarg0, ofs); // return ofs 3633 } 3634 3635 // write hash values back in the correct order 3636 __ stp(state_regs[0], state_regs[1], Address(state)); 3637 3638 __ pop(saved_regs, sp); 3639 3640 __ ret(lr); 3641 3642 return start; 3643 } 3644 3645 // Arguments: 3646 // 3647 // Inputs: 3648 // c_rarg0 - byte[] source+offset 3649 // c_rarg1 - int[] SHA.state 3650 // c_rarg2 - int offset 3651 // c_rarg3 - int limit 3652 // 3653 address generate_sha1_implCompress(StubGenStubId stub_id) { 3654 bool multi_block; 3655 switch (stub_id) { 3656 case sha1_implCompress_id: 3657 multi_block = false; 3658 break; 3659 case sha1_implCompressMB_id: 3660 multi_block = true; 3661 break; 3662 default: 3663 ShouldNotReachHere(); 3664 } 3665 3666 __ align(CodeEntryAlignment); 3667 3668 StubCodeMark mark(this, stub_id); 3669 address start = __ pc(); 3670 3671 Register buf = c_rarg0; 3672 Register state = c_rarg1; 3673 Register ofs = c_rarg2; 3674 Register limit = c_rarg3; 3675 3676 Label keys; 3677 Label sha1_loop; 3678 3679 // load the keys into v0..v3 3680 __ adr(rscratch1, keys); 3681 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3682 // load 5 words state into v6, v7 3683 __ ldrq(v6, Address(state, 0)); 3684 __ ldrs(v7, Address(state, 16)); 3685 3686 3687 __ BIND(sha1_loop); 3688 // load 64 bytes of data into v16..v19 3689 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3690 __ rev32(v16, __ T16B, v16); 3691 __ rev32(v17, __ T16B, v17); 3692 __ rev32(v18, __ T16B, v18); 3693 __ rev32(v19, __ T16B, v19); 3694 3695 // do the sha1 3696 __ addv(v4, __ T4S, v16, v0); 3697 __ orr(v20, __ T16B, v6, v6); 3698 3699 FloatRegister d0 = v16; 3700 FloatRegister d1 = v17; 3701 FloatRegister d2 = v18; 3702 FloatRegister d3 = v19; 3703 3704 for (int round = 0; round < 20; round++) { 3705 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3706 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3707 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3708 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3709 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3710 3711 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3712 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3713 __ sha1h(tmp2, __ T4S, v20); 3714 if (round < 5) 3715 __ sha1c(v20, __ T4S, tmp3, tmp4); 3716 else if (round < 10 || round >= 15) 3717 __ sha1p(v20, __ T4S, tmp3, tmp4); 3718 else 3719 __ sha1m(v20, __ T4S, tmp3, tmp4); 3720 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3721 3722 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3723 } 3724 3725 __ addv(v7, __ T2S, v7, v21); 3726 __ addv(v6, __ T4S, v6, v20); 3727 3728 if (multi_block) { 3729 __ add(ofs, ofs, 64); 3730 __ cmp(ofs, limit); 3731 __ br(Assembler::LE, sha1_loop); 3732 __ mov(c_rarg0, ofs); // return ofs 3733 } 3734 3735 __ strq(v6, Address(state, 0)); 3736 __ strs(v7, Address(state, 16)); 3737 3738 __ ret(lr); 3739 3740 __ bind(keys); 3741 __ emit_int32(0x5a827999); 3742 __ emit_int32(0x6ed9eba1); 3743 __ emit_int32(0x8f1bbcdc); 3744 __ emit_int32(0xca62c1d6); 3745 3746 return start; 3747 } 3748 3749 3750 // Arguments: 3751 // 3752 // Inputs: 3753 // c_rarg0 - byte[] source+offset 3754 // c_rarg1 - int[] SHA.state 3755 // c_rarg2 - int offset 3756 // c_rarg3 - int limit 3757 // 3758 address generate_sha256_implCompress(StubGenStubId stub_id) { 3759 bool multi_block; 3760 switch (stub_id) { 3761 case sha256_implCompress_id: 3762 multi_block = false; 3763 break; 3764 case sha256_implCompressMB_id: 3765 multi_block = true; 3766 break; 3767 default: 3768 ShouldNotReachHere(); 3769 } 3770 3771 static const uint32_t round_consts[64] = { 3772 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3773 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3774 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3775 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3776 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3777 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3778 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3779 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3780 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3781 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3782 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3783 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3784 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3785 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3786 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3787 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3788 }; 3789 3790 __ align(CodeEntryAlignment); 3791 3792 StubCodeMark mark(this, stub_id); 3793 address start = __ pc(); 3794 3795 Register buf = c_rarg0; 3796 Register state = c_rarg1; 3797 Register ofs = c_rarg2; 3798 Register limit = c_rarg3; 3799 3800 Label sha1_loop; 3801 3802 __ stpd(v8, v9, __ pre(sp, -32)); 3803 __ stpd(v10, v11, Address(sp, 16)); 3804 3805 // dga == v0 3806 // dgb == v1 3807 // dg0 == v2 3808 // dg1 == v3 3809 // dg2 == v4 3810 // t0 == v6 3811 // t1 == v7 3812 3813 // load 16 keys to v16..v31 3814 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3815 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3816 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3817 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3818 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3819 3820 // load 8 words (256 bits) state 3821 __ ldpq(v0, v1, state); 3822 3823 __ BIND(sha1_loop); 3824 // load 64 bytes of data into v8..v11 3825 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3826 __ rev32(v8, __ T16B, v8); 3827 __ rev32(v9, __ T16B, v9); 3828 __ rev32(v10, __ T16B, v10); 3829 __ rev32(v11, __ T16B, v11); 3830 3831 __ addv(v6, __ T4S, v8, v16); 3832 __ orr(v2, __ T16B, v0, v0); 3833 __ orr(v3, __ T16B, v1, v1); 3834 3835 FloatRegister d0 = v8; 3836 FloatRegister d1 = v9; 3837 FloatRegister d2 = v10; 3838 FloatRegister d3 = v11; 3839 3840 3841 for (int round = 0; round < 16; round++) { 3842 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3843 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3844 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3845 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3846 3847 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3848 __ orr(v4, __ T16B, v2, v2); 3849 if (round < 15) 3850 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3851 __ sha256h(v2, __ T4S, v3, tmp2); 3852 __ sha256h2(v3, __ T4S, v4, tmp2); 3853 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3854 3855 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3856 } 3857 3858 __ addv(v0, __ T4S, v0, v2); 3859 __ addv(v1, __ T4S, v1, v3); 3860 3861 if (multi_block) { 3862 __ add(ofs, ofs, 64); 3863 __ cmp(ofs, limit); 3864 __ br(Assembler::LE, sha1_loop); 3865 __ mov(c_rarg0, ofs); // return ofs 3866 } 3867 3868 __ ldpd(v10, v11, Address(sp, 16)); 3869 __ ldpd(v8, v9, __ post(sp, 32)); 3870 3871 __ stpq(v0, v1, state); 3872 3873 __ ret(lr); 3874 3875 return start; 3876 } 3877 3878 // Double rounds for sha512. 3879 void sha512_dround(int dr, 3880 FloatRegister vi0, FloatRegister vi1, 3881 FloatRegister vi2, FloatRegister vi3, 3882 FloatRegister vi4, FloatRegister vrc0, 3883 FloatRegister vrc1, FloatRegister vin0, 3884 FloatRegister vin1, FloatRegister vin2, 3885 FloatRegister vin3, FloatRegister vin4) { 3886 if (dr < 36) { 3887 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3888 } 3889 __ addv(v5, __ T2D, vrc0, vin0); 3890 __ ext(v6, __ T16B, vi2, vi3, 8); 3891 __ ext(v5, __ T16B, v5, v5, 8); 3892 __ ext(v7, __ T16B, vi1, vi2, 8); 3893 __ addv(vi3, __ T2D, vi3, v5); 3894 if (dr < 32) { 3895 __ ext(v5, __ T16B, vin3, vin4, 8); 3896 __ sha512su0(vin0, __ T2D, vin1); 3897 } 3898 __ sha512h(vi3, __ T2D, v6, v7); 3899 if (dr < 32) { 3900 __ sha512su1(vin0, __ T2D, vin2, v5); 3901 } 3902 __ addv(vi4, __ T2D, vi1, vi3); 3903 __ sha512h2(vi3, __ T2D, vi1, vi0); 3904 } 3905 3906 // Arguments: 3907 // 3908 // Inputs: 3909 // c_rarg0 - byte[] source+offset 3910 // c_rarg1 - int[] SHA.state 3911 // c_rarg2 - int offset 3912 // c_rarg3 - int limit 3913 // 3914 address generate_sha512_implCompress(StubGenStubId stub_id) { 3915 bool multi_block; 3916 switch (stub_id) { 3917 case sha512_implCompress_id: 3918 multi_block = false; 3919 break; 3920 case sha512_implCompressMB_id: 3921 multi_block = true; 3922 break; 3923 default: 3924 ShouldNotReachHere(); 3925 } 3926 3927 static const uint64_t round_consts[80] = { 3928 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3929 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3930 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3931 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3932 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3933 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3934 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3935 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3936 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3937 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3938 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3939 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3940 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3941 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3942 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3943 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3944 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3945 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3946 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3947 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3948 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3949 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3950 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3951 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3952 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3953 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3954 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3955 }; 3956 3957 __ align(CodeEntryAlignment); 3958 3959 StubCodeMark mark(this, stub_id); 3960 address start = __ pc(); 3961 3962 Register buf = c_rarg0; 3963 Register state = c_rarg1; 3964 Register ofs = c_rarg2; 3965 Register limit = c_rarg3; 3966 3967 __ stpd(v8, v9, __ pre(sp, -64)); 3968 __ stpd(v10, v11, Address(sp, 16)); 3969 __ stpd(v12, v13, Address(sp, 32)); 3970 __ stpd(v14, v15, Address(sp, 48)); 3971 3972 Label sha512_loop; 3973 3974 // load state 3975 __ ld1(v8, v9, v10, v11, __ T2D, state); 3976 3977 // load first 4 round constants 3978 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3979 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3980 3981 __ BIND(sha512_loop); 3982 // load 128B of data into v12..v19 3983 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3984 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3985 __ rev64(v12, __ T16B, v12); 3986 __ rev64(v13, __ T16B, v13); 3987 __ rev64(v14, __ T16B, v14); 3988 __ rev64(v15, __ T16B, v15); 3989 __ rev64(v16, __ T16B, v16); 3990 __ rev64(v17, __ T16B, v17); 3991 __ rev64(v18, __ T16B, v18); 3992 __ rev64(v19, __ T16B, v19); 3993 3994 __ mov(rscratch2, rscratch1); 3995 3996 __ mov(v0, __ T16B, v8); 3997 __ mov(v1, __ T16B, v9); 3998 __ mov(v2, __ T16B, v10); 3999 __ mov(v3, __ T16B, v11); 4000 4001 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4002 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4003 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4004 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4005 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4006 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4007 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4008 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4009 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4010 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4011 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4012 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4013 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4014 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4015 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4016 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4017 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4018 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4019 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4020 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4021 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4022 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4023 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4024 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4025 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4026 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4027 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4028 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4029 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4030 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4031 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4032 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4033 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4034 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4035 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4036 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4037 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4038 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4039 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4040 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4041 4042 __ addv(v8, __ T2D, v8, v0); 4043 __ addv(v9, __ T2D, v9, v1); 4044 __ addv(v10, __ T2D, v10, v2); 4045 __ addv(v11, __ T2D, v11, v3); 4046 4047 if (multi_block) { 4048 __ add(ofs, ofs, 128); 4049 __ cmp(ofs, limit); 4050 __ br(Assembler::LE, sha512_loop); 4051 __ mov(c_rarg0, ofs); // return ofs 4052 } 4053 4054 __ st1(v8, v9, v10, v11, __ T2D, state); 4055 4056 __ ldpd(v14, v15, Address(sp, 48)); 4057 __ ldpd(v12, v13, Address(sp, 32)); 4058 __ ldpd(v10, v11, Address(sp, 16)); 4059 __ ldpd(v8, v9, __ post(sp, 64)); 4060 4061 __ ret(lr); 4062 4063 return start; 4064 } 4065 4066 // Execute one round of keccak of two computations in parallel. 4067 // One of the states should be loaded into the lower halves of 4068 // the vector registers v0-v24, the other should be loaded into 4069 // the upper halves of those registers. The ld1r instruction loads 4070 // the round constant into both halves of register v31. 4071 // Intermediate results c0...c5 and d0...d5 are computed 4072 // in registers v25...v30. 4073 // All vector instructions that are used operate on both register 4074 // halves in parallel. 4075 // If only a single computation is needed, one can only load the lower halves. 4076 void keccak_round(Register rscratch1) { 4077 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4078 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4079 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4080 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4081 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4082 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4083 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4084 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4085 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4086 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4087 4088 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4089 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4090 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4091 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4092 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4093 4094 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4095 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4096 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4097 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4098 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4099 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4100 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4101 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4102 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4103 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4104 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4105 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4106 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4107 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4108 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4109 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4110 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4111 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4112 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4113 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4114 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4115 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4116 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4117 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4118 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4119 4120 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4121 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4122 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4123 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4124 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4125 4126 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4127 4128 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4129 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4130 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4131 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4132 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4133 4134 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4135 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4136 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4137 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4138 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4139 4140 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4141 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4142 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4143 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4144 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4145 4146 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4147 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4148 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4149 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4150 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4151 4152 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4153 } 4154 4155 // Arguments: 4156 // 4157 // Inputs: 4158 // c_rarg0 - byte[] source+offset 4159 // c_rarg1 - byte[] SHA.state 4160 // c_rarg2 - int block_size 4161 // c_rarg3 - int offset 4162 // c_rarg4 - int limit 4163 // 4164 address generate_sha3_implCompress(StubGenStubId stub_id) { 4165 bool multi_block; 4166 switch (stub_id) { 4167 case sha3_implCompress_id: 4168 multi_block = false; 4169 break; 4170 case sha3_implCompressMB_id: 4171 multi_block = true; 4172 break; 4173 default: 4174 ShouldNotReachHere(); 4175 } 4176 4177 static const uint64_t round_consts[24] = { 4178 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4179 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4180 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4181 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4182 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4183 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4184 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4185 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4186 }; 4187 4188 __ align(CodeEntryAlignment); 4189 4190 StubCodeMark mark(this, stub_id); 4191 address start = __ pc(); 4192 4193 Register buf = c_rarg0; 4194 Register state = c_rarg1; 4195 Register block_size = c_rarg2; 4196 Register ofs = c_rarg3; 4197 Register limit = c_rarg4; 4198 4199 Label sha3_loop, rounds24_loop; 4200 Label sha3_512_or_sha3_384, shake128; 4201 4202 __ stpd(v8, v9, __ pre(sp, -64)); 4203 __ stpd(v10, v11, Address(sp, 16)); 4204 __ stpd(v12, v13, Address(sp, 32)); 4205 __ stpd(v14, v15, Address(sp, 48)); 4206 4207 // load state 4208 __ add(rscratch1, state, 32); 4209 __ ld1(v0, v1, v2, v3, __ T1D, state); 4210 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4211 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4212 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4213 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4214 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4215 __ ld1(v24, __ T1D, rscratch1); 4216 4217 __ BIND(sha3_loop); 4218 4219 // 24 keccak rounds 4220 __ movw(rscratch2, 24); 4221 4222 // load round_constants base 4223 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4224 4225 // load input 4226 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4227 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4228 __ eor(v0, __ T8B, v0, v25); 4229 __ eor(v1, __ T8B, v1, v26); 4230 __ eor(v2, __ T8B, v2, v27); 4231 __ eor(v3, __ T8B, v3, v28); 4232 __ eor(v4, __ T8B, v4, v29); 4233 __ eor(v5, __ T8B, v5, v30); 4234 __ eor(v6, __ T8B, v6, v31); 4235 4236 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4237 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4238 4239 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4240 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4241 __ eor(v7, __ T8B, v7, v25); 4242 __ eor(v8, __ T8B, v8, v26); 4243 __ eor(v9, __ T8B, v9, v27); 4244 __ eor(v10, __ T8B, v10, v28); 4245 __ eor(v11, __ T8B, v11, v29); 4246 __ eor(v12, __ T8B, v12, v30); 4247 __ eor(v13, __ T8B, v13, v31); 4248 4249 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4250 __ eor(v14, __ T8B, v14, v25); 4251 __ eor(v15, __ T8B, v15, v26); 4252 __ eor(v16, __ T8B, v16, v27); 4253 4254 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4255 __ andw(c_rarg5, block_size, 48); 4256 __ cbzw(c_rarg5, rounds24_loop); 4257 4258 __ tbnz(block_size, 5, shake128); 4259 // block_size == 144, bit5 == 0, SHA3-224 4260 __ ldrd(v28, __ post(buf, 8)); 4261 __ eor(v17, __ T8B, v17, v28); 4262 __ b(rounds24_loop); 4263 4264 __ BIND(shake128); 4265 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4266 __ eor(v17, __ T8B, v17, v28); 4267 __ eor(v18, __ T8B, v18, v29); 4268 __ eor(v19, __ T8B, v19, v30); 4269 __ eor(v20, __ T8B, v20, v31); 4270 __ b(rounds24_loop); // block_size == 168, SHAKE128 4271 4272 __ BIND(sha3_512_or_sha3_384); 4273 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4274 __ eor(v7, __ T8B, v7, v25); 4275 __ eor(v8, __ T8B, v8, v26); 4276 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4277 4278 // SHA3-384 4279 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4280 __ eor(v9, __ T8B, v9, v27); 4281 __ eor(v10, __ T8B, v10, v28); 4282 __ eor(v11, __ T8B, v11, v29); 4283 __ eor(v12, __ T8B, v12, v30); 4284 4285 __ BIND(rounds24_loop); 4286 __ subw(rscratch2, rscratch2, 1); 4287 4288 keccak_round(rscratch1); 4289 4290 __ cbnzw(rscratch2, rounds24_loop); 4291 4292 if (multi_block) { 4293 __ add(ofs, ofs, block_size); 4294 __ cmp(ofs, limit); 4295 __ br(Assembler::LE, sha3_loop); 4296 __ mov(c_rarg0, ofs); // return ofs 4297 } 4298 4299 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4300 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4301 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4302 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4303 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4304 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4305 __ st1(v24, __ T1D, state); 4306 4307 // restore callee-saved registers 4308 __ ldpd(v14, v15, Address(sp, 48)); 4309 __ ldpd(v12, v13, Address(sp, 32)); 4310 __ ldpd(v10, v11, Address(sp, 16)); 4311 __ ldpd(v8, v9, __ post(sp, 64)); 4312 4313 __ ret(lr); 4314 4315 return start; 4316 } 4317 4318 // Inputs: 4319 // c_rarg0 - long[] state0 4320 // c_rarg1 - long[] state1 4321 address generate_double_keccak() { 4322 static const uint64_t round_consts[24] = { 4323 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4324 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4325 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4326 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4327 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4328 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4329 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4330 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4331 }; 4332 4333 // Implements the double_keccak() method of the 4334 // sun.secyrity.provider.SHA3Parallel class 4335 __ align(CodeEntryAlignment); 4336 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4337 address start = __ pc(); 4338 __ enter(); 4339 4340 Register state0 = c_rarg0; 4341 Register state1 = c_rarg1; 4342 4343 Label rounds24_loop; 4344 4345 // save callee-saved registers 4346 __ stpd(v8, v9, __ pre(sp, -64)); 4347 __ stpd(v10, v11, Address(sp, 16)); 4348 __ stpd(v12, v13, Address(sp, 32)); 4349 __ stpd(v14, v15, Address(sp, 48)); 4350 4351 // load states 4352 __ add(rscratch1, state0, 32); 4353 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4354 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4355 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4356 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4357 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4358 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4359 __ ld1(v24, __ D, 0, rscratch1); 4360 __ add(rscratch1, state1, 32); 4361 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4362 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4363 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4364 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4365 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4366 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4367 __ ld1(v24, __ D, 1, rscratch1); 4368 4369 // 24 keccak rounds 4370 __ movw(rscratch2, 24); 4371 4372 // load round_constants base 4373 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4374 4375 __ BIND(rounds24_loop); 4376 __ subw(rscratch2, rscratch2, 1); 4377 keccak_round(rscratch1); 4378 __ cbnzw(rscratch2, rounds24_loop); 4379 4380 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4381 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4382 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4383 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4384 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4385 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4386 __ st1(v24, __ D, 0, state0); 4387 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4388 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4389 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4390 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4391 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4392 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4393 __ st1(v24, __ D, 1, state1); 4394 4395 // restore callee-saved vector registers 4396 __ ldpd(v14, v15, Address(sp, 48)); 4397 __ ldpd(v12, v13, Address(sp, 32)); 4398 __ ldpd(v10, v11, Address(sp, 16)); 4399 __ ldpd(v8, v9, __ post(sp, 64)); 4400 4401 __ leave(); // required for proper stackwalking of RuntimeStub frame 4402 __ mov(r0, zr); // return 0 4403 __ ret(lr); 4404 4405 return start; 4406 } 4407 4408 // ChaCha20 block function. This version parallelizes the 32-bit 4409 // state elements on each of 16 vectors, producing 4 blocks of 4410 // keystream at a time. 4411 // 4412 // state (int[16]) = c_rarg0 4413 // keystream (byte[256]) = c_rarg1 4414 // return - number of bytes of produced keystream (always 256) 4415 // 4416 // This implementation takes each 32-bit integer from the state 4417 // array and broadcasts it across all 4 32-bit lanes of a vector register 4418 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4419 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4420 // the quarter round schedule is implemented as outlined in RFC 7539 section 4421 // 2.3. However, instead of sequentially processing the 3 quarter round 4422 // operations represented by one QUARTERROUND function, we instead stack all 4423 // the adds, xors and left-rotations from the first 4 quarter rounds together 4424 // and then do the same for the second set of 4 quarter rounds. This removes 4425 // some latency that would otherwise be incurred by waiting for an add to 4426 // complete before performing an xor (which depends on the result of the 4427 // add), etc. An adjustment happens between the first and second groups of 4 4428 // quarter rounds, but this is done only in the inputs to the macro functions 4429 // that generate the assembly instructions - these adjustments themselves are 4430 // not part of the resulting assembly. 4431 // The 4 registers v0-v3 are used during the quarter round operations as 4432 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4433 // registers become the vectors involved in adding the start state back onto 4434 // the post-QR working state. After the adds are complete, each of the 16 4435 // vectors write their first lane back to the keystream buffer, followed 4436 // by the second lane from all vectors and so on. 4437 address generate_chacha20Block_blockpar() { 4438 Label L_twoRounds, L_cc20_const; 4439 // The constant data is broken into two 128-bit segments to be loaded 4440 // onto FloatRegisters. The first 128 bits are a counter add overlay 4441 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4442 // The second 128-bits is a table constant used for 8-bit left rotations. 4443 __ BIND(L_cc20_const); 4444 __ emit_int64(0x0000000100000000UL); 4445 __ emit_int64(0x0000000300000002UL); 4446 __ emit_int64(0x0605040702010003UL); 4447 __ emit_int64(0x0E0D0C0F0A09080BUL); 4448 4449 __ align(CodeEntryAlignment); 4450 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4451 StubCodeMark mark(this, stub_id); 4452 address start = __ pc(); 4453 __ enter(); 4454 4455 int i, j; 4456 const Register state = c_rarg0; 4457 const Register keystream = c_rarg1; 4458 const Register loopCtr = r10; 4459 const Register tmpAddr = r11; 4460 const FloatRegister ctrAddOverlay = v28; 4461 const FloatRegister lrot8Tbl = v29; 4462 4463 // Organize SIMD registers in an array that facilitates 4464 // putting repetitive opcodes into loop structures. It is 4465 // important that each grouping of 4 registers is monotonically 4466 // increasing to support the requirements of multi-register 4467 // instructions (e.g. ld4r, st4, etc.) 4468 const FloatRegister workSt[16] = { 4469 v4, v5, v6, v7, v16, v17, v18, v19, 4470 v20, v21, v22, v23, v24, v25, v26, v27 4471 }; 4472 4473 // Pull in constant data. The first 16 bytes are the add overlay 4474 // which is applied to the vector holding the counter (state[12]). 4475 // The second 16 bytes is the index register for the 8-bit left 4476 // rotation tbl instruction. 4477 __ adr(tmpAddr, L_cc20_const); 4478 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4479 4480 // Load from memory and interlace across 16 SIMD registers, 4481 // With each word from memory being broadcast to all lanes of 4482 // each successive SIMD register. 4483 // Addr(0) -> All lanes in workSt[i] 4484 // Addr(4) -> All lanes workSt[i + 1], etc. 4485 __ mov(tmpAddr, state); 4486 for (i = 0; i < 16; i += 4) { 4487 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4488 __ post(tmpAddr, 16)); 4489 } 4490 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4491 4492 // Before entering the loop, create 5 4-register arrays. These 4493 // will hold the 4 registers that represent the a/b/c/d fields 4494 // in the quarter round operation. For instance the "b" field 4495 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4496 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4497 // since it is part of a diagonal organization. The aSet and scratch 4498 // register sets are defined at declaration time because they do not change 4499 // organization at any point during the 20-round processing. 4500 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4501 FloatRegister bSet[4]; 4502 FloatRegister cSet[4]; 4503 FloatRegister dSet[4]; 4504 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4505 4506 // Set up the 10 iteration loop and perform all 8 quarter round ops 4507 __ mov(loopCtr, 10); 4508 __ BIND(L_twoRounds); 4509 4510 // Set to columnar organization and do the following 4 quarter-rounds: 4511 // QUARTERROUND(0, 4, 8, 12) 4512 // QUARTERROUND(1, 5, 9, 13) 4513 // QUARTERROUND(2, 6, 10, 14) 4514 // QUARTERROUND(3, 7, 11, 15) 4515 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4516 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4517 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4518 4519 __ cc20_qr_add4(aSet, bSet); // a += b 4520 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4521 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4522 4523 __ cc20_qr_add4(cSet, dSet); // c += d 4524 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4525 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4526 4527 __ cc20_qr_add4(aSet, bSet); // a += b 4528 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4529 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4530 4531 __ cc20_qr_add4(cSet, dSet); // c += d 4532 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4533 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4534 4535 // Set to diagonal organization and do the next 4 quarter-rounds: 4536 // QUARTERROUND(0, 5, 10, 15) 4537 // QUARTERROUND(1, 6, 11, 12) 4538 // QUARTERROUND(2, 7, 8, 13) 4539 // QUARTERROUND(3, 4, 9, 14) 4540 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4541 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4542 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4543 4544 __ cc20_qr_add4(aSet, bSet); // a += b 4545 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4546 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4547 4548 __ cc20_qr_add4(cSet, dSet); // c += d 4549 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4550 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4551 4552 __ cc20_qr_add4(aSet, bSet); // a += b 4553 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4554 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4555 4556 __ cc20_qr_add4(cSet, dSet); // c += d 4557 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4558 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4559 4560 // Decrement and iterate 4561 __ sub(loopCtr, loopCtr, 1); 4562 __ cbnz(loopCtr, L_twoRounds); 4563 4564 __ mov(tmpAddr, state); 4565 4566 // Add the starting state back to the post-loop keystream 4567 // state. We read/interlace the state array from memory into 4568 // 4 registers similar to what we did in the beginning. Then 4569 // add the counter overlay onto workSt[12] at the end. 4570 for (i = 0; i < 16; i += 4) { 4571 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4572 __ addv(workSt[i], __ T4S, workSt[i], v0); 4573 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4574 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4575 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4576 } 4577 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4578 4579 // Write working state into the keystream buffer. This is accomplished 4580 // by taking the lane "i" from each of the four vectors and writing 4581 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4582 // repeating with the next 4 vectors until all 16 vectors have been used. 4583 // Then move to the next lane and repeat the process until all lanes have 4584 // been written. 4585 for (i = 0; i < 4; i++) { 4586 for (j = 0; j < 16; j += 4) { 4587 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4588 __ post(keystream, 16)); 4589 } 4590 } 4591 4592 __ mov(r0, 256); // Return length of output keystream 4593 __ leave(); 4594 __ ret(lr); 4595 4596 return start; 4597 } 4598 4599 // Helpers to schedule parallel operation bundles across vector 4600 // register sequences of size 2, 4 or 8. 4601 4602 // Implement various primitive computations across vector sequences 4603 4604 template<int N> 4605 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4606 const VSeq<N>& v1, const VSeq<N>& v2) { 4607 // output must not be constant 4608 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4609 // output cannot overwrite pending inputs 4610 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4611 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4612 for (int i = 0; i < N; i++) { 4613 __ addv(v[i], T, v1[i], v2[i]); 4614 } 4615 } 4616 4617 template<int N> 4618 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4619 const VSeq<N>& v1, const VSeq<N>& v2) { 4620 // output must not be constant 4621 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4622 // output cannot overwrite pending inputs 4623 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4624 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4625 for (int i = 0; i < N; i++) { 4626 __ subv(v[i], T, v1[i], v2[i]); 4627 } 4628 } 4629 4630 template<int N> 4631 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4632 const VSeq<N>& v1, const VSeq<N>& v2) { 4633 // output must not be constant 4634 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4635 // output cannot overwrite pending inputs 4636 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4637 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4638 for (int i = 0; i < N; i++) { 4639 __ mulv(v[i], T, v1[i], v2[i]); 4640 } 4641 } 4642 4643 template<int N> 4644 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4645 // output must not be constant 4646 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4647 // output cannot overwrite pending inputs 4648 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4649 for (int i = 0; i < N; i++) { 4650 __ negr(v[i], T, v1[i]); 4651 } 4652 } 4653 4654 template<int N> 4655 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4656 const VSeq<N>& v1, int shift) { 4657 // output must not be constant 4658 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4659 // output cannot overwrite pending inputs 4660 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4661 for (int i = 0; i < N; i++) { 4662 __ sshr(v[i], T, v1[i], shift); 4663 } 4664 } 4665 4666 template<int N> 4667 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4668 // output must not be constant 4669 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4670 // output cannot overwrite pending inputs 4671 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4672 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4673 for (int i = 0; i < N; i++) { 4674 __ andr(v[i], __ T16B, v1[i], v2[i]); 4675 } 4676 } 4677 4678 template<int N> 4679 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4680 // output must not be constant 4681 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4682 // output cannot overwrite pending inputs 4683 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4684 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4685 for (int i = 0; i < N; i++) { 4686 __ orr(v[i], __ T16B, v1[i], v2[i]); 4687 } 4688 } 4689 4690 template<int N> 4691 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4692 // output must not be constant 4693 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4694 // output cannot overwrite pending inputs 4695 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4696 for (int i = 0; i < N; i++) { 4697 __ notr(v[i], __ T16B, v1[i]); 4698 } 4699 } 4700 4701 template<int N> 4702 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4703 // output must not be constant 4704 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4705 // output cannot overwrite pending inputs 4706 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4707 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4708 for (int i = 0; i < N; i++) { 4709 __ sqdmulh(v[i], T, v1[i], v2[i]); 4710 } 4711 } 4712 4713 template<int N> 4714 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4715 // output must not be constant 4716 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4717 // output cannot overwrite pending inputs 4718 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4719 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4720 for (int i = 0; i < N; i++) { 4721 __ mlsv(v[i], T, v1[i], v2[i]); 4722 } 4723 } 4724 4725 // load N/2 successive pairs of quadword values from memory in order 4726 // into N successive vector registers of the sequence via the 4727 // address supplied in base. 4728 template<int N> 4729 void vs_ldpq(const VSeq<N>& v, Register base) { 4730 for (int i = 0; i < N; i += 2) { 4731 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4732 } 4733 } 4734 4735 // load N/2 successive pairs of quadword values from memory in order 4736 // into N vector registers of the sequence via the address supplied 4737 // in base using post-increment addressing 4738 template<int N> 4739 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4740 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4741 for (int i = 0; i < N; i += 2) { 4742 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4743 } 4744 } 4745 4746 // store N successive vector registers of the sequence into N/2 4747 // successive pairs of quadword memory locations via the address 4748 // supplied in base using post-increment addressing 4749 template<int N> 4750 void vs_stpq_post(const VSeq<N>& v, Register base) { 4751 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4752 for (int i = 0; i < N; i += 2) { 4753 __ stpq(v[i], v[i+1], __ post(base, 32)); 4754 } 4755 } 4756 4757 // load N/2 pairs of quadword values from memory de-interleaved into 4758 // N vector registers 2 at a time via the address supplied in base 4759 // using post-increment addressing. 4760 template<int N> 4761 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4762 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4763 for (int i = 0; i < N; i += 2) { 4764 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4765 } 4766 } 4767 4768 // store N vector registers interleaved into N/2 pairs of quadword 4769 // memory locations via the address supplied in base using 4770 // post-increment addressing. 4771 template<int N> 4772 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4773 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4774 for (int i = 0; i < N; i += 2) { 4775 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4776 } 4777 } 4778 4779 // load N quadword values from memory de-interleaved into N vector 4780 // registers 3 elements at a time via the address supplied in base. 4781 template<int N> 4782 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4783 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4784 for (int i = 0; i < N; i += 3) { 4785 __ ld3(v[i], v[i+1], v[i+2], T, base); 4786 } 4787 } 4788 4789 // load N quadword values from memory de-interleaved into N vector 4790 // registers 3 elements at a time via the address supplied in base 4791 // using post-increment addressing. 4792 template<int N> 4793 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4794 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4795 for (int i = 0; i < N; i += 3) { 4796 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4797 } 4798 } 4799 4800 // load N/2 pairs of quadword values from memory into N vector 4801 // registers via the address supplied in base with each pair indexed 4802 // using the the start offset plus the corresponding entry in the 4803 // offsets array 4804 template<int N> 4805 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4806 for (int i = 0; i < N/2; i++) { 4807 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4808 } 4809 } 4810 4811 // store N vector registers into N/2 pairs of quadword memory 4812 // locations via the address supplied in base with each pair indexed 4813 // using the the start offset plus the corresponding entry in the 4814 // offsets array 4815 template<int N> 4816 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4817 for (int i = 0; i < N/2; i++) { 4818 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4819 } 4820 } 4821 4822 // load N single quadword values from memory into N vector registers 4823 // via the address supplied in base with each value indexed using 4824 // the the start offset plus the corresponding entry in the offsets 4825 // array 4826 template<int N> 4827 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4828 int start, int (&offsets)[N]) { 4829 for (int i = 0; i < N; i++) { 4830 __ ldr(v[i], T, Address(base, start + offsets[i])); 4831 } 4832 } 4833 4834 // store N vector registers into N single quadword memory locations 4835 // via the address supplied in base with each value indexed using 4836 // the the start offset plus the corresponding entry in the offsets 4837 // array 4838 template<int N> 4839 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4840 int start, int (&offsets)[N]) { 4841 for (int i = 0; i < N; i++) { 4842 __ str(v[i], T, Address(base, start + offsets[i])); 4843 } 4844 } 4845 4846 // load N/2 pairs of quadword values from memory de-interleaved into 4847 // N vector registers 2 at a time via the address supplied in base 4848 // with each pair indexed using the the start offset plus the 4849 // corresponding entry in the offsets array 4850 template<int N> 4851 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4852 Register tmp, int start, int (&offsets)[N/2]) { 4853 for (int i = 0; i < N/2; i++) { 4854 __ add(tmp, base, start + offsets[i]); 4855 __ ld2(v[2*i], v[2*i+1], T, tmp); 4856 } 4857 } 4858 4859 // store N vector registers 2 at a time interleaved into N/2 pairs 4860 // of quadword memory locations via the address supplied in base 4861 // with each pair indexed using the the start offset plus the 4862 // corresponding entry in the offsets array 4863 template<int N> 4864 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4865 Register tmp, int start, int (&offsets)[N/2]) { 4866 for (int i = 0; i < N/2; i++) { 4867 __ add(tmp, base, start + offsets[i]); 4868 __ st2(v[2*i], v[2*i+1], T, tmp); 4869 } 4870 } 4871 4872 // Helper routines for various flavours of Montgomery multiply 4873 4874 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 4875 // multiplications in parallel 4876 // 4877 4878 // See the montMul() method of the sun.security.provider.ML_DSA 4879 // class. 4880 // 4881 // Computes 4x4S results or 8x8H results 4882 // a = b * c * 2^MONT_R_BITS mod MONT_Q 4883 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 4884 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 4885 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 4886 // Outputs: va - 4x4S or 4x8H vector register sequences 4887 // vb, vc, vtmp and vq must all be disjoint 4888 // va must be disjoint from all other inputs/temps or must equal vc 4889 // va must have a non-zero delta i.e. it must not be a constant vseq. 4890 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 4891 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 4892 Assembler::SIMD_Arrangement T, 4893 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4894 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 4895 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4896 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4897 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4898 4899 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4900 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4901 4902 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4903 4904 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4905 assert(vs_disjoint(va, vb), "va and vb overlap"); 4906 assert(vs_disjoint(va, vq), "va and vq overlap"); 4907 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4908 assert(!va.is_constant(), "output vector must identify 4 different registers"); 4909 4910 // schedule 4 streams of instructions across the vector sequences 4911 for (int i = 0; i < 4; i++) { 4912 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 4913 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 4914 } 4915 4916 for (int i = 0; i < 4; i++) { 4917 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 4918 } 4919 4920 for (int i = 0; i < 4; i++) { 4921 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 4922 } 4923 4924 for (int i = 0; i < 4; i++) { 4925 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 4926 } 4927 } 4928 4929 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 4930 // multiplications in parallel 4931 // 4932 4933 // See the montMul() method of the sun.security.provider.ML_DSA 4934 // class. 4935 // 4936 // Computes 4x4S results or 8x8H results 4937 // a = b * c * 2^MONT_R_BITS mod MONT_Q 4938 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 4939 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 4940 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 4941 // Outputs: va - 4x4S or 4x8H vector register sequences 4942 // vb, vc, vtmp and vq must all be disjoint 4943 // va must be disjoint from all other inputs/temps or must equal vc 4944 // va must have a non-zero delta i.e. it must not be a constant vseq. 4945 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 4946 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 4947 Assembler::SIMD_Arrangement T, 4948 const VSeq<2>& vtmp, const VSeq<2>& vq) { 4949 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 4950 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4951 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4952 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4953 4954 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4955 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4956 4957 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4958 4959 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4960 assert(vs_disjoint(va, vb), "va and vb overlap"); 4961 assert(vs_disjoint(va, vq), "va and vq overlap"); 4962 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4963 assert(!va.is_constant(), "output vector must identify 2 different registers"); 4964 4965 // schedule 2 streams of instructions across the vector sequences 4966 for (int i = 0; i < 2; i++) { 4967 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 4968 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 4969 } 4970 4971 for (int i = 0; i < 2; i++) { 4972 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 4973 } 4974 4975 for (int i = 0; i < 2; i++) { 4976 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 4977 } 4978 4979 for (int i = 0; i < 2; i++) { 4980 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 4981 } 4982 } 4983 4984 // Perform 16 16-bit Montgomery multiplications in parallel. 4985 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 4986 const VSeq<2>& vtmp, const VSeq<2>& vq) { 4987 // Use the helper routine to schedule a 2x8H Montgomery multiply. 4988 // It will assert that the register use is valid 4989 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 4990 } 4991 4992 // Perform 32 16-bit Montgomery multiplications in parallel. 4993 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 4994 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4995 // Use the helper routine to schedule a 4x8H Montgomery multiply. 4996 // It will assert that the register use is valid 4997 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 4998 } 4999 5000 // Perform 64 16-bit Montgomery multiplications in parallel. 5001 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5002 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5003 // Schedule two successive 4x8H multiplies via the montmul helper 5004 // on the front and back halves of va, vb and vc. The helper will 5005 // assert that the register use has no overlap conflicts on each 5006 // individual call but we also need to ensure that the necessary 5007 // disjoint/equality constraints are met across both calls. 5008 5009 // vb, vc, vtmp and vq must be disjoint. va must either be 5010 // disjoint from all other registers or equal vc 5011 5012 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5013 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5014 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5015 5016 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5017 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5018 5019 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5020 5021 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5022 assert(vs_disjoint(va, vb), "va and vb overlap"); 5023 assert(vs_disjoint(va, vq), "va and vq overlap"); 5024 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5025 5026 // we multiply the front and back halves of each sequence 4 at a 5027 // time because 5028 // 5029 // 1) we are currently only able to get 4-way instruction 5030 // parallelism at best 5031 // 5032 // 2) we need registers for the constants in vq and temporary 5033 // scratch registers to hold intermediate results so vtmp can only 5034 // be a VSeq<4> which means we only have 4 scratch slots 5035 5036 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5037 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5038 } 5039 5040 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5041 const VSeq<4>& vc, 5042 const VSeq<4>& vtmp, 5043 const VSeq<2>& vq) { 5044 // compute a = montmul(a1, c) 5045 kyber_montmul32(vc, va1, vc, vtmp, vq); 5046 // ouptut a1 = a0 - a 5047 vs_subv(va1, __ T8H, va0, vc); 5048 // and a0 = a0 + a 5049 vs_addv(va0, __ T8H, va0, vc); 5050 } 5051 5052 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5053 const VSeq<4>& vb, 5054 const VSeq<4>& vtmp1, 5055 const VSeq<4>& vtmp2, 5056 const VSeq<2>& vq) { 5057 // compute c = a0 - a1 5058 vs_subv(vtmp1, __ T8H, va0, va1); 5059 // output a0 = a0 + a1 5060 vs_addv(va0, __ T8H, va0, va1); 5061 // output a1 = b montmul c 5062 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5063 } 5064 5065 void load64shorts(const VSeq<8>& v, Register shorts) { 5066 vs_ldpq_post(v, shorts); 5067 } 5068 5069 void load32shorts(const VSeq<4>& v, Register shorts) { 5070 vs_ldpq_post(v, shorts); 5071 } 5072 5073 void store64shorts(VSeq<8> v, Register tmpAddr) { 5074 vs_stpq_post(v, tmpAddr); 5075 } 5076 5077 // Kyber NTT function. 5078 // Implements 5079 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5080 // 5081 // coeffs (short[256]) = c_rarg0 5082 // ntt_zetas (short[256]) = c_rarg1 5083 address generate_kyberNtt() { 5084 5085 __ align(CodeEntryAlignment); 5086 StubGenStubId stub_id = StubGenStubId::kyberNtt_id; 5087 StubCodeMark mark(this, stub_id); 5088 address start = __ pc(); 5089 __ enter(); 5090 5091 const Register coeffs = c_rarg0; 5092 const Register zetas = c_rarg1; 5093 5094 const Register kyberConsts = r10; 5095 const Register tmpAddr = r11; 5096 5097 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5098 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5099 VSeq<2> vq(30); // n.b. constants overlap vs3 5100 5101 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5102 // load the montmul constants 5103 vs_ldpq(vq, kyberConsts); 5104 5105 // Each level corresponds to an iteration of the outermost loop of the 5106 // Java method seilerNTT(int[] coeffs). There are some differences 5107 // from what is done in the seilerNTT() method, though: 5108 // 1. The computation is using 16-bit signed values, we do not convert them 5109 // to ints here. 5110 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5111 // this array for each level, it is easier that way to fill up the vector 5112 // registers. 5113 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5114 // multiplications (this is because that way there should not be any 5115 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5116 // that we can use the 16-bit arithmetic in the vector unit. 5117 // 5118 // On each level, we fill up the vector registers in such a way that the 5119 // array elements that need to be multiplied by the zetas go into one 5120 // set of vector registers while the corresponding ones that don't need to 5121 // be multiplied, go into another set. 5122 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5123 // registers interleaving the steps of 4 identical computations, 5124 // each done on 8 16-bit values per register. 5125 5126 // At levels 0-3 the coefficients multiplied by or added/subtracted 5127 // to the zetas occur in discrete blocks whose size is some multiple 5128 // of 32. 5129 5130 // level 0 5131 __ add(tmpAddr, coeffs, 256); 5132 load64shorts(vs1, tmpAddr); 5133 load64shorts(vs2, zetas); 5134 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5135 __ add(tmpAddr, coeffs, 0); 5136 load64shorts(vs1, tmpAddr); 5137 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5138 vs_addv(vs1, __ T8H, vs1, vs2); 5139 __ add(tmpAddr, coeffs, 0); 5140 vs_stpq_post(vs1, tmpAddr); 5141 __ add(tmpAddr, coeffs, 256); 5142 vs_stpq_post(vs3, tmpAddr); 5143 // restore montmul constants 5144 vs_ldpq(vq, kyberConsts); 5145 load64shorts(vs1, tmpAddr); 5146 load64shorts(vs2, zetas); 5147 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5148 __ add(tmpAddr, coeffs, 128); 5149 load64shorts(vs1, tmpAddr); 5150 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5151 vs_addv(vs1, __ T8H, vs1, vs2); 5152 __ add(tmpAddr, coeffs, 128); 5153 store64shorts(vs1, tmpAddr); 5154 __ add(tmpAddr, coeffs, 384); 5155 store64shorts(vs3, tmpAddr); 5156 5157 // level 1 5158 // restore montmul constants 5159 vs_ldpq(vq, kyberConsts); 5160 __ add(tmpAddr, coeffs, 128); 5161 load64shorts(vs1, tmpAddr); 5162 load64shorts(vs2, zetas); 5163 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5164 __ add(tmpAddr, coeffs, 0); 5165 load64shorts(vs1, tmpAddr); 5166 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5167 vs_addv(vs1, __ T8H, vs1, vs2); 5168 __ add(tmpAddr, coeffs, 0); 5169 store64shorts(vs1, tmpAddr); 5170 store64shorts(vs3, tmpAddr); 5171 vs_ldpq(vq, kyberConsts); 5172 __ add(tmpAddr, coeffs, 384); 5173 load64shorts(vs1, tmpAddr); 5174 load64shorts(vs2, zetas); 5175 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5176 __ add(tmpAddr, coeffs, 256); 5177 load64shorts(vs1, tmpAddr); 5178 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5179 vs_addv(vs1, __ T8H, vs1, vs2); 5180 __ add(tmpAddr, coeffs, 256); 5181 store64shorts(vs1, tmpAddr); 5182 store64shorts(vs3, tmpAddr); 5183 5184 // level 2 5185 vs_ldpq(vq, kyberConsts); 5186 int offsets1[4] = { 0, 32, 128, 160 }; 5187 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5188 load64shorts(vs2, zetas); 5189 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5190 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5191 // kyber_subv_addv64(); 5192 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5193 vs_addv(vs1, __ T8H, vs1, vs2); 5194 __ add(tmpAddr, coeffs, 0); 5195 vs_stpq_post(vs_front(vs1), tmpAddr); 5196 vs_stpq_post(vs_front(vs3), tmpAddr); 5197 vs_stpq_post(vs_back(vs1), tmpAddr); 5198 vs_stpq_post(vs_back(vs3), tmpAddr); 5199 vs_ldpq(vq, kyberConsts); 5200 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5201 load64shorts(vs2, zetas); 5202 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5203 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5204 // kyber_subv_addv64(); 5205 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5206 vs_addv(vs1, __ T8H, vs1, vs2); 5207 __ add(tmpAddr, coeffs, 256); 5208 vs_stpq_post(vs_front(vs1), tmpAddr); 5209 vs_stpq_post(vs_front(vs3), tmpAddr); 5210 vs_stpq_post(vs_back(vs1), tmpAddr); 5211 vs_stpq_post(vs_back(vs3), tmpAddr); 5212 5213 // level 3 5214 vs_ldpq(vq, kyberConsts); 5215 int offsets2[4] = { 0, 64, 128, 192 }; 5216 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5217 load64shorts(vs2, zetas); 5218 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5219 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5220 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5221 vs_addv(vs1, __ T8H, vs1, vs2); 5222 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5223 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5224 5225 vs_ldpq(vq, kyberConsts); 5226 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5227 load64shorts(vs2, zetas); 5228 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5229 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5230 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5231 vs_addv(vs1, __ T8H, vs1, vs2); 5232 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5233 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5234 5235 // level 4 5236 // At level 4 coefficients occur in 8 discrete blocks of size 16 5237 // so they are loaded using employing an ldr at 8 distinct offsets. 5238 5239 vs_ldpq(vq, kyberConsts); 5240 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5241 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5242 load64shorts(vs2, zetas); 5243 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5244 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5245 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5246 vs_addv(vs1, __ T8H, vs1, vs2); 5247 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5248 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5249 5250 vs_ldpq(vq, kyberConsts); 5251 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5252 load64shorts(vs2, zetas); 5253 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5254 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5255 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5256 vs_addv(vs1, __ T8H, vs1, vs2); 5257 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5258 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5259 5260 // level 5 5261 // At level 5 related coefficients occur in discrete blocks of size 8 so 5262 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5263 5264 vs_ldpq(vq, kyberConsts); 5265 int offsets4[4] = { 0, 32, 64, 96 }; 5266 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5267 load32shorts(vs_front(vs2), zetas); 5268 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5269 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5270 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5271 load32shorts(vs_front(vs2), zetas); 5272 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5273 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5274 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5275 load32shorts(vs_front(vs2), zetas); 5276 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5277 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5278 5279 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5280 load32shorts(vs_front(vs2), zetas); 5281 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5282 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5283 5284 // level 6 5285 // At level 6 related coefficients occur in discrete blocks of size 4 so 5286 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5287 5288 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5289 load32shorts(vs_front(vs2), zetas); 5290 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5291 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5292 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5293 // __ ldpq(v18, v19, __ post(zetas, 32)); 5294 load32shorts(vs_front(vs2), zetas); 5295 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5296 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5297 5298 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5299 load32shorts(vs_front(vs2), zetas); 5300 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5301 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5302 5303 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5304 load32shorts(vs_front(vs2), zetas); 5305 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5306 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5307 5308 __ leave(); // required for proper stackwalking of RuntimeStub frame 5309 __ mov(r0, zr); // return 0 5310 __ ret(lr); 5311 5312 return start; 5313 } 5314 5315 // Kyber Inverse NTT function 5316 // Implements 5317 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5318 // 5319 // coeffs (short[256]) = c_rarg0 5320 // ntt_zetas (short[256]) = c_rarg1 5321 address generate_kyberInverseNtt() { 5322 5323 __ align(CodeEntryAlignment); 5324 StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id; 5325 StubCodeMark mark(this, stub_id); 5326 address start = __ pc(); 5327 __ enter(); 5328 5329 const Register coeffs = c_rarg0; 5330 const Register zetas = c_rarg1; 5331 5332 const Register kyberConsts = r10; 5333 const Register tmpAddr = r11; 5334 const Register tmpAddr2 = c_rarg2; 5335 5336 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5337 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5338 VSeq<2> vq(30); // n.b. constants overlap vs3 5339 5340 __ lea(kyberConsts, 5341 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5342 5343 // level 0 5344 // At level 0 related coefficients occur in discrete blocks of size 4 so 5345 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5346 5347 vs_ldpq(vq, kyberConsts); 5348 int offsets4[4] = { 0, 32, 64, 96 }; 5349 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5350 load32shorts(vs_front(vs2), zetas); 5351 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5352 vs_front(vs2), vs_back(vs2), vtmp, vq); 5353 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5354 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5355 load32shorts(vs_front(vs2), zetas); 5356 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5357 vs_front(vs2), vs_back(vs2), vtmp, vq); 5358 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5359 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5360 load32shorts(vs_front(vs2), zetas); 5361 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5362 vs_front(vs2), vs_back(vs2), vtmp, vq); 5363 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5364 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5365 load32shorts(vs_front(vs2), zetas); 5366 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5367 vs_front(vs2), vs_back(vs2), vtmp, vq); 5368 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5369 5370 // level 1 5371 // At level 1 related coefficients occur in discrete blocks of size 8 so 5372 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5373 5374 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5375 load32shorts(vs_front(vs2), zetas); 5376 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5377 vs_front(vs2), vs_back(vs2), vtmp, vq); 5378 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5379 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5380 load32shorts(vs_front(vs2), zetas); 5381 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5382 vs_front(vs2), vs_back(vs2), vtmp, vq); 5383 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5384 5385 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5386 load32shorts(vs_front(vs2), zetas); 5387 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5388 vs_front(vs2), vs_back(vs2), vtmp, vq); 5389 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5390 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5391 load32shorts(vs_front(vs2), zetas); 5392 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5393 vs_front(vs2), vs_back(vs2), vtmp, vq); 5394 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5395 5396 // level 2 5397 // At level 2 coefficients occur in 8 discrete blocks of size 16 5398 // so they are loaded using employing an ldr at 8 distinct offsets. 5399 5400 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5401 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5402 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5403 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5404 vs_subv(vs1, __ T8H, vs1, vs2); 5405 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5406 load64shorts(vs2, zetas); 5407 vs_ldpq(vq, kyberConsts); 5408 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5409 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5410 5411 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5412 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5413 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5414 vs_subv(vs1, __ T8H, vs1, vs2); 5415 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5416 load64shorts(vs2, zetas); 5417 vs_ldpq(vq, kyberConsts); 5418 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5419 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5420 5421 // Barrett reduction at indexes where overflow may happen 5422 5423 // load q and the multiplier for the Barrett reduction 5424 __ add(tmpAddr, kyberConsts, 16); 5425 vs_ldpq(vq, tmpAddr); 5426 5427 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5428 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5429 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5430 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5431 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5432 vs_sshr(vs2, __ T8H, vs2, 11); 5433 vs_mlsv(vs1, __ T8H, vs2, vq1); 5434 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5435 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5436 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5437 vs_sshr(vs2, __ T8H, vs2, 11); 5438 vs_mlsv(vs1, __ T8H, vs2, vq1); 5439 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5440 5441 // level 3 5442 // From level 3 upwards coefficients occur in discrete blocks whose size is 5443 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5444 5445 int offsets2[4] = { 0, 64, 128, 192 }; 5446 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5447 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5448 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5449 vs_subv(vs1, __ T8H, vs1, vs2); 5450 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5451 load64shorts(vs2, zetas); 5452 vs_ldpq(vq, kyberConsts); 5453 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5454 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5455 5456 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5457 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5458 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5459 vs_subv(vs1, __ T8H, vs1, vs2); 5460 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5461 load64shorts(vs2, zetas); 5462 vs_ldpq(vq, kyberConsts); 5463 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5464 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5465 5466 // level 4 5467 5468 int offsets1[4] = { 0, 32, 128, 160 }; 5469 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5470 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5471 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5472 vs_subv(vs1, __ T8H, vs1, vs2); 5473 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5474 load64shorts(vs2, zetas); 5475 vs_ldpq(vq, kyberConsts); 5476 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5477 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5478 5479 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5480 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5481 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5482 vs_subv(vs1, __ T8H, vs1, vs2); 5483 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5484 load64shorts(vs2, zetas); 5485 vs_ldpq(vq, kyberConsts); 5486 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5487 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5488 5489 // level 5 5490 5491 __ add(tmpAddr, coeffs, 0); 5492 load64shorts(vs1, tmpAddr); 5493 __ add(tmpAddr, coeffs, 128); 5494 load64shorts(vs2, tmpAddr); 5495 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5496 vs_subv(vs1, __ T8H, vs1, vs2); 5497 __ add(tmpAddr, coeffs, 0); 5498 store64shorts(vs3, tmpAddr); 5499 load64shorts(vs2, zetas); 5500 vs_ldpq(vq, kyberConsts); 5501 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5502 __ add(tmpAddr, coeffs, 128); 5503 store64shorts(vs2, tmpAddr); 5504 5505 load64shorts(vs1, tmpAddr); 5506 __ add(tmpAddr, coeffs, 384); 5507 load64shorts(vs2, tmpAddr); 5508 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5509 vs_subv(vs1, __ T8H, vs1, vs2); 5510 __ add(tmpAddr, coeffs, 256); 5511 store64shorts(vs3, tmpAddr); 5512 load64shorts(vs2, zetas); 5513 vs_ldpq(vq, kyberConsts); 5514 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5515 __ add(tmpAddr, coeffs, 384); 5516 store64shorts(vs2, tmpAddr); 5517 5518 // Barrett reduction at indexes where overflow may happen 5519 5520 // load q and the multiplier for the Barrett reduction 5521 __ add(tmpAddr, kyberConsts, 16); 5522 vs_ldpq(vq, tmpAddr); 5523 5524 int offsets0[2] = { 0, 256 }; 5525 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5526 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5527 vs_sshr(vs2, __ T8H, vs2, 11); 5528 vs_mlsv(vs1, __ T8H, vs2, vq1); 5529 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5530 5531 // level 6 5532 5533 __ add(tmpAddr, coeffs, 0); 5534 load64shorts(vs1, tmpAddr); 5535 __ add(tmpAddr, coeffs, 256); 5536 load64shorts(vs2, tmpAddr); 5537 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5538 vs_subv(vs1, __ T8H, vs1, vs2); 5539 __ add(tmpAddr, coeffs, 0); 5540 store64shorts(vs3, tmpAddr); 5541 load64shorts(vs2, zetas); 5542 vs_ldpq(vq, kyberConsts); 5543 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5544 __ add(tmpAddr, coeffs, 256); 5545 store64shorts(vs2, tmpAddr); 5546 5547 __ add(tmpAddr, coeffs, 128); 5548 load64shorts(vs1, tmpAddr); 5549 __ add(tmpAddr, coeffs, 384); 5550 load64shorts(vs2, tmpAddr); 5551 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5552 vs_subv(vs1, __ T8H, vs1, vs2); 5553 __ add(tmpAddr, coeffs, 128); 5554 store64shorts(vs3, tmpAddr); 5555 load64shorts(vs2, zetas); 5556 vs_ldpq(vq, kyberConsts); 5557 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5558 __ add(tmpAddr, coeffs, 384); 5559 store64shorts(vs2, tmpAddr); 5560 5561 // multiply by 2^-n 5562 5563 // load toMont(2^-n mod q) 5564 __ add(tmpAddr, kyberConsts, 48); 5565 __ ldr(v29, __ Q, tmpAddr); 5566 5567 vs_ldpq(vq, kyberConsts); 5568 __ add(tmpAddr, coeffs, 0); 5569 load64shorts(vs1, tmpAddr); 5570 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5571 __ add(tmpAddr, coeffs, 0); 5572 store64shorts(vs2, tmpAddr); 5573 5574 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5575 load64shorts(vs1, tmpAddr); 5576 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5577 __ add(tmpAddr, coeffs, 128); 5578 store64shorts(vs2, tmpAddr); 5579 5580 // now tmpAddr contains coeffs + 256 5581 load64shorts(vs1, tmpAddr); 5582 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5583 __ add(tmpAddr, coeffs, 256); 5584 store64shorts(vs2, tmpAddr); 5585 5586 // now tmpAddr contains coeffs + 384 5587 load64shorts(vs1, tmpAddr); 5588 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5589 __ add(tmpAddr, coeffs, 384); 5590 store64shorts(vs2, tmpAddr); 5591 5592 __ leave(); // required for proper stackwalking of RuntimeStub frame 5593 __ mov(r0, zr); // return 0 5594 __ ret(lr); 5595 5596 return start; 5597 } 5598 5599 // Kyber multiply polynomials in the NTT domain. 5600 // Implements 5601 // static int implKyberNttMult( 5602 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5603 // 5604 // result (short[256]) = c_rarg0 5605 // ntta (short[256]) = c_rarg1 5606 // nttb (short[256]) = c_rarg2 5607 // zetas (short[128]) = c_rarg3 5608 address generate_kyberNttMult() { 5609 5610 __ align(CodeEntryAlignment); 5611 StubGenStubId stub_id = StubGenStubId::kyberNttMult_id; 5612 StubCodeMark mark(this, stub_id); 5613 address start = __ pc(); 5614 __ enter(); 5615 5616 const Register result = c_rarg0; 5617 const Register ntta = c_rarg1; 5618 const Register nttb = c_rarg2; 5619 const Register zetas = c_rarg3; 5620 5621 const Register kyberConsts = r10; 5622 const Register limit = r11; 5623 5624 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5625 VSeq<4> vs3(16), vs4(20); 5626 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5627 VSeq<2> vz(28); // pair of zetas 5628 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5629 5630 __ lea(kyberConsts, 5631 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5632 5633 Label kyberNttMult_loop; 5634 5635 __ add(limit, result, 512); 5636 5637 // load q and qinv 5638 vs_ldpq(vq, kyberConsts); 5639 5640 // load R^2 mod q (to convert back from Montgomery representation) 5641 __ add(kyberConsts, kyberConsts, 64); 5642 __ ldr(v27, __ Q, kyberConsts); 5643 5644 __ BIND(kyberNttMult_loop); 5645 5646 // load 16 zetas 5647 vs_ldpq_post(vz, zetas); 5648 5649 // load 2 sets of 32 coefficients from the two input arrays 5650 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5651 // are striped across pairs of vector registers 5652 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5653 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5654 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5655 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5656 5657 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5658 // i.e. montmul the first and second halves of vs1 in order and 5659 // then with one sequence reversed storing the two results in vs3 5660 // 5661 // vs3[0] <- montmul(a0, b0) 5662 // vs3[1] <- montmul(a1, b1) 5663 // vs3[2] <- montmul(a0, b1) 5664 // vs3[3] <- montmul(a1, b0) 5665 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5666 kyber_montmul16(vs_back(vs3), 5667 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5668 5669 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5670 // i.e. montmul the first and second halves of vs4 in order and 5671 // then with one sequence reversed storing the two results in vs1 5672 // 5673 // vs1[0] <- montmul(a2, b2) 5674 // vs1[1] <- montmul(a3, b3) 5675 // vs1[2] <- montmul(a2, b3) 5676 // vs1[3] <- montmul(a3, b2) 5677 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5678 kyber_montmul16(vs_back(vs1), 5679 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5680 5681 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5682 // We can schedule two montmuls at a time if we use a suitable vector 5683 // sequence <vs3[1], vs1[1]>. 5684 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5685 VSeq<2> vs5(vs3[1], delta); 5686 5687 // vs3[1] <- montmul(montmul(a1, b1), z0) 5688 // vs1[1] <- montmul(montmul(a3, b3), z1) 5689 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5690 5691 // add results in pairs storing in vs3 5692 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5693 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5694 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5695 5696 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5697 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5698 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5699 5700 // vs1 <- montmul(vs3, montRSquareModQ) 5701 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5702 5703 // store back the two pairs of result vectors de-interleaved as 8H elements 5704 // i.e. storing each pairs of shorts striped across a register pair adjacent 5705 // in memory 5706 vs_st2_post(vs1, __ T8H, result); 5707 5708 __ cmp(result, limit); 5709 __ br(Assembler::NE, kyberNttMult_loop); 5710 5711 __ leave(); // required for proper stackwalking of RuntimeStub frame 5712 __ mov(r0, zr); // return 0 5713 __ ret(lr); 5714 5715 return start; 5716 } 5717 5718 // Kyber add 2 polynomials. 5719 // Implements 5720 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5721 // 5722 // result (short[256]) = c_rarg0 5723 // a (short[256]) = c_rarg1 5724 // b (short[256]) = c_rarg2 5725 address generate_kyberAddPoly_2() { 5726 5727 __ align(CodeEntryAlignment); 5728 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id; 5729 StubCodeMark mark(this, stub_id); 5730 address start = __ pc(); 5731 __ enter(); 5732 5733 const Register result = c_rarg0; 5734 const Register a = c_rarg1; 5735 const Register b = c_rarg2; 5736 5737 const Register kyberConsts = r11; 5738 5739 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5740 // So, we can load, add and store the data in 3 groups of 11, 5741 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5742 // registers. A further constraint is that the mapping needs 5743 // to skip callee saves. So, we allocate the register 5744 // sequences using two 8 sequences, two 2 sequences and two 5745 // single registers. 5746 VSeq<8> vs1_1(0); 5747 VSeq<2> vs1_2(16); 5748 FloatRegister vs1_3 = v28; 5749 VSeq<8> vs2_1(18); 5750 VSeq<2> vs2_2(26); 5751 FloatRegister vs2_3 = v29; 5752 5753 // two constant vector sequences 5754 VSeq<8> vc_1(31, 0); 5755 VSeq<2> vc_2(31, 0); 5756 5757 FloatRegister vc_3 = v31; 5758 __ lea(kyberConsts, 5759 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5760 5761 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5762 for (int i = 0; i < 3; i++) { 5763 // load 80 or 88 values from a into vs1_1/2/3 5764 vs_ldpq_post(vs1_1, a); 5765 vs_ldpq_post(vs1_2, a); 5766 if (i < 2) { 5767 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5768 } 5769 // load 80 or 88 values from b into vs2_1/2/3 5770 vs_ldpq_post(vs2_1, b); 5771 vs_ldpq_post(vs2_2, b); 5772 if (i < 2) { 5773 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5774 } 5775 // sum 80 or 88 values across vs1 and vs2 into vs1 5776 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5777 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5778 if (i < 2) { 5779 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5780 } 5781 // add constant to all 80 or 88 results 5782 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5783 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5784 if (i < 2) { 5785 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5786 } 5787 // store 80 or 88 values 5788 vs_stpq_post(vs1_1, result); 5789 vs_stpq_post(vs1_2, result); 5790 if (i < 2) { 5791 __ str(vs1_3, __ Q, __ post(result, 16)); 5792 } 5793 } 5794 5795 __ leave(); // required for proper stackwalking of RuntimeStub frame 5796 __ mov(r0, zr); // return 0 5797 __ ret(lr); 5798 5799 return start; 5800 } 5801 5802 // Kyber add 3 polynomials. 5803 // Implements 5804 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5805 // 5806 // result (short[256]) = c_rarg0 5807 // a (short[256]) = c_rarg1 5808 // b (short[256]) = c_rarg2 5809 // c (short[256]) = c_rarg3 5810 address generate_kyberAddPoly_3() { 5811 5812 __ align(CodeEntryAlignment); 5813 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id; 5814 StubCodeMark mark(this, stub_id); 5815 address start = __ pc(); 5816 __ enter(); 5817 5818 const Register result = c_rarg0; 5819 const Register a = c_rarg1; 5820 const Register b = c_rarg2; 5821 const Register c = c_rarg3; 5822 5823 const Register kyberConsts = r11; 5824 5825 // As above we sum 256 sets of values in total i.e. 32 x 8H 5826 // quadwords. So, we can load, add and store the data in 3 5827 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5828 // of 10 or 11 registers. A further constraint is that the 5829 // mapping needs to skip callee saves. So, we allocate the 5830 // register sequences using two 8 sequences, two 2 sequences 5831 // and two single registers. 5832 VSeq<8> vs1_1(0); 5833 VSeq<2> vs1_2(16); 5834 FloatRegister vs1_3 = v28; 5835 VSeq<8> vs2_1(18); 5836 VSeq<2> vs2_2(26); 5837 FloatRegister vs2_3 = v29; 5838 5839 // two constant vector sequences 5840 VSeq<8> vc_1(31, 0); 5841 VSeq<2> vc_2(31, 0); 5842 5843 FloatRegister vc_3 = v31; 5844 5845 __ lea(kyberConsts, 5846 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5847 5848 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5849 for (int i = 0; i < 3; i++) { 5850 // load 80 or 88 values from a into vs1_1/2/3 5851 vs_ldpq_post(vs1_1, a); 5852 vs_ldpq_post(vs1_2, a); 5853 if (i < 2) { 5854 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5855 } 5856 // load 80 or 88 values from b into vs2_1/2/3 5857 vs_ldpq_post(vs2_1, b); 5858 vs_ldpq_post(vs2_2, b); 5859 if (i < 2) { 5860 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5861 } 5862 // sum 80 or 88 values across vs1 and vs2 into vs1 5863 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5864 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5865 if (i < 2) { 5866 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5867 } 5868 // load 80 or 88 values from c into vs2_1/2/3 5869 vs_ldpq_post(vs2_1, c); 5870 vs_ldpq_post(vs2_2, c); 5871 if (i < 2) { 5872 __ ldr(vs2_3, __ Q, __ post(c, 16)); 5873 } 5874 // sum 80 or 88 values across vs1 and vs2 into vs1 5875 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5876 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5877 if (i < 2) { 5878 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5879 } 5880 // add constant to all 80 or 88 results 5881 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5882 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5883 if (i < 2) { 5884 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5885 } 5886 // store 80 or 88 values 5887 vs_stpq_post(vs1_1, result); 5888 vs_stpq_post(vs1_2, result); 5889 if (i < 2) { 5890 __ str(vs1_3, __ Q, __ post(result, 16)); 5891 } 5892 } 5893 5894 __ leave(); // required for proper stackwalking of RuntimeStub frame 5895 __ mov(r0, zr); // return 0 5896 __ ret(lr); 5897 5898 return start; 5899 } 5900 5901 // Kyber parse XOF output to polynomial coefficient candidates 5902 // or decodePoly(12, ...). 5903 // Implements 5904 // static int implKyber12To16( 5905 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 5906 // 5907 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 5908 // 5909 // condensed (byte[]) = c_rarg0 5910 // condensedIndex = c_rarg1 5911 // parsed (short[112 or 256]) = c_rarg2 5912 // parsedLength (112 or 256) = c_rarg3 5913 address generate_kyber12To16() { 5914 Label L_F00, L_loop, L_end; 5915 5916 __ BIND(L_F00); 5917 __ emit_int64(0x0f000f000f000f00); 5918 __ emit_int64(0x0f000f000f000f00); 5919 5920 __ align(CodeEntryAlignment); 5921 StubGenStubId stub_id = StubGenStubId::kyber12To16_id; 5922 StubCodeMark mark(this, stub_id); 5923 address start = __ pc(); 5924 __ enter(); 5925 5926 const Register condensed = c_rarg0; 5927 const Register condensedOffs = c_rarg1; 5928 const Register parsed = c_rarg2; 5929 const Register parsedLength = c_rarg3; 5930 5931 const Register tmpAddr = r11; 5932 5933 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 5934 // quadwords so we need a 6 vector sequence for the inputs. 5935 // Parsing produces 64 shorts, employing two 8 vector 5936 // sequences to store and combine the intermediate data. 5937 VSeq<6> vin(24); 5938 VSeq<8> va(0), vb(16); 5939 5940 __ adr(tmpAddr, L_F00); 5941 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 5942 __ add(condensed, condensed, condensedOffs); 5943 5944 __ BIND(L_loop); 5945 // load 96 (6 x 16B) byte values 5946 vs_ld3_post(vin, __ T16B, condensed); 5947 5948 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 5949 // holds 48 (16x3) contiguous bytes from memory striped 5950 // horizontally across each of the 16 byte lanes. Equivalently, 5951 // that is 16 pairs of 12-bit integers. Likewise the back half 5952 // holds the next 48 bytes in the same arrangement. 5953 5954 // Each vector in the front half can also be viewed as a vertical 5955 // strip across the 16 pairs of 12 bit integers. Each byte in 5956 // vin[0] stores the low 8 bits of the first int in a pair. Each 5957 // byte in vin[1] stores the high 4 bits of the first int and the 5958 // low 4 bits of the second int. Each byte in vin[2] stores the 5959 // high 8 bits of the second int. Likewise the vectors in second 5960 // half. 5961 5962 // Converting the data to 16-bit shorts requires first of all 5963 // expanding each of the 6 x 16B vectors into 6 corresponding 5964 // pairs of 8H vectors. Mask, shift and add operations on the 5965 // resulting vector pairs can be used to combine 4 and 8 bit 5966 // parts of related 8H vector elements. 5967 // 5968 // The middle vectors (vin[2] and vin[5]) are actually expanded 5969 // twice, one copy manipulated to provide the lower 4 bits 5970 // belonging to the first short in a pair and another copy 5971 // manipulated to provide the higher 4 bits belonging to the 5972 // second short in a pair. This is why the the vector sequences va 5973 // and vb used to hold the expanded 8H elements are of length 8. 5974 5975 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 5976 // n.b. target elements 2 and 3 duplicate elements 4 and 5 5977 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 5978 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 5979 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 5980 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 5981 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 5982 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 5983 5984 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 5985 // and vb[4:5] 5986 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 5987 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 5988 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 5989 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 5990 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 5991 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 5992 5993 // shift lo byte of copy 1 of the middle stripe into the high byte 5994 __ shl(va[2], __ T8H, va[2], 8); 5995 __ shl(va[3], __ T8H, va[3], 8); 5996 __ shl(vb[2], __ T8H, vb[2], 8); 5997 __ shl(vb[3], __ T8H, vb[3], 8); 5998 5999 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6000 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6001 // are in bit positions [4..11]. 6002 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6003 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6004 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6005 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6006 6007 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6008 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6009 // copy2 6010 __ andr(va[2], __ T16B, va[2], v31); 6011 __ andr(va[3], __ T16B, va[3], v31); 6012 __ ushr(va[4], __ T8H, va[4], 4); 6013 __ ushr(va[5], __ T8H, va[5], 4); 6014 __ andr(vb[2], __ T16B, vb[2], v31); 6015 __ andr(vb[3], __ T16B, vb[3], v31); 6016 __ ushr(vb[4], __ T8H, vb[4], 4); 6017 __ ushr(vb[5], __ T8H, vb[5], 4); 6018 6019 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6020 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6021 // n.b. the ordering ensures: i) inputs are consumed before they 6022 // are overwritten ii) the order of 16-bit results across successive 6023 // pairs of vectors in va and then vb reflects the order of the 6024 // corresponding 12-bit inputs 6025 __ addv(va[0], __ T8H, va[0], va[2]); 6026 __ addv(va[2], __ T8H, va[1], va[3]); 6027 __ addv(va[1], __ T8H, va[4], va[6]); 6028 __ addv(va[3], __ T8H, va[5], va[7]); 6029 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6030 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6031 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6032 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6033 6034 // store 64 results interleaved as shorts 6035 vs_st2_post(vs_front(va), __ T8H, parsed); 6036 vs_st2_post(vs_front(vb), __ T8H, parsed); 6037 6038 __ sub(parsedLength, parsedLength, 64); 6039 __ cmp(parsedLength, (u1)64); 6040 __ br(Assembler::GE, L_loop); 6041 __ cbz(parsedLength, L_end); 6042 6043 // if anything is left it should be a final 72 bytes of input 6044 // i.e. a final 48 12-bit values. so we handle this by loading 6045 // 48 bytes into all 16B lanes of front(vin) and only 24 6046 // bytes into the lower 8B lane of back(vin) 6047 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6048 vs_ld3(vs_back(vin), __ T8B, condensed); 6049 6050 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6051 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6052 // 5 and target element 2 of vb duplicates element 4. 6053 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6054 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6055 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6056 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6057 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6058 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6059 6060 // This time expand just the lower 8 lanes 6061 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6062 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6063 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6064 6065 // shift lo byte of copy 1 of the middle stripe into the high byte 6066 __ shl(va[2], __ T8H, va[2], 8); 6067 __ shl(va[3], __ T8H, va[3], 8); 6068 __ shl(vb[2], __ T8H, vb[2], 8); 6069 6070 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6071 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6072 // int are in bit positions [4..11]. 6073 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6074 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6075 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6076 6077 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6078 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6079 // copy2 6080 __ andr(va[2], __ T16B, va[2], v31); 6081 __ andr(va[3], __ T16B, va[3], v31); 6082 __ ushr(va[4], __ T8H, va[4], 4); 6083 __ ushr(va[5], __ T8H, va[5], 4); 6084 __ andr(vb[2], __ T16B, vb[2], v31); 6085 __ ushr(vb[4], __ T8H, vb[4], 4); 6086 6087 6088 6089 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6090 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6091 6092 // n.b. ordering ensures: i) inputs are consumed before they are 6093 // overwritten ii) order of 16-bit results across succsessive 6094 // pairs of vectors in va and then lower half of vb reflects order 6095 // of corresponding 12-bit inputs 6096 __ addv(va[0], __ T8H, va[0], va[2]); 6097 __ addv(va[2], __ T8H, va[1], va[3]); 6098 __ addv(va[1], __ T8H, va[4], va[6]); 6099 __ addv(va[3], __ T8H, va[5], va[7]); 6100 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6101 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6102 6103 // store 48 results interleaved as shorts 6104 vs_st2_post(vs_front(va), __ T8H, parsed); 6105 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6106 6107 __ BIND(L_end); 6108 6109 __ leave(); // required for proper stackwalking of RuntimeStub frame 6110 __ mov(r0, zr); // return 0 6111 __ ret(lr); 6112 6113 return start; 6114 } 6115 6116 // Kyber Barrett reduce function. 6117 // Implements 6118 // static int implKyberBarrettReduce(short[] coeffs) {} 6119 // 6120 // coeffs (short[256]) = c_rarg0 6121 address generate_kyberBarrettReduce() { 6122 6123 __ align(CodeEntryAlignment); 6124 StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id; 6125 StubCodeMark mark(this, stub_id); 6126 address start = __ pc(); 6127 __ enter(); 6128 6129 const Register coeffs = c_rarg0; 6130 6131 const Register kyberConsts = r10; 6132 const Register result = r11; 6133 6134 // As above we process 256 sets of values in total i.e. 32 x 6135 // 8H quadwords. So, we can load, add and store the data in 3 6136 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6137 // of 10 or 11 registers. A further constraint is that the 6138 // mapping needs to skip callee saves. So, we allocate the 6139 // register sequences using two 8 sequences, two 2 sequences 6140 // and two single registers. 6141 VSeq<8> vs1_1(0); 6142 VSeq<2> vs1_2(16); 6143 FloatRegister vs1_3 = v28; 6144 VSeq<8> vs2_1(18); 6145 VSeq<2> vs2_2(26); 6146 FloatRegister vs2_3 = v29; 6147 6148 // we also need a pair of corresponding constant sequences 6149 6150 VSeq<8> vc1_1(30, 0); 6151 VSeq<2> vc1_2(30, 0); 6152 FloatRegister vc1_3 = v30; // for kyber_q 6153 6154 VSeq<8> vc2_1(31, 0); 6155 VSeq<2> vc2_2(31, 0); 6156 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6157 6158 __ add(result, coeffs, 0); 6159 __ lea(kyberConsts, 6160 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6161 6162 // load q and the multiplier for the Barrett reduction 6163 __ add(kyberConsts, kyberConsts, 16); 6164 __ ldpq(vc1_3, vc2_3, kyberConsts); 6165 6166 for (int i = 0; i < 3; i++) { 6167 // load 80 or 88 coefficients 6168 vs_ldpq_post(vs1_1, coeffs); 6169 vs_ldpq_post(vs1_2, coeffs); 6170 if (i < 2) { 6171 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6172 } 6173 6174 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6175 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6176 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6177 if (i < 2) { 6178 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6179 } 6180 6181 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6182 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6183 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6184 if (i < 2) { 6185 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6186 } 6187 6188 // vs1 <- vs1 - vs2 * kyber_q 6189 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6190 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6191 if (i < 2) { 6192 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6193 } 6194 6195 vs_stpq_post(vs1_1, result); 6196 vs_stpq_post(vs1_2, result); 6197 if (i < 2) { 6198 __ str(vs1_3, __ Q, __ post(result, 16)); 6199 } 6200 } 6201 6202 __ leave(); // required for proper stackwalking of RuntimeStub frame 6203 __ mov(r0, zr); // return 0 6204 __ ret(lr); 6205 6206 return start; 6207 } 6208 6209 6210 // Dilithium-specific montmul helper routines that generate parallel 6211 // code for, respectively, a single 4x4s vector sequence montmul or 6212 // two such multiplies in a row. 6213 6214 // Perform 16 32-bit Montgomery multiplications in parallel 6215 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6216 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6217 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6218 // It will assert that the register use is valid 6219 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6220 } 6221 6222 // Perform 2x16 32-bit Montgomery multiplications in parallel 6223 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6224 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6225 // Schedule two successive 4x4S multiplies via the montmul helper 6226 // on the front and back halves of va, vb and vc. The helper will 6227 // assert that the register use has no overlap conflicts on each 6228 // individual call but we also need to ensure that the necessary 6229 // disjoint/equality constraints are met across both calls. 6230 6231 // vb, vc, vtmp and vq must be disjoint. va must either be 6232 // disjoint from all other registers or equal vc 6233 6234 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6235 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6236 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6237 6238 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6239 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6240 6241 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6242 6243 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6244 assert(vs_disjoint(va, vb), "va and vb overlap"); 6245 assert(vs_disjoint(va, vq), "va and vq overlap"); 6246 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6247 6248 // We multiply the front and back halves of each sequence 4 at a 6249 // time because 6250 // 6251 // 1) we are currently only able to get 4-way instruction 6252 // parallelism at best 6253 // 6254 // 2) we need registers for the constants in vq and temporary 6255 // scratch registers to hold intermediate results so vtmp can only 6256 // be a VSeq<4> which means we only have 4 scratch slots. 6257 6258 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6259 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6260 } 6261 6262 // Perform combined montmul then add/sub on 4x4S vectors. 6263 void dilithium_montmul16_sub_add( 6264 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6265 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6266 // compute a = montmul(a1, c) 6267 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6268 // ouptut a1 = a0 - a 6269 vs_subv(va1, __ T4S, va0, vc); 6270 // and a0 = a0 + a 6271 vs_addv(va0, __ T4S, va0, vc); 6272 } 6273 6274 // Perform combined add/sub then montul on 4x4S vectors. 6275 void dilithium_sub_add_montmul16( 6276 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6277 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6278 // compute c = a0 - a1 6279 vs_subv(vtmp1, __ T4S, va0, va1); 6280 // output a0 = a0 + a1 6281 vs_addv(va0, __ T4S, va0, va1); 6282 // output a1 = b montmul c 6283 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6284 } 6285 6286 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6287 // in the Java implementation come in sequences of at least 8, so we 6288 // can use ldpq to collect the corresponding data into pairs of vector 6289 // registers. 6290 // We collect the coefficients corresponding to the 'j+l' indexes into 6291 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6292 // then we do the (Montgomery) multiplications by the zetas in parallel 6293 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6294 // v0-v7, then do the additions into v24-v31 and the subtractions into 6295 // v0-v7 and finally save the results back to the coeffs array. 6296 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6297 const Register coeffs, const Register zetas) { 6298 int c1 = 0; 6299 int c2 = 512; 6300 int startIncr; 6301 // don't use callee save registers v8 - v15 6302 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6303 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6304 VSeq<2> vq(30); // n.b. constants overlap vs3 6305 int offsets[4] = { 0, 32, 64, 96 }; 6306 6307 for (int level = 0; level < 5; level++) { 6308 int c1Start = c1; 6309 int c2Start = c2; 6310 if (level == 3) { 6311 offsets[1] = 32; 6312 offsets[2] = 128; 6313 offsets[3] = 160; 6314 } else if (level == 4) { 6315 offsets[1] = 64; 6316 offsets[2] = 128; 6317 offsets[3] = 192; 6318 } 6319 6320 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6321 // time at 4 different offsets and multiply them in order by the 6322 // next set of input values. So we employ indexed load and store 6323 // pair instructions with arrangement 4S. 6324 for (int i = 0; i < 4; i++) { 6325 // reload q and qinv 6326 vs_ldpq(vq, dilithiumConsts); // qInv, q 6327 // load 8x4S coefficients via second start pos == c2 6328 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6329 // load next 8x4S inputs == b 6330 vs_ldpq_post(vs2, zetas); 6331 // compute a == c2 * b mod MONT_Q 6332 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6333 // load 8x4s coefficients via first start pos == c1 6334 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6335 // compute a1 = c1 + a 6336 vs_addv(vs3, __ T4S, vs1, vs2); 6337 // compute a2 = c1 - a 6338 vs_subv(vs1, __ T4S, vs1, vs2); 6339 // output a1 and a2 6340 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6341 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6342 6343 int k = 4 * level + i; 6344 6345 if (k > 7) { 6346 startIncr = 256; 6347 } else if (k == 5) { 6348 startIncr = 384; 6349 } else { 6350 startIncr = 128; 6351 } 6352 6353 c1Start += startIncr; 6354 c2Start += startIncr; 6355 } 6356 6357 c2 /= 2; 6358 } 6359 } 6360 6361 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6362 // Implements the method 6363 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6364 // of the Java class sun.security.provider 6365 // 6366 // coeffs (int[256]) = c_rarg0 6367 // zetas (int[256]) = c_rarg1 6368 address generate_dilithiumAlmostNtt() { 6369 6370 __ align(CodeEntryAlignment); 6371 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 6372 StubCodeMark mark(this, stub_id); 6373 address start = __ pc(); 6374 __ enter(); 6375 6376 const Register coeffs = c_rarg0; 6377 const Register zetas = c_rarg1; 6378 6379 const Register tmpAddr = r9; 6380 const Register dilithiumConsts = r10; 6381 const Register result = r11; 6382 // don't use callee save registers v8 - v15 6383 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6384 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6385 VSeq<2> vq(30); // n.b. constants overlap vs3 6386 int offsets[4] = { 0, 32, 64, 96}; 6387 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6388 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6389 __ add(result, coeffs, 0); 6390 __ lea(dilithiumConsts, 6391 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6392 6393 // Each level represents one iteration of the outer for loop of the Java version. 6394 6395 // level 0-4 6396 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6397 6398 // level 5 6399 6400 // At level 5 the coefficients we need to combine with the zetas 6401 // are grouped in memory in blocks of size 4. So, for both sets of 6402 // coefficients we load 4 adjacent values at 8 different offsets 6403 // using an indexed ldr with register variant Q and multiply them 6404 // in sequence order by the next set of inputs. Likewise we store 6405 // the resuls using an indexed str with register variant Q. 6406 for (int i = 0; i < 1024; i += 256) { 6407 // reload constants q, qinv each iteration as they get clobbered later 6408 vs_ldpq(vq, dilithiumConsts); // qInv, q 6409 // load 32 (8x4S) coefficients via first offsets = c1 6410 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6411 // load next 32 (8x4S) inputs = b 6412 vs_ldpq_post(vs2, zetas); 6413 // a = b montul c1 6414 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6415 // load 32 (8x4S) coefficients via second offsets = c2 6416 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6417 // add/sub with result of multiply 6418 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6419 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6420 // write back new coefficients using same offsets 6421 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6422 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6423 } 6424 6425 // level 6 6426 // At level 6 the coefficients we need to combine with the zetas 6427 // are grouped in memory in pairs, the first two being montmul 6428 // inputs and the second add/sub inputs. We can still implement 6429 // the montmul+sub+add using 4-way parallelism but only if we 6430 // combine the coefficients with the zetas 16 at a time. We load 8 6431 // adjacent values at 4 different offsets using an ld2 load with 6432 // arrangement 2D. That interleaves the lower and upper halves of 6433 // each pair of quadwords into successive vector registers. We 6434 // then need to montmul the 4 even elements of the coefficients 6435 // register sequence by the zetas in order and then add/sub the 4 6436 // odd elements of the coefficients register sequence. We use an 6437 // equivalent st2 operation to store the results back into memory 6438 // de-interleaved. 6439 for (int i = 0; i < 1024; i += 128) { 6440 // reload constants q, qinv each iteration as they get clobbered later 6441 vs_ldpq(vq, dilithiumConsts); // qInv, q 6442 // load interleaved 16 (4x2D) coefficients via offsets 6443 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6444 // load next 16 (4x4S) inputs 6445 vs_ldpq_post(vs_front(vs2), zetas); 6446 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6447 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6448 vs_front(vs2), vtmp, vq); 6449 // store interleaved 16 (4x2D) coefficients via offsets 6450 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6451 } 6452 6453 // level 7 6454 // At level 7 the coefficients we need to combine with the zetas 6455 // occur singly with montmul inputs alterating with add/sub 6456 // inputs. Once again we can use 4-way parallelism to combine 16 6457 // zetas at a time. However, we have to load 8 adjacent values at 6458 // 4 different offsets using an ld2 load with arrangement 4S. That 6459 // interleaves the the odd words of each pair into one 6460 // coefficients vector register and the even words of the pair 6461 // into the next register. We then need to montmul the 4 even 6462 // elements of the coefficients register sequence by the zetas in 6463 // order and then add/sub the 4 odd elements of the coefficients 6464 // register sequence. We use an equivalent st2 operation to store 6465 // the results back into memory de-interleaved. 6466 6467 for (int i = 0; i < 1024; i += 128) { 6468 // reload constants q, qinv each iteration as they get clobbered later 6469 vs_ldpq(vq, dilithiumConsts); // qInv, q 6470 // load interleaved 16 (4x4S) coefficients via offsets 6471 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6472 // load next 16 (4x4S) inputs 6473 vs_ldpq_post(vs_front(vs2), zetas); 6474 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6475 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6476 vs_front(vs2), vtmp, vq); 6477 // store interleaved 16 (4x4S) coefficients via offsets 6478 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6479 } 6480 __ leave(); // required for proper stackwalking of RuntimeStub frame 6481 __ mov(r0, zr); // return 0 6482 __ ret(lr); 6483 6484 return start; 6485 } 6486 6487 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6488 // in the Java implementation come in sequences of at least 8, so we 6489 // can use ldpq to collect the corresponding data into pairs of vector 6490 // registers 6491 // We collect the coefficients that correspond to the 'j's into vs1 6492 // the coefficiets that correspond to the 'j+l's into vs2 then 6493 // do the additions into vs3 and the subtractions into vs1 then 6494 // save the result of the additions, load the zetas into vs2 6495 // do the (Montgomery) multiplications by zeta in parallel into vs2 6496 // finally save the results back to the coeffs array 6497 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6498 const Register coeffs, const Register zetas) { 6499 int c1 = 0; 6500 int c2 = 32; 6501 int startIncr; 6502 int offsets[4]; 6503 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6504 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6505 VSeq<2> vq(30); // n.b. constants overlap vs3 6506 6507 offsets[0] = 0; 6508 6509 for (int level = 3; level < 8; level++) { 6510 int c1Start = c1; 6511 int c2Start = c2; 6512 if (level == 3) { 6513 offsets[1] = 64; 6514 offsets[2] = 128; 6515 offsets[3] = 192; 6516 } else if (level == 4) { 6517 offsets[1] = 32; 6518 offsets[2] = 128; 6519 offsets[3] = 160; 6520 } else { 6521 offsets[1] = 32; 6522 offsets[2] = 64; 6523 offsets[3] = 96; 6524 } 6525 6526 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6527 // time at 4 different offsets and multiply them in order by the 6528 // next set of input values. So we employ indexed load and store 6529 // pair instructions with arrangement 4S. 6530 for (int i = 0; i < 4; i++) { 6531 // load v1 32 (8x4S) coefficients relative to first start index 6532 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6533 // load v2 32 (8x4S) coefficients relative to second start index 6534 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6535 // a0 = v1 + v2 -- n.b. clobbers vqs 6536 vs_addv(vs3, __ T4S, vs1, vs2); 6537 // a1 = v1 - v2 6538 vs_subv(vs1, __ T4S, vs1, vs2); 6539 // save a1 relative to first start index 6540 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6541 // load constants q, qinv each iteration as they get clobbered above 6542 vs_ldpq(vq, dilithiumConsts); // qInv, q 6543 // load b next 32 (8x4S) inputs 6544 vs_ldpq_post(vs2, zetas); 6545 // a = a1 montmul b 6546 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6547 // save a relative to second start index 6548 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6549 6550 int k = 4 * level + i; 6551 6552 if (k < 24) { 6553 startIncr = 256; 6554 } else if (k == 25) { 6555 startIncr = 384; 6556 } else { 6557 startIncr = 128; 6558 } 6559 6560 c1Start += startIncr; 6561 c2Start += startIncr; 6562 } 6563 6564 c2 *= 2; 6565 } 6566 } 6567 6568 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6569 // Implements the method 6570 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6571 // the sun.security.provider.ML_DSA class. 6572 // 6573 // coeffs (int[256]) = c_rarg0 6574 // zetas (int[256]) = c_rarg1 6575 address generate_dilithiumAlmostInverseNtt() { 6576 6577 __ align(CodeEntryAlignment); 6578 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 6579 StubCodeMark mark(this, stub_id); 6580 address start = __ pc(); 6581 __ enter(); 6582 6583 const Register coeffs = c_rarg0; 6584 const Register zetas = c_rarg1; 6585 6586 const Register tmpAddr = r9; 6587 const Register dilithiumConsts = r10; 6588 const Register result = r11; 6589 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6590 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6591 VSeq<2> vq(30); // n.b. constants overlap vs3 6592 int offsets[4] = { 0, 32, 64, 96 }; 6593 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6594 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6595 6596 __ add(result, coeffs, 0); 6597 __ lea(dilithiumConsts, 6598 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6599 6600 // Each level represents one iteration of the outer for loop of the Java version 6601 6602 // level 0 6603 // At level 0 we need to interleave adjacent quartets of 6604 // coefficients before we multiply and add/sub by the next 16 6605 // zetas just as we did for level 7 in the multiply code. So we 6606 // load and store the values using an ld2/st2 with arrangement 4S. 6607 for (int i = 0; i < 1024; i += 128) { 6608 // load constants q, qinv 6609 // n.b. this can be moved out of the loop as they do not get 6610 // clobbered by first two loops 6611 vs_ldpq(vq, dilithiumConsts); // qInv, q 6612 // a0/a1 load interleaved 32 (8x4S) coefficients 6613 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6614 // b load next 32 (8x4S) inputs 6615 vs_ldpq_post(vs_front(vs2), zetas); 6616 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6617 // n.b. second half of vs2 provides temporary register storage 6618 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6619 vs_front(vs2), vs_back(vs2), vtmp, vq); 6620 // a0/a1 store interleaved 32 (8x4S) coefficients 6621 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6622 } 6623 6624 // level 1 6625 // At level 1 we need to interleave pairs of adjacent pairs of 6626 // coefficients before we multiply by the next 16 zetas just as we 6627 // did for level 6 in the multiply code. So we load and store the 6628 // values an ld2/st2 with arrangement 2D. 6629 for (int i = 0; i < 1024; i += 128) { 6630 // a0/a1 load interleaved 32 (8x2D) coefficients 6631 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6632 // b load next 16 (4x4S) inputs 6633 vs_ldpq_post(vs_front(vs2), zetas); 6634 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6635 // n.b. second half of vs2 provides temporary register storage 6636 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6637 vs_front(vs2), vs_back(vs2), vtmp, vq); 6638 // a0/a1 store interleaved 32 (8x2D) coefficients 6639 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6640 } 6641 6642 // level 2 6643 // At level 2 coefficients come in blocks of 4. So, we load 4 6644 // adjacent coefficients at 8 distinct offsets for both the first 6645 // and second coefficient sequences, using an ldr with register 6646 // variant Q then combine them with next set of 32 zetas. Likewise 6647 // we store the results using an str with register variant Q. 6648 for (int i = 0; i < 1024; i += 256) { 6649 // c0 load 32 (8x4S) coefficients via first offsets 6650 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6651 // c1 load 32 (8x4S) coefficients via second offsets 6652 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6653 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6654 vs_addv(vs3, __ T4S, vs1, vs2); 6655 // c = c0 - c1 6656 vs_subv(vs1, __ T4S, vs1, vs2); 6657 // store a0 32 (8x4S) coefficients via first offsets 6658 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6659 // b load 32 (8x4S) next inputs 6660 vs_ldpq_post(vs2, zetas); 6661 // reload constants q, qinv -- they were clobbered earlier 6662 vs_ldpq(vq, dilithiumConsts); // qInv, q 6663 // compute a1 = b montmul c 6664 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6665 // store a1 32 (8x4S) coefficients via second offsets 6666 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6667 } 6668 6669 // level 3-7 6670 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6671 6672 __ leave(); // required for proper stackwalking of RuntimeStub frame 6673 __ mov(r0, zr); // return 0 6674 __ ret(lr); 6675 6676 return start; 6677 } 6678 6679 // Dilithium multiply polynomials in the NTT domain. 6680 // Straightforward implementation of the method 6681 // static int implDilithiumNttMult( 6682 // int[] result, int[] ntta, int[] nttb {} of 6683 // the sun.security.provider.ML_DSA class. 6684 // 6685 // result (int[256]) = c_rarg0 6686 // poly1 (int[256]) = c_rarg1 6687 // poly2 (int[256]) = c_rarg2 6688 address generate_dilithiumNttMult() { 6689 6690 __ align(CodeEntryAlignment); 6691 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 6692 StubCodeMark mark(this, stub_id); 6693 address start = __ pc(); 6694 __ enter(); 6695 6696 Label L_loop; 6697 6698 const Register result = c_rarg0; 6699 const Register poly1 = c_rarg1; 6700 const Register poly2 = c_rarg2; 6701 6702 const Register dilithiumConsts = r10; 6703 const Register len = r11; 6704 6705 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6706 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6707 VSeq<2> vq(30); // n.b. constants overlap vs3 6708 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6709 6710 __ lea(dilithiumConsts, 6711 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6712 6713 // load constants q, qinv 6714 vs_ldpq(vq, dilithiumConsts); // qInv, q 6715 // load constant rSquare into v29 6716 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6717 6718 __ mov(len, zr); 6719 __ add(len, len, 1024); 6720 6721 __ BIND(L_loop); 6722 6723 // b load 32 (8x4S) next inputs from poly1 6724 vs_ldpq_post(vs1, poly1); 6725 // c load 32 (8x4S) next inputs from poly2 6726 vs_ldpq_post(vs2, poly2); 6727 // compute a = b montmul c 6728 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6729 // compute a = rsquare montmul a 6730 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6731 // save a 32 (8x4S) results 6732 vs_stpq_post(vs2, result); 6733 6734 __ sub(len, len, 128); 6735 __ cmp(len, (u1)128); 6736 __ br(Assembler::GE, L_loop); 6737 6738 __ leave(); // required for proper stackwalking of RuntimeStub frame 6739 __ mov(r0, zr); // return 0 6740 __ ret(lr); 6741 6742 return start; 6743 } 6744 6745 // Dilithium Motgomery multiply an array by a constant. 6746 // A straightforward implementation of the method 6747 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6748 // of the sun.security.provider.MLDSA class 6749 // 6750 // coeffs (int[256]) = c_rarg0 6751 // constant (int) = c_rarg1 6752 address generate_dilithiumMontMulByConstant() { 6753 6754 __ align(CodeEntryAlignment); 6755 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 6756 StubCodeMark mark(this, stub_id); 6757 address start = __ pc(); 6758 __ enter(); 6759 6760 Label L_loop; 6761 6762 const Register coeffs = c_rarg0; 6763 const Register constant = c_rarg1; 6764 6765 const Register dilithiumConsts = r10; 6766 const Register result = r11; 6767 const Register len = r12; 6768 6769 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6770 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6771 VSeq<2> vq(30); // n.b. constants overlap vs3 6772 VSeq<8> vconst(29, 0); // for montmul by constant 6773 6774 // results track inputs 6775 __ add(result, coeffs, 0); 6776 __ lea(dilithiumConsts, 6777 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6778 6779 // load constants q, qinv -- they do not get clobbered by first two loops 6780 vs_ldpq(vq, dilithiumConsts); // qInv, q 6781 // copy caller supplied constant across vconst 6782 __ dup(vconst[0], __ T4S, constant); 6783 __ mov(len, zr); 6784 __ add(len, len, 1024); 6785 6786 __ BIND(L_loop); 6787 6788 // load next 32 inputs 6789 vs_ldpq_post(vs2, coeffs); 6790 // mont mul by constant 6791 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6792 // write next 32 results 6793 vs_stpq_post(vs2, result); 6794 6795 __ sub(len, len, 128); 6796 __ cmp(len, (u1)128); 6797 __ br(Assembler::GE, L_loop); 6798 6799 __ leave(); // required for proper stackwalking of RuntimeStub frame 6800 __ mov(r0, zr); // return 0 6801 __ ret(lr); 6802 6803 return start; 6804 } 6805 6806 // Dilithium decompose poly. 6807 // Implements the method 6808 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6809 // of the sun.security.provider.ML_DSA class 6810 // 6811 // input (int[256]) = c_rarg0 6812 // lowPart (int[256]) = c_rarg1 6813 // highPart (int[256]) = c_rarg2 6814 // twoGamma2 (int) = c_rarg3 6815 // multiplier (int) = c_rarg4 6816 address generate_dilithiumDecomposePoly() { 6817 6818 __ align(CodeEntryAlignment); 6819 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 6820 StubCodeMark mark(this, stub_id); 6821 address start = __ pc(); 6822 Label L_loop; 6823 6824 const Register input = c_rarg0; 6825 const Register lowPart = c_rarg1; 6826 const Register highPart = c_rarg2; 6827 const Register twoGamma2 = c_rarg3; 6828 const Register multiplier = c_rarg4; 6829 6830 const Register len = r9; 6831 const Register dilithiumConsts = r10; 6832 const Register tmp = r11; 6833 6834 // 6 independent sets of 4x4s values 6835 VSeq<4> vs1(0), vs2(4), vs3(8); 6836 VSeq<4> vs4(12), vs5(16), vtmp(20); 6837 6838 // 7 constants for cross-multiplying 6839 VSeq<4> one(25, 0); 6840 VSeq<4> qminus1(26, 0); 6841 VSeq<4> g2(27, 0); 6842 VSeq<4> twog2(28, 0); 6843 VSeq<4> mult(29, 0); 6844 VSeq<4> q(30, 0); 6845 VSeq<4> qadd(31, 0); 6846 6847 __ enter(); 6848 6849 __ lea(dilithiumConsts, 6850 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6851 6852 // save callee-saved registers 6853 __ stpd(v8, v9, __ pre(sp, -64)); 6854 __ stpd(v10, v11, Address(sp, 16)); 6855 __ stpd(v12, v13, Address(sp, 32)); 6856 __ stpd(v14, v15, Address(sp, 48)); 6857 6858 // populate constant registers 6859 __ mov(tmp, zr); 6860 __ add(tmp, tmp, 1); 6861 __ dup(one[0], __ T4S, tmp); // 1 6862 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 6863 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 6864 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 6865 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 6866 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 6867 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 6868 6869 __ mov(len, zr); 6870 __ add(len, len, 1024); 6871 6872 __ BIND(L_loop); 6873 6874 // load next 4x4S inputs interleaved: rplus --> vs1 6875 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 6876 6877 // rplus = rplus - ((rplus + qadd) >> 23) * q 6878 vs_addv(vtmp, __ T4S, vs1, qadd); 6879 vs_sshr(vtmp, __ T4S, vtmp, 23); 6880 vs_mulv(vtmp, __ T4S, vtmp, q); 6881 vs_subv(vs1, __ T4S, vs1, vtmp); 6882 6883 // rplus = rplus + ((rplus >> 31) & dilithium_q); 6884 vs_sshr(vtmp, __ T4S, vs1, 31); 6885 vs_andr(vtmp, vtmp, q); 6886 vs_addv(vs1, __ T4S, vs1, vtmp); 6887 6888 // quotient --> vs2 6889 // int quotient = (rplus * multiplier) >> 22; 6890 vs_mulv(vtmp, __ T4S, vs1, mult); 6891 vs_sshr(vs2, __ T4S, vtmp, 22); 6892 6893 // r0 --> vs3 6894 // int r0 = rplus - quotient * twoGamma2; 6895 vs_mulv(vtmp, __ T4S, vs2, twog2); 6896 vs_subv(vs3, __ T4S, vs1, vtmp); 6897 6898 // mask --> vs4 6899 // int mask = (twoGamma2 - r0) >> 22; 6900 vs_subv(vtmp, __ T4S, twog2, vs3); 6901 vs_sshr(vs4, __ T4S, vtmp, 22); 6902 6903 // r0 -= (mask & twoGamma2); 6904 vs_andr(vtmp, vs4, twog2); 6905 vs_subv(vs3, __ T4S, vs3, vtmp); 6906 6907 // quotient += (mask & 1); 6908 vs_andr(vtmp, vs4, one); 6909 vs_addv(vs2, __ T4S, vs2, vtmp); 6910 6911 // mask = (twoGamma2 / 2 - r0) >> 31; 6912 vs_subv(vtmp, __ T4S, g2, vs3); 6913 vs_sshr(vs4, __ T4S, vtmp, 31); 6914 6915 // r0 -= (mask & twoGamma2); 6916 vs_andr(vtmp, vs4, twog2); 6917 vs_subv(vs3, __ T4S, vs3, vtmp); 6918 6919 // quotient += (mask & 1); 6920 vs_andr(vtmp, vs4, one); 6921 vs_addv(vs2, __ T4S, vs2, vtmp); 6922 6923 // r1 --> vs5 6924 // int r1 = rplus - r0 - (dilithium_q - 1); 6925 vs_subv(vtmp, __ T4S, vs1, vs3); 6926 vs_subv(vs5, __ T4S, vtmp, qminus1); 6927 6928 // r1 --> vs1 (overwriting rplus) 6929 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 6930 vs_negr(vtmp, __ T4S, vs5); 6931 vs_orr(vtmp, vs5, vtmp); 6932 vs_sshr(vs1, __ T4S, vtmp, 31); 6933 6934 // r0 += ~r1; 6935 vs_notr(vtmp, vs1); 6936 vs_addv(vs3, __ T4S, vs3, vtmp); 6937 6938 // r1 = r1 & quotient; 6939 vs_andr(vs1, vs2, vs1); 6940 6941 // store results inteleaved 6942 // lowPart[m] = r0; 6943 // highPart[m] = r1; 6944 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 6945 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 6946 6947 __ sub(len, len, 64); 6948 __ cmp(len, (u1)64); 6949 __ br(Assembler::GE, L_loop); 6950 6951 // restore callee-saved vector registers 6952 __ ldpd(v14, v15, Address(sp, 48)); 6953 __ ldpd(v12, v13, Address(sp, 32)); 6954 __ ldpd(v10, v11, Address(sp, 16)); 6955 __ ldpd(v8, v9, __ post(sp, 64)); 6956 6957 __ leave(); // required for proper stackwalking of RuntimeStub frame 6958 __ mov(r0, zr); // return 0 6959 __ ret(lr); 6960 6961 return start; 6962 } 6963 6964 /** 6965 * Arguments: 6966 * 6967 * Inputs: 6968 * c_rarg0 - int crc 6969 * c_rarg1 - byte* buf 6970 * c_rarg2 - int length 6971 * 6972 * Output: 6973 * rax - int crc result 6974 */ 6975 address generate_updateBytesCRC32() { 6976 assert(UseCRC32Intrinsics, "what are we doing here?"); 6977 6978 __ align(CodeEntryAlignment); 6979 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 6980 StubCodeMark mark(this, stub_id); 6981 6982 address start = __ pc(); 6983 6984 const Register crc = c_rarg0; // crc 6985 const Register buf = c_rarg1; // source java byte array address 6986 const Register len = c_rarg2; // length 6987 const Register table0 = c_rarg3; // crc_table address 6988 const Register table1 = c_rarg4; 6989 const Register table2 = c_rarg5; 6990 const Register table3 = c_rarg6; 6991 const Register tmp3 = c_rarg7; 6992 6993 BLOCK_COMMENT("Entry:"); 6994 __ enter(); // required for proper stackwalking of RuntimeStub frame 6995 6996 __ kernel_crc32(crc, buf, len, 6997 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 6998 6999 __ leave(); // required for proper stackwalking of RuntimeStub frame 7000 __ ret(lr); 7001 7002 return start; 7003 } 7004 7005 /** 7006 * Arguments: 7007 * 7008 * Inputs: 7009 * c_rarg0 - int crc 7010 * c_rarg1 - byte* buf 7011 * c_rarg2 - int length 7012 * c_rarg3 - int* table 7013 * 7014 * Output: 7015 * r0 - int crc result 7016 */ 7017 address generate_updateBytesCRC32C() { 7018 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7019 7020 __ align(CodeEntryAlignment); 7021 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 7022 StubCodeMark mark(this, stub_id); 7023 7024 address start = __ pc(); 7025 7026 const Register crc = c_rarg0; // crc 7027 const Register buf = c_rarg1; // source java byte array address 7028 const Register len = c_rarg2; // length 7029 const Register table0 = c_rarg3; // crc_table address 7030 const Register table1 = c_rarg4; 7031 const Register table2 = c_rarg5; 7032 const Register table3 = c_rarg6; 7033 const Register tmp3 = c_rarg7; 7034 7035 BLOCK_COMMENT("Entry:"); 7036 __ enter(); // required for proper stackwalking of RuntimeStub frame 7037 7038 __ kernel_crc32c(crc, buf, len, 7039 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7040 7041 __ leave(); // required for proper stackwalking of RuntimeStub frame 7042 __ ret(lr); 7043 7044 return start; 7045 } 7046 7047 /*** 7048 * Arguments: 7049 * 7050 * Inputs: 7051 * c_rarg0 - int adler 7052 * c_rarg1 - byte* buff 7053 * c_rarg2 - int len 7054 * 7055 * Output: 7056 * c_rarg0 - int adler result 7057 */ 7058 address generate_updateBytesAdler32() { 7059 __ align(CodeEntryAlignment); 7060 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 7061 StubCodeMark mark(this, stub_id); 7062 address start = __ pc(); 7063 7064 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7065 7066 // Aliases 7067 Register adler = c_rarg0; 7068 Register s1 = c_rarg0; 7069 Register s2 = c_rarg3; 7070 Register buff = c_rarg1; 7071 Register len = c_rarg2; 7072 Register nmax = r4; 7073 Register base = r5; 7074 Register count = r6; 7075 Register temp0 = rscratch1; 7076 Register temp1 = rscratch2; 7077 FloatRegister vbytes = v0; 7078 FloatRegister vs1acc = v1; 7079 FloatRegister vs2acc = v2; 7080 FloatRegister vtable = v3; 7081 7082 // Max number of bytes we can process before having to take the mod 7083 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7084 uint64_t BASE = 0xfff1; 7085 uint64_t NMAX = 0x15B0; 7086 7087 __ mov(base, BASE); 7088 __ mov(nmax, NMAX); 7089 7090 // Load accumulation coefficients for the upper 16 bits 7091 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7092 __ ld1(vtable, __ T16B, Address(temp0)); 7093 7094 // s1 is initialized to the lower 16 bits of adler 7095 // s2 is initialized to the upper 16 bits of adler 7096 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7097 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7098 7099 // The pipelined loop needs at least 16 elements for 1 iteration 7100 // It does check this, but it is more effective to skip to the cleanup loop 7101 __ cmp(len, (u1)16); 7102 __ br(Assembler::HS, L_nmax); 7103 __ cbz(len, L_combine); 7104 7105 __ bind(L_simple_by1_loop); 7106 __ ldrb(temp0, Address(__ post(buff, 1))); 7107 __ add(s1, s1, temp0); 7108 __ add(s2, s2, s1); 7109 __ subs(len, len, 1); 7110 __ br(Assembler::HI, L_simple_by1_loop); 7111 7112 // s1 = s1 % BASE 7113 __ subs(temp0, s1, base); 7114 __ csel(s1, temp0, s1, Assembler::HS); 7115 7116 // s2 = s2 % BASE 7117 __ lsr(temp0, s2, 16); 7118 __ lsl(temp1, temp0, 4); 7119 __ sub(temp1, temp1, temp0); 7120 __ add(s2, temp1, s2, ext::uxth); 7121 7122 __ subs(temp0, s2, base); 7123 __ csel(s2, temp0, s2, Assembler::HS); 7124 7125 __ b(L_combine); 7126 7127 __ bind(L_nmax); 7128 __ subs(len, len, nmax); 7129 __ sub(count, nmax, 16); 7130 __ br(Assembler::LO, L_by16); 7131 7132 __ bind(L_nmax_loop); 7133 7134 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7135 vbytes, vs1acc, vs2acc, vtable); 7136 7137 __ subs(count, count, 16); 7138 __ br(Assembler::HS, L_nmax_loop); 7139 7140 // s1 = s1 % BASE 7141 __ lsr(temp0, s1, 16); 7142 __ lsl(temp1, temp0, 4); 7143 __ sub(temp1, temp1, temp0); 7144 __ add(temp1, temp1, s1, ext::uxth); 7145 7146 __ lsr(temp0, temp1, 16); 7147 __ lsl(s1, temp0, 4); 7148 __ sub(s1, s1, temp0); 7149 __ add(s1, s1, temp1, ext:: uxth); 7150 7151 __ subs(temp0, s1, base); 7152 __ csel(s1, temp0, s1, Assembler::HS); 7153 7154 // s2 = s2 % BASE 7155 __ lsr(temp0, s2, 16); 7156 __ lsl(temp1, temp0, 4); 7157 __ sub(temp1, temp1, temp0); 7158 __ add(temp1, temp1, s2, ext::uxth); 7159 7160 __ lsr(temp0, temp1, 16); 7161 __ lsl(s2, temp0, 4); 7162 __ sub(s2, s2, temp0); 7163 __ add(s2, s2, temp1, ext:: uxth); 7164 7165 __ subs(temp0, s2, base); 7166 __ csel(s2, temp0, s2, Assembler::HS); 7167 7168 __ subs(len, len, nmax); 7169 __ sub(count, nmax, 16); 7170 __ br(Assembler::HS, L_nmax_loop); 7171 7172 __ bind(L_by16); 7173 __ adds(len, len, count); 7174 __ br(Assembler::LO, L_by1); 7175 7176 __ bind(L_by16_loop); 7177 7178 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7179 vbytes, vs1acc, vs2acc, vtable); 7180 7181 __ subs(len, len, 16); 7182 __ br(Assembler::HS, L_by16_loop); 7183 7184 __ bind(L_by1); 7185 __ adds(len, len, 15); 7186 __ br(Assembler::LO, L_do_mod); 7187 7188 __ bind(L_by1_loop); 7189 __ ldrb(temp0, Address(__ post(buff, 1))); 7190 __ add(s1, temp0, s1); 7191 __ add(s2, s2, s1); 7192 __ subs(len, len, 1); 7193 __ br(Assembler::HS, L_by1_loop); 7194 7195 __ bind(L_do_mod); 7196 // s1 = s1 % BASE 7197 __ lsr(temp0, s1, 16); 7198 __ lsl(temp1, temp0, 4); 7199 __ sub(temp1, temp1, temp0); 7200 __ add(temp1, temp1, s1, ext::uxth); 7201 7202 __ lsr(temp0, temp1, 16); 7203 __ lsl(s1, temp0, 4); 7204 __ sub(s1, s1, temp0); 7205 __ add(s1, s1, temp1, ext:: uxth); 7206 7207 __ subs(temp0, s1, base); 7208 __ csel(s1, temp0, s1, Assembler::HS); 7209 7210 // s2 = s2 % BASE 7211 __ lsr(temp0, s2, 16); 7212 __ lsl(temp1, temp0, 4); 7213 __ sub(temp1, temp1, temp0); 7214 __ add(temp1, temp1, s2, ext::uxth); 7215 7216 __ lsr(temp0, temp1, 16); 7217 __ lsl(s2, temp0, 4); 7218 __ sub(s2, s2, temp0); 7219 __ add(s2, s2, temp1, ext:: uxth); 7220 7221 __ subs(temp0, s2, base); 7222 __ csel(s2, temp0, s2, Assembler::HS); 7223 7224 // Combine lower bits and higher bits 7225 __ bind(L_combine); 7226 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7227 7228 __ ret(lr); 7229 7230 return start; 7231 } 7232 7233 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7234 Register temp0, Register temp1, FloatRegister vbytes, 7235 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7236 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7237 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7238 // In non-vectorized code, we update s1 and s2 as: 7239 // s1 <- s1 + b1 7240 // s2 <- s2 + s1 7241 // s1 <- s1 + b2 7242 // s2 <- s2 + b1 7243 // ... 7244 // s1 <- s1 + b16 7245 // s2 <- s2 + s1 7246 // Putting above assignments together, we have: 7247 // s1_new = s1 + b1 + b2 + ... + b16 7248 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7249 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7250 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7251 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7252 7253 // s2 = s2 + s1 * 16 7254 __ add(s2, s2, s1, Assembler::LSL, 4); 7255 7256 // vs1acc = b1 + b2 + b3 + ... + b16 7257 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7258 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7259 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7260 __ uaddlv(vs1acc, __ T16B, vbytes); 7261 __ uaddlv(vs2acc, __ T8H, vs2acc); 7262 7263 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7264 __ fmovd(temp0, vs1acc); 7265 __ fmovd(temp1, vs2acc); 7266 __ add(s1, s1, temp0); 7267 __ add(s2, s2, temp1); 7268 } 7269 7270 /** 7271 * Arguments: 7272 * 7273 * Input: 7274 * c_rarg0 - x address 7275 * c_rarg1 - x length 7276 * c_rarg2 - y address 7277 * c_rarg3 - y length 7278 * c_rarg4 - z address 7279 */ 7280 address generate_multiplyToLen() { 7281 __ align(CodeEntryAlignment); 7282 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 7283 StubCodeMark mark(this, stub_id); 7284 7285 address start = __ pc(); 7286 const Register x = r0; 7287 const Register xlen = r1; 7288 const Register y = r2; 7289 const Register ylen = r3; 7290 const Register z = r4; 7291 7292 const Register tmp0 = r5; 7293 const Register tmp1 = r10; 7294 const Register tmp2 = r11; 7295 const Register tmp3 = r12; 7296 const Register tmp4 = r13; 7297 const Register tmp5 = r14; 7298 const Register tmp6 = r15; 7299 const Register tmp7 = r16; 7300 7301 BLOCK_COMMENT("Entry:"); 7302 __ enter(); // required for proper stackwalking of RuntimeStub frame 7303 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7304 __ leave(); // required for proper stackwalking of RuntimeStub frame 7305 __ ret(lr); 7306 7307 return start; 7308 } 7309 7310 address generate_squareToLen() { 7311 // squareToLen algorithm for sizes 1..127 described in java code works 7312 // faster than multiply_to_len on some CPUs and slower on others, but 7313 // multiply_to_len shows a bit better overall results 7314 __ align(CodeEntryAlignment); 7315 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 7316 StubCodeMark mark(this, stub_id); 7317 address start = __ pc(); 7318 7319 const Register x = r0; 7320 const Register xlen = r1; 7321 const Register z = r2; 7322 const Register y = r4; // == x 7323 const Register ylen = r5; // == xlen 7324 7325 const Register tmp0 = r3; 7326 const Register tmp1 = r10; 7327 const Register tmp2 = r11; 7328 const Register tmp3 = r12; 7329 const Register tmp4 = r13; 7330 const Register tmp5 = r14; 7331 const Register tmp6 = r15; 7332 const Register tmp7 = r16; 7333 7334 RegSet spilled_regs = RegSet::of(y, ylen); 7335 BLOCK_COMMENT("Entry:"); 7336 __ enter(); 7337 __ push(spilled_regs, sp); 7338 __ mov(y, x); 7339 __ mov(ylen, xlen); 7340 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7341 __ pop(spilled_regs, sp); 7342 __ leave(); 7343 __ ret(lr); 7344 return start; 7345 } 7346 7347 address generate_mulAdd() { 7348 __ align(CodeEntryAlignment); 7349 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 7350 StubCodeMark mark(this, stub_id); 7351 7352 address start = __ pc(); 7353 7354 const Register out = r0; 7355 const Register in = r1; 7356 const Register offset = r2; 7357 const Register len = r3; 7358 const Register k = r4; 7359 7360 BLOCK_COMMENT("Entry:"); 7361 __ enter(); 7362 __ mul_add(out, in, offset, len, k); 7363 __ leave(); 7364 __ ret(lr); 7365 7366 return start; 7367 } 7368 7369 // Arguments: 7370 // 7371 // Input: 7372 // c_rarg0 - newArr address 7373 // c_rarg1 - oldArr address 7374 // c_rarg2 - newIdx 7375 // c_rarg3 - shiftCount 7376 // c_rarg4 - numIter 7377 // 7378 address generate_bigIntegerRightShift() { 7379 __ align(CodeEntryAlignment); 7380 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 7381 StubCodeMark mark(this, stub_id); 7382 address start = __ pc(); 7383 7384 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7385 7386 Register newArr = c_rarg0; 7387 Register oldArr = c_rarg1; 7388 Register newIdx = c_rarg2; 7389 Register shiftCount = c_rarg3; 7390 Register numIter = c_rarg4; 7391 Register idx = numIter; 7392 7393 Register newArrCur = rscratch1; 7394 Register shiftRevCount = rscratch2; 7395 Register oldArrCur = r13; 7396 Register oldArrNext = r14; 7397 7398 FloatRegister oldElem0 = v0; 7399 FloatRegister oldElem1 = v1; 7400 FloatRegister newElem = v2; 7401 FloatRegister shiftVCount = v3; 7402 FloatRegister shiftVRevCount = v4; 7403 7404 __ cbz(idx, Exit); 7405 7406 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7407 7408 // left shift count 7409 __ movw(shiftRevCount, 32); 7410 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7411 7412 // numIter too small to allow a 4-words SIMD loop, rolling back 7413 __ cmp(numIter, (u1)4); 7414 __ br(Assembler::LT, ShiftThree); 7415 7416 __ dup(shiftVCount, __ T4S, shiftCount); 7417 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7418 __ negr(shiftVCount, __ T4S, shiftVCount); 7419 7420 __ BIND(ShiftSIMDLoop); 7421 7422 // Calculate the load addresses 7423 __ sub(idx, idx, 4); 7424 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7425 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7426 __ add(oldArrCur, oldArrNext, 4); 7427 7428 // Load 4 words and process 7429 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7430 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7431 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7432 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7433 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7434 __ st1(newElem, __ T4S, Address(newArrCur)); 7435 7436 __ cmp(idx, (u1)4); 7437 __ br(Assembler::LT, ShiftTwoLoop); 7438 __ b(ShiftSIMDLoop); 7439 7440 __ BIND(ShiftTwoLoop); 7441 __ cbz(idx, Exit); 7442 __ cmp(idx, (u1)1); 7443 __ br(Assembler::EQ, ShiftOne); 7444 7445 // Calculate the load addresses 7446 __ sub(idx, idx, 2); 7447 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7448 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7449 __ add(oldArrCur, oldArrNext, 4); 7450 7451 // Load 2 words and process 7452 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7453 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7454 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7455 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7456 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7457 __ st1(newElem, __ T2S, Address(newArrCur)); 7458 __ b(ShiftTwoLoop); 7459 7460 __ BIND(ShiftThree); 7461 __ tbz(idx, 1, ShiftOne); 7462 __ tbz(idx, 0, ShiftTwo); 7463 __ ldrw(r10, Address(oldArr, 12)); 7464 __ ldrw(r11, Address(oldArr, 8)); 7465 __ lsrvw(r10, r10, shiftCount); 7466 __ lslvw(r11, r11, shiftRevCount); 7467 __ orrw(r12, r10, r11); 7468 __ strw(r12, Address(newArr, 8)); 7469 7470 __ BIND(ShiftTwo); 7471 __ ldrw(r10, Address(oldArr, 8)); 7472 __ ldrw(r11, Address(oldArr, 4)); 7473 __ lsrvw(r10, r10, shiftCount); 7474 __ lslvw(r11, r11, shiftRevCount); 7475 __ orrw(r12, r10, r11); 7476 __ strw(r12, Address(newArr, 4)); 7477 7478 __ BIND(ShiftOne); 7479 __ ldrw(r10, Address(oldArr, 4)); 7480 __ ldrw(r11, Address(oldArr)); 7481 __ lsrvw(r10, r10, shiftCount); 7482 __ lslvw(r11, r11, shiftRevCount); 7483 __ orrw(r12, r10, r11); 7484 __ strw(r12, Address(newArr)); 7485 7486 __ BIND(Exit); 7487 __ ret(lr); 7488 7489 return start; 7490 } 7491 7492 // Arguments: 7493 // 7494 // Input: 7495 // c_rarg0 - newArr address 7496 // c_rarg1 - oldArr address 7497 // c_rarg2 - newIdx 7498 // c_rarg3 - shiftCount 7499 // c_rarg4 - numIter 7500 // 7501 address generate_bigIntegerLeftShift() { 7502 __ align(CodeEntryAlignment); 7503 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 7504 StubCodeMark mark(this, stub_id); 7505 address start = __ pc(); 7506 7507 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7508 7509 Register newArr = c_rarg0; 7510 Register oldArr = c_rarg1; 7511 Register newIdx = c_rarg2; 7512 Register shiftCount = c_rarg3; 7513 Register numIter = c_rarg4; 7514 7515 Register shiftRevCount = rscratch1; 7516 Register oldArrNext = rscratch2; 7517 7518 FloatRegister oldElem0 = v0; 7519 FloatRegister oldElem1 = v1; 7520 FloatRegister newElem = v2; 7521 FloatRegister shiftVCount = v3; 7522 FloatRegister shiftVRevCount = v4; 7523 7524 __ cbz(numIter, Exit); 7525 7526 __ add(oldArrNext, oldArr, 4); 7527 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7528 7529 // right shift count 7530 __ movw(shiftRevCount, 32); 7531 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7532 7533 // numIter too small to allow a 4-words SIMD loop, rolling back 7534 __ cmp(numIter, (u1)4); 7535 __ br(Assembler::LT, ShiftThree); 7536 7537 __ dup(shiftVCount, __ T4S, shiftCount); 7538 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7539 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 7540 7541 __ BIND(ShiftSIMDLoop); 7542 7543 // load 4 words and process 7544 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 7545 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 7546 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7547 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7548 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7549 __ st1(newElem, __ T4S, __ post(newArr, 16)); 7550 __ sub(numIter, numIter, 4); 7551 7552 __ cmp(numIter, (u1)4); 7553 __ br(Assembler::LT, ShiftTwoLoop); 7554 __ b(ShiftSIMDLoop); 7555 7556 __ BIND(ShiftTwoLoop); 7557 __ cbz(numIter, Exit); 7558 __ cmp(numIter, (u1)1); 7559 __ br(Assembler::EQ, ShiftOne); 7560 7561 // load 2 words and process 7562 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 7563 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 7564 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7565 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7566 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7567 __ st1(newElem, __ T2S, __ post(newArr, 8)); 7568 __ sub(numIter, numIter, 2); 7569 __ b(ShiftTwoLoop); 7570 7571 __ BIND(ShiftThree); 7572 __ ldrw(r10, __ post(oldArr, 4)); 7573 __ ldrw(r11, __ post(oldArrNext, 4)); 7574 __ lslvw(r10, r10, shiftCount); 7575 __ lsrvw(r11, r11, shiftRevCount); 7576 __ orrw(r12, r10, r11); 7577 __ strw(r12, __ post(newArr, 4)); 7578 __ tbz(numIter, 1, Exit); 7579 __ tbz(numIter, 0, ShiftOne); 7580 7581 __ BIND(ShiftTwo); 7582 __ ldrw(r10, __ post(oldArr, 4)); 7583 __ ldrw(r11, __ post(oldArrNext, 4)); 7584 __ lslvw(r10, r10, shiftCount); 7585 __ lsrvw(r11, r11, shiftRevCount); 7586 __ orrw(r12, r10, r11); 7587 __ strw(r12, __ post(newArr, 4)); 7588 7589 __ BIND(ShiftOne); 7590 __ ldrw(r10, Address(oldArr)); 7591 __ ldrw(r11, Address(oldArrNext)); 7592 __ lslvw(r10, r10, shiftCount); 7593 __ lsrvw(r11, r11, shiftRevCount); 7594 __ orrw(r12, r10, r11); 7595 __ strw(r12, Address(newArr)); 7596 7597 __ BIND(Exit); 7598 __ ret(lr); 7599 7600 return start; 7601 } 7602 7603 address generate_count_positives(address &count_positives_long) { 7604 const u1 large_loop_size = 64; 7605 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 7606 int dcache_line = VM_Version::dcache_line_size(); 7607 7608 Register ary1 = r1, len = r2, result = r0; 7609 7610 __ align(CodeEntryAlignment); 7611 7612 StubGenStubId stub_id = StubGenStubId::count_positives_id; 7613 StubCodeMark mark(this, stub_id); 7614 7615 address entry = __ pc(); 7616 7617 __ enter(); 7618 // precondition: a copy of len is already in result 7619 // __ mov(result, len); 7620 7621 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 7622 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 7623 7624 __ cmp(len, (u1)15); 7625 __ br(Assembler::GT, LEN_OVER_15); 7626 // The only case when execution falls into this code is when pointer is near 7627 // the end of memory page and we have to avoid reading next page 7628 __ add(ary1, ary1, len); 7629 __ subs(len, len, 8); 7630 __ br(Assembler::GT, LEN_OVER_8); 7631 __ ldr(rscratch2, Address(ary1, -8)); 7632 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 7633 __ lsrv(rscratch2, rscratch2, rscratch1); 7634 __ tst(rscratch2, UPPER_BIT_MASK); 7635 __ csel(result, zr, result, Assembler::NE); 7636 __ leave(); 7637 __ ret(lr); 7638 __ bind(LEN_OVER_8); 7639 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 7640 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 7641 __ tst(rscratch2, UPPER_BIT_MASK); 7642 __ br(Assembler::NE, RET_NO_POP); 7643 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 7644 __ lsrv(rscratch1, rscratch1, rscratch2); 7645 __ tst(rscratch1, UPPER_BIT_MASK); 7646 __ bind(RET_NO_POP); 7647 __ csel(result, zr, result, Assembler::NE); 7648 __ leave(); 7649 __ ret(lr); 7650 7651 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 7652 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 7653 7654 count_positives_long = __ pc(); // 2nd entry point 7655 7656 __ enter(); 7657 7658 __ bind(LEN_OVER_15); 7659 __ push(spilled_regs, sp); 7660 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 7661 __ cbz(rscratch2, ALIGNED); 7662 __ ldp(tmp6, tmp1, Address(ary1)); 7663 __ mov(tmp5, 16); 7664 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 7665 __ add(ary1, ary1, rscratch1); 7666 __ orr(tmp6, tmp6, tmp1); 7667 __ tst(tmp6, UPPER_BIT_MASK); 7668 __ br(Assembler::NE, RET_ADJUST); 7669 __ sub(len, len, rscratch1); 7670 7671 __ bind(ALIGNED); 7672 __ cmp(len, large_loop_size); 7673 __ br(Assembler::LT, CHECK_16); 7674 // Perform 16-byte load as early return in pre-loop to handle situation 7675 // when initially aligned large array has negative values at starting bytes, 7676 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 7677 // slower. Cases with negative bytes further ahead won't be affected that 7678 // much. In fact, it'll be faster due to early loads, less instructions and 7679 // less branches in LARGE_LOOP. 7680 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 7681 __ sub(len, len, 16); 7682 __ orr(tmp6, tmp6, tmp1); 7683 __ tst(tmp6, UPPER_BIT_MASK); 7684 __ br(Assembler::NE, RET_ADJUST_16); 7685 __ cmp(len, large_loop_size); 7686 __ br(Assembler::LT, CHECK_16); 7687 7688 if (SoftwarePrefetchHintDistance >= 0 7689 && SoftwarePrefetchHintDistance >= dcache_line) { 7690 // initial prefetch 7691 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 7692 } 7693 __ bind(LARGE_LOOP); 7694 if (SoftwarePrefetchHintDistance >= 0) { 7695 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 7696 } 7697 // Issue load instructions first, since it can save few CPU/MEM cycles, also 7698 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 7699 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 7700 // instructions per cycle and have less branches, but this approach disables 7701 // early return, thus, all 64 bytes are loaded and checked every time. 7702 __ ldp(tmp2, tmp3, Address(ary1)); 7703 __ ldp(tmp4, tmp5, Address(ary1, 16)); 7704 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 7705 __ ldp(tmp6, tmp1, Address(ary1, 48)); 7706 __ add(ary1, ary1, large_loop_size); 7707 __ sub(len, len, large_loop_size); 7708 __ orr(tmp2, tmp2, tmp3); 7709 __ orr(tmp4, tmp4, tmp5); 7710 __ orr(rscratch1, rscratch1, rscratch2); 7711 __ orr(tmp6, tmp6, tmp1); 7712 __ orr(tmp2, tmp2, tmp4); 7713 __ orr(rscratch1, rscratch1, tmp6); 7714 __ orr(tmp2, tmp2, rscratch1); 7715 __ tst(tmp2, UPPER_BIT_MASK); 7716 __ br(Assembler::NE, RET_ADJUST_LONG); 7717 __ cmp(len, large_loop_size); 7718 __ br(Assembler::GE, LARGE_LOOP); 7719 7720 __ bind(CHECK_16); // small 16-byte load pre-loop 7721 __ cmp(len, (u1)16); 7722 __ br(Assembler::LT, POST_LOOP16); 7723 7724 __ bind(LOOP16); // small 16-byte load loop 7725 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 7726 __ sub(len, len, 16); 7727 __ orr(tmp2, tmp2, tmp3); 7728 __ tst(tmp2, UPPER_BIT_MASK); 7729 __ br(Assembler::NE, RET_ADJUST_16); 7730 __ cmp(len, (u1)16); 7731 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 7732 7733 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 7734 __ cmp(len, (u1)8); 7735 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 7736 __ ldr(tmp3, Address(__ post(ary1, 8))); 7737 __ tst(tmp3, UPPER_BIT_MASK); 7738 __ br(Assembler::NE, RET_ADJUST); 7739 __ sub(len, len, 8); 7740 7741 __ bind(POST_LOOP16_LOAD_TAIL); 7742 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 7743 __ ldr(tmp1, Address(ary1)); 7744 __ mov(tmp2, 64); 7745 __ sub(tmp4, tmp2, len, __ LSL, 3); 7746 __ lslv(tmp1, tmp1, tmp4); 7747 __ tst(tmp1, UPPER_BIT_MASK); 7748 __ br(Assembler::NE, RET_ADJUST); 7749 // Fallthrough 7750 7751 __ bind(RET_LEN); 7752 __ pop(spilled_regs, sp); 7753 __ leave(); 7754 __ ret(lr); 7755 7756 // difference result - len is the count of guaranteed to be 7757 // positive bytes 7758 7759 __ bind(RET_ADJUST_LONG); 7760 __ add(len, len, (u1)(large_loop_size - 16)); 7761 __ bind(RET_ADJUST_16); 7762 __ add(len, len, 16); 7763 __ bind(RET_ADJUST); 7764 __ pop(spilled_regs, sp); 7765 __ leave(); 7766 __ sub(result, result, len); 7767 __ ret(lr); 7768 7769 return entry; 7770 } 7771 7772 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 7773 bool usePrefetch, Label &NOT_EQUAL) { 7774 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7775 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7776 tmp7 = r12, tmp8 = r13; 7777 Label LOOP; 7778 7779 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7780 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7781 __ bind(LOOP); 7782 if (usePrefetch) { 7783 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7784 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7785 } 7786 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7787 __ eor(tmp1, tmp1, tmp2); 7788 __ eor(tmp3, tmp3, tmp4); 7789 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7790 __ orr(tmp1, tmp1, tmp3); 7791 __ cbnz(tmp1, NOT_EQUAL); 7792 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7793 __ eor(tmp5, tmp5, tmp6); 7794 __ eor(tmp7, tmp7, tmp8); 7795 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7796 __ orr(tmp5, tmp5, tmp7); 7797 __ cbnz(tmp5, NOT_EQUAL); 7798 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7799 __ eor(tmp1, tmp1, tmp2); 7800 __ eor(tmp3, tmp3, tmp4); 7801 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7802 __ orr(tmp1, tmp1, tmp3); 7803 __ cbnz(tmp1, NOT_EQUAL); 7804 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7805 __ eor(tmp5, tmp5, tmp6); 7806 __ sub(cnt1, cnt1, 8 * wordSize); 7807 __ eor(tmp7, tmp7, tmp8); 7808 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7809 // tmp6 is not used. MacroAssembler::subs is used here (rather than 7810 // cmp) because subs allows an unlimited range of immediate operand. 7811 __ subs(tmp6, cnt1, loopThreshold); 7812 __ orr(tmp5, tmp5, tmp7); 7813 __ cbnz(tmp5, NOT_EQUAL); 7814 __ br(__ GE, LOOP); 7815 // post-loop 7816 __ eor(tmp1, tmp1, tmp2); 7817 __ eor(tmp3, tmp3, tmp4); 7818 __ orr(tmp1, tmp1, tmp3); 7819 __ sub(cnt1, cnt1, 2 * wordSize); 7820 __ cbnz(tmp1, NOT_EQUAL); 7821 } 7822 7823 void generate_large_array_equals_loop_simd(int loopThreshold, 7824 bool usePrefetch, Label &NOT_EQUAL) { 7825 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7826 tmp2 = rscratch2; 7827 Label LOOP; 7828 7829 __ bind(LOOP); 7830 if (usePrefetch) { 7831 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7832 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7833 } 7834 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 7835 __ sub(cnt1, cnt1, 8 * wordSize); 7836 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 7837 __ subs(tmp1, cnt1, loopThreshold); 7838 __ eor(v0, __ T16B, v0, v4); 7839 __ eor(v1, __ T16B, v1, v5); 7840 __ eor(v2, __ T16B, v2, v6); 7841 __ eor(v3, __ T16B, v3, v7); 7842 __ orr(v0, __ T16B, v0, v1); 7843 __ orr(v1, __ T16B, v2, v3); 7844 __ orr(v0, __ T16B, v0, v1); 7845 __ umov(tmp1, v0, __ D, 0); 7846 __ umov(tmp2, v0, __ D, 1); 7847 __ orr(tmp1, tmp1, tmp2); 7848 __ cbnz(tmp1, NOT_EQUAL); 7849 __ br(__ GE, LOOP); 7850 } 7851 7852 // a1 = r1 - array1 address 7853 // a2 = r2 - array2 address 7854 // result = r0 - return value. Already contains "false" 7855 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 7856 // r3-r5 are reserved temporary registers 7857 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 7858 address generate_large_array_equals() { 7859 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7860 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7861 tmp7 = r12, tmp8 = r13; 7862 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 7863 SMALL_LOOP, POST_LOOP; 7864 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 7865 // calculate if at least 32 prefetched bytes are used 7866 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 7867 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 7868 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 7869 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 7870 tmp5, tmp6, tmp7, tmp8); 7871 7872 __ align(CodeEntryAlignment); 7873 7874 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 7875 StubCodeMark mark(this, stub_id); 7876 7877 address entry = __ pc(); 7878 __ enter(); 7879 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 7880 // also advance pointers to use post-increment instead of pre-increment 7881 __ add(a1, a1, wordSize); 7882 __ add(a2, a2, wordSize); 7883 if (AvoidUnalignedAccesses) { 7884 // both implementations (SIMD/nonSIMD) are using relatively large load 7885 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 7886 // on some CPUs in case of address is not at least 16-byte aligned. 7887 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 7888 // load if needed at least for 1st address and make if 16-byte aligned. 7889 Label ALIGNED16; 7890 __ tbz(a1, 3, ALIGNED16); 7891 __ ldr(tmp1, Address(__ post(a1, wordSize))); 7892 __ ldr(tmp2, Address(__ post(a2, wordSize))); 7893 __ sub(cnt1, cnt1, wordSize); 7894 __ eor(tmp1, tmp1, tmp2); 7895 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 7896 __ bind(ALIGNED16); 7897 } 7898 if (UseSIMDForArrayEquals) { 7899 if (SoftwarePrefetchHintDistance >= 0) { 7900 __ subs(tmp1, cnt1, prefetchLoopThreshold); 7901 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 7902 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 7903 /* prfm = */ true, NOT_EQUAL); 7904 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 7905 __ br(__ LT, TAIL); 7906 } 7907 __ bind(NO_PREFETCH_LARGE_LOOP); 7908 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 7909 /* prfm = */ false, NOT_EQUAL); 7910 } else { 7911 __ push(spilled_regs, sp); 7912 if (SoftwarePrefetchHintDistance >= 0) { 7913 __ subs(tmp1, cnt1, prefetchLoopThreshold); 7914 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 7915 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 7916 /* prfm = */ true, NOT_EQUAL); 7917 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 7918 __ br(__ LT, TAIL); 7919 } 7920 __ bind(NO_PREFETCH_LARGE_LOOP); 7921 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 7922 /* prfm = */ false, NOT_EQUAL); 7923 } 7924 __ bind(TAIL); 7925 __ cbz(cnt1, EQUAL); 7926 __ subs(cnt1, cnt1, wordSize); 7927 __ br(__ LE, POST_LOOP); 7928 __ bind(SMALL_LOOP); 7929 __ ldr(tmp1, Address(__ post(a1, wordSize))); 7930 __ ldr(tmp2, Address(__ post(a2, wordSize))); 7931 __ subs(cnt1, cnt1, wordSize); 7932 __ eor(tmp1, tmp1, tmp2); 7933 __ cbnz(tmp1, NOT_EQUAL); 7934 __ br(__ GT, SMALL_LOOP); 7935 __ bind(POST_LOOP); 7936 __ ldr(tmp1, Address(a1, cnt1)); 7937 __ ldr(tmp2, Address(a2, cnt1)); 7938 __ eor(tmp1, tmp1, tmp2); 7939 __ cbnz(tmp1, NOT_EQUAL); 7940 __ bind(EQUAL); 7941 __ mov(result, true); 7942 __ bind(NOT_EQUAL); 7943 if (!UseSIMDForArrayEquals) { 7944 __ pop(spilled_regs, sp); 7945 } 7946 __ bind(NOT_EQUAL_NO_POP); 7947 __ leave(); 7948 __ ret(lr); 7949 return entry; 7950 } 7951 7952 // result = r0 - return value. Contains initial hashcode value on entry. 7953 // ary = r1 - array address 7954 // cnt = r2 - elements count 7955 // Clobbers: v0-v13, rscratch1, rscratch2 7956 address generate_large_arrays_hashcode(BasicType eltype) { 7957 const Register result = r0, ary = r1, cnt = r2; 7958 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 7959 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 7960 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 7961 const FloatRegister vpowm = v13; 7962 7963 ARRAYS_HASHCODE_REGISTERS; 7964 7965 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 7966 7967 unsigned int vf; // vectorization factor 7968 bool multiply_by_halves; 7969 Assembler::SIMD_Arrangement load_arrangement; 7970 switch (eltype) { 7971 case T_BOOLEAN: 7972 case T_BYTE: 7973 load_arrangement = Assembler::T8B; 7974 multiply_by_halves = true; 7975 vf = 8; 7976 break; 7977 case T_CHAR: 7978 case T_SHORT: 7979 load_arrangement = Assembler::T8H; 7980 multiply_by_halves = true; 7981 vf = 8; 7982 break; 7983 case T_INT: 7984 load_arrangement = Assembler::T4S; 7985 multiply_by_halves = false; 7986 vf = 4; 7987 break; 7988 default: 7989 ShouldNotReachHere(); 7990 } 7991 7992 // Unroll factor 7993 const unsigned uf = 4; 7994 7995 // Effective vectorization factor 7996 const unsigned evf = vf * uf; 7997 7998 __ align(CodeEntryAlignment); 7999 8000 StubGenStubId stub_id; 8001 switch (eltype) { 8002 case T_BOOLEAN: 8003 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 8004 break; 8005 case T_BYTE: 8006 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 8007 break; 8008 case T_CHAR: 8009 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 8010 break; 8011 case T_SHORT: 8012 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 8013 break; 8014 case T_INT: 8015 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 8016 break; 8017 default: 8018 stub_id = StubGenStubId::NO_STUBID; 8019 ShouldNotReachHere(); 8020 }; 8021 8022 StubCodeMark mark(this, stub_id); 8023 8024 address entry = __ pc(); 8025 __ enter(); 8026 8027 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8028 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8029 // value shouldn't change throughout both loops. 8030 __ movw(rscratch1, intpow(31U, 3)); 8031 __ mov(vpow, Assembler::S, 0, rscratch1); 8032 __ movw(rscratch1, intpow(31U, 2)); 8033 __ mov(vpow, Assembler::S, 1, rscratch1); 8034 __ movw(rscratch1, intpow(31U, 1)); 8035 __ mov(vpow, Assembler::S, 2, rscratch1); 8036 __ movw(rscratch1, intpow(31U, 0)); 8037 __ mov(vpow, Assembler::S, 3, rscratch1); 8038 8039 __ mov(vmul0, Assembler::T16B, 0); 8040 __ mov(vmul0, Assembler::S, 3, result); 8041 8042 __ andr(rscratch2, cnt, (uf - 1) * vf); 8043 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8044 8045 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8046 __ mov(vpowm, Assembler::S, 0, rscratch1); 8047 8048 // SMALL LOOP 8049 __ bind(SMALL_LOOP); 8050 8051 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8052 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8053 __ subsw(rscratch2, rscratch2, vf); 8054 8055 if (load_arrangement == Assembler::T8B) { 8056 // Extend 8B to 8H to be able to use vector multiply 8057 // instructions 8058 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8059 if (is_signed_subword_type(eltype)) { 8060 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8061 } else { 8062 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8063 } 8064 } 8065 8066 switch (load_arrangement) { 8067 case Assembler::T4S: 8068 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8069 break; 8070 case Assembler::T8B: 8071 case Assembler::T8H: 8072 assert(is_subword_type(eltype), "subword type expected"); 8073 if (is_signed_subword_type(eltype)) { 8074 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8075 } else { 8076 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8077 } 8078 break; 8079 default: 8080 __ should_not_reach_here(); 8081 } 8082 8083 // Process the upper half of a vector 8084 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8085 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8086 if (is_signed_subword_type(eltype)) { 8087 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8088 } else { 8089 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8090 } 8091 } 8092 8093 __ br(Assembler::HI, SMALL_LOOP); 8094 8095 // SMALL LOOP'S EPILOQUE 8096 __ lsr(rscratch2, cnt, exact_log2(evf)); 8097 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8098 8099 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8100 __ addv(vmul0, Assembler::T4S, vmul0); 8101 __ umov(result, vmul0, Assembler::S, 0); 8102 8103 // TAIL 8104 __ bind(TAIL); 8105 8106 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8107 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8108 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8109 __ andr(rscratch2, cnt, vf - 1); 8110 __ bind(TAIL_SHORTCUT); 8111 __ adr(rscratch1, BR_BASE); 8112 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 8113 __ movw(rscratch2, 0x1f); 8114 __ br(rscratch1); 8115 8116 for (size_t i = 0; i < vf - 1; ++i) { 8117 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8118 eltype); 8119 __ maddw(result, result, rscratch2, rscratch1); 8120 } 8121 __ bind(BR_BASE); 8122 8123 __ leave(); 8124 __ ret(lr); 8125 8126 // LARGE LOOP 8127 __ bind(LARGE_LOOP_PREHEADER); 8128 8129 __ lsr(rscratch2, cnt, exact_log2(evf)); 8130 8131 if (multiply_by_halves) { 8132 // 31^4 - multiplier between lower and upper parts of a register 8133 __ movw(rscratch1, intpow(31U, vf / 2)); 8134 __ mov(vpowm, Assembler::S, 1, rscratch1); 8135 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8136 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8137 __ mov(vpowm, Assembler::S, 0, rscratch1); 8138 } else { 8139 // 31^16 8140 __ movw(rscratch1, intpow(31U, evf)); 8141 __ mov(vpowm, Assembler::S, 0, rscratch1); 8142 } 8143 8144 __ mov(vmul3, Assembler::T16B, 0); 8145 __ mov(vmul2, Assembler::T16B, 0); 8146 __ mov(vmul1, Assembler::T16B, 0); 8147 8148 __ bind(LARGE_LOOP); 8149 8150 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8151 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8152 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8153 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8154 8155 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8156 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8157 8158 if (load_arrangement == Assembler::T8B) { 8159 // Extend 8B to 8H to be able to use vector multiply 8160 // instructions 8161 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8162 if (is_signed_subword_type(eltype)) { 8163 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8164 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8165 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8166 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8167 } else { 8168 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8169 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8170 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8171 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8172 } 8173 } 8174 8175 switch (load_arrangement) { 8176 case Assembler::T4S: 8177 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8178 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8179 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8180 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8181 break; 8182 case Assembler::T8B: 8183 case Assembler::T8H: 8184 assert(is_subword_type(eltype), "subword type expected"); 8185 if (is_signed_subword_type(eltype)) { 8186 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8187 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8188 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8189 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8190 } else { 8191 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8192 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8193 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8194 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8195 } 8196 break; 8197 default: 8198 __ should_not_reach_here(); 8199 } 8200 8201 // Process the upper half of a vector 8202 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8203 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8204 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8205 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8206 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8207 if (is_signed_subword_type(eltype)) { 8208 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8209 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8210 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8211 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8212 } else { 8213 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8214 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8215 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8216 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8217 } 8218 } 8219 8220 __ subsw(rscratch2, rscratch2, 1); 8221 __ br(Assembler::HI, LARGE_LOOP); 8222 8223 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8224 __ addv(vmul3, Assembler::T4S, vmul3); 8225 __ umov(result, vmul3, Assembler::S, 0); 8226 8227 __ mov(rscratch2, intpow(31U, vf)); 8228 8229 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8230 __ addv(vmul2, Assembler::T4S, vmul2); 8231 __ umov(rscratch1, vmul2, Assembler::S, 0); 8232 __ maddw(result, result, rscratch2, rscratch1); 8233 8234 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8235 __ addv(vmul1, Assembler::T4S, vmul1); 8236 __ umov(rscratch1, vmul1, Assembler::S, 0); 8237 __ maddw(result, result, rscratch2, rscratch1); 8238 8239 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8240 __ addv(vmul0, Assembler::T4S, vmul0); 8241 __ umov(rscratch1, vmul0, Assembler::S, 0); 8242 __ maddw(result, result, rscratch2, rscratch1); 8243 8244 __ andr(rscratch2, cnt, vf - 1); 8245 __ cbnz(rscratch2, TAIL_SHORTCUT); 8246 8247 __ leave(); 8248 __ ret(lr); 8249 8250 return entry; 8251 } 8252 8253 address generate_dsin_dcos(bool isCos) { 8254 __ align(CodeEntryAlignment); 8255 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 8256 StubCodeMark mark(this, stub_id); 8257 address start = __ pc(); 8258 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8259 (address)StubRoutines::aarch64::_two_over_pi, 8260 (address)StubRoutines::aarch64::_pio2, 8261 (address)StubRoutines::aarch64::_dsin_coef, 8262 (address)StubRoutines::aarch64::_dcos_coef); 8263 return start; 8264 } 8265 8266 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8267 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8268 Label &DIFF2) { 8269 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8270 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8271 8272 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8273 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8274 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8275 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8276 8277 __ fmovd(tmpL, vtmp3); 8278 __ eor(rscratch2, tmp3, tmpL); 8279 __ cbnz(rscratch2, DIFF2); 8280 8281 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8282 __ umov(tmpL, vtmp3, __ D, 1); 8283 __ eor(rscratch2, tmpU, tmpL); 8284 __ cbnz(rscratch2, DIFF1); 8285 8286 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8287 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8288 __ fmovd(tmpL, vtmp); 8289 __ eor(rscratch2, tmp3, tmpL); 8290 __ cbnz(rscratch2, DIFF2); 8291 8292 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8293 __ umov(tmpL, vtmp, __ D, 1); 8294 __ eor(rscratch2, tmpU, tmpL); 8295 __ cbnz(rscratch2, DIFF1); 8296 } 8297 8298 // r0 = result 8299 // r1 = str1 8300 // r2 = cnt1 8301 // r3 = str2 8302 // r4 = cnt2 8303 // r10 = tmp1 8304 // r11 = tmp2 8305 address generate_compare_long_string_different_encoding(bool isLU) { 8306 __ align(CodeEntryAlignment); 8307 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 8308 StubCodeMark mark(this, stub_id); 8309 address entry = __ pc(); 8310 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8311 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8312 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8313 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8314 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8315 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8316 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8317 8318 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8319 8320 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8321 // cnt2 == amount of characters left to compare 8322 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8323 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8324 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8325 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8326 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8327 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8328 __ eor(rscratch2, tmp1, tmp2); 8329 __ mov(rscratch1, tmp2); 8330 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8331 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8332 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8333 __ push(spilled_regs, sp); 8334 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8335 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8336 8337 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8338 8339 if (SoftwarePrefetchHintDistance >= 0) { 8340 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8341 __ br(__ LT, NO_PREFETCH); 8342 __ bind(LARGE_LOOP_PREFETCH); 8343 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8344 __ mov(tmp4, 2); 8345 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8346 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8347 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8348 __ subs(tmp4, tmp4, 1); 8349 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8350 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8351 __ mov(tmp4, 2); 8352 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8353 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8354 __ subs(tmp4, tmp4, 1); 8355 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8356 __ sub(cnt2, cnt2, 64); 8357 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8358 __ br(__ GE, LARGE_LOOP_PREFETCH); 8359 } 8360 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8361 __ bind(NO_PREFETCH); 8362 __ subs(cnt2, cnt2, 16); 8363 __ br(__ LT, TAIL); 8364 __ align(OptoLoopAlignment); 8365 __ bind(SMALL_LOOP); // smaller loop 8366 __ subs(cnt2, cnt2, 16); 8367 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8368 __ br(__ GE, SMALL_LOOP); 8369 __ cmn(cnt2, (u1)16); 8370 __ br(__ EQ, LOAD_LAST); 8371 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8372 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8373 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8374 __ ldr(tmp3, Address(cnt1, -8)); 8375 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8376 __ b(LOAD_LAST); 8377 __ bind(DIFF2); 8378 __ mov(tmpU, tmp3); 8379 __ bind(DIFF1); 8380 __ pop(spilled_regs, sp); 8381 __ b(CALCULATE_DIFFERENCE); 8382 __ bind(LOAD_LAST); 8383 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8384 // No need to load it again 8385 __ mov(tmpU, tmp3); 8386 __ pop(spilled_regs, sp); 8387 8388 // tmp2 points to the address of the last 4 Latin1 characters right now 8389 __ ldrs(vtmp, Address(tmp2)); 8390 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8391 __ fmovd(tmpL, vtmp); 8392 8393 __ eor(rscratch2, tmpU, tmpL); 8394 __ cbz(rscratch2, DONE); 8395 8396 // Find the first different characters in the longwords and 8397 // compute their difference. 8398 __ bind(CALCULATE_DIFFERENCE); 8399 __ rev(rscratch2, rscratch2); 8400 __ clz(rscratch2, rscratch2); 8401 __ andr(rscratch2, rscratch2, -16); 8402 __ lsrv(tmp1, tmp1, rscratch2); 8403 __ uxthw(tmp1, tmp1); 8404 __ lsrv(rscratch1, rscratch1, rscratch2); 8405 __ uxthw(rscratch1, rscratch1); 8406 __ subw(result, tmp1, rscratch1); 8407 __ bind(DONE); 8408 __ ret(lr); 8409 return entry; 8410 } 8411 8412 // r0 = input (float16) 8413 // v0 = result (float) 8414 // v1 = temporary float register 8415 address generate_float16ToFloat() { 8416 __ align(CodeEntryAlignment); 8417 StubGenStubId stub_id = StubGenStubId::hf2f_id; 8418 StubCodeMark mark(this, stub_id); 8419 address entry = __ pc(); 8420 BLOCK_COMMENT("Entry:"); 8421 __ flt16_to_flt(v0, r0, v1); 8422 __ ret(lr); 8423 return entry; 8424 } 8425 8426 // v0 = input (float) 8427 // r0 = result (float16) 8428 // v1 = temporary float register 8429 address generate_floatToFloat16() { 8430 __ align(CodeEntryAlignment); 8431 StubGenStubId stub_id = StubGenStubId::f2hf_id; 8432 StubCodeMark mark(this, stub_id); 8433 address entry = __ pc(); 8434 BLOCK_COMMENT("Entry:"); 8435 __ flt_to_flt16(r0, v0, v1); 8436 __ ret(lr); 8437 return entry; 8438 } 8439 8440 address generate_method_entry_barrier() { 8441 __ align(CodeEntryAlignment); 8442 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 8443 StubCodeMark mark(this, stub_id); 8444 8445 Label deoptimize_label; 8446 8447 address start = __ pc(); 8448 8449 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8450 8451 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8452 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8453 // We can get here despite the nmethod being good, if we have not 8454 // yet applied our cross modification fence (or data fence). 8455 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8456 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8457 __ ldrw(rscratch2, rscratch2); 8458 __ strw(rscratch2, thread_epoch_addr); 8459 __ isb(); 8460 __ membar(__ LoadLoad); 8461 } 8462 8463 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8464 8465 __ enter(); 8466 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8467 8468 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8469 8470 __ push_call_clobbered_registers(); 8471 8472 __ mov(c_rarg0, rscratch2); 8473 __ call_VM_leaf 8474 (CAST_FROM_FN_PTR 8475 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8476 8477 __ reset_last_Java_frame(true); 8478 8479 __ mov(rscratch1, r0); 8480 8481 __ pop_call_clobbered_registers(); 8482 8483 __ cbnz(rscratch1, deoptimize_label); 8484 8485 __ leave(); 8486 __ ret(lr); 8487 8488 __ BIND(deoptimize_label); 8489 8490 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8491 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 8492 8493 __ mov(sp, rscratch1); 8494 __ br(rscratch2); 8495 8496 return start; 8497 } 8498 8499 // r0 = result 8500 // r1 = str1 8501 // r2 = cnt1 8502 // r3 = str2 8503 // r4 = cnt2 8504 // r10 = tmp1 8505 // r11 = tmp2 8506 address generate_compare_long_string_same_encoding(bool isLL) { 8507 __ align(CodeEntryAlignment); 8508 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 8509 StubCodeMark mark(this, stub_id); 8510 address entry = __ pc(); 8511 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8512 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 8513 8514 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 8515 8516 // exit from large loop when less than 64 bytes left to read or we're about 8517 // to prefetch memory behind array border 8518 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 8519 8520 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 8521 __ eor(rscratch2, tmp1, tmp2); 8522 __ cbnz(rscratch2, CAL_DIFFERENCE); 8523 8524 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 8525 // update pointers, because of previous read 8526 __ add(str1, str1, wordSize); 8527 __ add(str2, str2, wordSize); 8528 if (SoftwarePrefetchHintDistance >= 0) { 8529 __ align(OptoLoopAlignment); 8530 __ bind(LARGE_LOOP_PREFETCH); 8531 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 8532 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 8533 8534 for (int i = 0; i < 4; i++) { 8535 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 8536 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 8537 __ cmp(tmp1, tmp2); 8538 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8539 __ br(Assembler::NE, DIFF); 8540 } 8541 __ sub(cnt2, cnt2, isLL ? 64 : 32); 8542 __ add(str1, str1, 64); 8543 __ add(str2, str2, 64); 8544 __ subs(rscratch2, cnt2, largeLoopExitCondition); 8545 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 8546 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 8547 } 8548 8549 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 8550 __ br(Assembler::LE, LESS16); 8551 __ align(OptoLoopAlignment); 8552 __ bind(LOOP_COMPARE16); 8553 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8554 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8555 __ cmp(tmp1, tmp2); 8556 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8557 __ br(Assembler::NE, DIFF); 8558 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8559 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8560 __ br(Assembler::LT, LESS16); 8561 8562 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8563 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8564 __ cmp(tmp1, tmp2); 8565 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8566 __ br(Assembler::NE, DIFF); 8567 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8568 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8569 __ br(Assembler::GE, LOOP_COMPARE16); 8570 __ cbz(cnt2, LENGTH_DIFF); 8571 8572 __ bind(LESS16); 8573 // each 8 compare 8574 __ subs(cnt2, cnt2, isLL ? 8 : 4); 8575 __ br(Assembler::LE, LESS8); 8576 __ ldr(tmp1, Address(__ post(str1, 8))); 8577 __ ldr(tmp2, Address(__ post(str2, 8))); 8578 __ eor(rscratch2, tmp1, tmp2); 8579 __ cbnz(rscratch2, CAL_DIFFERENCE); 8580 __ sub(cnt2, cnt2, isLL ? 8 : 4); 8581 8582 __ bind(LESS8); // directly load last 8 bytes 8583 if (!isLL) { 8584 __ add(cnt2, cnt2, cnt2); 8585 } 8586 __ ldr(tmp1, Address(str1, cnt2)); 8587 __ ldr(tmp2, Address(str2, cnt2)); 8588 __ eor(rscratch2, tmp1, tmp2); 8589 __ cbz(rscratch2, LENGTH_DIFF); 8590 __ b(CAL_DIFFERENCE); 8591 8592 __ bind(DIFF); 8593 __ cmp(tmp1, tmp2); 8594 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 8595 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 8596 // reuse rscratch2 register for the result of eor instruction 8597 __ eor(rscratch2, tmp1, tmp2); 8598 8599 __ bind(CAL_DIFFERENCE); 8600 __ rev(rscratch2, rscratch2); 8601 __ clz(rscratch2, rscratch2); 8602 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 8603 __ lsrv(tmp1, tmp1, rscratch2); 8604 __ lsrv(tmp2, tmp2, rscratch2); 8605 if (isLL) { 8606 __ uxtbw(tmp1, tmp1); 8607 __ uxtbw(tmp2, tmp2); 8608 } else { 8609 __ uxthw(tmp1, tmp1); 8610 __ uxthw(tmp2, tmp2); 8611 } 8612 __ subw(result, tmp1, tmp2); 8613 8614 __ bind(LENGTH_DIFF); 8615 __ ret(lr); 8616 return entry; 8617 } 8618 8619 enum string_compare_mode { 8620 LL, 8621 LU, 8622 UL, 8623 UU, 8624 }; 8625 8626 // The following registers are declared in aarch64.ad 8627 // r0 = result 8628 // r1 = str1 8629 // r2 = cnt1 8630 // r3 = str2 8631 // r4 = cnt2 8632 // r10 = tmp1 8633 // r11 = tmp2 8634 // z0 = ztmp1 8635 // z1 = ztmp2 8636 // p0 = pgtmp1 8637 // p1 = pgtmp2 8638 address generate_compare_long_string_sve(string_compare_mode mode) { 8639 StubGenStubId stub_id; 8640 switch (mode) { 8641 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 8642 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 8643 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 8644 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 8645 default: ShouldNotReachHere(); 8646 } 8647 8648 __ align(CodeEntryAlignment); 8649 address entry = __ pc(); 8650 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8651 tmp1 = r10, tmp2 = r11; 8652 8653 Label LOOP, DONE, MISMATCH; 8654 Register vec_len = tmp1; 8655 Register idx = tmp2; 8656 // The minimum of the string lengths has been stored in cnt2. 8657 Register cnt = cnt2; 8658 FloatRegister ztmp1 = z0, ztmp2 = z1; 8659 PRegister pgtmp1 = p0, pgtmp2 = p1; 8660 8661 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 8662 switch (mode) { \ 8663 case LL: \ 8664 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 8665 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 8666 break; \ 8667 case LU: \ 8668 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 8669 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8670 break; \ 8671 case UL: \ 8672 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8673 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 8674 break; \ 8675 case UU: \ 8676 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8677 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8678 break; \ 8679 default: \ 8680 ShouldNotReachHere(); \ 8681 } 8682 8683 StubCodeMark mark(this, stub_id); 8684 8685 __ mov(idx, 0); 8686 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8687 8688 if (mode == LL) { 8689 __ sve_cntb(vec_len); 8690 } else { 8691 __ sve_cnth(vec_len); 8692 } 8693 8694 __ sub(rscratch1, cnt, vec_len); 8695 8696 __ bind(LOOP); 8697 8698 // main loop 8699 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8700 __ add(idx, idx, vec_len); 8701 // Compare strings. 8702 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8703 __ br(__ NE, MISMATCH); 8704 __ cmp(idx, rscratch1); 8705 __ br(__ LT, LOOP); 8706 8707 // post loop, last iteration 8708 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8709 8710 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8711 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8712 __ br(__ EQ, DONE); 8713 8714 __ bind(MISMATCH); 8715 8716 // Crop the vector to find its location. 8717 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 8718 // Extract the first different characters of each string. 8719 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 8720 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 8721 8722 // Compute the difference of the first different characters. 8723 __ sub(result, rscratch1, rscratch2); 8724 8725 __ bind(DONE); 8726 __ ret(lr); 8727 #undef LOAD_PAIR 8728 return entry; 8729 } 8730 8731 void generate_compare_long_strings() { 8732 if (UseSVE == 0) { 8733 StubRoutines::aarch64::_compare_long_string_LL 8734 = generate_compare_long_string_same_encoding(true); 8735 StubRoutines::aarch64::_compare_long_string_UU 8736 = generate_compare_long_string_same_encoding(false); 8737 StubRoutines::aarch64::_compare_long_string_LU 8738 = generate_compare_long_string_different_encoding(true); 8739 StubRoutines::aarch64::_compare_long_string_UL 8740 = generate_compare_long_string_different_encoding(false); 8741 } else { 8742 StubRoutines::aarch64::_compare_long_string_LL 8743 = generate_compare_long_string_sve(LL); 8744 StubRoutines::aarch64::_compare_long_string_UU 8745 = generate_compare_long_string_sve(UU); 8746 StubRoutines::aarch64::_compare_long_string_LU 8747 = generate_compare_long_string_sve(LU); 8748 StubRoutines::aarch64::_compare_long_string_UL 8749 = generate_compare_long_string_sve(UL); 8750 } 8751 } 8752 8753 // R0 = result 8754 // R1 = str2 8755 // R2 = cnt1 8756 // R3 = str1 8757 // R4 = cnt2 8758 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 8759 // 8760 // This generic linear code use few additional ideas, which makes it faster: 8761 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 8762 // in order to skip initial loading(help in systems with 1 ld pipeline) 8763 // 2) we can use "fast" algorithm of finding single character to search for 8764 // first symbol with less branches(1 branch per each loaded register instead 8765 // of branch for each symbol), so, this is where constants like 8766 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 8767 // 3) after loading and analyzing 1st register of source string, it can be 8768 // used to search for every 1st character entry, saving few loads in 8769 // comparison with "simplier-but-slower" implementation 8770 // 4) in order to avoid lots of push/pop operations, code below is heavily 8771 // re-using/re-initializing/compressing register values, which makes code 8772 // larger and a bit less readable, however, most of extra operations are 8773 // issued during loads or branches, so, penalty is minimal 8774 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 8775 StubGenStubId stub_id; 8776 if (str1_isL) { 8777 if (str2_isL) { 8778 stub_id = StubGenStubId::string_indexof_linear_ll_id; 8779 } else { 8780 stub_id = StubGenStubId::string_indexof_linear_ul_id; 8781 } 8782 } else { 8783 if (str2_isL) { 8784 ShouldNotReachHere(); 8785 } else { 8786 stub_id = StubGenStubId::string_indexof_linear_uu_id; 8787 } 8788 } 8789 __ align(CodeEntryAlignment); 8790 StubCodeMark mark(this, stub_id); 8791 address entry = __ pc(); 8792 8793 int str1_chr_size = str1_isL ? 1 : 2; 8794 int str2_chr_size = str2_isL ? 1 : 2; 8795 int str1_chr_shift = str1_isL ? 0 : 1; 8796 int str2_chr_shift = str2_isL ? 0 : 1; 8797 bool isL = str1_isL && str2_isL; 8798 // parameters 8799 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 8800 // temporary registers 8801 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 8802 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 8803 // redefinitions 8804 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 8805 8806 __ push(spilled_regs, sp); 8807 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 8808 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 8809 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 8810 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 8811 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 8812 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 8813 // Read whole register from str1. It is safe, because length >=8 here 8814 __ ldr(ch1, Address(str1)); 8815 // Read whole register from str2. It is safe, because length >=8 here 8816 __ ldr(ch2, Address(str2)); 8817 __ sub(cnt2, cnt2, cnt1); 8818 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 8819 if (str1_isL != str2_isL) { 8820 __ eor(v0, __ T16B, v0, v0); 8821 } 8822 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 8823 __ mul(first, first, tmp1); 8824 // check if we have less than 1 register to check 8825 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 8826 if (str1_isL != str2_isL) { 8827 __ fmovd(v1, ch1); 8828 } 8829 __ br(__ LE, L_SMALL); 8830 __ eor(ch2, first, ch2); 8831 if (str1_isL != str2_isL) { 8832 __ zip1(v1, __ T16B, v1, v0); 8833 } 8834 __ sub(tmp2, ch2, tmp1); 8835 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8836 __ bics(tmp2, tmp2, ch2); 8837 if (str1_isL != str2_isL) { 8838 __ fmovd(ch1, v1); 8839 } 8840 __ br(__ NE, L_HAS_ZERO); 8841 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8842 __ add(result, result, wordSize/str2_chr_size); 8843 __ add(str2, str2, wordSize); 8844 __ br(__ LT, L_POST_LOOP); 8845 __ BIND(L_LOOP); 8846 __ ldr(ch2, Address(str2)); 8847 __ eor(ch2, first, ch2); 8848 __ sub(tmp2, ch2, tmp1); 8849 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8850 __ bics(tmp2, tmp2, ch2); 8851 __ br(__ NE, L_HAS_ZERO); 8852 __ BIND(L_LOOP_PROCEED); 8853 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8854 __ add(str2, str2, wordSize); 8855 __ add(result, result, wordSize/str2_chr_size); 8856 __ br(__ GE, L_LOOP); 8857 __ BIND(L_POST_LOOP); 8858 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 8859 __ br(__ LE, NOMATCH); 8860 __ ldr(ch2, Address(str2)); 8861 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 8862 __ eor(ch2, first, ch2); 8863 __ sub(tmp2, ch2, tmp1); 8864 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8865 __ mov(tmp4, -1); // all bits set 8866 __ b(L_SMALL_PROCEED); 8867 __ align(OptoLoopAlignment); 8868 __ BIND(L_SMALL); 8869 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 8870 __ eor(ch2, first, ch2); 8871 if (str1_isL != str2_isL) { 8872 __ zip1(v1, __ T16B, v1, v0); 8873 } 8874 __ sub(tmp2, ch2, tmp1); 8875 __ mov(tmp4, -1); // all bits set 8876 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8877 if (str1_isL != str2_isL) { 8878 __ fmovd(ch1, v1); // move converted 4 symbols 8879 } 8880 __ BIND(L_SMALL_PROCEED); 8881 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 8882 __ bic(tmp2, tmp2, ch2); 8883 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 8884 __ rbit(tmp2, tmp2); 8885 __ br(__ EQ, NOMATCH); 8886 __ BIND(L_SMALL_HAS_ZERO_LOOP); 8887 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 8888 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 8889 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 8890 if (str2_isL) { // LL 8891 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 8892 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 8893 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 8894 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 8895 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8896 } else { 8897 __ mov(ch2, 0xE); // all bits in byte set except last one 8898 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8899 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8900 __ lslv(tmp2, tmp2, tmp4); 8901 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8902 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8903 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8904 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8905 } 8906 __ cmp(ch1, ch2); 8907 __ mov(tmp4, wordSize/str2_chr_size); 8908 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8909 __ BIND(L_SMALL_CMP_LOOP); 8910 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 8911 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 8912 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 8913 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 8914 __ add(tmp4, tmp4, 1); 8915 __ cmp(tmp4, cnt1); 8916 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 8917 __ cmp(first, ch2); 8918 __ br(__ EQ, L_SMALL_CMP_LOOP); 8919 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 8920 __ cbz(tmp2, NOMATCH); // no more matches. exit 8921 __ clz(tmp4, tmp2); 8922 __ add(result, result, 1); // advance index 8923 __ add(str2, str2, str2_chr_size); // advance pointer 8924 __ b(L_SMALL_HAS_ZERO_LOOP); 8925 __ align(OptoLoopAlignment); 8926 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 8927 __ cmp(first, ch2); 8928 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8929 __ b(DONE); 8930 __ align(OptoLoopAlignment); 8931 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 8932 if (str2_isL) { // LL 8933 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 8934 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 8935 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 8936 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 8937 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8938 } else { 8939 __ mov(ch2, 0xE); // all bits in byte set except last one 8940 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8941 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8942 __ lslv(tmp2, tmp2, tmp4); 8943 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8944 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8945 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8946 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8947 } 8948 __ cmp(ch1, ch2); 8949 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8950 __ b(DONE); 8951 __ align(OptoLoopAlignment); 8952 __ BIND(L_HAS_ZERO); 8953 __ rbit(tmp2, tmp2); 8954 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 8955 // Now, perform compression of counters(cnt2 and cnt1) into one register. 8956 // It's fine because both counters are 32bit and are not changed in this 8957 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 8958 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 8959 __ sub(result, result, 1); 8960 __ BIND(L_HAS_ZERO_LOOP); 8961 __ mov(cnt1, wordSize/str2_chr_size); 8962 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 8963 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 8964 if (str2_isL) { 8965 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 8966 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8967 __ lslv(tmp2, tmp2, tmp4); 8968 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8969 __ add(tmp4, tmp4, 1); 8970 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8971 __ lsl(tmp2, tmp2, 1); 8972 __ mov(tmp4, wordSize/str2_chr_size); 8973 } else { 8974 __ mov(ch2, 0xE); 8975 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8976 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8977 __ lslv(tmp2, tmp2, tmp4); 8978 __ add(tmp4, tmp4, 1); 8979 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8980 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 8981 __ lsl(tmp2, tmp2, 1); 8982 __ mov(tmp4, wordSize/str2_chr_size); 8983 __ sub(str2, str2, str2_chr_size); 8984 } 8985 __ cmp(ch1, ch2); 8986 __ mov(tmp4, wordSize/str2_chr_size); 8987 __ br(__ NE, L_CMP_LOOP_NOMATCH); 8988 __ BIND(L_CMP_LOOP); 8989 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 8990 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 8991 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 8992 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 8993 __ add(tmp4, tmp4, 1); 8994 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 8995 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 8996 __ cmp(cnt1, ch2); 8997 __ br(__ EQ, L_CMP_LOOP); 8998 __ BIND(L_CMP_LOOP_NOMATCH); 8999 // here we're not matched 9000 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9001 __ clz(tmp4, tmp2); 9002 __ add(str2, str2, str2_chr_size); // advance pointer 9003 __ b(L_HAS_ZERO_LOOP); 9004 __ align(OptoLoopAlignment); 9005 __ BIND(L_CMP_LOOP_LAST_CMP); 9006 __ cmp(cnt1, ch2); 9007 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9008 __ b(DONE); 9009 __ align(OptoLoopAlignment); 9010 __ BIND(L_CMP_LOOP_LAST_CMP2); 9011 if (str2_isL) { 9012 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9013 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9014 __ lslv(tmp2, tmp2, tmp4); 9015 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9016 __ add(tmp4, tmp4, 1); 9017 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9018 __ lsl(tmp2, tmp2, 1); 9019 } else { 9020 __ mov(ch2, 0xE); 9021 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9022 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9023 __ lslv(tmp2, tmp2, tmp4); 9024 __ add(tmp4, tmp4, 1); 9025 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9026 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9027 __ lsl(tmp2, tmp2, 1); 9028 __ sub(str2, str2, str2_chr_size); 9029 } 9030 __ cmp(ch1, ch2); 9031 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9032 __ b(DONE); 9033 __ align(OptoLoopAlignment); 9034 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9035 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9036 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9037 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9038 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9039 // result by analyzed characters value, so, we can just reset lower bits 9040 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9041 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9042 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9043 // index of last analyzed substring inside current octet. So, str2 in at 9044 // respective start address. We need to advance it to next octet 9045 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9046 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9047 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9048 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9049 __ movw(cnt2, cnt2); 9050 __ b(L_LOOP_PROCEED); 9051 __ align(OptoLoopAlignment); 9052 __ BIND(NOMATCH); 9053 __ mov(result, -1); 9054 __ BIND(DONE); 9055 __ pop(spilled_regs, sp); 9056 __ ret(lr); 9057 return entry; 9058 } 9059 9060 void generate_string_indexof_stubs() { 9061 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9062 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9063 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9064 } 9065 9066 void inflate_and_store_2_fp_registers(bool generatePrfm, 9067 FloatRegister src1, FloatRegister src2) { 9068 Register dst = r1; 9069 __ zip1(v1, __ T16B, src1, v0); 9070 __ zip2(v2, __ T16B, src1, v0); 9071 if (generatePrfm) { 9072 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9073 } 9074 __ zip1(v3, __ T16B, src2, v0); 9075 __ zip2(v4, __ T16B, src2, v0); 9076 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9077 } 9078 9079 // R0 = src 9080 // R1 = dst 9081 // R2 = len 9082 // R3 = len >> 3 9083 // V0 = 0 9084 // v1 = loaded 8 bytes 9085 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9086 address generate_large_byte_array_inflate() { 9087 __ align(CodeEntryAlignment); 9088 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 9089 StubCodeMark mark(this, stub_id); 9090 address entry = __ pc(); 9091 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9092 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9093 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9094 9095 // do one more 8-byte read to have address 16-byte aligned in most cases 9096 // also use single store instruction 9097 __ ldrd(v2, __ post(src, 8)); 9098 __ sub(octetCounter, octetCounter, 2); 9099 __ zip1(v1, __ T16B, v1, v0); 9100 __ zip1(v2, __ T16B, v2, v0); 9101 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9102 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9103 __ subs(rscratch1, octetCounter, large_loop_threshold); 9104 __ br(__ LE, LOOP_START); 9105 __ b(LOOP_PRFM_START); 9106 __ bind(LOOP_PRFM); 9107 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9108 __ bind(LOOP_PRFM_START); 9109 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9110 __ sub(octetCounter, octetCounter, 8); 9111 __ subs(rscratch1, octetCounter, large_loop_threshold); 9112 inflate_and_store_2_fp_registers(true, v3, v4); 9113 inflate_and_store_2_fp_registers(true, v5, v6); 9114 __ br(__ GT, LOOP_PRFM); 9115 __ cmp(octetCounter, (u1)8); 9116 __ br(__ LT, DONE); 9117 __ bind(LOOP); 9118 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9119 __ bind(LOOP_START); 9120 __ sub(octetCounter, octetCounter, 8); 9121 __ cmp(octetCounter, (u1)8); 9122 inflate_and_store_2_fp_registers(false, v3, v4); 9123 inflate_and_store_2_fp_registers(false, v5, v6); 9124 __ br(__ GE, LOOP); 9125 __ bind(DONE); 9126 __ ret(lr); 9127 return entry; 9128 } 9129 9130 /** 9131 * Arguments: 9132 * 9133 * Input: 9134 * c_rarg0 - current state address 9135 * c_rarg1 - H key address 9136 * c_rarg2 - data address 9137 * c_rarg3 - number of blocks 9138 * 9139 * Output: 9140 * Updated state at c_rarg0 9141 */ 9142 address generate_ghash_processBlocks() { 9143 // Bafflingly, GCM uses little-endian for the byte order, but 9144 // big-endian for the bit order. For example, the polynomial 1 is 9145 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9146 // 9147 // So, we must either reverse the bytes in each word and do 9148 // everything big-endian or reverse the bits in each byte and do 9149 // it little-endian. On AArch64 it's more idiomatic to reverse 9150 // the bits in each byte (we have an instruction, RBIT, to do 9151 // that) and keep the data in little-endian bit order through the 9152 // calculation, bit-reversing the inputs and outputs. 9153 9154 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 9155 StubCodeMark mark(this, stub_id); 9156 __ align(wordSize * 2); 9157 address p = __ pc(); 9158 __ emit_int64(0x87); // The low-order bits of the field 9159 // polynomial (i.e. p = z^7+z^2+z+1) 9160 // repeated in the low and high parts of a 9161 // 128-bit vector 9162 __ emit_int64(0x87); 9163 9164 __ align(CodeEntryAlignment); 9165 address start = __ pc(); 9166 9167 Register state = c_rarg0; 9168 Register subkeyH = c_rarg1; 9169 Register data = c_rarg2; 9170 Register blocks = c_rarg3; 9171 9172 FloatRegister vzr = v30; 9173 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9174 9175 __ ldrq(v24, p); // The field polynomial 9176 9177 __ ldrq(v0, Address(state)); 9178 __ ldrq(v1, Address(subkeyH)); 9179 9180 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9181 __ rbit(v0, __ T16B, v0); 9182 __ rev64(v1, __ T16B, v1); 9183 __ rbit(v1, __ T16B, v1); 9184 9185 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9186 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9187 9188 { 9189 Label L_ghash_loop; 9190 __ bind(L_ghash_loop); 9191 9192 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9193 // reversing each byte 9194 __ rbit(v2, __ T16B, v2); 9195 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9196 9197 // Multiply state in v2 by subkey in v1 9198 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9199 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9200 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9201 // Reduce v7:v5 by the field polynomial 9202 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9203 9204 __ sub(blocks, blocks, 1); 9205 __ cbnz(blocks, L_ghash_loop); 9206 } 9207 9208 // The bit-reversed result is at this point in v0 9209 __ rev64(v0, __ T16B, v0); 9210 __ rbit(v0, __ T16B, v0); 9211 9212 __ st1(v0, __ T16B, state); 9213 __ ret(lr); 9214 9215 return start; 9216 } 9217 9218 address generate_ghash_processBlocks_wide() { 9219 address small = generate_ghash_processBlocks(); 9220 9221 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 9222 StubCodeMark mark(this, stub_id); 9223 __ align(wordSize * 2); 9224 address p = __ pc(); 9225 __ emit_int64(0x87); // The low-order bits of the field 9226 // polynomial (i.e. p = z^7+z^2+z+1) 9227 // repeated in the low and high parts of a 9228 // 128-bit vector 9229 __ emit_int64(0x87); 9230 9231 __ align(CodeEntryAlignment); 9232 address start = __ pc(); 9233 9234 Register state = c_rarg0; 9235 Register subkeyH = c_rarg1; 9236 Register data = c_rarg2; 9237 Register blocks = c_rarg3; 9238 9239 const int unroll = 4; 9240 9241 __ cmp(blocks, (unsigned char)(unroll * 2)); 9242 __ br(__ LT, small); 9243 9244 if (unroll > 1) { 9245 // Save state before entering routine 9246 __ sub(sp, sp, 4 * 16); 9247 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9248 __ sub(sp, sp, 4 * 16); 9249 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9250 } 9251 9252 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9253 9254 if (unroll > 1) { 9255 // And restore state 9256 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9257 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9258 } 9259 9260 __ cmp(blocks, (unsigned char)0); 9261 __ br(__ GT, small); 9262 9263 __ ret(lr); 9264 9265 return start; 9266 } 9267 9268 void generate_base64_encode_simdround(Register src, Register dst, 9269 FloatRegister codec, u8 size) { 9270 9271 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9272 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9273 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9274 9275 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9276 9277 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9278 9279 __ ushr(ind0, arrangement, in0, 2); 9280 9281 __ ushr(ind1, arrangement, in1, 2); 9282 __ shl(in0, arrangement, in0, 6); 9283 __ orr(ind1, arrangement, ind1, in0); 9284 __ ushr(ind1, arrangement, ind1, 2); 9285 9286 __ ushr(ind2, arrangement, in2, 4); 9287 __ shl(in1, arrangement, in1, 4); 9288 __ orr(ind2, arrangement, in1, ind2); 9289 __ ushr(ind2, arrangement, ind2, 2); 9290 9291 __ shl(ind3, arrangement, in2, 2); 9292 __ ushr(ind3, arrangement, ind3, 2); 9293 9294 __ tbl(out0, arrangement, codec, 4, ind0); 9295 __ tbl(out1, arrangement, codec, 4, ind1); 9296 __ tbl(out2, arrangement, codec, 4, ind2); 9297 __ tbl(out3, arrangement, codec, 4, ind3); 9298 9299 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9300 } 9301 9302 /** 9303 * Arguments: 9304 * 9305 * Input: 9306 * c_rarg0 - src_start 9307 * c_rarg1 - src_offset 9308 * c_rarg2 - src_length 9309 * c_rarg3 - dest_start 9310 * c_rarg4 - dest_offset 9311 * c_rarg5 - isURL 9312 * 9313 */ 9314 address generate_base64_encodeBlock() { 9315 9316 static const char toBase64[64] = { 9317 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9318 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9319 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9320 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9321 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9322 }; 9323 9324 static const char toBase64URL[64] = { 9325 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9326 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9327 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9328 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9329 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9330 }; 9331 9332 __ align(CodeEntryAlignment); 9333 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 9334 StubCodeMark mark(this, stub_id); 9335 address start = __ pc(); 9336 9337 Register src = c_rarg0; // source array 9338 Register soff = c_rarg1; // source start offset 9339 Register send = c_rarg2; // source end offset 9340 Register dst = c_rarg3; // dest array 9341 Register doff = c_rarg4; // position for writing to dest array 9342 Register isURL = c_rarg5; // Base64 or URL character set 9343 9344 // c_rarg6 and c_rarg7 are free to use as temps 9345 Register codec = c_rarg6; 9346 Register length = c_rarg7; 9347 9348 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9349 9350 __ add(src, src, soff); 9351 __ add(dst, dst, doff); 9352 __ sub(length, send, soff); 9353 9354 // load the codec base address 9355 __ lea(codec, ExternalAddress((address) toBase64)); 9356 __ cbz(isURL, ProcessData); 9357 __ lea(codec, ExternalAddress((address) toBase64URL)); 9358 9359 __ BIND(ProcessData); 9360 9361 // too short to formup a SIMD loop, roll back 9362 __ cmp(length, (u1)24); 9363 __ br(Assembler::LT, Process3B); 9364 9365 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9366 9367 __ BIND(Process48B); 9368 __ cmp(length, (u1)48); 9369 __ br(Assembler::LT, Process24B); 9370 generate_base64_encode_simdround(src, dst, v0, 16); 9371 __ sub(length, length, 48); 9372 __ b(Process48B); 9373 9374 __ BIND(Process24B); 9375 __ cmp(length, (u1)24); 9376 __ br(Assembler::LT, SIMDExit); 9377 generate_base64_encode_simdround(src, dst, v0, 8); 9378 __ sub(length, length, 24); 9379 9380 __ BIND(SIMDExit); 9381 __ cbz(length, Exit); 9382 9383 __ BIND(Process3B); 9384 // 3 src bytes, 24 bits 9385 __ ldrb(r10, __ post(src, 1)); 9386 __ ldrb(r11, __ post(src, 1)); 9387 __ ldrb(r12, __ post(src, 1)); 9388 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9389 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9390 // codec index 9391 __ ubfmw(r15, r12, 18, 23); 9392 __ ubfmw(r14, r12, 12, 17); 9393 __ ubfmw(r13, r12, 6, 11); 9394 __ andw(r12, r12, 63); 9395 // get the code based on the codec 9396 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9397 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9398 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9399 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9400 __ strb(r15, __ post(dst, 1)); 9401 __ strb(r14, __ post(dst, 1)); 9402 __ strb(r13, __ post(dst, 1)); 9403 __ strb(r12, __ post(dst, 1)); 9404 __ sub(length, length, 3); 9405 __ cbnz(length, Process3B); 9406 9407 __ BIND(Exit); 9408 __ ret(lr); 9409 9410 return start; 9411 } 9412 9413 void generate_base64_decode_simdround(Register src, Register dst, 9414 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9415 9416 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9417 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9418 9419 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9420 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9421 9422 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9423 9424 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9425 9426 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9427 9428 // we need unsigned saturating subtract, to make sure all input values 9429 // in range [0, 63] will have 0U value in the higher half lookup 9430 __ uqsubv(decH0, __ T16B, in0, v27); 9431 __ uqsubv(decH1, __ T16B, in1, v27); 9432 __ uqsubv(decH2, __ T16B, in2, v27); 9433 __ uqsubv(decH3, __ T16B, in3, v27); 9434 9435 // lower half lookup 9436 __ tbl(decL0, arrangement, codecL, 4, in0); 9437 __ tbl(decL1, arrangement, codecL, 4, in1); 9438 __ tbl(decL2, arrangement, codecL, 4, in2); 9439 __ tbl(decL3, arrangement, codecL, 4, in3); 9440 9441 // higher half lookup 9442 __ tbx(decH0, arrangement, codecH, 4, decH0); 9443 __ tbx(decH1, arrangement, codecH, 4, decH1); 9444 __ tbx(decH2, arrangement, codecH, 4, decH2); 9445 __ tbx(decH3, arrangement, codecH, 4, decH3); 9446 9447 // combine lower and higher 9448 __ orr(decL0, arrangement, decL0, decH0); 9449 __ orr(decL1, arrangement, decL1, decH1); 9450 __ orr(decL2, arrangement, decL2, decH2); 9451 __ orr(decL3, arrangement, decL3, decH3); 9452 9453 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9454 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9455 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9456 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9457 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9458 __ orr(in0, arrangement, decH0, decH1); 9459 __ orr(in1, arrangement, decH2, decH3); 9460 __ orr(in2, arrangement, in0, in1); 9461 __ umaxv(in3, arrangement, in2); 9462 __ umov(rscratch2, in3, __ B, 0); 9463 9464 // get the data to output 9465 __ shl(out0, arrangement, decL0, 2); 9466 __ ushr(out1, arrangement, decL1, 4); 9467 __ orr(out0, arrangement, out0, out1); 9468 __ shl(out1, arrangement, decL1, 4); 9469 __ ushr(out2, arrangement, decL2, 2); 9470 __ orr(out1, arrangement, out1, out2); 9471 __ shl(out2, arrangement, decL2, 6); 9472 __ orr(out2, arrangement, out2, decL3); 9473 9474 __ cbz(rscratch2, NoIllegalData); 9475 9476 // handle illegal input 9477 __ umov(r10, in2, __ D, 0); 9478 if (size == 16) { 9479 __ cbnz(r10, ErrorInLowerHalf); 9480 9481 // illegal input is in higher half, store the lower half now. 9482 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9483 9484 __ umov(r10, in2, __ D, 1); 9485 __ umov(r11, out0, __ D, 1); 9486 __ umov(r12, out1, __ D, 1); 9487 __ umov(r13, out2, __ D, 1); 9488 __ b(StoreLegalData); 9489 9490 __ BIND(ErrorInLowerHalf); 9491 } 9492 __ umov(r11, out0, __ D, 0); 9493 __ umov(r12, out1, __ D, 0); 9494 __ umov(r13, out2, __ D, 0); 9495 9496 __ BIND(StoreLegalData); 9497 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 9498 __ strb(r11, __ post(dst, 1)); 9499 __ strb(r12, __ post(dst, 1)); 9500 __ strb(r13, __ post(dst, 1)); 9501 __ lsr(r10, r10, 8); 9502 __ lsr(r11, r11, 8); 9503 __ lsr(r12, r12, 8); 9504 __ lsr(r13, r13, 8); 9505 __ b(StoreLegalData); 9506 9507 __ BIND(NoIllegalData); 9508 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 9509 } 9510 9511 9512 /** 9513 * Arguments: 9514 * 9515 * Input: 9516 * c_rarg0 - src_start 9517 * c_rarg1 - src_offset 9518 * c_rarg2 - src_length 9519 * c_rarg3 - dest_start 9520 * c_rarg4 - dest_offset 9521 * c_rarg5 - isURL 9522 * c_rarg6 - isMIME 9523 * 9524 */ 9525 address generate_base64_decodeBlock() { 9526 9527 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 9528 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 9529 // titled "Base64 decoding". 9530 9531 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 9532 // except the trailing character '=' is also treated illegal value in this intrinsic. That 9533 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 9534 static const uint8_t fromBase64ForNoSIMD[256] = { 9535 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9536 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9537 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9538 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9539 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9540 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 9541 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9542 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9543 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9544 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9545 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9546 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9547 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9548 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9549 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9550 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9551 }; 9552 9553 static const uint8_t fromBase64URLForNoSIMD[256] = { 9554 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9555 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9556 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9557 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9558 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9559 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 9560 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9561 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9562 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9563 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9564 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9565 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9566 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9567 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9568 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9569 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9570 }; 9571 9572 // A legal value of base64 code is in range [0, 127]. We need two lookups 9573 // with tbl/tbx and combine them to get the decode data. The 1st table vector 9574 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 9575 // table vector lookup use tbx, out of range indices are unchanged in 9576 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 9577 // The value of index 64 is set to 0, so that we know that we already get the 9578 // decoded data with the 1st lookup. 9579 static const uint8_t fromBase64ForSIMD[128] = { 9580 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9581 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9582 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9583 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9584 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9585 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9586 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9587 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9588 }; 9589 9590 static const uint8_t fromBase64URLForSIMD[128] = { 9591 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9592 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9593 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9594 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9595 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9596 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9597 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9598 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9599 }; 9600 9601 __ align(CodeEntryAlignment); 9602 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 9603 StubCodeMark mark(this, stub_id); 9604 address start = __ pc(); 9605 9606 Register src = c_rarg0; // source array 9607 Register soff = c_rarg1; // source start offset 9608 Register send = c_rarg2; // source end offset 9609 Register dst = c_rarg3; // dest array 9610 Register doff = c_rarg4; // position for writing to dest array 9611 Register isURL = c_rarg5; // Base64 or URL character set 9612 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 9613 9614 Register length = send; // reuse send as length of source data to process 9615 9616 Register simd_codec = c_rarg6; 9617 Register nosimd_codec = c_rarg7; 9618 9619 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 9620 9621 __ enter(); 9622 9623 __ add(src, src, soff); 9624 __ add(dst, dst, doff); 9625 9626 __ mov(doff, dst); 9627 9628 __ sub(length, send, soff); 9629 __ bfm(length, zr, 0, 1); 9630 9631 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 9632 __ cbz(isURL, ProcessData); 9633 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 9634 9635 __ BIND(ProcessData); 9636 __ mov(rscratch1, length); 9637 __ cmp(length, (u1)144); // 144 = 80 + 64 9638 __ br(Assembler::LT, Process4B); 9639 9640 // In the MIME case, the line length cannot be more than 76 9641 // bytes (see RFC 2045). This is too short a block for SIMD 9642 // to be worthwhile, so we use non-SIMD here. 9643 __ movw(rscratch1, 79); 9644 9645 __ BIND(Process4B); 9646 __ ldrw(r14, __ post(src, 4)); 9647 __ ubfxw(r10, r14, 0, 8); 9648 __ ubfxw(r11, r14, 8, 8); 9649 __ ubfxw(r12, r14, 16, 8); 9650 __ ubfxw(r13, r14, 24, 8); 9651 // get the de-code 9652 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 9653 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 9654 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 9655 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 9656 // error detection, 255u indicates an illegal input 9657 __ orrw(r14, r10, r11); 9658 __ orrw(r15, r12, r13); 9659 __ orrw(r14, r14, r15); 9660 __ tbnz(r14, 7, Exit); 9661 // recover the data 9662 __ lslw(r14, r10, 10); 9663 __ bfiw(r14, r11, 4, 6); 9664 __ bfmw(r14, r12, 2, 5); 9665 __ rev16w(r14, r14); 9666 __ bfiw(r13, r12, 6, 2); 9667 __ strh(r14, __ post(dst, 2)); 9668 __ strb(r13, __ post(dst, 1)); 9669 // non-simd loop 9670 __ subsw(rscratch1, rscratch1, 4); 9671 __ br(Assembler::GT, Process4B); 9672 9673 // if exiting from PreProcess80B, rscratch1 == -1; 9674 // otherwise, rscratch1 == 0. 9675 __ cbzw(rscratch1, Exit); 9676 __ sub(length, length, 80); 9677 9678 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 9679 __ cbz(isURL, SIMDEnter); 9680 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 9681 9682 __ BIND(SIMDEnter); 9683 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 9684 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 9685 __ mov(rscratch1, 63); 9686 __ dup(v27, __ T16B, rscratch1); 9687 9688 __ BIND(Process64B); 9689 __ cmp(length, (u1)64); 9690 __ br(Assembler::LT, Process32B); 9691 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 9692 __ sub(length, length, 64); 9693 __ b(Process64B); 9694 9695 __ BIND(Process32B); 9696 __ cmp(length, (u1)32); 9697 __ br(Assembler::LT, SIMDExit); 9698 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 9699 __ sub(length, length, 32); 9700 __ b(Process32B); 9701 9702 __ BIND(SIMDExit); 9703 __ cbz(length, Exit); 9704 __ movw(rscratch1, length); 9705 __ b(Process4B); 9706 9707 __ BIND(Exit); 9708 __ sub(c_rarg0, dst, doff); 9709 9710 __ leave(); 9711 __ ret(lr); 9712 9713 return start; 9714 } 9715 9716 // Support for spin waits. 9717 address generate_spin_wait() { 9718 __ align(CodeEntryAlignment); 9719 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 9720 StubCodeMark mark(this, stub_id); 9721 address start = __ pc(); 9722 9723 __ spin_wait(); 9724 __ ret(lr); 9725 9726 return start; 9727 } 9728 9729 void generate_lookup_secondary_supers_table_stub() { 9730 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 9731 StubCodeMark mark(this, stub_id); 9732 9733 const Register 9734 r_super_klass = r0, 9735 r_array_base = r1, 9736 r_array_length = r2, 9737 r_array_index = r3, 9738 r_sub_klass = r4, 9739 r_bitmap = rscratch2, 9740 result = r5; 9741 const FloatRegister 9742 vtemp = v0; 9743 9744 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 9745 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 9746 Label L_success; 9747 __ enter(); 9748 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 9749 r_array_base, r_array_length, r_array_index, 9750 vtemp, result, slot, 9751 /*stub_is_near*/true); 9752 __ leave(); 9753 __ ret(lr); 9754 } 9755 } 9756 9757 // Slow path implementation for UseSecondarySupersTable. 9758 address generate_lookup_secondary_supers_table_slow_path_stub() { 9759 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 9760 StubCodeMark mark(this, stub_id); 9761 9762 address start = __ pc(); 9763 const Register 9764 r_super_klass = r0, // argument 9765 r_array_base = r1, // argument 9766 temp1 = r2, // temp 9767 r_array_index = r3, // argument 9768 r_bitmap = rscratch2, // argument 9769 result = r5; // argument 9770 9771 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 9772 __ ret(lr); 9773 9774 return start; 9775 } 9776 9777 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 9778 9779 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 9780 // 9781 // If LSE is in use, generate LSE versions of all the stubs. The 9782 // non-LSE versions are in atomic_aarch64.S. 9783 9784 // class AtomicStubMark records the entry point of a stub and the 9785 // stub pointer which will point to it. The stub pointer is set to 9786 // the entry point when ~AtomicStubMark() is called, which must be 9787 // after ICache::invalidate_range. This ensures safe publication of 9788 // the generated code. 9789 class AtomicStubMark { 9790 address _entry_point; 9791 aarch64_atomic_stub_t *_stub; 9792 MacroAssembler *_masm; 9793 public: 9794 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 9795 _masm = masm; 9796 __ align(32); 9797 _entry_point = __ pc(); 9798 _stub = stub; 9799 } 9800 ~AtomicStubMark() { 9801 *_stub = (aarch64_atomic_stub_t)_entry_point; 9802 } 9803 }; 9804 9805 // NB: For memory_order_conservative we need a trailing membar after 9806 // LSE atomic operations but not a leading membar. 9807 // 9808 // We don't need a leading membar because a clause in the Arm ARM 9809 // says: 9810 // 9811 // Barrier-ordered-before 9812 // 9813 // Barrier instructions order prior Memory effects before subsequent 9814 // Memory effects generated by the same Observer. A read or a write 9815 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 9816 // Observer if and only if RW1 appears in program order before RW 2 9817 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 9818 // instruction with both Acquire and Release semantics. 9819 // 9820 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 9821 // and Release semantics, therefore we don't need a leading 9822 // barrier. However, there is no corresponding Barrier-ordered-after 9823 // relationship, therefore we need a trailing membar to prevent a 9824 // later store or load from being reordered with the store in an 9825 // atomic instruction. 9826 // 9827 // This was checked by using the herd7 consistency model simulator 9828 // (http://diy.inria.fr/) with this test case: 9829 // 9830 // AArch64 LseCas 9831 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 9832 // P0 | P1; 9833 // LDR W4, [X2] | MOV W3, #0; 9834 // DMB LD | MOV W4, #1; 9835 // LDR W3, [X1] | CASAL W3, W4, [X1]; 9836 // | DMB ISH; 9837 // | STR W4, [X2]; 9838 // exists 9839 // (0:X3=0 /\ 0:X4=1) 9840 // 9841 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 9842 // with the store to x in P1. Without the DMB in P1 this may happen. 9843 // 9844 // At the time of writing we don't know of any AArch64 hardware that 9845 // reorders stores in this way, but the Reference Manual permits it. 9846 9847 void gen_cas_entry(Assembler::operand_size size, 9848 atomic_memory_order order) { 9849 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 9850 exchange_val = c_rarg2; 9851 bool acquire, release; 9852 switch (order) { 9853 case memory_order_relaxed: 9854 acquire = false; 9855 release = false; 9856 break; 9857 case memory_order_release: 9858 acquire = false; 9859 release = true; 9860 break; 9861 default: 9862 acquire = true; 9863 release = true; 9864 break; 9865 } 9866 __ mov(prev, compare_val); 9867 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 9868 if (order == memory_order_conservative) { 9869 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9870 } 9871 if (size == Assembler::xword) { 9872 __ mov(r0, prev); 9873 } else { 9874 __ movw(r0, prev); 9875 } 9876 __ ret(lr); 9877 } 9878 9879 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 9880 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 9881 // If not relaxed, then default to conservative. Relaxed is the only 9882 // case we use enough to be worth specializing. 9883 if (order == memory_order_relaxed) { 9884 __ ldadd(size, incr, prev, addr); 9885 } else { 9886 __ ldaddal(size, incr, prev, addr); 9887 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9888 } 9889 if (size == Assembler::xword) { 9890 __ mov(r0, prev); 9891 } else { 9892 __ movw(r0, prev); 9893 } 9894 __ ret(lr); 9895 } 9896 9897 void gen_swpal_entry(Assembler::operand_size size) { 9898 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 9899 __ swpal(size, incr, prev, addr); 9900 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9901 if (size == Assembler::xword) { 9902 __ mov(r0, prev); 9903 } else { 9904 __ movw(r0, prev); 9905 } 9906 __ ret(lr); 9907 } 9908 9909 void generate_atomic_entry_points() { 9910 if (! UseLSE) { 9911 return; 9912 } 9913 __ align(CodeEntryAlignment); 9914 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 9915 StubCodeMark mark(this, stub_id); 9916 address first_entry = __ pc(); 9917 9918 // ADD, memory_order_conservative 9919 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 9920 gen_ldadd_entry(Assembler::word, memory_order_conservative); 9921 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 9922 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 9923 9924 // ADD, memory_order_relaxed 9925 AtomicStubMark mark_fetch_add_4_relaxed 9926 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 9927 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 9928 AtomicStubMark mark_fetch_add_8_relaxed 9929 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 9930 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 9931 9932 // XCHG, memory_order_conservative 9933 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 9934 gen_swpal_entry(Assembler::word); 9935 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 9936 gen_swpal_entry(Assembler::xword); 9937 9938 // CAS, memory_order_conservative 9939 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 9940 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 9941 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 9942 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 9943 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 9944 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 9945 9946 // CAS, memory_order_relaxed 9947 AtomicStubMark mark_cmpxchg_1_relaxed 9948 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 9949 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 9950 AtomicStubMark mark_cmpxchg_4_relaxed 9951 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 9952 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 9953 AtomicStubMark mark_cmpxchg_8_relaxed 9954 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 9955 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 9956 9957 AtomicStubMark mark_cmpxchg_4_release 9958 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 9959 gen_cas_entry(MacroAssembler::word, memory_order_release); 9960 AtomicStubMark mark_cmpxchg_8_release 9961 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 9962 gen_cas_entry(MacroAssembler::xword, memory_order_release); 9963 9964 AtomicStubMark mark_cmpxchg_4_seq_cst 9965 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 9966 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 9967 AtomicStubMark mark_cmpxchg_8_seq_cst 9968 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 9969 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 9970 9971 ICache::invalidate_range(first_entry, __ pc() - first_entry); 9972 } 9973 #endif // LINUX 9974 9975 address generate_cont_thaw(Continuation::thaw_kind kind) { 9976 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 9977 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 9978 9979 address start = __ pc(); 9980 9981 if (return_barrier) { 9982 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 9983 __ mov(sp, rscratch1); 9984 } 9985 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 9986 9987 if (return_barrier) { 9988 // preserve possible return value from a method returning to the return barrier 9989 __ fmovd(rscratch1, v0); 9990 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 9991 } 9992 9993 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 9994 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 9995 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 9996 9997 if (return_barrier) { 9998 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 9999 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10000 __ fmovd(v0, rscratch1); 10001 } 10002 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10003 10004 10005 Label thaw_success; 10006 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10007 __ cbnz(rscratch2, thaw_success); 10008 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10009 __ br(rscratch1); 10010 __ bind(thaw_success); 10011 10012 // make room for the thawed frames 10013 __ sub(rscratch1, sp, rscratch2); 10014 __ andr(rscratch1, rscratch1, -16); // align 10015 __ mov(sp, rscratch1); 10016 10017 if (return_barrier) { 10018 // save original return value -- again 10019 __ fmovd(rscratch1, v0); 10020 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10021 } 10022 10023 // If we want, we can templatize thaw by kind, and have three different entries 10024 __ movw(c_rarg1, (uint32_t)kind); 10025 10026 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10027 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10028 10029 if (return_barrier) { 10030 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10031 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10032 __ fmovd(v0, rscratch1); 10033 } else { 10034 __ mov(r0, zr); // return 0 (success) from doYield 10035 } 10036 10037 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10038 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10039 __ mov(rfp, sp); 10040 10041 if (return_barrier_exception) { 10042 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10043 __ authenticate_return_address(c_rarg1); 10044 __ verify_oop(r0); 10045 // save return value containing the exception oop in callee-saved R19 10046 __ mov(r19, r0); 10047 10048 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10049 10050 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10051 // __ reinitialize_ptrue(); 10052 10053 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10054 10055 __ mov(r1, r0); // the exception handler 10056 __ mov(r0, r19); // restore return value containing the exception oop 10057 __ verify_oop(r0); 10058 10059 __ leave(); 10060 __ mov(r3, lr); 10061 __ br(r1); // the exception handler 10062 } else { 10063 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10064 __ leave(); 10065 __ ret(lr); 10066 } 10067 10068 return start; 10069 } 10070 10071 address generate_cont_thaw() { 10072 if (!Continuations::enabled()) return nullptr; 10073 10074 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 10075 StubCodeMark mark(this, stub_id); 10076 address start = __ pc(); 10077 generate_cont_thaw(Continuation::thaw_top); 10078 return start; 10079 } 10080 10081 address generate_cont_returnBarrier() { 10082 if (!Continuations::enabled()) return nullptr; 10083 10084 // TODO: will probably need multiple return barriers depending on return type 10085 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 10086 StubCodeMark mark(this, stub_id); 10087 address start = __ pc(); 10088 10089 generate_cont_thaw(Continuation::thaw_return_barrier); 10090 10091 return start; 10092 } 10093 10094 address generate_cont_returnBarrier_exception() { 10095 if (!Continuations::enabled()) return nullptr; 10096 10097 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 10098 StubCodeMark mark(this, stub_id); 10099 address start = __ pc(); 10100 10101 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10102 10103 return start; 10104 } 10105 10106 address generate_cont_preempt_stub() { 10107 if (!Continuations::enabled()) return nullptr; 10108 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 10109 StubCodeMark mark(this, stub_id); 10110 address start = __ pc(); 10111 10112 __ reset_last_Java_frame(true); 10113 10114 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10115 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10116 __ mov(sp, rscratch2); 10117 10118 Label preemption_cancelled; 10119 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10120 __ cbnz(rscratch1, preemption_cancelled); 10121 10122 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10123 SharedRuntime::continuation_enter_cleanup(_masm); 10124 __ leave(); 10125 __ ret(lr); 10126 10127 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10128 __ bind(preemption_cancelled); 10129 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10130 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10131 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10132 __ ldr(rscratch1, Address(rscratch1)); 10133 __ br(rscratch1); 10134 10135 return start; 10136 } 10137 10138 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10139 // are represented as long[5], with BITS_PER_LIMB = 26. 10140 // Pack five 26-bit limbs into three 64-bit registers. 10141 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10142 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10143 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10144 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10145 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10146 10147 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10148 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10149 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10150 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10151 10152 if (dest2->is_valid()) { 10153 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10154 } else { 10155 #ifdef ASSERT 10156 Label OK; 10157 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10158 __ br(__ EQ, OK); 10159 __ stop("high bits of Poly1305 integer should be zero"); 10160 __ should_not_reach_here(); 10161 __ bind(OK); 10162 #endif 10163 } 10164 } 10165 10166 // As above, but return only a 128-bit integer, packed into two 10167 // 64-bit registers. 10168 void pack_26(Register dest0, Register dest1, Register src) { 10169 pack_26(dest0, dest1, noreg, src); 10170 } 10171 10172 // Multiply and multiply-accumulate unsigned 64-bit registers. 10173 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10174 __ mul(prod_lo, n, m); 10175 __ umulh(prod_hi, n, m); 10176 } 10177 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10178 wide_mul(rscratch1, rscratch2, n, m); 10179 __ adds(sum_lo, sum_lo, rscratch1); 10180 __ adc(sum_hi, sum_hi, rscratch2); 10181 } 10182 10183 // Poly1305, RFC 7539 10184 10185 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10186 // description of the tricks used to simplify and accelerate this 10187 // computation. 10188 10189 address generate_poly1305_processBlocks() { 10190 __ align(CodeEntryAlignment); 10191 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 10192 StubCodeMark mark(this, stub_id); 10193 address start = __ pc(); 10194 Label here; 10195 __ enter(); 10196 RegSet callee_saved = RegSet::range(r19, r28); 10197 __ push(callee_saved, sp); 10198 10199 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10200 10201 // Arguments 10202 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10203 10204 // R_n is the 128-bit randomly-generated key, packed into two 10205 // registers. The caller passes this key to us as long[5], with 10206 // BITS_PER_LIMB = 26. 10207 const Register R_0 = *++regs, R_1 = *++regs; 10208 pack_26(R_0, R_1, r_start); 10209 10210 // RR_n is (R_n >> 2) * 5 10211 const Register RR_0 = *++regs, RR_1 = *++regs; 10212 __ lsr(RR_0, R_0, 2); 10213 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10214 __ lsr(RR_1, R_1, 2); 10215 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10216 10217 // U_n is the current checksum 10218 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10219 pack_26(U_0, U_1, U_2, acc_start); 10220 10221 static constexpr int BLOCK_LENGTH = 16; 10222 Label DONE, LOOP; 10223 10224 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10225 __ br(Assembler::LT, DONE); { 10226 __ bind(LOOP); 10227 10228 // S_n is to be the sum of U_n and the next block of data 10229 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10230 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10231 __ adds(S_0, U_0, S_0); 10232 __ adcs(S_1, U_1, S_1); 10233 __ adc(S_2, U_2, zr); 10234 __ add(S_2, S_2, 1); 10235 10236 const Register U_0HI = *++regs, U_1HI = *++regs; 10237 10238 // NB: this logic depends on some of the special properties of 10239 // Poly1305 keys. In particular, because we know that the top 10240 // four bits of R_0 and R_1 are zero, we can add together 10241 // partial products without any risk of needing to propagate a 10242 // carry out. 10243 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10244 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10245 __ andr(U_2, R_0, 3); 10246 __ mul(U_2, S_2, U_2); 10247 10248 // Recycle registers S_0, S_1, S_2 10249 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10250 10251 // Partial reduction mod 2**130 - 5 10252 __ adds(U_1, U_0HI, U_1); 10253 __ adc(U_2, U_1HI, U_2); 10254 // Sum now in U_2:U_1:U_0. 10255 // Dead: U_0HI, U_1HI. 10256 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10257 10258 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10259 10260 // First, U_2:U_1:U_0 += (U_2 >> 2) 10261 __ lsr(rscratch1, U_2, 2); 10262 __ andr(U_2, U_2, (u8)3); 10263 __ adds(U_0, U_0, rscratch1); 10264 __ adcs(U_1, U_1, zr); 10265 __ adc(U_2, U_2, zr); 10266 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10267 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10268 __ adcs(U_1, U_1, zr); 10269 __ adc(U_2, U_2, zr); 10270 10271 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10272 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10273 __ br(~ Assembler::LT, LOOP); 10274 } 10275 10276 // Further reduce modulo 2^130 - 5 10277 __ lsr(rscratch1, U_2, 2); 10278 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10279 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10280 __ adcs(U_1, U_1, zr); 10281 __ andr(U_2, U_2, (u1)3); 10282 __ adc(U_2, U_2, zr); 10283 10284 // Unpack the sum into five 26-bit limbs and write to memory. 10285 __ ubfiz(rscratch1, U_0, 0, 26); 10286 __ ubfx(rscratch2, U_0, 26, 26); 10287 __ stp(rscratch1, rscratch2, Address(acc_start)); 10288 __ ubfx(rscratch1, U_0, 52, 12); 10289 __ bfi(rscratch1, U_1, 12, 14); 10290 __ ubfx(rscratch2, U_1, 14, 26); 10291 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10292 __ ubfx(rscratch1, U_1, 40, 24); 10293 __ bfi(rscratch1, U_2, 24, 3); 10294 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10295 10296 __ bind(DONE); 10297 __ pop(callee_saved, sp); 10298 __ leave(); 10299 __ ret(lr); 10300 10301 return start; 10302 } 10303 10304 // exception handler for upcall stubs 10305 address generate_upcall_stub_exception_handler() { 10306 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 10307 StubCodeMark mark(this, stub_id); 10308 address start = __ pc(); 10309 10310 // Native caller has no idea how to handle exceptions, 10311 // so we just crash here. Up to callee to catch exceptions. 10312 __ verify_oop(r0); 10313 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10314 __ blr(rscratch1); 10315 __ should_not_reach_here(); 10316 10317 return start; 10318 } 10319 10320 // load Method* target of MethodHandle 10321 // j_rarg0 = jobject receiver 10322 // rmethod = result 10323 address generate_upcall_stub_load_target() { 10324 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 10325 StubCodeMark mark(this, stub_id); 10326 address start = __ pc(); 10327 10328 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10329 // Load target method from receiver 10330 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10331 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10332 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10333 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10334 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10335 noreg, noreg); 10336 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10337 10338 __ ret(lr); 10339 10340 return start; 10341 } 10342 10343 #undef __ 10344 #define __ masm-> 10345 10346 class MontgomeryMultiplyGenerator : public MacroAssembler { 10347 10348 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10349 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10350 10351 RegSet _toSave; 10352 bool _squaring; 10353 10354 public: 10355 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10356 : MacroAssembler(as->code()), _squaring(squaring) { 10357 10358 // Register allocation 10359 10360 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10361 Pa_base = *regs; // Argument registers 10362 if (squaring) 10363 Pb_base = Pa_base; 10364 else 10365 Pb_base = *++regs; 10366 Pn_base = *++regs; 10367 Rlen= *++regs; 10368 inv = *++regs; 10369 Pm_base = *++regs; 10370 10371 // Working registers: 10372 Ra = *++regs; // The current digit of a, b, n, and m. 10373 Rb = *++regs; 10374 Rm = *++regs; 10375 Rn = *++regs; 10376 10377 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10378 Pb = *++regs; 10379 Pm = *++regs; 10380 Pn = *++regs; 10381 10382 t0 = *++regs; // Three registers which form a 10383 t1 = *++regs; // triple-precision accumuator. 10384 t2 = *++regs; 10385 10386 Ri = *++regs; // Inner and outer loop indexes. 10387 Rj = *++regs; 10388 10389 Rhi_ab = *++regs; // Product registers: low and high parts 10390 Rlo_ab = *++regs; // of a*b and m*n. 10391 Rhi_mn = *++regs; 10392 Rlo_mn = *++regs; 10393 10394 // r19 and up are callee-saved. 10395 _toSave = RegSet::range(r19, *regs) + Pm_base; 10396 } 10397 10398 private: 10399 void save_regs() { 10400 push(_toSave, sp); 10401 } 10402 10403 void restore_regs() { 10404 pop(_toSave, sp); 10405 } 10406 10407 template <typename T> 10408 void unroll_2(Register count, T block) { 10409 Label loop, end, odd; 10410 tbnz(count, 0, odd); 10411 cbz(count, end); 10412 align(16); 10413 bind(loop); 10414 (this->*block)(); 10415 bind(odd); 10416 (this->*block)(); 10417 subs(count, count, 2); 10418 br(Assembler::GT, loop); 10419 bind(end); 10420 } 10421 10422 template <typename T> 10423 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10424 Label loop, end, odd; 10425 tbnz(count, 0, odd); 10426 cbz(count, end); 10427 align(16); 10428 bind(loop); 10429 (this->*block)(d, s, tmp); 10430 bind(odd); 10431 (this->*block)(d, s, tmp); 10432 subs(count, count, 2); 10433 br(Assembler::GT, loop); 10434 bind(end); 10435 } 10436 10437 void pre1(RegisterOrConstant i) { 10438 block_comment("pre1"); 10439 // Pa = Pa_base; 10440 // Pb = Pb_base + i; 10441 // Pm = Pm_base; 10442 // Pn = Pn_base + i; 10443 // Ra = *Pa; 10444 // Rb = *Pb; 10445 // Rm = *Pm; 10446 // Rn = *Pn; 10447 ldr(Ra, Address(Pa_base)); 10448 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10449 ldr(Rm, Address(Pm_base)); 10450 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10451 lea(Pa, Address(Pa_base)); 10452 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10453 lea(Pm, Address(Pm_base)); 10454 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10455 10456 // Zero the m*n result. 10457 mov(Rhi_mn, zr); 10458 mov(Rlo_mn, zr); 10459 } 10460 10461 // The core multiply-accumulate step of a Montgomery 10462 // multiplication. The idea is to schedule operations as a 10463 // pipeline so that instructions with long latencies (loads and 10464 // multiplies) have time to complete before their results are 10465 // used. This most benefits in-order implementations of the 10466 // architecture but out-of-order ones also benefit. 10467 void step() { 10468 block_comment("step"); 10469 // MACC(Ra, Rb, t0, t1, t2); 10470 // Ra = *++Pa; 10471 // Rb = *--Pb; 10472 umulh(Rhi_ab, Ra, Rb); 10473 mul(Rlo_ab, Ra, Rb); 10474 ldr(Ra, pre(Pa, wordSize)); 10475 ldr(Rb, pre(Pb, -wordSize)); 10476 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 10477 // previous iteration. 10478 // MACC(Rm, Rn, t0, t1, t2); 10479 // Rm = *++Pm; 10480 // Rn = *--Pn; 10481 umulh(Rhi_mn, Rm, Rn); 10482 mul(Rlo_mn, Rm, Rn); 10483 ldr(Rm, pre(Pm, wordSize)); 10484 ldr(Rn, pre(Pn, -wordSize)); 10485 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10486 } 10487 10488 void post1() { 10489 block_comment("post1"); 10490 10491 // MACC(Ra, Rb, t0, t1, t2); 10492 // Ra = *++Pa; 10493 // Rb = *--Pb; 10494 umulh(Rhi_ab, Ra, Rb); 10495 mul(Rlo_ab, Ra, Rb); 10496 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10497 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10498 10499 // *Pm = Rm = t0 * inv; 10500 mul(Rm, t0, inv); 10501 str(Rm, Address(Pm)); 10502 10503 // MACC(Rm, Rn, t0, t1, t2); 10504 // t0 = t1; t1 = t2; t2 = 0; 10505 umulh(Rhi_mn, Rm, Rn); 10506 10507 #ifndef PRODUCT 10508 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10509 { 10510 mul(Rlo_mn, Rm, Rn); 10511 add(Rlo_mn, t0, Rlo_mn); 10512 Label ok; 10513 cbz(Rlo_mn, ok); { 10514 stop("broken Montgomery multiply"); 10515 } bind(ok); 10516 } 10517 #endif 10518 // We have very carefully set things up so that 10519 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10520 // the lower half of Rm * Rn because we know the result already: 10521 // it must be -t0. t0 + (-t0) must generate a carry iff 10522 // t0 != 0. So, rather than do a mul and an adds we just set 10523 // the carry flag iff t0 is nonzero. 10524 // 10525 // mul(Rlo_mn, Rm, Rn); 10526 // adds(zr, t0, Rlo_mn); 10527 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10528 adcs(t0, t1, Rhi_mn); 10529 adc(t1, t2, zr); 10530 mov(t2, zr); 10531 } 10532 10533 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 10534 block_comment("pre2"); 10535 // Pa = Pa_base + i-len; 10536 // Pb = Pb_base + len; 10537 // Pm = Pm_base + i-len; 10538 // Pn = Pn_base + len; 10539 10540 if (i.is_register()) { 10541 sub(Rj, i.as_register(), len); 10542 } else { 10543 mov(Rj, i.as_constant()); 10544 sub(Rj, Rj, len); 10545 } 10546 // Rj == i-len 10547 10548 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 10549 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 10550 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10551 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 10552 10553 // Ra = *++Pa; 10554 // Rb = *--Pb; 10555 // Rm = *++Pm; 10556 // Rn = *--Pn; 10557 ldr(Ra, pre(Pa, wordSize)); 10558 ldr(Rb, pre(Pb, -wordSize)); 10559 ldr(Rm, pre(Pm, wordSize)); 10560 ldr(Rn, pre(Pn, -wordSize)); 10561 10562 mov(Rhi_mn, zr); 10563 mov(Rlo_mn, zr); 10564 } 10565 10566 void post2(RegisterOrConstant i, RegisterOrConstant len) { 10567 block_comment("post2"); 10568 if (i.is_constant()) { 10569 mov(Rj, i.as_constant()-len.as_constant()); 10570 } else { 10571 sub(Rj, i.as_register(), len); 10572 } 10573 10574 adds(t0, t0, Rlo_mn); // The pending m*n, low part 10575 10576 // As soon as we know the least significant digit of our result, 10577 // store it. 10578 // Pm_base[i-len] = t0; 10579 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10580 10581 // t0 = t1; t1 = t2; t2 = 0; 10582 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 10583 adc(t1, t2, zr); 10584 mov(t2, zr); 10585 } 10586 10587 // A carry in t0 after Montgomery multiplication means that we 10588 // should subtract multiples of n from our result in m. We'll 10589 // keep doing that until there is no carry. 10590 void normalize(RegisterOrConstant len) { 10591 block_comment("normalize"); 10592 // while (t0) 10593 // t0 = sub(Pm_base, Pn_base, t0, len); 10594 Label loop, post, again; 10595 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 10596 cbz(t0, post); { 10597 bind(again); { 10598 mov(i, zr); 10599 mov(cnt, len); 10600 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10601 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10602 subs(zr, zr, zr); // set carry flag, i.e. no borrow 10603 align(16); 10604 bind(loop); { 10605 sbcs(Rm, Rm, Rn); 10606 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10607 add(i, i, 1); 10608 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10609 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10610 sub(cnt, cnt, 1); 10611 } cbnz(cnt, loop); 10612 sbc(t0, t0, zr); 10613 } cbnz(t0, again); 10614 } bind(post); 10615 } 10616 10617 // Move memory at s to d, reversing words. 10618 // Increments d to end of copied memory 10619 // Destroys tmp1, tmp2 10620 // Preserves len 10621 // Leaves s pointing to the address which was in d at start 10622 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 10623 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 10624 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 10625 10626 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 10627 mov(tmp1, len); 10628 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 10629 sub(s, d, len, ext::uxtw, LogBytesPerWord); 10630 } 10631 // where 10632 void reverse1(Register d, Register s, Register tmp) { 10633 ldr(tmp, pre(s, -wordSize)); 10634 ror(tmp, tmp, 32); 10635 str(tmp, post(d, wordSize)); 10636 } 10637 10638 void step_squaring() { 10639 // An extra ACC 10640 step(); 10641 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10642 } 10643 10644 void last_squaring(RegisterOrConstant i) { 10645 Label dont; 10646 // if ((i & 1) == 0) { 10647 tbnz(i.as_register(), 0, dont); { 10648 // MACC(Ra, Rb, t0, t1, t2); 10649 // Ra = *++Pa; 10650 // Rb = *--Pb; 10651 umulh(Rhi_ab, Ra, Rb); 10652 mul(Rlo_ab, Ra, Rb); 10653 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10654 } bind(dont); 10655 } 10656 10657 void extra_step_squaring() { 10658 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10659 10660 // MACC(Rm, Rn, t0, t1, t2); 10661 // Rm = *++Pm; 10662 // Rn = *--Pn; 10663 umulh(Rhi_mn, Rm, Rn); 10664 mul(Rlo_mn, Rm, Rn); 10665 ldr(Rm, pre(Pm, wordSize)); 10666 ldr(Rn, pre(Pn, -wordSize)); 10667 } 10668 10669 void post1_squaring() { 10670 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10671 10672 // *Pm = Rm = t0 * inv; 10673 mul(Rm, t0, inv); 10674 str(Rm, Address(Pm)); 10675 10676 // MACC(Rm, Rn, t0, t1, t2); 10677 // t0 = t1; t1 = t2; t2 = 0; 10678 umulh(Rhi_mn, Rm, Rn); 10679 10680 #ifndef PRODUCT 10681 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10682 { 10683 mul(Rlo_mn, Rm, Rn); 10684 add(Rlo_mn, t0, Rlo_mn); 10685 Label ok; 10686 cbz(Rlo_mn, ok); { 10687 stop("broken Montgomery multiply"); 10688 } bind(ok); 10689 } 10690 #endif 10691 // We have very carefully set things up so that 10692 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10693 // the lower half of Rm * Rn because we know the result already: 10694 // it must be -t0. t0 + (-t0) must generate a carry iff 10695 // t0 != 0. So, rather than do a mul and an adds we just set 10696 // the carry flag iff t0 is nonzero. 10697 // 10698 // mul(Rlo_mn, Rm, Rn); 10699 // adds(zr, t0, Rlo_mn); 10700 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10701 adcs(t0, t1, Rhi_mn); 10702 adc(t1, t2, zr); 10703 mov(t2, zr); 10704 } 10705 10706 void acc(Register Rhi, Register Rlo, 10707 Register t0, Register t1, Register t2) { 10708 adds(t0, t0, Rlo); 10709 adcs(t1, t1, Rhi); 10710 adc(t2, t2, zr); 10711 } 10712 10713 public: 10714 /** 10715 * Fast Montgomery multiplication. The derivation of the 10716 * algorithm is in A Cryptographic Library for the Motorola 10717 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 10718 * 10719 * Arguments: 10720 * 10721 * Inputs for multiplication: 10722 * c_rarg0 - int array elements a 10723 * c_rarg1 - int array elements b 10724 * c_rarg2 - int array elements n (the modulus) 10725 * c_rarg3 - int length 10726 * c_rarg4 - int inv 10727 * c_rarg5 - int array elements m (the result) 10728 * 10729 * Inputs for squaring: 10730 * c_rarg0 - int array elements a 10731 * c_rarg1 - int array elements n (the modulus) 10732 * c_rarg2 - int length 10733 * c_rarg3 - int inv 10734 * c_rarg4 - int array elements m (the result) 10735 * 10736 */ 10737 address generate_multiply() { 10738 Label argh, nothing; 10739 bind(argh); 10740 stop("MontgomeryMultiply total_allocation must be <= 8192"); 10741 10742 align(CodeEntryAlignment); 10743 address entry = pc(); 10744 10745 cbzw(Rlen, nothing); 10746 10747 enter(); 10748 10749 // Make room. 10750 cmpw(Rlen, 512); 10751 br(Assembler::HI, argh); 10752 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 10753 andr(sp, Ra, -2 * wordSize); 10754 10755 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 10756 10757 { 10758 // Copy input args, reversing as we go. We use Ra as a 10759 // temporary variable. 10760 reverse(Ra, Pa_base, Rlen, t0, t1); 10761 if (!_squaring) 10762 reverse(Ra, Pb_base, Rlen, t0, t1); 10763 reverse(Ra, Pn_base, Rlen, t0, t1); 10764 } 10765 10766 // Push all call-saved registers and also Pm_base which we'll need 10767 // at the end. 10768 save_regs(); 10769 10770 #ifndef PRODUCT 10771 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 10772 { 10773 ldr(Rn, Address(Pn_base, 0)); 10774 mul(Rlo_mn, Rn, inv); 10775 subs(zr, Rlo_mn, -1); 10776 Label ok; 10777 br(EQ, ok); { 10778 stop("broken inverse in Montgomery multiply"); 10779 } bind(ok); 10780 } 10781 #endif 10782 10783 mov(Pm_base, Ra); 10784 10785 mov(t0, zr); 10786 mov(t1, zr); 10787 mov(t2, zr); 10788 10789 block_comment("for (int i = 0; i < len; i++) {"); 10790 mov(Ri, zr); { 10791 Label loop, end; 10792 cmpw(Ri, Rlen); 10793 br(Assembler::GE, end); 10794 10795 bind(loop); 10796 pre1(Ri); 10797 10798 block_comment(" for (j = i; j; j--) {"); { 10799 movw(Rj, Ri); 10800 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10801 } block_comment(" } // j"); 10802 10803 post1(); 10804 addw(Ri, Ri, 1); 10805 cmpw(Ri, Rlen); 10806 br(Assembler::LT, loop); 10807 bind(end); 10808 block_comment("} // i"); 10809 } 10810 10811 block_comment("for (int i = len; i < 2*len; i++) {"); 10812 mov(Ri, Rlen); { 10813 Label loop, end; 10814 cmpw(Ri, Rlen, Assembler::LSL, 1); 10815 br(Assembler::GE, end); 10816 10817 bind(loop); 10818 pre2(Ri, Rlen); 10819 10820 block_comment(" for (j = len*2-i-1; j; j--) {"); { 10821 lslw(Rj, Rlen, 1); 10822 subw(Rj, Rj, Ri); 10823 subw(Rj, Rj, 1); 10824 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10825 } block_comment(" } // j"); 10826 10827 post2(Ri, Rlen); 10828 addw(Ri, Ri, 1); 10829 cmpw(Ri, Rlen, Assembler::LSL, 1); 10830 br(Assembler::LT, loop); 10831 bind(end); 10832 } 10833 block_comment("} // i"); 10834 10835 normalize(Rlen); 10836 10837 mov(Ra, Pm_base); // Save Pm_base in Ra 10838 restore_regs(); // Restore caller's Pm_base 10839 10840 // Copy our result into caller's Pm_base 10841 reverse(Pm_base, Ra, Rlen, t0, t1); 10842 10843 leave(); 10844 bind(nothing); 10845 ret(lr); 10846 10847 return entry; 10848 } 10849 // In C, approximately: 10850 10851 // void 10852 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 10853 // julong Pn_base[], julong Pm_base[], 10854 // julong inv, int len) { 10855 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 10856 // julong *Pa, *Pb, *Pn, *Pm; 10857 // julong Ra, Rb, Rn, Rm; 10858 10859 // int i; 10860 10861 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 10862 10863 // for (i = 0; i < len; i++) { 10864 // int j; 10865 10866 // Pa = Pa_base; 10867 // Pb = Pb_base + i; 10868 // Pm = Pm_base; 10869 // Pn = Pn_base + i; 10870 10871 // Ra = *Pa; 10872 // Rb = *Pb; 10873 // Rm = *Pm; 10874 // Rn = *Pn; 10875 10876 // int iters = i; 10877 // for (j = 0; iters--; j++) { 10878 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 10879 // MACC(Ra, Rb, t0, t1, t2); 10880 // Ra = *++Pa; 10881 // Rb = *--Pb; 10882 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 10883 // MACC(Rm, Rn, t0, t1, t2); 10884 // Rm = *++Pm; 10885 // Rn = *--Pn; 10886 // } 10887 10888 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 10889 // MACC(Ra, Rb, t0, t1, t2); 10890 // *Pm = Rm = t0 * inv; 10891 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 10892 // MACC(Rm, Rn, t0, t1, t2); 10893 10894 // assert(t0 == 0, "broken Montgomery multiply"); 10895 10896 // t0 = t1; t1 = t2; t2 = 0; 10897 // } 10898 10899 // for (i = len; i < 2*len; i++) { 10900 // int j; 10901 10902 // Pa = Pa_base + i-len; 10903 // Pb = Pb_base + len; 10904 // Pm = Pm_base + i-len; 10905 // Pn = Pn_base + len; 10906 10907 // Ra = *++Pa; 10908 // Rb = *--Pb; 10909 // Rm = *++Pm; 10910 // Rn = *--Pn; 10911 10912 // int iters = len*2-i-1; 10913 // for (j = i-len+1; iters--; j++) { 10914 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 10915 // MACC(Ra, Rb, t0, t1, t2); 10916 // Ra = *++Pa; 10917 // Rb = *--Pb; 10918 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 10919 // MACC(Rm, Rn, t0, t1, t2); 10920 // Rm = *++Pm; 10921 // Rn = *--Pn; 10922 // } 10923 10924 // Pm_base[i-len] = t0; 10925 // t0 = t1; t1 = t2; t2 = 0; 10926 // } 10927 10928 // while (t0) 10929 // t0 = sub(Pm_base, Pn_base, t0, len); 10930 // } 10931 10932 /** 10933 * Fast Montgomery squaring. This uses asymptotically 25% fewer 10934 * multiplies than Montgomery multiplication so it should be up to 10935 * 25% faster. However, its loop control is more complex and it 10936 * may actually run slower on some machines. 10937 * 10938 * Arguments: 10939 * 10940 * Inputs: 10941 * c_rarg0 - int array elements a 10942 * c_rarg1 - int array elements n (the modulus) 10943 * c_rarg2 - int length 10944 * c_rarg3 - int inv 10945 * c_rarg4 - int array elements m (the result) 10946 * 10947 */ 10948 address generate_square() { 10949 Label argh; 10950 bind(argh); 10951 stop("MontgomeryMultiply total_allocation must be <= 8192"); 10952 10953 align(CodeEntryAlignment); 10954 address entry = pc(); 10955 10956 enter(); 10957 10958 // Make room. 10959 cmpw(Rlen, 512); 10960 br(Assembler::HI, argh); 10961 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 10962 andr(sp, Ra, -2 * wordSize); 10963 10964 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 10965 10966 { 10967 // Copy input args, reversing as we go. We use Ra as a 10968 // temporary variable. 10969 reverse(Ra, Pa_base, Rlen, t0, t1); 10970 reverse(Ra, Pn_base, Rlen, t0, t1); 10971 } 10972 10973 // Push all call-saved registers and also Pm_base which we'll need 10974 // at the end. 10975 save_regs(); 10976 10977 mov(Pm_base, Ra); 10978 10979 mov(t0, zr); 10980 mov(t1, zr); 10981 mov(t2, zr); 10982 10983 block_comment("for (int i = 0; i < len; i++) {"); 10984 mov(Ri, zr); { 10985 Label loop, end; 10986 bind(loop); 10987 cmp(Ri, Rlen); 10988 br(Assembler::GE, end); 10989 10990 pre1(Ri); 10991 10992 block_comment("for (j = (i+1)/2; j; j--) {"); { 10993 add(Rj, Ri, 1); 10994 lsr(Rj, Rj, 1); 10995 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 10996 } block_comment(" } // j"); 10997 10998 last_squaring(Ri); 10999 11000 block_comment(" for (j = i/2; j; j--) {"); { 11001 lsr(Rj, Ri, 1); 11002 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11003 } block_comment(" } // j"); 11004 11005 post1_squaring(); 11006 add(Ri, Ri, 1); 11007 cmp(Ri, Rlen); 11008 br(Assembler::LT, loop); 11009 11010 bind(end); 11011 block_comment("} // i"); 11012 } 11013 11014 block_comment("for (int i = len; i < 2*len; i++) {"); 11015 mov(Ri, Rlen); { 11016 Label loop, end; 11017 bind(loop); 11018 cmp(Ri, Rlen, Assembler::LSL, 1); 11019 br(Assembler::GE, end); 11020 11021 pre2(Ri, Rlen); 11022 11023 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11024 lsl(Rj, Rlen, 1); 11025 sub(Rj, Rj, Ri); 11026 sub(Rj, Rj, 1); 11027 lsr(Rj, Rj, 1); 11028 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11029 } block_comment(" } // j"); 11030 11031 last_squaring(Ri); 11032 11033 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11034 lsl(Rj, Rlen, 1); 11035 sub(Rj, Rj, Ri); 11036 lsr(Rj, Rj, 1); 11037 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11038 } block_comment(" } // j"); 11039 11040 post2(Ri, Rlen); 11041 add(Ri, Ri, 1); 11042 cmp(Ri, Rlen, Assembler::LSL, 1); 11043 11044 br(Assembler::LT, loop); 11045 bind(end); 11046 block_comment("} // i"); 11047 } 11048 11049 normalize(Rlen); 11050 11051 mov(Ra, Pm_base); // Save Pm_base in Ra 11052 restore_regs(); // Restore caller's Pm_base 11053 11054 // Copy our result into caller's Pm_base 11055 reverse(Pm_base, Ra, Rlen, t0, t1); 11056 11057 leave(); 11058 ret(lr); 11059 11060 return entry; 11061 } 11062 // In C, approximately: 11063 11064 // void 11065 // montgomery_square(julong Pa_base[], julong Pn_base[], 11066 // julong Pm_base[], julong inv, int len) { 11067 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11068 // julong *Pa, *Pb, *Pn, *Pm; 11069 // julong Ra, Rb, Rn, Rm; 11070 11071 // int i; 11072 11073 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11074 11075 // for (i = 0; i < len; i++) { 11076 // int j; 11077 11078 // Pa = Pa_base; 11079 // Pb = Pa_base + i; 11080 // Pm = Pm_base; 11081 // Pn = Pn_base + i; 11082 11083 // Ra = *Pa; 11084 // Rb = *Pb; 11085 // Rm = *Pm; 11086 // Rn = *Pn; 11087 11088 // int iters = (i+1)/2; 11089 // for (j = 0; iters--; j++) { 11090 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11091 // MACC2(Ra, Rb, t0, t1, t2); 11092 // Ra = *++Pa; 11093 // Rb = *--Pb; 11094 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11095 // MACC(Rm, Rn, t0, t1, t2); 11096 // Rm = *++Pm; 11097 // Rn = *--Pn; 11098 // } 11099 // if ((i & 1) == 0) { 11100 // assert(Ra == Pa_base[j], "must be"); 11101 // MACC(Ra, Ra, t0, t1, t2); 11102 // } 11103 // iters = i/2; 11104 // assert(iters == i-j, "must be"); 11105 // for (; iters--; j++) { 11106 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11107 // MACC(Rm, Rn, t0, t1, t2); 11108 // Rm = *++Pm; 11109 // Rn = *--Pn; 11110 // } 11111 11112 // *Pm = Rm = t0 * inv; 11113 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11114 // MACC(Rm, Rn, t0, t1, t2); 11115 11116 // assert(t0 == 0, "broken Montgomery multiply"); 11117 11118 // t0 = t1; t1 = t2; t2 = 0; 11119 // } 11120 11121 // for (i = len; i < 2*len; i++) { 11122 // int start = i-len+1; 11123 // int end = start + (len - start)/2; 11124 // int j; 11125 11126 // Pa = Pa_base + i-len; 11127 // Pb = Pa_base + len; 11128 // Pm = Pm_base + i-len; 11129 // Pn = Pn_base + len; 11130 11131 // Ra = *++Pa; 11132 // Rb = *--Pb; 11133 // Rm = *++Pm; 11134 // Rn = *--Pn; 11135 11136 // int iters = (2*len-i-1)/2; 11137 // assert(iters == end-start, "must be"); 11138 // for (j = start; iters--; j++) { 11139 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11140 // MACC2(Ra, Rb, t0, t1, t2); 11141 // Ra = *++Pa; 11142 // Rb = *--Pb; 11143 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11144 // MACC(Rm, Rn, t0, t1, t2); 11145 // Rm = *++Pm; 11146 // Rn = *--Pn; 11147 // } 11148 // if ((i & 1) == 0) { 11149 // assert(Ra == Pa_base[j], "must be"); 11150 // MACC(Ra, Ra, t0, t1, t2); 11151 // } 11152 // iters = (2*len-i)/2; 11153 // assert(iters == len-j, "must be"); 11154 // for (; iters--; j++) { 11155 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11156 // MACC(Rm, Rn, t0, t1, t2); 11157 // Rm = *++Pm; 11158 // Rn = *--Pn; 11159 // } 11160 // Pm_base[i-len] = t0; 11161 // t0 = t1; t1 = t2; t2 = 0; 11162 // } 11163 11164 // while (t0) 11165 // t0 = sub(Pm_base, Pn_base, t0, len); 11166 // } 11167 }; 11168 11169 // Initialization 11170 void generate_initial_stubs() { 11171 // Generate initial stubs and initializes the entry points 11172 11173 // entry points that exist in all platforms Note: This is code 11174 // that could be shared among different platforms - however the 11175 // benefit seems to be smaller than the disadvantage of having a 11176 // much more complicated generator structure. See also comment in 11177 // stubRoutines.hpp. 11178 11179 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11180 11181 StubRoutines::_call_stub_entry = 11182 generate_call_stub(StubRoutines::_call_stub_return_address); 11183 11184 // is referenced by megamorphic call 11185 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11186 11187 // Initialize table for copy memory (arraycopy) check. 11188 if (UnsafeMemoryAccess::_table == nullptr) { 11189 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11190 } 11191 11192 if (UseCRC32Intrinsics) { 11193 // set table address before stub generation which use it 11194 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 11195 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11196 } 11197 11198 if (UseCRC32CIntrinsics) { 11199 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11200 } 11201 11202 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11203 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11204 } 11205 11206 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11207 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11208 } 11209 11210 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11211 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11212 StubRoutines::_hf2f = generate_float16ToFloat(); 11213 StubRoutines::_f2hf = generate_floatToFloat16(); 11214 } 11215 } 11216 11217 void generate_continuation_stubs() { 11218 // Continuation stubs: 11219 StubRoutines::_cont_thaw = generate_cont_thaw(); 11220 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11221 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11222 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11223 } 11224 11225 void generate_final_stubs() { 11226 // support for verify_oop (must happen after universe_init) 11227 if (VerifyOops) { 11228 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11229 } 11230 11231 // arraycopy stubs used by compilers 11232 generate_arraycopy_stubs(); 11233 11234 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11235 11236 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11237 11238 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11239 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11240 11241 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11242 11243 generate_atomic_entry_points(); 11244 11245 #endif // LINUX 11246 11247 #ifdef COMPILER2 11248 if (UseSecondarySupersTable) { 11249 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11250 if (! InlineSecondarySupersTest) { 11251 generate_lookup_secondary_supers_table_stub(); 11252 } 11253 } 11254 #endif 11255 11256 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11257 } 11258 11259 void generate_compiler_stubs() { 11260 #if COMPILER2_OR_JVMCI 11261 11262 if (UseSVE == 0) { 11263 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 11264 } 11265 11266 // array equals stub for large arrays. 11267 if (!UseSimpleArrayEquals) { 11268 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11269 } 11270 11271 // arrays_hascode stub for large arrays. 11272 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11273 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11274 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11275 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11276 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11277 11278 // byte_array_inflate stub for large arrays. 11279 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11280 11281 // countPositives stub for large arrays. 11282 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11283 11284 generate_compare_long_strings(); 11285 11286 generate_string_indexof_stubs(); 11287 11288 #ifdef COMPILER2 11289 if (UseMultiplyToLenIntrinsic) { 11290 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11291 } 11292 11293 if (UseSquareToLenIntrinsic) { 11294 StubRoutines::_squareToLen = generate_squareToLen(); 11295 } 11296 11297 if (UseMulAddIntrinsic) { 11298 StubRoutines::_mulAdd = generate_mulAdd(); 11299 } 11300 11301 if (UseSIMDForBigIntegerShiftIntrinsics) { 11302 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11303 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11304 } 11305 11306 if (UseMontgomeryMultiplyIntrinsic) { 11307 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 11308 StubCodeMark mark(this, stub_id); 11309 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11310 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11311 } 11312 11313 if (UseMontgomerySquareIntrinsic) { 11314 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 11315 StubCodeMark mark(this, stub_id); 11316 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11317 // We use generate_multiply() rather than generate_square() 11318 // because it's faster for the sizes of modulus we care about. 11319 StubRoutines::_montgomerySquare = g.generate_multiply(); 11320 } 11321 11322 #endif // COMPILER2 11323 11324 if (UseChaCha20Intrinsics) { 11325 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11326 } 11327 11328 if (UseKyberIntrinsics) { 11329 StubRoutines::_kyberNtt = generate_kyberNtt(); 11330 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11331 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11332 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11333 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11334 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11335 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11336 } 11337 11338 if (UseDilithiumIntrinsics) { 11339 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11340 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11341 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11342 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11343 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11344 } 11345 11346 if (UseBASE64Intrinsics) { 11347 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11348 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11349 } 11350 11351 // data cache line writeback 11352 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11353 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11354 11355 if (UseAESIntrinsics) { 11356 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11357 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11358 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11359 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11360 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11361 } 11362 if (UseGHASHIntrinsics) { 11363 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11364 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11365 } 11366 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11367 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11368 } 11369 11370 if (UseMD5Intrinsics) { 11371 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 11372 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 11373 } 11374 if (UseSHA1Intrinsics) { 11375 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 11376 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 11377 } 11378 if (UseSHA256Intrinsics) { 11379 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 11380 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 11381 } 11382 if (UseSHA512Intrinsics) { 11383 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 11384 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 11385 } 11386 if (UseSHA3Intrinsics) { 11387 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 11388 StubRoutines::_double_keccak = generate_double_keccak(); 11389 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 11390 } 11391 11392 if (UsePoly1305Intrinsics) { 11393 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11394 } 11395 11396 // generate Adler32 intrinsics code 11397 if (UseAdler32Intrinsics) { 11398 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11399 } 11400 11401 #endif // COMPILER2_OR_JVMCI 11402 } 11403 11404 public: 11405 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 11406 switch(blob_id) { 11407 case initial_id: 11408 generate_initial_stubs(); 11409 break; 11410 case continuation_id: 11411 generate_continuation_stubs(); 11412 break; 11413 case compiler_id: 11414 generate_compiler_stubs(); 11415 break; 11416 case final_id: 11417 generate_final_stubs(); 11418 break; 11419 default: 11420 fatal("unexpected blob id: %d", blob_id); 11421 break; 11422 }; 11423 } 11424 }; // end class declaration 11425 11426 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 11427 StubGenerator g(code, blob_id); 11428 } 11429 11430 11431 #if defined (LINUX) 11432 11433 // Define pointers to atomic stubs and initialize them to point to the 11434 // code in atomic_aarch64.S. 11435 11436 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 11437 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 11438 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 11439 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 11440 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 11441 11442 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 11443 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 11444 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 11445 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 11446 DEFAULT_ATOMIC_OP(xchg, 4, ) 11447 DEFAULT_ATOMIC_OP(xchg, 8, ) 11448 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 11449 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 11450 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 11451 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 11452 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 11453 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 11454 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 11455 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 11456 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 11457 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 11458 11459 #undef DEFAULT_ATOMIC_OP 11460 11461 #endif // LINUX