1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubId stub_id = StubId::stubgen_call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 // All of j_rargN may be used to return inline type fields so be careful 332 // not to clobber those. 333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 334 // assignment of Rresult below. 335 Register Rresult = r14, Rresult_type = r15; 336 __ ldr(Rresult, result); 337 Label is_long, is_float, is_double, check_prim, exit; 338 __ ldr(Rresult_type, result_type); 339 __ cmp(Rresult_type, (u1)T_OBJECT); 340 __ br(Assembler::EQ, check_prim); 341 __ cmp(Rresult_type, (u1)T_LONG); 342 __ br(Assembler::EQ, is_long); 343 __ cmp(Rresult_type, (u1)T_FLOAT); 344 __ br(Assembler::EQ, is_float); 345 __ cmp(Rresult_type, (u1)T_DOUBLE); 346 __ br(Assembler::EQ, is_double); 347 348 // handle T_INT case 349 __ strw(r0, Address(Rresult)); 350 351 __ BIND(exit); 352 353 // pop parameters 354 __ sub(esp, rfp, -sp_after_call_off * wordSize); 355 356 #ifdef ASSERT 357 // verify that threads correspond 358 { 359 Label L, S; 360 __ ldr(rscratch1, thread); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::NE, S); 363 __ get_thread(rscratch1); 364 __ cmp(rthread, rscratch1); 365 __ br(Assembler::EQ, L); 366 __ BIND(S); 367 __ stop("StubRoutines::call_stub: threads must correspond"); 368 __ BIND(L); 369 } 370 #endif 371 372 __ pop_cont_fastpath(rthread); 373 374 // restore callee-save registers 375 __ ldpd(v15, v14, d15_save); 376 __ ldpd(v13, v12, d13_save); 377 __ ldpd(v11, v10, d11_save); 378 __ ldpd(v9, v8, d9_save); 379 380 __ ldp(r28, r27, r28_save); 381 __ ldp(r26, r25, r26_save); 382 __ ldp(r24, r23, r24_save); 383 __ ldp(r22, r21, r22_save); 384 __ ldp(r20, r19, r20_save); 385 386 // restore fpcr 387 __ ldr(rscratch1, fpcr_save); 388 __ set_fpcr(rscratch1); 389 390 __ ldp(c_rarg0, c_rarg1, call_wrapper); 391 __ ldrw(c_rarg2, result_type); 392 __ ldr(c_rarg3, method); 393 __ ldp(c_rarg4, c_rarg5, entry_point); 394 __ ldp(c_rarg6, c_rarg7, parameter_size); 395 396 // leave frame and return to caller 397 __ leave(); 398 __ ret(lr); 399 400 // handle return types different from T_INT 401 __ BIND(check_prim); 402 if (InlineTypeReturnedAsFields) { 403 // Check for scalarized return value 404 __ tbz(r0, 0, is_long); 405 // Load pack handler address 406 __ andr(rscratch1, r0, -2); 407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 409 __ blr(rscratch1); 410 __ b(exit); 411 } 412 413 __ BIND(is_long); 414 __ str(r0, Address(Rresult, 0)); 415 __ br(Assembler::AL, exit); 416 417 __ BIND(is_float); 418 __ strs(j_farg0, Address(Rresult, 0)); 419 __ br(Assembler::AL, exit); 420 421 __ BIND(is_double); 422 __ strd(j_farg0, Address(Rresult, 0)); 423 __ br(Assembler::AL, exit); 424 425 return start; 426 } 427 428 // Return point for a Java call if there's an exception thrown in 429 // Java code. The exception is caught and transformed into a 430 // pending exception stored in JavaThread that can be tested from 431 // within the VM. 432 // 433 // Note: Usually the parameters are removed by the callee. In case 434 // of an exception crossing an activation frame boundary, that is 435 // not the case if the callee is compiled code => need to setup the 436 // rsp. 437 // 438 // r0: exception oop 439 440 address generate_catch_exception() { 441 StubId stub_id = StubId::stubgen_catch_exception_id; 442 StubCodeMark mark(this, stub_id); 443 address start = __ pc(); 444 445 // same as in generate_call_stub(): 446 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 447 const Address thread (rfp, thread_off * wordSize); 448 449 #ifdef ASSERT 450 // verify that threads correspond 451 { 452 Label L, S; 453 __ ldr(rscratch1, thread); 454 __ cmp(rthread, rscratch1); 455 __ br(Assembler::NE, S); 456 __ get_thread(rscratch1); 457 __ cmp(rthread, rscratch1); 458 __ br(Assembler::EQ, L); 459 __ bind(S); 460 __ stop("StubRoutines::catch_exception: threads must correspond"); 461 __ bind(L); 462 } 463 #endif 464 465 // set pending exception 466 __ verify_oop(r0); 467 468 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 469 __ mov(rscratch1, (address)__FILE__); 470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 471 __ movw(rscratch1, (int)__LINE__); 472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 473 474 // complete return to VM 475 assert(StubRoutines::_call_stub_return_address != nullptr, 476 "_call_stub_return_address must have been generated before"); 477 __ b(StubRoutines::_call_stub_return_address); 478 479 return start; 480 } 481 482 // Continuation point for runtime calls returning with a pending 483 // exception. The pending exception check happened in the runtime 484 // or native call stub. The pending exception in Thread is 485 // converted into a Java-level exception. 486 // 487 // Contract with Java-level exception handlers: 488 // r0: exception 489 // r3: throwing pc 490 // 491 // NOTE: At entry of this stub, exception-pc must be in LR !! 492 493 // NOTE: this is always used as a jump target within generated code 494 // so it just needs to be generated code with no x86 prolog 495 496 address generate_forward_exception() { 497 StubId stub_id = StubId::stubgen_forward_exception_id; 498 StubCodeMark mark(this, stub_id); 499 address start = __ pc(); 500 501 // Upon entry, LR points to the return address returning into 502 // Java (interpreted or compiled) code; i.e., the return address 503 // becomes the throwing pc. 504 // 505 // Arguments pushed before the runtime call are still on the stack 506 // but the exception handler will reset the stack pointer -> 507 // ignore them. A potential result in registers can be ignored as 508 // well. 509 510 #ifdef ASSERT 511 // make sure this code is only executed if there is a pending exception 512 { 513 Label L; 514 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 515 __ cbnz(rscratch1, L); 516 __ stop("StubRoutines::forward exception: no pending exception (1)"); 517 __ bind(L); 518 } 519 #endif 520 521 // compute exception handler into r19 522 523 // call the VM to find the handler address associated with the 524 // caller address. pass thread in r0 and caller pc (ret address) 525 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 526 // the stack. 527 __ mov(c_rarg1, lr); 528 // lr will be trashed by the VM call so we move it to R19 529 // (callee-saved) because we also need to pass it to the handler 530 // returned by this call. 531 __ mov(r19, lr); 532 BLOCK_COMMENT("call exception_handler_for_return_address"); 533 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 534 SharedRuntime::exception_handler_for_return_address), 535 rthread, c_rarg1); 536 // Reinitialize the ptrue predicate register, in case the external runtime 537 // call clobbers ptrue reg, as we may return to SVE compiled code. 538 __ reinitialize_ptrue(); 539 540 // we should not really care that lr is no longer the callee 541 // address. we saved the value the handler needs in r19 so we can 542 // just copy it to r3. however, the C2 handler will push its own 543 // frame and then calls into the VM and the VM code asserts that 544 // the PC for the frame above the handler belongs to a compiled 545 // Java method. So, we restore lr here to satisfy that assert. 546 __ mov(lr, r19); 547 // setup r0 & r3 & clear pending exception 548 __ mov(r3, r19); 549 __ mov(r19, r0); 550 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 551 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 552 553 #ifdef ASSERT 554 // make sure exception is set 555 { 556 Label L; 557 __ cbnz(r0, L); 558 __ stop("StubRoutines::forward exception: no pending exception (2)"); 559 __ bind(L); 560 } 561 #endif 562 563 // continue at exception handler 564 // r0: exception 565 // r3: throwing pc 566 // r19: exception handler 567 __ verify_oop(r0); 568 __ br(r19); 569 570 return start; 571 } 572 573 // Non-destructive plausibility checks for oops 574 // 575 // Arguments: 576 // r0: oop to verify 577 // rscratch1: error message 578 // 579 // Stack after saving c_rarg3: 580 // [tos + 0]: saved c_rarg3 581 // [tos + 1]: saved c_rarg2 582 // [tos + 2]: saved lr 583 // [tos + 3]: saved rscratch2 584 // [tos + 4]: saved r0 585 // [tos + 5]: saved rscratch1 586 address generate_verify_oop() { 587 StubId stub_id = StubId::stubgen_verify_oop_id; 588 StubCodeMark mark(this, stub_id); 589 address start = __ pc(); 590 591 Label exit, error; 592 593 // save c_rarg2 and c_rarg3 594 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 595 596 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 597 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 598 __ ldr(c_rarg3, Address(c_rarg2)); 599 __ add(c_rarg3, c_rarg3, 1); 600 __ str(c_rarg3, Address(c_rarg2)); 601 602 // object is in r0 603 // make sure object is 'reasonable' 604 __ cbz(r0, exit); // if obj is null it is OK 605 606 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 607 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 608 609 // return if everything seems ok 610 __ bind(exit); 611 612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 613 __ ret(lr); 614 615 // handle errors 616 __ bind(error); 617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 618 619 __ push(RegSet::range(r0, r29), sp); 620 // debug(char* msg, int64_t pc, int64_t regs[]) 621 __ mov(c_rarg0, rscratch1); // pass address of error message 622 __ mov(c_rarg1, lr); // pass return address 623 __ mov(c_rarg2, sp); // pass address of regs on stack 624 #ifndef PRODUCT 625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 626 #endif 627 BLOCK_COMMENT("call MacroAssembler::debug"); 628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 629 __ blr(rscratch1); 630 __ hlt(0); 631 632 return start; 633 } 634 635 // Generate indices for iota vector. 636 address generate_iota_indices(StubId stub_id) { 637 __ align(CodeEntryAlignment); 638 StubCodeMark mark(this, stub_id); 639 address start = __ pc(); 640 // B 641 __ emit_data64(0x0706050403020100, relocInfo::none); 642 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 643 // H 644 __ emit_data64(0x0003000200010000, relocInfo::none); 645 __ emit_data64(0x0007000600050004, relocInfo::none); 646 // S 647 __ emit_data64(0x0000000100000000, relocInfo::none); 648 __ emit_data64(0x0000000300000002, relocInfo::none); 649 // D 650 __ emit_data64(0x0000000000000000, relocInfo::none); 651 __ emit_data64(0x0000000000000001, relocInfo::none); 652 // S - FP 653 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 654 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 655 // D - FP 656 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 657 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 658 return start; 659 } 660 661 // The inner part of zero_words(). This is the bulk operation, 662 // zeroing words in blocks, possibly using DC ZVA to do it. The 663 // caller is responsible for zeroing the last few words. 664 // 665 // Inputs: 666 // r10: the HeapWord-aligned base address of an array to zero. 667 // r11: the count in HeapWords, r11 > 0. 668 // 669 // Returns r10 and r11, adjusted for the caller to clear. 670 // r10: the base address of the tail of words left to clear. 671 // r11: the number of words in the tail. 672 // r11 < MacroAssembler::zero_words_block_size. 673 674 address generate_zero_blocks() { 675 Label done; 676 Label base_aligned; 677 678 Register base = r10, cnt = r11; 679 680 __ align(CodeEntryAlignment); 681 StubId stub_id = StubId::stubgen_zero_blocks_id; 682 StubCodeMark mark(this, stub_id); 683 address start = __ pc(); 684 685 if (UseBlockZeroing) { 686 int zva_length = VM_Version::zva_length(); 687 688 // Ensure ZVA length can be divided by 16. This is required by 689 // the subsequent operations. 690 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 691 692 __ tbz(base, 3, base_aligned); 693 __ str(zr, Address(__ post(base, 8))); 694 __ sub(cnt, cnt, 1); 695 __ bind(base_aligned); 696 697 // Ensure count >= zva_length * 2 so that it still deserves a zva after 698 // alignment. 699 Label small; 700 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 701 __ subs(rscratch1, cnt, low_limit >> 3); 702 __ br(Assembler::LT, small); 703 __ zero_dcache_blocks(base, cnt); 704 __ bind(small); 705 } 706 707 { 708 // Number of stp instructions we'll unroll 709 const int unroll = 710 MacroAssembler::zero_words_block_size / 2; 711 // Clear the remaining blocks. 712 Label loop; 713 __ subs(cnt, cnt, unroll * 2); 714 __ br(Assembler::LT, done); 715 __ bind(loop); 716 for (int i = 0; i < unroll; i++) 717 __ stp(zr, zr, __ post(base, 16)); 718 __ subs(cnt, cnt, unroll * 2); 719 __ br(Assembler::GE, loop); 720 __ bind(done); 721 __ add(cnt, cnt, unroll * 2); 722 } 723 724 __ ret(lr); 725 726 return start; 727 } 728 729 730 typedef enum { 731 copy_forwards = 1, 732 copy_backwards = -1 733 } copy_direction; 734 735 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 736 // for arraycopy stubs. 737 class ArrayCopyBarrierSetHelper : StackObj { 738 BarrierSetAssembler* _bs_asm; 739 MacroAssembler* _masm; 740 DecoratorSet _decorators; 741 BasicType _type; 742 Register _gct1; 743 Register _gct2; 744 Register _gct3; 745 FloatRegister _gcvt1; 746 FloatRegister _gcvt2; 747 FloatRegister _gcvt3; 748 749 public: 750 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 751 DecoratorSet decorators, 752 BasicType type, 753 Register gct1, 754 Register gct2, 755 Register gct3, 756 FloatRegister gcvt1, 757 FloatRegister gcvt2, 758 FloatRegister gcvt3) 759 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 760 _masm(masm), 761 _decorators(decorators), 762 _type(type), 763 _gct1(gct1), 764 _gct2(gct2), 765 _gct3(gct3), 766 _gcvt1(gcvt1), 767 _gcvt2(gcvt2), 768 _gcvt3(gcvt3) { 769 } 770 771 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 772 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 773 dst1, dst2, src, 774 _gct1, _gct2, _gcvt1); 775 } 776 777 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 778 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 779 dst, src1, src2, 780 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 781 } 782 783 void copy_load_at_16(Register dst1, Register dst2, Address src) { 784 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 785 dst1, dst2, src, 786 _gct1); 787 } 788 789 void copy_store_at_16(Address dst, Register src1, Register src2) { 790 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 791 dst, src1, src2, 792 _gct1, _gct2, _gct3); 793 } 794 795 void copy_load_at_8(Register dst, Address src) { 796 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 797 dst, noreg, src, 798 _gct1); 799 } 800 801 void copy_store_at_8(Address dst, Register src) { 802 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 803 dst, src, noreg, 804 _gct1, _gct2, _gct3); 805 } 806 }; 807 808 // Bulk copy of blocks of 8 words. 809 // 810 // count is a count of words. 811 // 812 // Precondition: count >= 8 813 // 814 // Postconditions: 815 // 816 // The least significant bit of count contains the remaining count 817 // of words to copy. The rest of count is trash. 818 // 819 // s and d are adjusted to point to the remaining words to copy 820 // 821 void generate_copy_longs(StubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 822 BasicType type; 823 copy_direction direction; 824 825 switch (stub_id) { 826 case StubId::stubgen_copy_byte_f_id: 827 direction = copy_forwards; 828 type = T_BYTE; 829 break; 830 case StubId::stubgen_copy_byte_b_id: 831 direction = copy_backwards; 832 type = T_BYTE; 833 break; 834 case StubId::stubgen_copy_oop_f_id: 835 direction = copy_forwards; 836 type = T_OBJECT; 837 break; 838 case StubId::stubgen_copy_oop_b_id: 839 direction = copy_backwards; 840 type = T_OBJECT; 841 break; 842 case StubId::stubgen_copy_oop_uninit_f_id: 843 direction = copy_forwards; 844 type = T_OBJECT; 845 break; 846 case StubId::stubgen_copy_oop_uninit_b_id: 847 direction = copy_backwards; 848 type = T_OBJECT; 849 break; 850 default: 851 ShouldNotReachHere(); 852 } 853 854 int unit = wordSize * direction; 855 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 856 857 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 858 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 859 const Register stride = r14; 860 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 861 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 862 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 863 864 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 865 assert_different_registers(s, d, count, rscratch1, rscratch2); 866 867 Label again, drain; 868 869 __ align(CodeEntryAlignment); 870 871 StubCodeMark mark(this, stub_id); 872 873 __ bind(start); 874 875 Label unaligned_copy_long; 876 if (AvoidUnalignedAccesses) { 877 __ tbnz(d, 3, unaligned_copy_long); 878 } 879 880 if (direction == copy_forwards) { 881 __ sub(s, s, bias); 882 __ sub(d, d, bias); 883 } 884 885 #ifdef ASSERT 886 // Make sure we are never given < 8 words 887 { 888 Label L; 889 __ cmp(count, (u1)8); 890 __ br(Assembler::GE, L); 891 __ stop("genrate_copy_longs called with < 8 words"); 892 __ bind(L); 893 } 894 #endif 895 896 // Fill 8 registers 897 if (UseSIMDForMemoryOps) { 898 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 899 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 900 } else { 901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 902 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 903 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 905 } 906 907 __ subs(count, count, 16); 908 __ br(Assembler::LO, drain); 909 910 int prefetch = PrefetchCopyIntervalInBytes; 911 bool use_stride = false; 912 if (direction == copy_backwards) { 913 use_stride = prefetch > 256; 914 prefetch = -prefetch; 915 if (use_stride) __ mov(stride, prefetch); 916 } 917 918 __ bind(again); 919 920 if (PrefetchCopyIntervalInBytes > 0) 921 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 922 923 if (UseSIMDForMemoryOps) { 924 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 925 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 926 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 927 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 928 } else { 929 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 930 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 931 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 932 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 933 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 934 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 937 } 938 939 __ subs(count, count, 8); 940 __ br(Assembler::HS, again); 941 942 // Drain 943 __ bind(drain); 944 if (UseSIMDForMemoryOps) { 945 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 946 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 947 } else { 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 950 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 951 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 952 } 953 954 { 955 Label L1, L2; 956 __ tbz(count, exact_log2(4), L1); 957 if (UseSIMDForMemoryOps) { 958 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 959 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 960 } else { 961 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 962 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 963 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 964 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 965 } 966 __ bind(L1); 967 968 if (direction == copy_forwards) { 969 __ add(s, s, bias); 970 __ add(d, d, bias); 971 } 972 973 __ tbz(count, 1, L2); 974 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 975 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 976 __ bind(L2); 977 } 978 979 __ ret(lr); 980 981 if (AvoidUnalignedAccesses) { 982 Label drain, again; 983 // Register order for storing. Order is different for backward copy. 984 985 __ bind(unaligned_copy_long); 986 987 // source address is even aligned, target odd aligned 988 // 989 // when forward copying word pairs we read long pairs at offsets 990 // {0, 2, 4, 6} (in long words). when backwards copying we read 991 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 992 // address by -2 in the forwards case so we can compute the 993 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 994 // or -1. 995 // 996 // when forward copying we need to store 1 word, 3 pairs and 997 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 998 // zero offset We adjust the destination by -1 which means we 999 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 1000 // 1001 // When backwards copyng we need to store 1 word, 3 pairs and 1002 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 1003 // offsets {1, 3, 5, 7, 8} * unit. 1004 1005 if (direction == copy_forwards) { 1006 __ sub(s, s, 16); 1007 __ sub(d, d, 8); 1008 } 1009 1010 // Fill 8 registers 1011 // 1012 // for forwards copy s was offset by -16 from the original input 1013 // value of s so the register contents are at these offsets 1014 // relative to the 64 bit block addressed by that original input 1015 // and so on for each successive 64 byte block when s is updated 1016 // 1017 // t0 at offset 0, t1 at offset 8 1018 // t2 at offset 16, t3 at offset 24 1019 // t4 at offset 32, t5 at offset 40 1020 // t6 at offset 48, t7 at offset 56 1021 1022 // for backwards copy s was not offset so the register contents 1023 // are at these offsets into the preceding 64 byte block 1024 // relative to that original input and so on for each successive 1025 // preceding 64 byte block when s is updated. this explains the 1026 // slightly counter-intuitive looking pattern of register usage 1027 // in the stp instructions for backwards copy. 1028 // 1029 // t0 at offset -16, t1 at offset -8 1030 // t2 at offset -32, t3 at offset -24 1031 // t4 at offset -48, t5 at offset -40 1032 // t6 at offset -64, t7 at offset -56 1033 1034 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1035 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1036 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1037 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1038 1039 __ subs(count, count, 16); 1040 __ br(Assembler::LO, drain); 1041 1042 int prefetch = PrefetchCopyIntervalInBytes; 1043 bool use_stride = false; 1044 if (direction == copy_backwards) { 1045 use_stride = prefetch > 256; 1046 prefetch = -prefetch; 1047 if (use_stride) __ mov(stride, prefetch); 1048 } 1049 1050 __ bind(again); 1051 1052 if (PrefetchCopyIntervalInBytes > 0) 1053 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1054 1055 if (direction == copy_forwards) { 1056 // allowing for the offset of -8 the store instructions place 1057 // registers into the target 64 bit block at the following 1058 // offsets 1059 // 1060 // t0 at offset 0 1061 // t1 at offset 8, t2 at offset 16 1062 // t3 at offset 24, t4 at offset 32 1063 // t5 at offset 40, t6 at offset 48 1064 // t7 at offset 56 1065 1066 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1067 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1068 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1069 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1070 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1071 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1072 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1073 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1074 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1075 } else { 1076 // d was not offset when we started so the registers are 1077 // written into the 64 bit block preceding d with the following 1078 // offsets 1079 // 1080 // t1 at offset -8 1081 // t3 at offset -24, t0 at offset -16 1082 // t5 at offset -48, t2 at offset -32 1083 // t7 at offset -56, t4 at offset -48 1084 // t6 at offset -64 1085 // 1086 // note that this matches the offsets previously noted for the 1087 // loads 1088 1089 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1090 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1091 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1092 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1093 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1094 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1095 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1096 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1097 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1098 } 1099 1100 __ subs(count, count, 8); 1101 __ br(Assembler::HS, again); 1102 1103 // Drain 1104 // 1105 // this uses the same pattern of offsets and register arguments 1106 // as above 1107 __ bind(drain); 1108 if (direction == copy_forwards) { 1109 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1110 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1111 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1112 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1113 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1114 } else { 1115 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1116 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1117 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1118 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1119 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1120 } 1121 // now we need to copy any remaining part block which may 1122 // include a 4 word block subblock and/or a 2 word subblock. 1123 // bits 2 and 1 in the count are the tell-tale for whether we 1124 // have each such subblock 1125 { 1126 Label L1, L2; 1127 __ tbz(count, exact_log2(4), L1); 1128 // this is the same as above but copying only 4 longs hence 1129 // with only one intervening stp between the str instructions 1130 // but note that the offsets and registers still follow the 1131 // same pattern 1132 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1133 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1134 if (direction == copy_forwards) { 1135 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1136 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1137 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1141 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1142 } 1143 __ bind(L1); 1144 1145 __ tbz(count, 1, L2); 1146 // this is the same as above but copying only 2 longs hence 1147 // there is no intervening stp between the str instructions 1148 // but note that the offset and register patterns are still 1149 // the same 1150 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1151 if (direction == copy_forwards) { 1152 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1153 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1154 } else { 1155 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1156 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1157 } 1158 __ bind(L2); 1159 1160 // for forwards copy we need to re-adjust the offsets we 1161 // applied so that s and d are follow the last words written 1162 1163 if (direction == copy_forwards) { 1164 __ add(s, s, 16); 1165 __ add(d, d, 8); 1166 } 1167 1168 } 1169 1170 __ ret(lr); 1171 } 1172 } 1173 1174 // Small copy: less than 16 bytes. 1175 // 1176 // NB: Ignores all of the bits of count which represent more than 15 1177 // bytes, so a caller doesn't have to mask them. 1178 1179 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1180 bool is_backwards = step < 0; 1181 size_t granularity = g_uabs(step); 1182 int direction = is_backwards ? -1 : 1; 1183 1184 Label Lword, Lint, Lshort, Lbyte; 1185 1186 assert(granularity 1187 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1188 1189 const Register t0 = r3; 1190 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1191 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1192 1193 // ??? I don't know if this bit-test-and-branch is the right thing 1194 // to do. It does a lot of jumping, resulting in several 1195 // mispredicted branches. It might make more sense to do this 1196 // with something like Duff's device with a single computed branch. 1197 1198 __ tbz(count, 3 - exact_log2(granularity), Lword); 1199 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1200 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1201 __ bind(Lword); 1202 1203 if (granularity <= sizeof (jint)) { 1204 __ tbz(count, 2 - exact_log2(granularity), Lint); 1205 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1206 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1207 __ bind(Lint); 1208 } 1209 1210 if (granularity <= sizeof (jshort)) { 1211 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1212 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1213 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1214 __ bind(Lshort); 1215 } 1216 1217 if (granularity <= sizeof (jbyte)) { 1218 __ tbz(count, 0, Lbyte); 1219 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1220 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1221 __ bind(Lbyte); 1222 } 1223 } 1224 1225 Label copy_f, copy_b; 1226 Label copy_obj_f, copy_obj_b; 1227 Label copy_obj_uninit_f, copy_obj_uninit_b; 1228 1229 // All-singing all-dancing memory copy. 1230 // 1231 // Copy count units of memory from s to d. The size of a unit is 1232 // step, which can be positive or negative depending on the direction 1233 // of copy. If is_aligned is false, we align the source address. 1234 // 1235 1236 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1237 Register s, Register d, Register count, int step) { 1238 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1239 bool is_backwards = step < 0; 1240 unsigned int granularity = g_uabs(step); 1241 const Register t0 = r3, t1 = r4; 1242 1243 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1244 // load all the data before writing anything 1245 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1246 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1247 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1248 const Register send = r17, dend = r16; 1249 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1250 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1251 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1252 1253 if (PrefetchCopyIntervalInBytes > 0) 1254 __ prfm(Address(s, 0), PLDL1KEEP); 1255 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1256 __ br(Assembler::HI, copy_big); 1257 1258 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1259 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1260 1261 __ cmp(count, u1(16/granularity)); 1262 __ br(Assembler::LS, copy16); 1263 1264 __ cmp(count, u1(64/granularity)); 1265 __ br(Assembler::HI, copy80); 1266 1267 __ cmp(count, u1(32/granularity)); 1268 __ br(Assembler::LS, copy32); 1269 1270 // 33..64 bytes 1271 if (UseSIMDForMemoryOps) { 1272 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1273 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1274 bs.copy_store_at_32(Address(d, 0), v0, v1); 1275 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1276 } else { 1277 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1278 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1279 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1280 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1281 1282 bs.copy_store_at_16(Address(d, 0), t0, t1); 1283 bs.copy_store_at_16(Address(d, 16), t2, t3); 1284 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1285 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1286 } 1287 __ b(finish); 1288 1289 // 17..32 bytes 1290 __ bind(copy32); 1291 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1292 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1293 1294 bs.copy_store_at_16(Address(d, 0), t0, t1); 1295 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1296 __ b(finish); 1297 1298 // 65..80/96 bytes 1299 // (96 bytes if SIMD because we do 32 byes per instruction) 1300 __ bind(copy80); 1301 if (UseSIMDForMemoryOps) { 1302 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1303 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1304 // Unaligned pointers can be an issue for copying. 1305 // The issue has more chances to happen when granularity of data is 1306 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1307 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1308 // The most performance drop has been seen for the range 65-80 bytes. 1309 // For such cases using the pair of ldp/stp instead of the third pair of 1310 // ldpq/stpq fixes the performance issue. 1311 if (granularity < sizeof (jint)) { 1312 Label copy96; 1313 __ cmp(count, u1(80/granularity)); 1314 __ br(Assembler::HI, copy96); 1315 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1316 1317 bs.copy_store_at_32(Address(d, 0), v0, v1); 1318 bs.copy_store_at_32(Address(d, 32), v2, v3); 1319 1320 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1321 __ b(finish); 1322 1323 __ bind(copy96); 1324 } 1325 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1326 1327 bs.copy_store_at_32(Address(d, 0), v0, v1); 1328 bs.copy_store_at_32(Address(d, 32), v2, v3); 1329 1330 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1331 } else { 1332 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1333 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1334 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1335 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1336 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1337 1338 bs.copy_store_at_16(Address(d, 0), t0, t1); 1339 bs.copy_store_at_16(Address(d, 16), t2, t3); 1340 bs.copy_store_at_16(Address(d, 32), t4, t5); 1341 bs.copy_store_at_16(Address(d, 48), t6, t7); 1342 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1343 } 1344 __ b(finish); 1345 1346 // 0..16 bytes 1347 __ bind(copy16); 1348 __ cmp(count, u1(8/granularity)); 1349 __ br(Assembler::LO, copy8); 1350 1351 // 8..16 bytes 1352 bs.copy_load_at_8(t0, Address(s, 0)); 1353 bs.copy_load_at_8(t1, Address(send, -8)); 1354 bs.copy_store_at_8(Address(d, 0), t0); 1355 bs.copy_store_at_8(Address(dend, -8), t1); 1356 __ b(finish); 1357 1358 if (granularity < 8) { 1359 // 4..7 bytes 1360 __ bind(copy8); 1361 __ tbz(count, 2 - exact_log2(granularity), copy4); 1362 __ ldrw(t0, Address(s, 0)); 1363 __ ldrw(t1, Address(send, -4)); 1364 __ strw(t0, Address(d, 0)); 1365 __ strw(t1, Address(dend, -4)); 1366 __ b(finish); 1367 if (granularity < 4) { 1368 // 0..3 bytes 1369 __ bind(copy4); 1370 __ cbz(count, finish); // get rid of 0 case 1371 if (granularity == 2) { 1372 __ ldrh(t0, Address(s, 0)); 1373 __ strh(t0, Address(d, 0)); 1374 } else { // granularity == 1 1375 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1376 // the first and last byte. 1377 // Handle the 3 byte case by loading and storing base + count/2 1378 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1379 // This does means in the 1 byte case we load/store the same 1380 // byte 3 times. 1381 __ lsr(count, count, 1); 1382 __ ldrb(t0, Address(s, 0)); 1383 __ ldrb(t1, Address(send, -1)); 1384 __ ldrb(t2, Address(s, count)); 1385 __ strb(t0, Address(d, 0)); 1386 __ strb(t1, Address(dend, -1)); 1387 __ strb(t2, Address(d, count)); 1388 } 1389 __ b(finish); 1390 } 1391 } 1392 1393 __ bind(copy_big); 1394 if (is_backwards) { 1395 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1396 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1397 } 1398 1399 // Now we've got the small case out of the way we can align the 1400 // source address on a 2-word boundary. 1401 1402 // Here we will materialize a count in r15, which is used by copy_memory_small 1403 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1404 // Up until here, we have used t9, which aliases r15, but from here on, that register 1405 // can not be used as a temp register, as it contains the count. 1406 1407 Label aligned; 1408 1409 if (is_aligned) { 1410 // We may have to adjust by 1 word to get s 2-word-aligned. 1411 __ tbz(s, exact_log2(wordSize), aligned); 1412 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1413 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1414 __ sub(count, count, wordSize/granularity); 1415 } else { 1416 if (is_backwards) { 1417 __ andr(r15, s, 2 * wordSize - 1); 1418 } else { 1419 __ neg(r15, s); 1420 __ andr(r15, r15, 2 * wordSize - 1); 1421 } 1422 // r15 is the byte adjustment needed to align s. 1423 __ cbz(r15, aligned); 1424 int shift = exact_log2(granularity); 1425 if (shift > 0) { 1426 __ lsr(r15, r15, shift); 1427 } 1428 __ sub(count, count, r15); 1429 1430 #if 0 1431 // ?? This code is only correct for a disjoint copy. It may or 1432 // may not make sense to use it in that case. 1433 1434 // Copy the first pair; s and d may not be aligned. 1435 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1436 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1437 1438 // Align s and d, adjust count 1439 if (is_backwards) { 1440 __ sub(s, s, r15); 1441 __ sub(d, d, r15); 1442 } else { 1443 __ add(s, s, r15); 1444 __ add(d, d, r15); 1445 } 1446 #else 1447 copy_memory_small(decorators, type, s, d, r15, step); 1448 #endif 1449 } 1450 1451 __ bind(aligned); 1452 1453 // s is now 2-word-aligned. 1454 1455 // We have a count of units and some trailing bytes. Adjust the 1456 // count and do a bulk copy of words. If the shift is zero 1457 // perform a move instead to benefit from zero latency moves. 1458 int shift = exact_log2(wordSize/granularity); 1459 if (shift > 0) { 1460 __ lsr(r15, count, shift); 1461 } else { 1462 __ mov(r15, count); 1463 } 1464 if (direction == copy_forwards) { 1465 if (type != T_OBJECT) { 1466 __ bl(copy_f); 1467 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1468 __ bl(copy_obj_uninit_f); 1469 } else { 1470 __ bl(copy_obj_f); 1471 } 1472 } else { 1473 if (type != T_OBJECT) { 1474 __ bl(copy_b); 1475 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1476 __ bl(copy_obj_uninit_b); 1477 } else { 1478 __ bl(copy_obj_b); 1479 } 1480 } 1481 1482 // And the tail. 1483 copy_memory_small(decorators, type, s, d, count, step); 1484 1485 if (granularity >= 8) __ bind(copy8); 1486 if (granularity >= 4) __ bind(copy4); 1487 __ bind(finish); 1488 } 1489 1490 1491 void clobber_registers() { 1492 #ifdef ASSERT 1493 RegSet clobbered 1494 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1495 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1496 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1497 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1498 __ mov(*it, rscratch1); 1499 } 1500 #endif 1501 1502 } 1503 1504 // Scan over array at a for count oops, verifying each one. 1505 // Preserves a and count, clobbers rscratch1 and rscratch2. 1506 void verify_oop_array (int size, Register a, Register count, Register temp) { 1507 Label loop, end; 1508 __ mov(rscratch1, a); 1509 __ mov(rscratch2, zr); 1510 __ bind(loop); 1511 __ cmp(rscratch2, count); 1512 __ br(Assembler::HS, end); 1513 if (size == wordSize) { 1514 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1515 __ verify_oop(temp); 1516 } else { 1517 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1518 __ decode_heap_oop(temp); // calls verify_oop 1519 } 1520 __ add(rscratch2, rscratch2, 1); 1521 __ b(loop); 1522 __ bind(end); 1523 } 1524 1525 // Arguments: 1526 // stub_id - is used to name the stub and identify all details of 1527 // how to perform the copy. 1528 // 1529 // entry - is assigned to the stub's post push entry point unless 1530 // it is null 1531 // 1532 // Inputs: 1533 // c_rarg0 - source array address 1534 // c_rarg1 - destination array address 1535 // c_rarg2 - element count, treated as ssize_t, can be zero 1536 // 1537 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1538 // the hardware handle it. The two dwords within qwords that span 1539 // cache line boundaries will still be loaded and stored atomically. 1540 // 1541 // Side Effects: entry is set to the (post push) entry point so it 1542 // can be used by the corresponding conjoint copy 1543 // method 1544 // 1545 address generate_disjoint_copy(StubId stub_id, address *entry) { 1546 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1547 RegSet saved_reg = RegSet::of(s, d, count); 1548 int size; 1549 bool aligned; 1550 bool is_oop; 1551 bool dest_uninitialized; 1552 switch (stub_id) { 1553 case StubId::stubgen_jbyte_disjoint_arraycopy_id: 1554 size = sizeof(jbyte); 1555 aligned = false; 1556 is_oop = false; 1557 dest_uninitialized = false; 1558 break; 1559 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id: 1560 size = sizeof(jbyte); 1561 aligned = true; 1562 is_oop = false; 1563 dest_uninitialized = false; 1564 break; 1565 case StubId::stubgen_jshort_disjoint_arraycopy_id: 1566 size = sizeof(jshort); 1567 aligned = false; 1568 is_oop = false; 1569 dest_uninitialized = false; 1570 break; 1571 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id: 1572 size = sizeof(jshort); 1573 aligned = true; 1574 is_oop = false; 1575 dest_uninitialized = false; 1576 break; 1577 case StubId::stubgen_jint_disjoint_arraycopy_id: 1578 size = sizeof(jint); 1579 aligned = false; 1580 is_oop = false; 1581 dest_uninitialized = false; 1582 break; 1583 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id: 1584 size = sizeof(jint); 1585 aligned = true; 1586 is_oop = false; 1587 dest_uninitialized = false; 1588 break; 1589 case StubId::stubgen_jlong_disjoint_arraycopy_id: 1590 // since this is always aligned we can (should!) use the same 1591 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1592 ShouldNotReachHere(); 1593 break; 1594 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id: 1595 size = sizeof(jlong); 1596 aligned = true; 1597 is_oop = false; 1598 dest_uninitialized = false; 1599 break; 1600 case StubId::stubgen_oop_disjoint_arraycopy_id: 1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1602 aligned = !UseCompressedOops; 1603 is_oop = true; 1604 dest_uninitialized = false; 1605 break; 1606 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id: 1607 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1608 aligned = !UseCompressedOops; 1609 is_oop = true; 1610 dest_uninitialized = false; 1611 break; 1612 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id: 1613 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1614 aligned = !UseCompressedOops; 1615 is_oop = true; 1616 dest_uninitialized = true; 1617 break; 1618 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id: 1619 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1620 aligned = !UseCompressedOops; 1621 is_oop = true; 1622 dest_uninitialized = true; 1623 break; 1624 default: 1625 ShouldNotReachHere(); 1626 break; 1627 } 1628 1629 __ align(CodeEntryAlignment); 1630 StubCodeMark mark(this, stub_id); 1631 address start = __ pc(); 1632 __ enter(); 1633 1634 if (entry != nullptr) { 1635 *entry = __ pc(); 1636 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1637 BLOCK_COMMENT("Entry:"); 1638 } 1639 1640 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1641 if (dest_uninitialized) { 1642 decorators |= IS_DEST_UNINITIALIZED; 1643 } 1644 if (aligned) { 1645 decorators |= ARRAYCOPY_ALIGNED; 1646 } 1647 1648 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1649 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1650 1651 if (is_oop) { 1652 // save regs before copy_memory 1653 __ push(RegSet::of(d, count), sp); 1654 } 1655 { 1656 // UnsafeMemoryAccess page error: continue after unsafe access 1657 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1658 UnsafeMemoryAccessMark umam(this, add_entry, true); 1659 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1660 } 1661 1662 if (is_oop) { 1663 __ pop(RegSet::of(d, count), sp); 1664 if (VerifyOops) 1665 verify_oop_array(size, d, count, r16); 1666 } 1667 1668 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1669 1670 __ leave(); 1671 __ mov(r0, zr); // return 0 1672 __ ret(lr); 1673 return start; 1674 } 1675 1676 // Arguments: 1677 // stub_id - is used to name the stub and identify all details of 1678 // how to perform the copy. 1679 // 1680 // nooverlap_target - identifes the (post push) entry for the 1681 // corresponding disjoint copy routine which can be 1682 // jumped to if the ranges do not actually overlap 1683 // 1684 // entry - is assigned to the stub's post push entry point unless 1685 // it is null 1686 // 1687 // 1688 // Inputs: 1689 // c_rarg0 - source array address 1690 // c_rarg1 - destination array address 1691 // c_rarg2 - element count, treated as ssize_t, can be zero 1692 // 1693 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1694 // the hardware handle it. The two dwords within qwords that span 1695 // cache line boundaries will still be loaded and stored atomically. 1696 // 1697 // Side Effects: 1698 // entry is set to the no-overlap entry point so it can be used by 1699 // some other conjoint copy method 1700 // 1701 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) { 1702 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1703 RegSet saved_regs = RegSet::of(s, d, count); 1704 int size; 1705 bool aligned; 1706 bool is_oop; 1707 bool dest_uninitialized; 1708 switch (stub_id) { 1709 case StubId::stubgen_jbyte_arraycopy_id: 1710 size = sizeof(jbyte); 1711 aligned = false; 1712 is_oop = false; 1713 dest_uninitialized = false; 1714 break; 1715 case StubId::stubgen_arrayof_jbyte_arraycopy_id: 1716 size = sizeof(jbyte); 1717 aligned = true; 1718 is_oop = false; 1719 dest_uninitialized = false; 1720 break; 1721 case StubId::stubgen_jshort_arraycopy_id: 1722 size = sizeof(jshort); 1723 aligned = false; 1724 is_oop = false; 1725 dest_uninitialized = false; 1726 break; 1727 case StubId::stubgen_arrayof_jshort_arraycopy_id: 1728 size = sizeof(jshort); 1729 aligned = true; 1730 is_oop = false; 1731 dest_uninitialized = false; 1732 break; 1733 case StubId::stubgen_jint_arraycopy_id: 1734 size = sizeof(jint); 1735 aligned = false; 1736 is_oop = false; 1737 dest_uninitialized = false; 1738 break; 1739 case StubId::stubgen_arrayof_jint_arraycopy_id: 1740 size = sizeof(jint); 1741 aligned = true; 1742 is_oop = false; 1743 dest_uninitialized = false; 1744 break; 1745 case StubId::stubgen_jlong_arraycopy_id: 1746 // since this is always aligned we can (should!) use the same 1747 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1748 ShouldNotReachHere(); 1749 break; 1750 case StubId::stubgen_arrayof_jlong_arraycopy_id: 1751 size = sizeof(jlong); 1752 aligned = true; 1753 is_oop = false; 1754 dest_uninitialized = false; 1755 break; 1756 case StubId::stubgen_oop_arraycopy_id: 1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1758 aligned = !UseCompressedOops; 1759 is_oop = true; 1760 dest_uninitialized = false; 1761 break; 1762 case StubId::stubgen_arrayof_oop_arraycopy_id: 1763 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1764 aligned = !UseCompressedOops; 1765 is_oop = true; 1766 dest_uninitialized = false; 1767 break; 1768 case StubId::stubgen_oop_arraycopy_uninit_id: 1769 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1770 aligned = !UseCompressedOops; 1771 is_oop = true; 1772 dest_uninitialized = true; 1773 break; 1774 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id: 1775 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1776 aligned = !UseCompressedOops; 1777 is_oop = true; 1778 dest_uninitialized = true; 1779 break; 1780 default: 1781 ShouldNotReachHere(); 1782 } 1783 1784 StubCodeMark mark(this, stub_id); 1785 address start = __ pc(); 1786 __ enter(); 1787 1788 if (entry != nullptr) { 1789 *entry = __ pc(); 1790 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1791 BLOCK_COMMENT("Entry:"); 1792 } 1793 1794 // use fwd copy when (d-s) above_equal (count*size) 1795 __ sub(rscratch1, d, s); 1796 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1797 __ br(Assembler::HS, nooverlap_target); 1798 1799 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1800 if (dest_uninitialized) { 1801 decorators |= IS_DEST_UNINITIALIZED; 1802 } 1803 if (aligned) { 1804 decorators |= ARRAYCOPY_ALIGNED; 1805 } 1806 1807 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1808 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1809 1810 if (is_oop) { 1811 // save regs before copy_memory 1812 __ push(RegSet::of(d, count), sp); 1813 } 1814 { 1815 // UnsafeMemoryAccess page error: continue after unsafe access 1816 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1817 UnsafeMemoryAccessMark umam(this, add_entry, true); 1818 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1819 } 1820 if (is_oop) { 1821 __ pop(RegSet::of(d, count), sp); 1822 if (VerifyOops) 1823 verify_oop_array(size, d, count, r16); 1824 } 1825 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1826 __ leave(); 1827 __ mov(r0, zr); // return 0 1828 __ ret(lr); 1829 return start; 1830 } 1831 1832 // Helper for generating a dynamic type check. 1833 // Smashes rscratch1, rscratch2. 1834 void generate_type_check(Register sub_klass, 1835 Register super_check_offset, 1836 Register super_klass, 1837 Register temp1, 1838 Register temp2, 1839 Register result, 1840 Label& L_success) { 1841 assert_different_registers(sub_klass, super_check_offset, super_klass); 1842 1843 BLOCK_COMMENT("type_check:"); 1844 1845 Label L_miss; 1846 1847 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1848 super_check_offset); 1849 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1850 1851 // Fall through on failure! 1852 __ BIND(L_miss); 1853 } 1854 1855 // 1856 // Generate checkcasting array copy stub 1857 // 1858 // Input: 1859 // c_rarg0 - source array address 1860 // c_rarg1 - destination array address 1861 // c_rarg2 - element count, treated as ssize_t, can be zero 1862 // c_rarg3 - size_t ckoff (super_check_offset) 1863 // c_rarg4 - oop ckval (super_klass) 1864 // 1865 // Output: 1866 // r0 == 0 - success 1867 // r0 == -1^K - failure, where K is partial transfer count 1868 // 1869 address generate_checkcast_copy(StubId stub_id, address *entry) { 1870 bool dest_uninitialized; 1871 switch (stub_id) { 1872 case StubId::stubgen_checkcast_arraycopy_id: 1873 dest_uninitialized = false; 1874 break; 1875 case StubId::stubgen_checkcast_arraycopy_uninit_id: 1876 dest_uninitialized = true; 1877 break; 1878 default: 1879 ShouldNotReachHere(); 1880 } 1881 1882 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1883 1884 // Input registers (after setup_arg_regs) 1885 const Register from = c_rarg0; // source array address 1886 const Register to = c_rarg1; // destination array address 1887 const Register count = c_rarg2; // elementscount 1888 const Register ckoff = c_rarg3; // super_check_offset 1889 const Register ckval = c_rarg4; // super_klass 1890 1891 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1892 RegSet wb_post_saved_regs = RegSet::of(count); 1893 1894 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1895 const Register copied_oop = r22; // actual oop copied 1896 const Register count_save = r21; // orig elementscount 1897 const Register start_to = r20; // destination array start address 1898 const Register r19_klass = r19; // oop._klass 1899 1900 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1901 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1902 1903 //--------------------------------------------------------------- 1904 // Assembler stub will be used for this call to arraycopy 1905 // if the two arrays are subtypes of Object[] but the 1906 // destination array type is not equal to or a supertype 1907 // of the source type. Each element must be separately 1908 // checked. 1909 1910 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1911 copied_oop, r19_klass, count_save); 1912 1913 __ align(CodeEntryAlignment); 1914 StubCodeMark mark(this, stub_id); 1915 address start = __ pc(); 1916 1917 __ enter(); // required for proper stackwalking of RuntimeStub frame 1918 1919 #ifdef ASSERT 1920 // caller guarantees that the arrays really are different 1921 // otherwise, we would have to make conjoint checks 1922 { Label L; 1923 __ b(L); // conjoint check not yet implemented 1924 __ stop("checkcast_copy within a single array"); 1925 __ bind(L); 1926 } 1927 #endif //ASSERT 1928 1929 // Caller of this entry point must set up the argument registers. 1930 if (entry != nullptr) { 1931 *entry = __ pc(); 1932 BLOCK_COMMENT("Entry:"); 1933 } 1934 1935 // Empty array: Nothing to do. 1936 __ cbz(count, L_done); 1937 __ push(RegSet::of(r19, r20, r21, r22), sp); 1938 1939 #ifdef ASSERT 1940 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1941 // The ckoff and ckval must be mutually consistent, 1942 // even though caller generates both. 1943 { Label L; 1944 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1945 __ ldrw(start_to, Address(ckval, sco_offset)); 1946 __ cmpw(ckoff, start_to); 1947 __ br(Assembler::EQ, L); 1948 __ stop("super_check_offset inconsistent"); 1949 __ bind(L); 1950 } 1951 #endif //ASSERT 1952 1953 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1954 bool is_oop = true; 1955 int element_size = UseCompressedOops ? 4 : 8; 1956 if (dest_uninitialized) { 1957 decorators |= IS_DEST_UNINITIALIZED; 1958 } 1959 1960 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1961 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1962 1963 // save the original count 1964 __ mov(count_save, count); 1965 1966 // Copy from low to high addresses 1967 __ mov(start_to, to); // Save destination array start address 1968 __ b(L_load_element); 1969 1970 // ======== begin loop ======== 1971 // (Loop is rotated; its entry is L_load_element.) 1972 // Loop control: 1973 // for (; count != 0; count--) { 1974 // copied_oop = load_heap_oop(from++); 1975 // ... generate_type_check ...; 1976 // store_heap_oop(to++, copied_oop); 1977 // } 1978 __ align(OptoLoopAlignment); 1979 1980 __ BIND(L_store_element); 1981 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1982 __ post(to, element_size), copied_oop, noreg, 1983 gct1, gct2, gct3); 1984 __ sub(count, count, 1); 1985 __ cbz(count, L_do_card_marks); 1986 1987 // ======== loop entry is here ======== 1988 __ BIND(L_load_element); 1989 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1990 copied_oop, noreg, __ post(from, element_size), 1991 gct1); 1992 __ cbz(copied_oop, L_store_element); 1993 1994 __ load_klass(r19_klass, copied_oop);// query the object klass 1995 1996 BLOCK_COMMENT("type_check:"); 1997 generate_type_check(/*sub_klass*/r19_klass, 1998 /*super_check_offset*/ckoff, 1999 /*super_klass*/ckval, 2000 /*r_array_base*/gct1, 2001 /*temp2*/gct2, 2002 /*result*/r10, L_store_element); 2003 2004 // Fall through on failure! 2005 2006 // ======== end loop ======== 2007 2008 // It was a real error; we must depend on the caller to finish the job. 2009 // Register count = remaining oops, count_orig = total oops. 2010 // Emit GC store barriers for the oops we have copied and report 2011 // their number to the caller. 2012 2013 __ subs(count, count_save, count); // K = partially copied oop count 2014 __ eon(count, count, zr); // report (-1^K) to caller 2015 __ br(Assembler::EQ, L_done_pop); 2016 2017 __ BIND(L_do_card_marks); 2018 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2019 2020 __ bind(L_done_pop); 2021 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2022 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2023 2024 __ bind(L_done); 2025 __ mov(r0, count); 2026 __ leave(); 2027 __ ret(lr); 2028 2029 return start; 2030 } 2031 2032 // Perform range checks on the proposed arraycopy. 2033 // Kills temp, but nothing else. 2034 // Also, clean the sign bits of src_pos and dst_pos. 2035 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2036 Register src_pos, // source position (c_rarg1) 2037 Register dst, // destination array oo (c_rarg2) 2038 Register dst_pos, // destination position (c_rarg3) 2039 Register length, 2040 Register temp, 2041 Label& L_failed) { 2042 BLOCK_COMMENT("arraycopy_range_checks:"); 2043 2044 assert_different_registers(rscratch1, temp); 2045 2046 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2047 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2048 __ addw(temp, length, src_pos); 2049 __ cmpw(temp, rscratch1); 2050 __ br(Assembler::HI, L_failed); 2051 2052 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2053 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2054 __ addw(temp, length, dst_pos); 2055 __ cmpw(temp, rscratch1); 2056 __ br(Assembler::HI, L_failed); 2057 2058 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2059 __ movw(src_pos, src_pos); 2060 __ movw(dst_pos, dst_pos); 2061 2062 BLOCK_COMMENT("arraycopy_range_checks done"); 2063 } 2064 2065 // These stubs get called from some dumb test routine. 2066 // I'll write them properly when they're called from 2067 // something that's actually doing something. 2068 static void fake_arraycopy_stub(address src, address dst, int count) { 2069 assert(count == 0, "huh?"); 2070 } 2071 2072 2073 // 2074 // Generate 'unsafe' array copy stub 2075 // Though just as safe as the other stubs, it takes an unscaled 2076 // size_t argument instead of an element count. 2077 // 2078 // Input: 2079 // c_rarg0 - source array address 2080 // c_rarg1 - destination array address 2081 // c_rarg2 - byte count, treated as ssize_t, can be zero 2082 // 2083 // Examines the alignment of the operands and dispatches 2084 // to a long, int, short, or byte copy loop. 2085 // 2086 address generate_unsafe_copy(address byte_copy_entry, 2087 address short_copy_entry, 2088 address int_copy_entry, 2089 address long_copy_entry) { 2090 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id; 2091 2092 Label L_long_aligned, L_int_aligned, L_short_aligned; 2093 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2094 2095 __ align(CodeEntryAlignment); 2096 StubCodeMark mark(this, stub_id); 2097 address start = __ pc(); 2098 __ enter(); // required for proper stackwalking of RuntimeStub frame 2099 2100 // bump this on entry, not on exit: 2101 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2102 2103 __ orr(rscratch1, s, d); 2104 __ orr(rscratch1, rscratch1, count); 2105 2106 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2107 __ cbz(rscratch1, L_long_aligned); 2108 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2109 __ cbz(rscratch1, L_int_aligned); 2110 __ tbz(rscratch1, 0, L_short_aligned); 2111 __ b(RuntimeAddress(byte_copy_entry)); 2112 2113 __ BIND(L_short_aligned); 2114 __ lsr(count, count, LogBytesPerShort); // size => short_count 2115 __ b(RuntimeAddress(short_copy_entry)); 2116 __ BIND(L_int_aligned); 2117 __ lsr(count, count, LogBytesPerInt); // size => int_count 2118 __ b(RuntimeAddress(int_copy_entry)); 2119 __ BIND(L_long_aligned); 2120 __ lsr(count, count, LogBytesPerLong); // size => long_count 2121 __ b(RuntimeAddress(long_copy_entry)); 2122 2123 return start; 2124 } 2125 2126 // 2127 // Generate generic array copy stubs 2128 // 2129 // Input: 2130 // c_rarg0 - src oop 2131 // c_rarg1 - src_pos (32-bits) 2132 // c_rarg2 - dst oop 2133 // c_rarg3 - dst_pos (32-bits) 2134 // c_rarg4 - element count (32-bits) 2135 // 2136 // Output: 2137 // r0 == 0 - success 2138 // r0 == -1^K - failure, where K is partial transfer count 2139 // 2140 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2141 address int_copy_entry, address oop_copy_entry, 2142 address long_copy_entry, address checkcast_copy_entry) { 2143 StubId stub_id = StubId::stubgen_generic_arraycopy_id; 2144 2145 Label L_failed, L_objArray; 2146 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2147 2148 // Input registers 2149 const Register src = c_rarg0; // source array oop 2150 const Register src_pos = c_rarg1; // source position 2151 const Register dst = c_rarg2; // destination array oop 2152 const Register dst_pos = c_rarg3; // destination position 2153 const Register length = c_rarg4; 2154 2155 2156 // Registers used as temps 2157 const Register dst_klass = c_rarg5; 2158 2159 __ align(CodeEntryAlignment); 2160 2161 StubCodeMark mark(this, stub_id); 2162 2163 address start = __ pc(); 2164 2165 __ enter(); // required for proper stackwalking of RuntimeStub frame 2166 2167 // bump this on entry, not on exit: 2168 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2169 2170 //----------------------------------------------------------------------- 2171 // Assembler stub will be used for this call to arraycopy 2172 // if the following conditions are met: 2173 // 2174 // (1) src and dst must not be null. 2175 // (2) src_pos must not be negative. 2176 // (3) dst_pos must not be negative. 2177 // (4) length must not be negative. 2178 // (5) src klass and dst klass should be the same and not null. 2179 // (6) src and dst should be arrays. 2180 // (7) src_pos + length must not exceed length of src. 2181 // (8) dst_pos + length must not exceed length of dst. 2182 // 2183 2184 // if (src == nullptr) return -1; 2185 __ cbz(src, L_failed); 2186 2187 // if (src_pos < 0) return -1; 2188 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2189 2190 // if (dst == nullptr) return -1; 2191 __ cbz(dst, L_failed); 2192 2193 // if (dst_pos < 0) return -1; 2194 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2195 2196 // registers used as temp 2197 const Register scratch_length = r16; // elements count to copy 2198 const Register scratch_src_klass = r17; // array klass 2199 const Register lh = r15; // layout helper 2200 2201 // if (length < 0) return -1; 2202 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2203 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2204 2205 __ load_klass(scratch_src_klass, src); 2206 #ifdef ASSERT 2207 // assert(src->klass() != nullptr); 2208 { 2209 BLOCK_COMMENT("assert klasses not null {"); 2210 Label L1, L2; 2211 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2212 __ bind(L1); 2213 __ stop("broken null klass"); 2214 __ bind(L2); 2215 __ load_klass(rscratch1, dst); 2216 __ cbz(rscratch1, L1); // this would be broken also 2217 BLOCK_COMMENT("} assert klasses not null done"); 2218 } 2219 #endif 2220 2221 // Load layout helper (32-bits) 2222 // 2223 // |array_tag| | header_size | element_type | |log2_element_size| 2224 // 32 30 24 16 8 2 0 2225 // 2226 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2227 // 2228 2229 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2230 2231 // Handle objArrays completely differently... 2232 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2233 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2234 __ movw(rscratch1, objArray_lh); 2235 __ eorw(rscratch2, lh, rscratch1); 2236 __ cbzw(rscratch2, L_objArray); 2237 2238 // if (src->klass() != dst->klass()) return -1; 2239 __ load_klass(rscratch2, dst); 2240 __ eor(rscratch2, rscratch2, scratch_src_klass); 2241 __ cbnz(rscratch2, L_failed); 2242 2243 // Check for flat inline type array -> return -1 2244 __ test_flat_array_oop(src, rscratch2, L_failed); 2245 2246 // Check for null-free (non-flat) inline type array -> handle as object array 2247 __ test_null_free_array_oop(src, rscratch2, L_objArray); 2248 2249 // if (!src->is_Array()) return -1; 2250 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2251 2252 // At this point, it is known to be a typeArray (array_tag 0x3). 2253 #ifdef ASSERT 2254 { 2255 BLOCK_COMMENT("assert primitive array {"); 2256 Label L; 2257 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2258 __ cmpw(lh, rscratch2); 2259 __ br(Assembler::GE, L); 2260 __ stop("must be a primitive array"); 2261 __ bind(L); 2262 BLOCK_COMMENT("} assert primitive array done"); 2263 } 2264 #endif 2265 2266 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2267 rscratch2, L_failed); 2268 2269 // TypeArrayKlass 2270 // 2271 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2272 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2273 // 2274 2275 const Register rscratch1_offset = rscratch1; // array offset 2276 const Register r15_elsize = lh; // element size 2277 2278 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2279 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2280 __ add(src, src, rscratch1_offset); // src array offset 2281 __ add(dst, dst, rscratch1_offset); // dst array offset 2282 BLOCK_COMMENT("choose copy loop based on element size"); 2283 2284 // next registers should be set before the jump to corresponding stub 2285 const Register from = c_rarg0; // source array address 2286 const Register to = c_rarg1; // destination array address 2287 const Register count = c_rarg2; // elements count 2288 2289 // 'from', 'to', 'count' registers should be set in such order 2290 // since they are the same as 'src', 'src_pos', 'dst'. 2291 2292 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2293 2294 // The possible values of elsize are 0-3, i.e. exact_log2(element 2295 // size in bytes). We do a simple bitwise binary search. 2296 __ BIND(L_copy_bytes); 2297 __ tbnz(r15_elsize, 1, L_copy_ints); 2298 __ tbnz(r15_elsize, 0, L_copy_shorts); 2299 __ lea(from, Address(src, src_pos));// src_addr 2300 __ lea(to, Address(dst, dst_pos));// dst_addr 2301 __ movw(count, scratch_length); // length 2302 __ b(RuntimeAddress(byte_copy_entry)); 2303 2304 __ BIND(L_copy_shorts); 2305 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2306 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2307 __ movw(count, scratch_length); // length 2308 __ b(RuntimeAddress(short_copy_entry)); 2309 2310 __ BIND(L_copy_ints); 2311 __ tbnz(r15_elsize, 0, L_copy_longs); 2312 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2313 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2314 __ movw(count, scratch_length); // length 2315 __ b(RuntimeAddress(int_copy_entry)); 2316 2317 __ BIND(L_copy_longs); 2318 #ifdef ASSERT 2319 { 2320 BLOCK_COMMENT("assert long copy {"); 2321 Label L; 2322 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2323 __ cmpw(r15_elsize, LogBytesPerLong); 2324 __ br(Assembler::EQ, L); 2325 __ stop("must be long copy, but elsize is wrong"); 2326 __ bind(L); 2327 BLOCK_COMMENT("} assert long copy done"); 2328 } 2329 #endif 2330 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2331 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2332 __ movw(count, scratch_length); // length 2333 __ b(RuntimeAddress(long_copy_entry)); 2334 2335 // ObjArrayKlass 2336 __ BIND(L_objArray); 2337 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2338 2339 Label L_plain_copy, L_checkcast_copy; 2340 // test array classes for subtyping 2341 __ load_klass(r15, dst); 2342 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2343 __ br(Assembler::NE, L_checkcast_copy); 2344 2345 // Identically typed arrays can be copied without element-wise checks. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 rscratch2, L_failed); 2348 2349 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2350 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2351 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ movw(count, scratch_length); // length 2354 __ BIND(L_plain_copy); 2355 __ b(RuntimeAddress(oop_copy_entry)); 2356 2357 __ BIND(L_checkcast_copy); 2358 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2359 { 2360 // Before looking at dst.length, make sure dst is also an objArray. 2361 __ ldrw(rscratch1, Address(r15, lh_offset)); 2362 __ movw(rscratch2, objArray_lh); 2363 __ eorw(rscratch1, rscratch1, rscratch2); 2364 __ cbnzw(rscratch1, L_failed); 2365 2366 // It is safe to examine both src.length and dst.length. 2367 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2368 r15, L_failed); 2369 2370 __ load_klass(dst_klass, dst); // reload 2371 2372 // Marshal the base address arguments now, freeing registers. 2373 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2374 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2375 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2376 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2377 __ movw(count, length); // length (reloaded) 2378 Register sco_temp = c_rarg3; // this register is free now 2379 assert_different_registers(from, to, count, sco_temp, 2380 dst_klass, scratch_src_klass); 2381 // assert_clean_int(count, sco_temp); 2382 2383 // Generate the type check. 2384 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2386 2387 // Smashes rscratch1, rscratch2 2388 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2389 L_plain_copy); 2390 2391 // Fetch destination element klass from the ObjArrayKlass header. 2392 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2393 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2394 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2395 2396 // the checkcast_copy loop needs two extra arguments: 2397 assert(c_rarg3 == sco_temp, "#3 already in place"); 2398 // Set up arguments for checkcast_copy_entry. 2399 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2400 __ b(RuntimeAddress(checkcast_copy_entry)); 2401 } 2402 2403 __ BIND(L_failed); 2404 __ mov(r0, -1); 2405 __ leave(); // required for proper stackwalking of RuntimeStub frame 2406 __ ret(lr); 2407 2408 return start; 2409 } 2410 2411 // 2412 // Generate stub for array fill. If "aligned" is true, the 2413 // "to" address is assumed to be heapword aligned. 2414 // 2415 // Arguments for generated stub: 2416 // to: c_rarg0 2417 // value: c_rarg1 2418 // count: c_rarg2 treated as signed 2419 // 2420 address generate_fill(StubId stub_id) { 2421 BasicType t; 2422 bool aligned; 2423 2424 switch (stub_id) { 2425 case StubId::stubgen_jbyte_fill_id: 2426 t = T_BYTE; 2427 aligned = false; 2428 break; 2429 case StubId::stubgen_jshort_fill_id: 2430 t = T_SHORT; 2431 aligned = false; 2432 break; 2433 case StubId::stubgen_jint_fill_id: 2434 t = T_INT; 2435 aligned = false; 2436 break; 2437 case StubId::stubgen_arrayof_jbyte_fill_id: 2438 t = T_BYTE; 2439 aligned = true; 2440 break; 2441 case StubId::stubgen_arrayof_jshort_fill_id: 2442 t = T_SHORT; 2443 aligned = true; 2444 break; 2445 case StubId::stubgen_arrayof_jint_fill_id: 2446 t = T_INT; 2447 aligned = true; 2448 break; 2449 default: 2450 ShouldNotReachHere(); 2451 }; 2452 2453 __ align(CodeEntryAlignment); 2454 StubCodeMark mark(this, stub_id); 2455 address start = __ pc(); 2456 2457 BLOCK_COMMENT("Entry:"); 2458 2459 const Register to = c_rarg0; // source array address 2460 const Register value = c_rarg1; // value 2461 const Register count = c_rarg2; // elements count 2462 2463 const Register bz_base = r10; // base for block_zero routine 2464 const Register cnt_words = r11; // temp register 2465 2466 __ enter(); 2467 2468 Label L_fill_elements, L_exit1; 2469 2470 int shift = -1; 2471 switch (t) { 2472 case T_BYTE: 2473 shift = 0; 2474 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2475 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2476 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2477 __ br(Assembler::LO, L_fill_elements); 2478 break; 2479 case T_SHORT: 2480 shift = 1; 2481 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2482 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2483 __ br(Assembler::LO, L_fill_elements); 2484 break; 2485 case T_INT: 2486 shift = 2; 2487 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2488 __ br(Assembler::LO, L_fill_elements); 2489 break; 2490 default: ShouldNotReachHere(); 2491 } 2492 2493 // Align source address at 8 bytes address boundary. 2494 Label L_skip_align1, L_skip_align2, L_skip_align4; 2495 if (!aligned) { 2496 switch (t) { 2497 case T_BYTE: 2498 // One byte misalignment happens only for byte arrays. 2499 __ tbz(to, 0, L_skip_align1); 2500 __ strb(value, Address(__ post(to, 1))); 2501 __ subw(count, count, 1); 2502 __ bind(L_skip_align1); 2503 // Fallthrough 2504 case T_SHORT: 2505 // Two bytes misalignment happens only for byte and short (char) arrays. 2506 __ tbz(to, 1, L_skip_align2); 2507 __ strh(value, Address(__ post(to, 2))); 2508 __ subw(count, count, 2 >> shift); 2509 __ bind(L_skip_align2); 2510 // Fallthrough 2511 case T_INT: 2512 // Align to 8 bytes, we know we are 4 byte aligned to start. 2513 __ tbz(to, 2, L_skip_align4); 2514 __ strw(value, Address(__ post(to, 4))); 2515 __ subw(count, count, 4 >> shift); 2516 __ bind(L_skip_align4); 2517 break; 2518 default: ShouldNotReachHere(); 2519 } 2520 } 2521 2522 // 2523 // Fill large chunks 2524 // 2525 __ lsrw(cnt_words, count, 3 - shift); // number of words 2526 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2527 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2528 if (UseBlockZeroing) { 2529 Label non_block_zeroing, rest; 2530 // If the fill value is zero we can use the fast zero_words(). 2531 __ cbnz(value, non_block_zeroing); 2532 __ mov(bz_base, to); 2533 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2534 address tpc = __ zero_words(bz_base, cnt_words); 2535 if (tpc == nullptr) { 2536 fatal("CodeCache is full at generate_fill"); 2537 } 2538 __ b(rest); 2539 __ bind(non_block_zeroing); 2540 __ fill_words(to, cnt_words, value); 2541 __ bind(rest); 2542 } else { 2543 __ fill_words(to, cnt_words, value); 2544 } 2545 2546 // Remaining count is less than 8 bytes. Fill it by a single store. 2547 // Note that the total length is no less than 8 bytes. 2548 if (t == T_BYTE || t == T_SHORT) { 2549 Label L_exit1; 2550 __ cbzw(count, L_exit1); 2551 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2552 __ str(value, Address(to, -8)); // overwrite some elements 2553 __ bind(L_exit1); 2554 __ leave(); 2555 __ ret(lr); 2556 } 2557 2558 // Handle copies less than 8 bytes. 2559 Label L_fill_2, L_fill_4, L_exit2; 2560 __ bind(L_fill_elements); 2561 switch (t) { 2562 case T_BYTE: 2563 __ tbz(count, 0, L_fill_2); 2564 __ strb(value, Address(__ post(to, 1))); 2565 __ bind(L_fill_2); 2566 __ tbz(count, 1, L_fill_4); 2567 __ strh(value, Address(__ post(to, 2))); 2568 __ bind(L_fill_4); 2569 __ tbz(count, 2, L_exit2); 2570 __ strw(value, Address(to)); 2571 break; 2572 case T_SHORT: 2573 __ tbz(count, 0, L_fill_4); 2574 __ strh(value, Address(__ post(to, 2))); 2575 __ bind(L_fill_4); 2576 __ tbz(count, 1, L_exit2); 2577 __ strw(value, Address(to)); 2578 break; 2579 case T_INT: 2580 __ cbzw(count, L_exit2); 2581 __ strw(value, Address(to)); 2582 break; 2583 default: ShouldNotReachHere(); 2584 } 2585 __ bind(L_exit2); 2586 __ leave(); 2587 __ ret(lr); 2588 return start; 2589 } 2590 2591 address generate_unsafecopy_common_error_exit() { 2592 address start_pc = __ pc(); 2593 __ leave(); 2594 __ mov(r0, 0); 2595 __ ret(lr); 2596 return start_pc; 2597 } 2598 2599 // 2600 // Generate 'unsafe' set memory stub 2601 // Though just as safe as the other stubs, it takes an unscaled 2602 // size_t (# bytes) argument instead of an element count. 2603 // 2604 // This fill operation is atomicity preserving: as long as the 2605 // address supplied is sufficiently aligned, all writes of up to 64 2606 // bits in size are single-copy atomic. 2607 // 2608 // Input: 2609 // c_rarg0 - destination array address 2610 // c_rarg1 - byte count (size_t) 2611 // c_rarg2 - byte value 2612 // 2613 address generate_unsafe_setmemory() { 2614 __ align(CodeEntryAlignment); 2615 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id); 2616 address start = __ pc(); 2617 2618 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2619 Label tail; 2620 2621 UnsafeMemoryAccessMark umam(this, true, false); 2622 2623 __ enter(); // required for proper stackwalking of RuntimeStub frame 2624 2625 __ dup(v0, __ T16B, value); 2626 2627 if (AvoidUnalignedAccesses) { 2628 __ cmp(count, (u1)16); 2629 __ br(__ LO, tail); 2630 2631 __ mov(rscratch1, 16); 2632 __ andr(rscratch2, dest, 15); 2633 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2634 __ strq(v0, Address(dest)); 2635 __ sub(count, count, rscratch1); 2636 __ add(dest, dest, rscratch1); 2637 } 2638 2639 __ subs(count, count, (u1)64); 2640 __ br(__ LO, tail); 2641 { 2642 Label again; 2643 __ bind(again); 2644 __ stpq(v0, v0, Address(dest)); 2645 __ stpq(v0, v0, Address(dest, 32)); 2646 2647 __ subs(count, count, 64); 2648 __ add(dest, dest, 64); 2649 __ br(__ HS, again); 2650 } 2651 2652 __ bind(tail); 2653 // The count of bytes is off by 64, but we don't need to correct 2654 // it because we're only going to use the least-significant few 2655 // count bits from here on. 2656 // __ add(count, count, 64); 2657 2658 { 2659 Label dont; 2660 __ tbz(count, exact_log2(32), dont); 2661 __ stpq(v0, v0, __ post(dest, 32)); 2662 __ bind(dont); 2663 } 2664 { 2665 Label dont; 2666 __ tbz(count, exact_log2(16), dont); 2667 __ strq(v0, __ post(dest, 16)); 2668 __ bind(dont); 2669 } 2670 { 2671 Label dont; 2672 __ tbz(count, exact_log2(8), dont); 2673 __ strd(v0, __ post(dest, 8)); 2674 __ bind(dont); 2675 } 2676 2677 Label finished; 2678 __ tst(count, 7); 2679 __ br(__ EQ, finished); 2680 2681 { 2682 Label dont; 2683 __ tbz(count, exact_log2(4), dont); 2684 __ strs(v0, __ post(dest, 4)); 2685 __ bind(dont); 2686 } 2687 { 2688 Label dont; 2689 __ tbz(count, exact_log2(2), dont); 2690 __ bfi(value, value, 8, 8); 2691 __ strh(value, __ post(dest, 2)); 2692 __ bind(dont); 2693 } 2694 { 2695 Label dont; 2696 __ tbz(count, exact_log2(1), dont); 2697 __ strb(value, Address(dest)); 2698 __ bind(dont); 2699 } 2700 2701 __ bind(finished); 2702 __ leave(); 2703 __ ret(lr); 2704 2705 return start; 2706 } 2707 2708 address generate_data_cache_writeback() { 2709 const Register line = c_rarg0; // address of line to write back 2710 2711 __ align(CodeEntryAlignment); 2712 2713 StubId stub_id = StubId::stubgen_data_cache_writeback_id; 2714 StubCodeMark mark(this, stub_id); 2715 2716 address start = __ pc(); 2717 __ enter(); 2718 __ cache_wb(Address(line, 0)); 2719 __ leave(); 2720 __ ret(lr); 2721 2722 return start; 2723 } 2724 2725 address generate_data_cache_writeback_sync() { 2726 const Register is_pre = c_rarg0; // pre or post sync 2727 2728 __ align(CodeEntryAlignment); 2729 2730 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id; 2731 StubCodeMark mark(this, stub_id); 2732 2733 // pre wbsync is a no-op 2734 // post wbsync translates to an sfence 2735 2736 Label skip; 2737 address start = __ pc(); 2738 __ enter(); 2739 __ cbnz(is_pre, skip); 2740 __ cache_wbsync(false); 2741 __ bind(skip); 2742 __ leave(); 2743 __ ret(lr); 2744 2745 return start; 2746 } 2747 2748 void generate_arraycopy_stubs() { 2749 address entry; 2750 address entry_jbyte_arraycopy; 2751 address entry_jshort_arraycopy; 2752 address entry_jint_arraycopy; 2753 address entry_oop_arraycopy; 2754 address entry_jlong_arraycopy; 2755 address entry_checkcast_arraycopy; 2756 2757 // generate the common exit first so later stubs can rely on it if 2758 // they want an UnsafeMemoryAccess exit non-local to the stub 2759 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit(); 2760 // register the stub as the default exit with class UnsafeMemoryAccess 2761 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit); 2762 2763 generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2764 generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2765 2766 generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2767 generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2768 2769 generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2770 generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2771 2772 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2773 2774 //*** jbyte 2775 // Always need aligned and unaligned versions 2776 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry); 2777 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2778 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry); 2779 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr); 2780 2781 //*** jshort 2782 // Always need aligned and unaligned versions 2783 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry); 2784 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2785 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry); 2786 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr); 2787 2788 //*** jint 2789 // Aligned versions 2790 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry); 2791 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2792 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2793 // entry_jint_arraycopy always points to the unaligned version 2794 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry); 2795 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2796 2797 //*** jlong 2798 // It is always aligned 2799 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry); 2800 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2801 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2802 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2803 2804 //*** oops 2805 { 2806 // With compressed oops we need unaligned versions; notice that 2807 // we overwrite entry_oop_arraycopy. 2808 bool aligned = !UseCompressedOops; 2809 2810 StubRoutines::_arrayof_oop_disjoint_arraycopy 2811 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry); 2812 StubRoutines::_arrayof_oop_arraycopy 2813 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2814 // Aligned versions without pre-barriers 2815 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2816 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2817 StubRoutines::_arrayof_oop_arraycopy_uninit 2818 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2819 } 2820 2821 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2822 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2823 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2824 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2825 2826 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2827 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr); 2828 2829 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2830 entry_jshort_arraycopy, 2831 entry_jint_arraycopy, 2832 entry_jlong_arraycopy); 2833 2834 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2835 entry_jshort_arraycopy, 2836 entry_jint_arraycopy, 2837 entry_oop_arraycopy, 2838 entry_jlong_arraycopy, 2839 entry_checkcast_arraycopy); 2840 2841 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id); 2842 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id); 2843 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id); 2844 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id); 2845 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id); 2846 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id); 2847 } 2848 2849 void generate_math_stubs() { Unimplemented(); } 2850 2851 // Arguments: 2852 // 2853 // Inputs: 2854 // c_rarg0 - source byte array address 2855 // c_rarg1 - destination byte array address 2856 // c_rarg2 - K (key) in little endian int array 2857 // 2858 address generate_aescrypt_encryptBlock() { 2859 __ align(CodeEntryAlignment); 2860 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id; 2861 StubCodeMark mark(this, stub_id); 2862 2863 const Register from = c_rarg0; // source array address 2864 const Register to = c_rarg1; // destination array address 2865 const Register key = c_rarg2; // key array address 2866 const Register keylen = rscratch1; 2867 2868 address start = __ pc(); 2869 __ enter(); 2870 2871 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2872 2873 __ aesenc_loadkeys(key, keylen); 2874 __ aesecb_encrypt(from, to, keylen); 2875 2876 __ mov(r0, 0); 2877 2878 __ leave(); 2879 __ ret(lr); 2880 2881 return start; 2882 } 2883 2884 // Arguments: 2885 // 2886 // Inputs: 2887 // c_rarg0 - source byte array address 2888 // c_rarg1 - destination byte array address 2889 // c_rarg2 - K (key) in little endian int array 2890 // 2891 address generate_aescrypt_decryptBlock() { 2892 assert(UseAES, "need AES cryptographic extension support"); 2893 __ align(CodeEntryAlignment); 2894 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id; 2895 StubCodeMark mark(this, stub_id); 2896 Label L_doLast; 2897 2898 const Register from = c_rarg0; // source array address 2899 const Register to = c_rarg1; // destination array address 2900 const Register key = c_rarg2; // key array address 2901 const Register keylen = rscratch1; 2902 2903 address start = __ pc(); 2904 __ enter(); // required for proper stackwalking of RuntimeStub frame 2905 2906 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2907 2908 __ aesecb_decrypt(from, to, key, keylen); 2909 2910 __ mov(r0, 0); 2911 2912 __ leave(); 2913 __ ret(lr); 2914 2915 return start; 2916 } 2917 2918 // Arguments: 2919 // 2920 // Inputs: 2921 // c_rarg0 - source byte array address 2922 // c_rarg1 - destination byte array address 2923 // c_rarg2 - K (key) in little endian int array 2924 // c_rarg3 - r vector byte array address 2925 // c_rarg4 - input length 2926 // 2927 // Output: 2928 // x0 - input length 2929 // 2930 address generate_cipherBlockChaining_encryptAESCrypt() { 2931 assert(UseAES, "need AES cryptographic extension support"); 2932 __ align(CodeEntryAlignment); 2933 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id; 2934 StubCodeMark mark(this, stub_id); 2935 2936 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2937 2938 const Register from = c_rarg0; // source array address 2939 const Register to = c_rarg1; // destination array address 2940 const Register key = c_rarg2; // key array address 2941 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2942 // and left with the results of the last encryption block 2943 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2944 const Register keylen = rscratch1; 2945 2946 address start = __ pc(); 2947 2948 __ enter(); 2949 2950 __ movw(rscratch2, len_reg); 2951 2952 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2953 2954 __ ld1(v0, __ T16B, rvec); 2955 2956 __ cmpw(keylen, 52); 2957 __ br(Assembler::CC, L_loadkeys_44); 2958 __ br(Assembler::EQ, L_loadkeys_52); 2959 2960 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2961 __ rev32(v17, __ T16B, v17); 2962 __ rev32(v18, __ T16B, v18); 2963 __ BIND(L_loadkeys_52); 2964 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2965 __ rev32(v19, __ T16B, v19); 2966 __ rev32(v20, __ T16B, v20); 2967 __ BIND(L_loadkeys_44); 2968 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2969 __ rev32(v21, __ T16B, v21); 2970 __ rev32(v22, __ T16B, v22); 2971 __ rev32(v23, __ T16B, v23); 2972 __ rev32(v24, __ T16B, v24); 2973 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2974 __ rev32(v25, __ T16B, v25); 2975 __ rev32(v26, __ T16B, v26); 2976 __ rev32(v27, __ T16B, v27); 2977 __ rev32(v28, __ T16B, v28); 2978 __ ld1(v29, v30, v31, __ T16B, key); 2979 __ rev32(v29, __ T16B, v29); 2980 __ rev32(v30, __ T16B, v30); 2981 __ rev32(v31, __ T16B, v31); 2982 2983 __ BIND(L_aes_loop); 2984 __ ld1(v1, __ T16B, __ post(from, 16)); 2985 __ eor(v0, __ T16B, v0, v1); 2986 2987 __ br(Assembler::CC, L_rounds_44); 2988 __ br(Assembler::EQ, L_rounds_52); 2989 2990 __ aese(v0, v17); __ aesmc(v0, v0); 2991 __ aese(v0, v18); __ aesmc(v0, v0); 2992 __ BIND(L_rounds_52); 2993 __ aese(v0, v19); __ aesmc(v0, v0); 2994 __ aese(v0, v20); __ aesmc(v0, v0); 2995 __ BIND(L_rounds_44); 2996 __ aese(v0, v21); __ aesmc(v0, v0); 2997 __ aese(v0, v22); __ aesmc(v0, v0); 2998 __ aese(v0, v23); __ aesmc(v0, v0); 2999 __ aese(v0, v24); __ aesmc(v0, v0); 3000 __ aese(v0, v25); __ aesmc(v0, v0); 3001 __ aese(v0, v26); __ aesmc(v0, v0); 3002 __ aese(v0, v27); __ aesmc(v0, v0); 3003 __ aese(v0, v28); __ aesmc(v0, v0); 3004 __ aese(v0, v29); __ aesmc(v0, v0); 3005 __ aese(v0, v30); 3006 __ eor(v0, __ T16B, v0, v31); 3007 3008 __ st1(v0, __ T16B, __ post(to, 16)); 3009 3010 __ subw(len_reg, len_reg, 16); 3011 __ cbnzw(len_reg, L_aes_loop); 3012 3013 __ st1(v0, __ T16B, rvec); 3014 3015 __ mov(r0, rscratch2); 3016 3017 __ leave(); 3018 __ ret(lr); 3019 3020 return start; 3021 } 3022 3023 // Arguments: 3024 // 3025 // Inputs: 3026 // c_rarg0 - source byte array address 3027 // c_rarg1 - destination byte array address 3028 // c_rarg2 - K (key) in little endian int array 3029 // c_rarg3 - r vector byte array address 3030 // c_rarg4 - input length 3031 // 3032 // Output: 3033 // r0 - input length 3034 // 3035 address generate_cipherBlockChaining_decryptAESCrypt() { 3036 assert(UseAES, "need AES cryptographic extension support"); 3037 __ align(CodeEntryAlignment); 3038 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id; 3039 StubCodeMark mark(this, stub_id); 3040 3041 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3042 3043 const Register from = c_rarg0; // source array address 3044 const Register to = c_rarg1; // destination array address 3045 const Register key = c_rarg2; // key array address 3046 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3047 // and left with the results of the last encryption block 3048 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3049 const Register keylen = rscratch1; 3050 3051 address start = __ pc(); 3052 3053 __ enter(); 3054 3055 __ movw(rscratch2, len_reg); 3056 3057 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3058 3059 __ ld1(v2, __ T16B, rvec); 3060 3061 __ ld1(v31, __ T16B, __ post(key, 16)); 3062 __ rev32(v31, __ T16B, v31); 3063 3064 __ cmpw(keylen, 52); 3065 __ br(Assembler::CC, L_loadkeys_44); 3066 __ br(Assembler::EQ, L_loadkeys_52); 3067 3068 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3069 __ rev32(v17, __ T16B, v17); 3070 __ rev32(v18, __ T16B, v18); 3071 __ BIND(L_loadkeys_52); 3072 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3073 __ rev32(v19, __ T16B, v19); 3074 __ rev32(v20, __ T16B, v20); 3075 __ BIND(L_loadkeys_44); 3076 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3077 __ rev32(v21, __ T16B, v21); 3078 __ rev32(v22, __ T16B, v22); 3079 __ rev32(v23, __ T16B, v23); 3080 __ rev32(v24, __ T16B, v24); 3081 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3082 __ rev32(v25, __ T16B, v25); 3083 __ rev32(v26, __ T16B, v26); 3084 __ rev32(v27, __ T16B, v27); 3085 __ rev32(v28, __ T16B, v28); 3086 __ ld1(v29, v30, __ T16B, key); 3087 __ rev32(v29, __ T16B, v29); 3088 __ rev32(v30, __ T16B, v30); 3089 3090 __ BIND(L_aes_loop); 3091 __ ld1(v0, __ T16B, __ post(from, 16)); 3092 __ orr(v1, __ T16B, v0, v0); 3093 3094 __ br(Assembler::CC, L_rounds_44); 3095 __ br(Assembler::EQ, L_rounds_52); 3096 3097 __ aesd(v0, v17); __ aesimc(v0, v0); 3098 __ aesd(v0, v18); __ aesimc(v0, v0); 3099 __ BIND(L_rounds_52); 3100 __ aesd(v0, v19); __ aesimc(v0, v0); 3101 __ aesd(v0, v20); __ aesimc(v0, v0); 3102 __ BIND(L_rounds_44); 3103 __ aesd(v0, v21); __ aesimc(v0, v0); 3104 __ aesd(v0, v22); __ aesimc(v0, v0); 3105 __ aesd(v0, v23); __ aesimc(v0, v0); 3106 __ aesd(v0, v24); __ aesimc(v0, v0); 3107 __ aesd(v0, v25); __ aesimc(v0, v0); 3108 __ aesd(v0, v26); __ aesimc(v0, v0); 3109 __ aesd(v0, v27); __ aesimc(v0, v0); 3110 __ aesd(v0, v28); __ aesimc(v0, v0); 3111 __ aesd(v0, v29); __ aesimc(v0, v0); 3112 __ aesd(v0, v30); 3113 __ eor(v0, __ T16B, v0, v31); 3114 __ eor(v0, __ T16B, v0, v2); 3115 3116 __ st1(v0, __ T16B, __ post(to, 16)); 3117 __ orr(v2, __ T16B, v1, v1); 3118 3119 __ subw(len_reg, len_reg, 16); 3120 __ cbnzw(len_reg, L_aes_loop); 3121 3122 __ st1(v2, __ T16B, rvec); 3123 3124 __ mov(r0, rscratch2); 3125 3126 __ leave(); 3127 __ ret(lr); 3128 3129 return start; 3130 } 3131 3132 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3133 // Inputs: 128-bits. in is preserved. 3134 // The least-significant 64-bit word is in the upper dword of each vector. 3135 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3136 // Output: result 3137 void be_add_128_64(FloatRegister result, FloatRegister in, 3138 FloatRegister inc, FloatRegister tmp) { 3139 assert_different_registers(result, tmp, inc); 3140 3141 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3142 // input 3143 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3144 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3145 // MSD == 0 (must be!) to LSD 3146 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3147 } 3148 3149 // CTR AES crypt. 3150 // Arguments: 3151 // 3152 // Inputs: 3153 // c_rarg0 - source byte array address 3154 // c_rarg1 - destination byte array address 3155 // c_rarg2 - K (key) in little endian int array 3156 // c_rarg3 - counter vector byte array address 3157 // c_rarg4 - input length 3158 // c_rarg5 - saved encryptedCounter start 3159 // c_rarg6 - saved used length 3160 // 3161 // Output: 3162 // r0 - input length 3163 // 3164 address generate_counterMode_AESCrypt() { 3165 const Register in = c_rarg0; 3166 const Register out = c_rarg1; 3167 const Register key = c_rarg2; 3168 const Register counter = c_rarg3; 3169 const Register saved_len = c_rarg4, len = r10; 3170 const Register saved_encrypted_ctr = c_rarg5; 3171 const Register used_ptr = c_rarg6, used = r12; 3172 3173 const Register offset = r7; 3174 const Register keylen = r11; 3175 3176 const unsigned char block_size = 16; 3177 const int bulk_width = 4; 3178 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3179 // performance with larger data sizes, but it also means that the 3180 // fast path isn't used until you have at least 8 blocks, and up 3181 // to 127 bytes of data will be executed on the slow path. For 3182 // that reason, and also so as not to blow away too much icache, 4 3183 // blocks seems like a sensible compromise. 3184 3185 // Algorithm: 3186 // 3187 // if (len == 0) { 3188 // goto DONE; 3189 // } 3190 // int result = len; 3191 // do { 3192 // if (used >= blockSize) { 3193 // if (len >= bulk_width * blockSize) { 3194 // CTR_large_block(); 3195 // if (len == 0) 3196 // goto DONE; 3197 // } 3198 // for (;;) { 3199 // 16ByteVector v0 = counter; 3200 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3201 // used = 0; 3202 // if (len < blockSize) 3203 // break; /* goto NEXT */ 3204 // 16ByteVector v1 = load16Bytes(in, offset); 3205 // v1 = v1 ^ encryptedCounter; 3206 // store16Bytes(out, offset); 3207 // used = blockSize; 3208 // offset += blockSize; 3209 // len -= blockSize; 3210 // if (len == 0) 3211 // goto DONE; 3212 // } 3213 // } 3214 // NEXT: 3215 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3216 // len--; 3217 // } while (len != 0); 3218 // DONE: 3219 // return result; 3220 // 3221 // CTR_large_block() 3222 // Wide bulk encryption of whole blocks. 3223 3224 __ align(CodeEntryAlignment); 3225 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id; 3226 StubCodeMark mark(this, stub_id); 3227 const address start = __ pc(); 3228 __ enter(); 3229 3230 Label DONE, CTR_large_block, large_block_return; 3231 __ ldrw(used, Address(used_ptr)); 3232 __ cbzw(saved_len, DONE); 3233 3234 __ mov(len, saved_len); 3235 __ mov(offset, 0); 3236 3237 // Compute #rounds for AES based on the length of the key array 3238 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3239 3240 __ aesenc_loadkeys(key, keylen); 3241 3242 { 3243 Label L_CTR_loop, NEXT; 3244 3245 __ bind(L_CTR_loop); 3246 3247 __ cmp(used, block_size); 3248 __ br(__ LO, NEXT); 3249 3250 // Maybe we have a lot of data 3251 __ subsw(rscratch1, len, bulk_width * block_size); 3252 __ br(__ HS, CTR_large_block); 3253 __ BIND(large_block_return); 3254 __ cbzw(len, DONE); 3255 3256 // Setup the counter 3257 __ movi(v4, __ T4S, 0); 3258 __ movi(v5, __ T4S, 1); 3259 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3260 3261 // 128-bit big-endian increment 3262 __ ld1(v0, __ T16B, counter); 3263 __ rev64(v16, __ T16B, v0); 3264 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3265 __ rev64(v16, __ T16B, v16); 3266 __ st1(v16, __ T16B, counter); 3267 // Previous counter value is in v0 3268 // v4 contains { 0, 1 } 3269 3270 { 3271 // We have fewer than bulk_width blocks of data left. Encrypt 3272 // them one by one until there is less than a full block 3273 // remaining, being careful to save both the encrypted counter 3274 // and the counter. 3275 3276 Label inner_loop; 3277 __ bind(inner_loop); 3278 // Counter to encrypt is in v0 3279 __ aesecb_encrypt(noreg, noreg, keylen); 3280 __ st1(v0, __ T16B, saved_encrypted_ctr); 3281 3282 // Do we have a remaining full block? 3283 3284 __ mov(used, 0); 3285 __ cmp(len, block_size); 3286 __ br(__ LO, NEXT); 3287 3288 // Yes, we have a full block 3289 __ ldrq(v1, Address(in, offset)); 3290 __ eor(v1, __ T16B, v1, v0); 3291 __ strq(v1, Address(out, offset)); 3292 __ mov(used, block_size); 3293 __ add(offset, offset, block_size); 3294 3295 __ subw(len, len, block_size); 3296 __ cbzw(len, DONE); 3297 3298 // Increment the counter, store it back 3299 __ orr(v0, __ T16B, v16, v16); 3300 __ rev64(v16, __ T16B, v16); 3301 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3302 __ rev64(v16, __ T16B, v16); 3303 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3304 3305 __ b(inner_loop); 3306 } 3307 3308 __ BIND(NEXT); 3309 3310 // Encrypt a single byte, and loop. 3311 // We expect this to be a rare event. 3312 __ ldrb(rscratch1, Address(in, offset)); 3313 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3314 __ eor(rscratch1, rscratch1, rscratch2); 3315 __ strb(rscratch1, Address(out, offset)); 3316 __ add(offset, offset, 1); 3317 __ add(used, used, 1); 3318 __ subw(len, len,1); 3319 __ cbnzw(len, L_CTR_loop); 3320 } 3321 3322 __ bind(DONE); 3323 __ strw(used, Address(used_ptr)); 3324 __ mov(r0, saved_len); 3325 3326 __ leave(); // required for proper stackwalking of RuntimeStub frame 3327 __ ret(lr); 3328 3329 // Bulk encryption 3330 3331 __ BIND (CTR_large_block); 3332 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3333 3334 if (bulk_width == 8) { 3335 __ sub(sp, sp, 4 * 16); 3336 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3337 } 3338 __ sub(sp, sp, 4 * 16); 3339 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3340 RegSet saved_regs = (RegSet::of(in, out, offset) 3341 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3342 __ push(saved_regs, sp); 3343 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3344 __ add(in, in, offset); 3345 __ add(out, out, offset); 3346 3347 // Keys should already be loaded into the correct registers 3348 3349 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3350 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3351 3352 // AES/CTR loop 3353 { 3354 Label L_CTR_loop; 3355 __ BIND(L_CTR_loop); 3356 3357 // Setup the counters 3358 __ movi(v8, __ T4S, 0); 3359 __ movi(v9, __ T4S, 1); 3360 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3361 3362 for (int i = 0; i < bulk_width; i++) { 3363 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3364 __ rev64(v0_ofs, __ T16B, v16); 3365 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3366 } 3367 3368 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3369 3370 // Encrypt the counters 3371 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3372 3373 if (bulk_width == 8) { 3374 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3375 } 3376 3377 // XOR the encrypted counters with the inputs 3378 for (int i = 0; i < bulk_width; i++) { 3379 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3380 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3381 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3382 } 3383 3384 // Write the encrypted data 3385 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3386 if (bulk_width == 8) { 3387 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3388 } 3389 3390 __ subw(len, len, 16 * bulk_width); 3391 __ cbnzw(len, L_CTR_loop); 3392 } 3393 3394 // Save the counter back where it goes 3395 __ rev64(v16, __ T16B, v16); 3396 __ st1(v16, __ T16B, counter); 3397 3398 __ pop(saved_regs, sp); 3399 3400 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3401 if (bulk_width == 8) { 3402 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3403 } 3404 3405 __ andr(rscratch1, len, -16 * bulk_width); 3406 __ sub(len, len, rscratch1); 3407 __ add(offset, offset, rscratch1); 3408 __ mov(used, 16); 3409 __ strw(used, Address(used_ptr)); 3410 __ b(large_block_return); 3411 3412 return start; 3413 } 3414 3415 // Vector AES Galois Counter Mode implementation. Parameters: 3416 // 3417 // in = c_rarg0 3418 // len = c_rarg1 3419 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3420 // out = c_rarg3 3421 // key = c_rarg4 3422 // state = c_rarg5 - GHASH.state 3423 // subkeyHtbl = c_rarg6 - powers of H 3424 // counter = c_rarg7 - 16 bytes of CTR 3425 // return - number of processed bytes 3426 address generate_galoisCounterMode_AESCrypt() { 3427 address ghash_polynomial = __ pc(); 3428 __ emit_int64(0x87); // The low-order bits of the field 3429 // polynomial (i.e. p = z^7+z^2+z+1) 3430 // repeated in the low and high parts of a 3431 // 128-bit vector 3432 __ emit_int64(0x87); 3433 3434 __ align(CodeEntryAlignment); 3435 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id; 3436 StubCodeMark mark(this, stub_id); 3437 address start = __ pc(); 3438 __ enter(); 3439 3440 const Register in = c_rarg0; 3441 const Register len = c_rarg1; 3442 const Register ct = c_rarg2; 3443 const Register out = c_rarg3; 3444 // and updated with the incremented counter in the end 3445 3446 const Register key = c_rarg4; 3447 const Register state = c_rarg5; 3448 3449 const Register subkeyHtbl = c_rarg6; 3450 3451 const Register counter = c_rarg7; 3452 3453 const Register keylen = r10; 3454 // Save state before entering routine 3455 __ sub(sp, sp, 4 * 16); 3456 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3457 __ sub(sp, sp, 4 * 16); 3458 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3459 3460 // __ andr(len, len, -512); 3461 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3462 __ str(len, __ pre(sp, -2 * wordSize)); 3463 3464 Label DONE; 3465 __ cbz(len, DONE); 3466 3467 // Compute #rounds for AES based on the length of the key array 3468 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3469 3470 __ aesenc_loadkeys(key, keylen); 3471 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3472 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3473 3474 // AES/CTR loop 3475 { 3476 Label L_CTR_loop; 3477 __ BIND(L_CTR_loop); 3478 3479 // Setup the counters 3480 __ movi(v8, __ T4S, 0); 3481 __ movi(v9, __ T4S, 1); 3482 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3483 3484 assert(v0->encoding() < v8->encoding(), ""); 3485 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3486 FloatRegister f = as_FloatRegister(i); 3487 __ rev32(f, __ T16B, v16); 3488 __ addv(v16, __ T4S, v16, v8); 3489 } 3490 3491 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3492 3493 // Encrypt the counters 3494 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3495 3496 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3497 3498 // XOR the encrypted counters with the inputs 3499 for (int i = 0; i < 8; i++) { 3500 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3501 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3502 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3503 } 3504 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3505 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3506 3507 __ subw(len, len, 16 * 8); 3508 __ cbnzw(len, L_CTR_loop); 3509 } 3510 3511 __ rev32(v16, __ T16B, v16); 3512 __ st1(v16, __ T16B, counter); 3513 3514 __ ldr(len, Address(sp)); 3515 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3516 3517 // GHASH/CTR loop 3518 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3519 len, /*unrolls*/4); 3520 3521 #ifdef ASSERT 3522 { Label L; 3523 __ cmp(len, (unsigned char)0); 3524 __ br(Assembler::EQ, L); 3525 __ stop("stubGenerator: abort"); 3526 __ bind(L); 3527 } 3528 #endif 3529 3530 __ bind(DONE); 3531 // Return the number of bytes processed 3532 __ ldr(r0, __ post(sp, 2 * wordSize)); 3533 3534 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3535 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3536 3537 __ leave(); // required for proper stackwalking of RuntimeStub frame 3538 __ ret(lr); 3539 return start; 3540 } 3541 3542 class Cached64Bytes { 3543 private: 3544 MacroAssembler *_masm; 3545 Register _regs[8]; 3546 3547 public: 3548 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3549 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3550 auto it = rs.begin(); 3551 for (auto &r: _regs) { 3552 r = *it; 3553 ++it; 3554 } 3555 } 3556 3557 void gen_loads(Register base) { 3558 for (int i = 0; i < 8; i += 2) { 3559 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3560 } 3561 } 3562 3563 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3564 void extract_u32(Register dest, int i) { 3565 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3566 } 3567 }; 3568 3569 // Utility routines for md5. 3570 // Clobbers r10 and r11. 3571 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3572 int k, int s, int t) { 3573 Register rscratch3 = r10; 3574 Register rscratch4 = r11; 3575 3576 __ eorw(rscratch3, r3, r4); 3577 __ movw(rscratch2, t); 3578 __ andw(rscratch3, rscratch3, r2); 3579 __ addw(rscratch4, r1, rscratch2); 3580 reg_cache.extract_u32(rscratch1, k); 3581 __ eorw(rscratch3, rscratch3, r4); 3582 __ addw(rscratch4, rscratch4, rscratch1); 3583 __ addw(rscratch3, rscratch3, rscratch4); 3584 __ rorw(rscratch2, rscratch3, 32 - s); 3585 __ addw(r1, rscratch2, r2); 3586 } 3587 3588 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3589 int k, int s, int t) { 3590 Register rscratch3 = r10; 3591 Register rscratch4 = r11; 3592 3593 reg_cache.extract_u32(rscratch1, k); 3594 __ movw(rscratch2, t); 3595 __ addw(rscratch4, r1, rscratch2); 3596 __ addw(rscratch4, rscratch4, rscratch1); 3597 __ bicw(rscratch2, r3, r4); 3598 __ andw(rscratch3, r2, r4); 3599 __ addw(rscratch2, rscratch2, rscratch4); 3600 __ addw(rscratch2, rscratch2, rscratch3); 3601 __ rorw(rscratch2, rscratch2, 32 - s); 3602 __ addw(r1, rscratch2, r2); 3603 } 3604 3605 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3606 int k, int s, int t) { 3607 Register rscratch3 = r10; 3608 Register rscratch4 = r11; 3609 3610 __ eorw(rscratch3, r3, r4); 3611 __ movw(rscratch2, t); 3612 __ addw(rscratch4, r1, rscratch2); 3613 reg_cache.extract_u32(rscratch1, k); 3614 __ eorw(rscratch3, rscratch3, r2); 3615 __ addw(rscratch4, rscratch4, rscratch1); 3616 __ addw(rscratch3, rscratch3, rscratch4); 3617 __ rorw(rscratch2, rscratch3, 32 - s); 3618 __ addw(r1, rscratch2, r2); 3619 } 3620 3621 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3622 int k, int s, int t) { 3623 Register rscratch3 = r10; 3624 Register rscratch4 = r11; 3625 3626 __ movw(rscratch3, t); 3627 __ ornw(rscratch2, r2, r4); 3628 __ addw(rscratch4, r1, rscratch3); 3629 reg_cache.extract_u32(rscratch1, k); 3630 __ eorw(rscratch3, rscratch2, r3); 3631 __ addw(rscratch4, rscratch4, rscratch1); 3632 __ addw(rscratch3, rscratch3, rscratch4); 3633 __ rorw(rscratch2, rscratch3, 32 - s); 3634 __ addw(r1, rscratch2, r2); 3635 } 3636 3637 // Arguments: 3638 // 3639 // Inputs: 3640 // c_rarg0 - byte[] source+offset 3641 // c_rarg1 - int[] SHA.state 3642 // c_rarg2 - int offset 3643 // c_rarg3 - int limit 3644 // 3645 address generate_md5_implCompress(StubId stub_id) { 3646 bool multi_block; 3647 switch (stub_id) { 3648 case StubId::stubgen_md5_implCompress_id: 3649 multi_block = false; 3650 break; 3651 case StubId::stubgen_md5_implCompressMB_id: 3652 multi_block = true; 3653 break; 3654 default: 3655 ShouldNotReachHere(); 3656 } 3657 __ align(CodeEntryAlignment); 3658 3659 StubCodeMark mark(this, stub_id); 3660 address start = __ pc(); 3661 3662 Register buf = c_rarg0; 3663 Register state = c_rarg1; 3664 Register ofs = c_rarg2; 3665 Register limit = c_rarg3; 3666 Register a = r4; 3667 Register b = r5; 3668 Register c = r6; 3669 Register d = r7; 3670 Register rscratch3 = r10; 3671 Register rscratch4 = r11; 3672 3673 Register state_regs[2] = { r12, r13 }; 3674 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3675 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3676 3677 __ push(saved_regs, sp); 3678 3679 __ ldp(state_regs[0], state_regs[1], Address(state)); 3680 __ ubfx(a, state_regs[0], 0, 32); 3681 __ ubfx(b, state_regs[0], 32, 32); 3682 __ ubfx(c, state_regs[1], 0, 32); 3683 __ ubfx(d, state_regs[1], 32, 32); 3684 3685 Label md5_loop; 3686 __ BIND(md5_loop); 3687 3688 reg_cache.gen_loads(buf); 3689 3690 // Round 1 3691 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3692 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3693 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3694 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3695 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3696 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3697 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3698 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3699 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3700 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3701 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3702 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3703 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3704 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3705 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3706 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3707 3708 // Round 2 3709 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3710 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3711 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3712 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3713 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3714 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3715 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3716 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3717 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3718 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3719 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3720 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3721 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3722 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3723 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3724 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3725 3726 // Round 3 3727 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3728 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3729 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3730 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3731 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3732 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3733 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3734 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3735 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3736 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3737 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3738 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3739 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3740 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3741 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3742 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3743 3744 // Round 4 3745 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3746 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3747 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3748 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3749 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3750 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3751 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3752 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3753 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3754 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3755 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3756 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3757 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3758 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3759 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3760 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3761 3762 __ addw(a, state_regs[0], a); 3763 __ ubfx(rscratch2, state_regs[0], 32, 32); 3764 __ addw(b, rscratch2, b); 3765 __ addw(c, state_regs[1], c); 3766 __ ubfx(rscratch4, state_regs[1], 32, 32); 3767 __ addw(d, rscratch4, d); 3768 3769 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3770 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3771 3772 if (multi_block) { 3773 __ add(buf, buf, 64); 3774 __ add(ofs, ofs, 64); 3775 __ cmp(ofs, limit); 3776 __ br(Assembler::LE, md5_loop); 3777 __ mov(c_rarg0, ofs); // return ofs 3778 } 3779 3780 // write hash values back in the correct order 3781 __ stp(state_regs[0], state_regs[1], Address(state)); 3782 3783 __ pop(saved_regs, sp); 3784 3785 __ ret(lr); 3786 3787 return start; 3788 } 3789 3790 // Arguments: 3791 // 3792 // Inputs: 3793 // c_rarg0 - byte[] source+offset 3794 // c_rarg1 - int[] SHA.state 3795 // c_rarg2 - int offset 3796 // c_rarg3 - int limit 3797 // 3798 address generate_sha1_implCompress(StubId stub_id) { 3799 bool multi_block; 3800 switch (stub_id) { 3801 case StubId::stubgen_sha1_implCompress_id: 3802 multi_block = false; 3803 break; 3804 case StubId::stubgen_sha1_implCompressMB_id: 3805 multi_block = true; 3806 break; 3807 default: 3808 ShouldNotReachHere(); 3809 } 3810 3811 __ align(CodeEntryAlignment); 3812 3813 StubCodeMark mark(this, stub_id); 3814 address start = __ pc(); 3815 3816 Register buf = c_rarg0; 3817 Register state = c_rarg1; 3818 Register ofs = c_rarg2; 3819 Register limit = c_rarg3; 3820 3821 Label keys; 3822 Label sha1_loop; 3823 3824 // load the keys into v0..v3 3825 __ adr(rscratch1, keys); 3826 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3827 // load 5 words state into v6, v7 3828 __ ldrq(v6, Address(state, 0)); 3829 __ ldrs(v7, Address(state, 16)); 3830 3831 3832 __ BIND(sha1_loop); 3833 // load 64 bytes of data into v16..v19 3834 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3835 __ rev32(v16, __ T16B, v16); 3836 __ rev32(v17, __ T16B, v17); 3837 __ rev32(v18, __ T16B, v18); 3838 __ rev32(v19, __ T16B, v19); 3839 3840 // do the sha1 3841 __ addv(v4, __ T4S, v16, v0); 3842 __ orr(v20, __ T16B, v6, v6); 3843 3844 FloatRegister d0 = v16; 3845 FloatRegister d1 = v17; 3846 FloatRegister d2 = v18; 3847 FloatRegister d3 = v19; 3848 3849 for (int round = 0; round < 20; round++) { 3850 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3851 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3852 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3853 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3854 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3855 3856 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3857 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3858 __ sha1h(tmp2, __ T4S, v20); 3859 if (round < 5) 3860 __ sha1c(v20, __ T4S, tmp3, tmp4); 3861 else if (round < 10 || round >= 15) 3862 __ sha1p(v20, __ T4S, tmp3, tmp4); 3863 else 3864 __ sha1m(v20, __ T4S, tmp3, tmp4); 3865 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3866 3867 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3868 } 3869 3870 __ addv(v7, __ T2S, v7, v21); 3871 __ addv(v6, __ T4S, v6, v20); 3872 3873 if (multi_block) { 3874 __ add(ofs, ofs, 64); 3875 __ cmp(ofs, limit); 3876 __ br(Assembler::LE, sha1_loop); 3877 __ mov(c_rarg0, ofs); // return ofs 3878 } 3879 3880 __ strq(v6, Address(state, 0)); 3881 __ strs(v7, Address(state, 16)); 3882 3883 __ ret(lr); 3884 3885 __ bind(keys); 3886 __ emit_int32(0x5a827999); 3887 __ emit_int32(0x6ed9eba1); 3888 __ emit_int32(0x8f1bbcdc); 3889 __ emit_int32(0xca62c1d6); 3890 3891 return start; 3892 } 3893 3894 3895 // Arguments: 3896 // 3897 // Inputs: 3898 // c_rarg0 - byte[] source+offset 3899 // c_rarg1 - int[] SHA.state 3900 // c_rarg2 - int offset 3901 // c_rarg3 - int limit 3902 // 3903 address generate_sha256_implCompress(StubId stub_id) { 3904 bool multi_block; 3905 switch (stub_id) { 3906 case StubId::stubgen_sha256_implCompress_id: 3907 multi_block = false; 3908 break; 3909 case StubId::stubgen_sha256_implCompressMB_id: 3910 multi_block = true; 3911 break; 3912 default: 3913 ShouldNotReachHere(); 3914 } 3915 3916 static const uint32_t round_consts[64] = { 3917 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3918 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3919 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3920 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3921 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3922 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3923 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3924 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3925 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3926 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3927 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3928 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3929 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3930 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3931 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3932 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3933 }; 3934 3935 __ align(CodeEntryAlignment); 3936 3937 StubCodeMark mark(this, stub_id); 3938 address start = __ pc(); 3939 3940 Register buf = c_rarg0; 3941 Register state = c_rarg1; 3942 Register ofs = c_rarg2; 3943 Register limit = c_rarg3; 3944 3945 Label sha1_loop; 3946 3947 __ stpd(v8, v9, __ pre(sp, -32)); 3948 __ stpd(v10, v11, Address(sp, 16)); 3949 3950 // dga == v0 3951 // dgb == v1 3952 // dg0 == v2 3953 // dg1 == v3 3954 // dg2 == v4 3955 // t0 == v6 3956 // t1 == v7 3957 3958 // load 16 keys to v16..v31 3959 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3960 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3961 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3962 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3963 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3964 3965 // load 8 words (256 bits) state 3966 __ ldpq(v0, v1, state); 3967 3968 __ BIND(sha1_loop); 3969 // load 64 bytes of data into v8..v11 3970 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3971 __ rev32(v8, __ T16B, v8); 3972 __ rev32(v9, __ T16B, v9); 3973 __ rev32(v10, __ T16B, v10); 3974 __ rev32(v11, __ T16B, v11); 3975 3976 __ addv(v6, __ T4S, v8, v16); 3977 __ orr(v2, __ T16B, v0, v0); 3978 __ orr(v3, __ T16B, v1, v1); 3979 3980 FloatRegister d0 = v8; 3981 FloatRegister d1 = v9; 3982 FloatRegister d2 = v10; 3983 FloatRegister d3 = v11; 3984 3985 3986 for (int round = 0; round < 16; round++) { 3987 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3988 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3989 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3990 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3991 3992 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3993 __ orr(v4, __ T16B, v2, v2); 3994 if (round < 15) 3995 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3996 __ sha256h(v2, __ T4S, v3, tmp2); 3997 __ sha256h2(v3, __ T4S, v4, tmp2); 3998 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3999 4000 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 4001 } 4002 4003 __ addv(v0, __ T4S, v0, v2); 4004 __ addv(v1, __ T4S, v1, v3); 4005 4006 if (multi_block) { 4007 __ add(ofs, ofs, 64); 4008 __ cmp(ofs, limit); 4009 __ br(Assembler::LE, sha1_loop); 4010 __ mov(c_rarg0, ofs); // return ofs 4011 } 4012 4013 __ ldpd(v10, v11, Address(sp, 16)); 4014 __ ldpd(v8, v9, __ post(sp, 32)); 4015 4016 __ stpq(v0, v1, state); 4017 4018 __ ret(lr); 4019 4020 return start; 4021 } 4022 4023 // Double rounds for sha512. 4024 void sha512_dround(int dr, 4025 FloatRegister vi0, FloatRegister vi1, 4026 FloatRegister vi2, FloatRegister vi3, 4027 FloatRegister vi4, FloatRegister vrc0, 4028 FloatRegister vrc1, FloatRegister vin0, 4029 FloatRegister vin1, FloatRegister vin2, 4030 FloatRegister vin3, FloatRegister vin4) { 4031 if (dr < 36) { 4032 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4033 } 4034 __ addv(v5, __ T2D, vrc0, vin0); 4035 __ ext(v6, __ T16B, vi2, vi3, 8); 4036 __ ext(v5, __ T16B, v5, v5, 8); 4037 __ ext(v7, __ T16B, vi1, vi2, 8); 4038 __ addv(vi3, __ T2D, vi3, v5); 4039 if (dr < 32) { 4040 __ ext(v5, __ T16B, vin3, vin4, 8); 4041 __ sha512su0(vin0, __ T2D, vin1); 4042 } 4043 __ sha512h(vi3, __ T2D, v6, v7); 4044 if (dr < 32) { 4045 __ sha512su1(vin0, __ T2D, vin2, v5); 4046 } 4047 __ addv(vi4, __ T2D, vi1, vi3); 4048 __ sha512h2(vi3, __ T2D, vi1, vi0); 4049 } 4050 4051 // Arguments: 4052 // 4053 // Inputs: 4054 // c_rarg0 - byte[] source+offset 4055 // c_rarg1 - int[] SHA.state 4056 // c_rarg2 - int offset 4057 // c_rarg3 - int limit 4058 // 4059 address generate_sha512_implCompress(StubId stub_id) { 4060 bool multi_block; 4061 switch (stub_id) { 4062 case StubId::stubgen_sha512_implCompress_id: 4063 multi_block = false; 4064 break; 4065 case StubId::stubgen_sha512_implCompressMB_id: 4066 multi_block = true; 4067 break; 4068 default: 4069 ShouldNotReachHere(); 4070 } 4071 4072 static const uint64_t round_consts[80] = { 4073 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4074 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4075 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4076 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4077 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4078 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4079 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4080 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4081 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4082 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4083 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4084 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4085 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4086 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4087 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4088 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4089 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4090 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4091 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4092 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4093 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4094 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4095 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4096 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4097 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4098 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4099 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4100 }; 4101 4102 __ align(CodeEntryAlignment); 4103 4104 StubCodeMark mark(this, stub_id); 4105 address start = __ pc(); 4106 4107 Register buf = c_rarg0; 4108 Register state = c_rarg1; 4109 Register ofs = c_rarg2; 4110 Register limit = c_rarg3; 4111 4112 __ stpd(v8, v9, __ pre(sp, -64)); 4113 __ stpd(v10, v11, Address(sp, 16)); 4114 __ stpd(v12, v13, Address(sp, 32)); 4115 __ stpd(v14, v15, Address(sp, 48)); 4116 4117 Label sha512_loop; 4118 4119 // load state 4120 __ ld1(v8, v9, v10, v11, __ T2D, state); 4121 4122 // load first 4 round constants 4123 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4124 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4125 4126 __ BIND(sha512_loop); 4127 // load 128B of data into v12..v19 4128 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4129 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4130 __ rev64(v12, __ T16B, v12); 4131 __ rev64(v13, __ T16B, v13); 4132 __ rev64(v14, __ T16B, v14); 4133 __ rev64(v15, __ T16B, v15); 4134 __ rev64(v16, __ T16B, v16); 4135 __ rev64(v17, __ T16B, v17); 4136 __ rev64(v18, __ T16B, v18); 4137 __ rev64(v19, __ T16B, v19); 4138 4139 __ mov(rscratch2, rscratch1); 4140 4141 __ mov(v0, __ T16B, v8); 4142 __ mov(v1, __ T16B, v9); 4143 __ mov(v2, __ T16B, v10); 4144 __ mov(v3, __ T16B, v11); 4145 4146 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4147 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4148 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4149 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4150 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4151 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4152 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4153 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4154 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4155 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4156 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4157 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4158 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4159 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4160 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4161 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4162 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4163 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4164 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4165 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4166 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4167 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4168 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4169 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4170 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4171 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4172 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4173 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4174 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4175 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4176 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4177 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4178 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4179 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4180 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4181 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4182 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4183 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4184 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4185 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4186 4187 __ addv(v8, __ T2D, v8, v0); 4188 __ addv(v9, __ T2D, v9, v1); 4189 __ addv(v10, __ T2D, v10, v2); 4190 __ addv(v11, __ T2D, v11, v3); 4191 4192 if (multi_block) { 4193 __ add(ofs, ofs, 128); 4194 __ cmp(ofs, limit); 4195 __ br(Assembler::LE, sha512_loop); 4196 __ mov(c_rarg0, ofs); // return ofs 4197 } 4198 4199 __ st1(v8, v9, v10, v11, __ T2D, state); 4200 4201 __ ldpd(v14, v15, Address(sp, 48)); 4202 __ ldpd(v12, v13, Address(sp, 32)); 4203 __ ldpd(v10, v11, Address(sp, 16)); 4204 __ ldpd(v8, v9, __ post(sp, 64)); 4205 4206 __ ret(lr); 4207 4208 return start; 4209 } 4210 4211 // Execute one round of keccak of two computations in parallel. 4212 // One of the states should be loaded into the lower halves of 4213 // the vector registers v0-v24, the other should be loaded into 4214 // the upper halves of those registers. The ld1r instruction loads 4215 // the round constant into both halves of register v31. 4216 // Intermediate results c0...c5 and d0...d5 are computed 4217 // in registers v25...v30. 4218 // All vector instructions that are used operate on both register 4219 // halves in parallel. 4220 // If only a single computation is needed, one can only load the lower halves. 4221 void keccak_round(Register rscratch1) { 4222 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4223 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4224 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4225 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4226 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4227 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4228 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4229 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4230 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4231 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4232 4233 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4234 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4235 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4236 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4237 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4238 4239 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4240 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4241 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4242 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4243 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4244 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4245 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4246 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4247 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4248 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4249 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4250 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4251 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4252 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4253 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4254 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4255 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4256 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4257 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4258 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4259 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4260 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4261 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4262 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4263 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4264 4265 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4266 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4267 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4268 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4269 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4270 4271 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4272 4273 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4274 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4275 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4276 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4277 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4278 4279 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4280 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4281 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4282 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4283 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4284 4285 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4286 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4287 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4288 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4289 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4290 4291 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4292 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4293 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4294 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4295 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4296 4297 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4298 } 4299 4300 // Arguments: 4301 // 4302 // Inputs: 4303 // c_rarg0 - byte[] source+offset 4304 // c_rarg1 - byte[] SHA.state 4305 // c_rarg2 - int block_size 4306 // c_rarg3 - int offset 4307 // c_rarg4 - int limit 4308 // 4309 address generate_sha3_implCompress(StubId stub_id) { 4310 bool multi_block; 4311 switch (stub_id) { 4312 case StubId::stubgen_sha3_implCompress_id: 4313 multi_block = false; 4314 break; 4315 case StubId::stubgen_sha3_implCompressMB_id: 4316 multi_block = true; 4317 break; 4318 default: 4319 ShouldNotReachHere(); 4320 } 4321 4322 static const uint64_t round_consts[24] = { 4323 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4324 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4325 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4326 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4327 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4328 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4329 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4330 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4331 }; 4332 4333 __ align(CodeEntryAlignment); 4334 4335 StubCodeMark mark(this, stub_id); 4336 address start = __ pc(); 4337 4338 Register buf = c_rarg0; 4339 Register state = c_rarg1; 4340 Register block_size = c_rarg2; 4341 Register ofs = c_rarg3; 4342 Register limit = c_rarg4; 4343 4344 Label sha3_loop, rounds24_loop; 4345 Label sha3_512_or_sha3_384, shake128; 4346 4347 __ stpd(v8, v9, __ pre(sp, -64)); 4348 __ stpd(v10, v11, Address(sp, 16)); 4349 __ stpd(v12, v13, Address(sp, 32)); 4350 __ stpd(v14, v15, Address(sp, 48)); 4351 4352 // load state 4353 __ add(rscratch1, state, 32); 4354 __ ld1(v0, v1, v2, v3, __ T1D, state); 4355 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4356 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4357 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4358 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4359 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4360 __ ld1(v24, __ T1D, rscratch1); 4361 4362 __ BIND(sha3_loop); 4363 4364 // 24 keccak rounds 4365 __ movw(rscratch2, 24); 4366 4367 // load round_constants base 4368 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4369 4370 // load input 4371 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4372 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4373 __ eor(v0, __ T8B, v0, v25); 4374 __ eor(v1, __ T8B, v1, v26); 4375 __ eor(v2, __ T8B, v2, v27); 4376 __ eor(v3, __ T8B, v3, v28); 4377 __ eor(v4, __ T8B, v4, v29); 4378 __ eor(v5, __ T8B, v5, v30); 4379 __ eor(v6, __ T8B, v6, v31); 4380 4381 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4382 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4383 4384 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4385 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4386 __ eor(v7, __ T8B, v7, v25); 4387 __ eor(v8, __ T8B, v8, v26); 4388 __ eor(v9, __ T8B, v9, v27); 4389 __ eor(v10, __ T8B, v10, v28); 4390 __ eor(v11, __ T8B, v11, v29); 4391 __ eor(v12, __ T8B, v12, v30); 4392 __ eor(v13, __ T8B, v13, v31); 4393 4394 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4395 __ eor(v14, __ T8B, v14, v25); 4396 __ eor(v15, __ T8B, v15, v26); 4397 __ eor(v16, __ T8B, v16, v27); 4398 4399 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4400 __ andw(c_rarg5, block_size, 48); 4401 __ cbzw(c_rarg5, rounds24_loop); 4402 4403 __ tbnz(block_size, 5, shake128); 4404 // block_size == 144, bit5 == 0, SHA3-224 4405 __ ldrd(v28, __ post(buf, 8)); 4406 __ eor(v17, __ T8B, v17, v28); 4407 __ b(rounds24_loop); 4408 4409 __ BIND(shake128); 4410 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4411 __ eor(v17, __ T8B, v17, v28); 4412 __ eor(v18, __ T8B, v18, v29); 4413 __ eor(v19, __ T8B, v19, v30); 4414 __ eor(v20, __ T8B, v20, v31); 4415 __ b(rounds24_loop); // block_size == 168, SHAKE128 4416 4417 __ BIND(sha3_512_or_sha3_384); 4418 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4419 __ eor(v7, __ T8B, v7, v25); 4420 __ eor(v8, __ T8B, v8, v26); 4421 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4422 4423 // SHA3-384 4424 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4425 __ eor(v9, __ T8B, v9, v27); 4426 __ eor(v10, __ T8B, v10, v28); 4427 __ eor(v11, __ T8B, v11, v29); 4428 __ eor(v12, __ T8B, v12, v30); 4429 4430 __ BIND(rounds24_loop); 4431 __ subw(rscratch2, rscratch2, 1); 4432 4433 keccak_round(rscratch1); 4434 4435 __ cbnzw(rscratch2, rounds24_loop); 4436 4437 if (multi_block) { 4438 __ add(ofs, ofs, block_size); 4439 __ cmp(ofs, limit); 4440 __ br(Assembler::LE, sha3_loop); 4441 __ mov(c_rarg0, ofs); // return ofs 4442 } 4443 4444 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4445 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4446 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4447 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4448 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4449 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4450 __ st1(v24, __ T1D, state); 4451 4452 // restore callee-saved registers 4453 __ ldpd(v14, v15, Address(sp, 48)); 4454 __ ldpd(v12, v13, Address(sp, 32)); 4455 __ ldpd(v10, v11, Address(sp, 16)); 4456 __ ldpd(v8, v9, __ post(sp, 64)); 4457 4458 __ ret(lr); 4459 4460 return start; 4461 } 4462 4463 // Inputs: 4464 // c_rarg0 - long[] state0 4465 // c_rarg1 - long[] state1 4466 address generate_double_keccak() { 4467 static const uint64_t round_consts[24] = { 4468 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4469 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4470 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4471 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4472 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4473 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4474 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4475 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4476 }; 4477 4478 // Implements the double_keccak() method of the 4479 // sun.secyrity.provider.SHA3Parallel class 4480 __ align(CodeEntryAlignment); 4481 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4482 address start = __ pc(); 4483 __ enter(); 4484 4485 Register state0 = c_rarg0; 4486 Register state1 = c_rarg1; 4487 4488 Label rounds24_loop; 4489 4490 // save callee-saved registers 4491 __ stpd(v8, v9, __ pre(sp, -64)); 4492 __ stpd(v10, v11, Address(sp, 16)); 4493 __ stpd(v12, v13, Address(sp, 32)); 4494 __ stpd(v14, v15, Address(sp, 48)); 4495 4496 // load states 4497 __ add(rscratch1, state0, 32); 4498 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4499 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4500 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4501 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4502 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4503 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4504 __ ld1(v24, __ D, 0, rscratch1); 4505 __ add(rscratch1, state1, 32); 4506 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4507 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4508 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4509 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4510 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4511 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4512 __ ld1(v24, __ D, 1, rscratch1); 4513 4514 // 24 keccak rounds 4515 __ movw(rscratch2, 24); 4516 4517 // load round_constants base 4518 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4519 4520 __ BIND(rounds24_loop); 4521 __ subw(rscratch2, rscratch2, 1); 4522 keccak_round(rscratch1); 4523 __ cbnzw(rscratch2, rounds24_loop); 4524 4525 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4526 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4527 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4528 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4529 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4530 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4531 __ st1(v24, __ D, 0, state0); 4532 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4533 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4534 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4535 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4536 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4537 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4538 __ st1(v24, __ D, 1, state1); 4539 4540 // restore callee-saved vector registers 4541 __ ldpd(v14, v15, Address(sp, 48)); 4542 __ ldpd(v12, v13, Address(sp, 32)); 4543 __ ldpd(v10, v11, Address(sp, 16)); 4544 __ ldpd(v8, v9, __ post(sp, 64)); 4545 4546 __ leave(); // required for proper stackwalking of RuntimeStub frame 4547 __ mov(r0, zr); // return 0 4548 __ ret(lr); 4549 4550 return start; 4551 } 4552 4553 // ChaCha20 block function. This version parallelizes the 32-bit 4554 // state elements on each of 16 vectors, producing 4 blocks of 4555 // keystream at a time. 4556 // 4557 // state (int[16]) = c_rarg0 4558 // keystream (byte[256]) = c_rarg1 4559 // return - number of bytes of produced keystream (always 256) 4560 // 4561 // This implementation takes each 32-bit integer from the state 4562 // array and broadcasts it across all 4 32-bit lanes of a vector register 4563 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4564 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4565 // the quarter round schedule is implemented as outlined in RFC 7539 section 4566 // 2.3. However, instead of sequentially processing the 3 quarter round 4567 // operations represented by one QUARTERROUND function, we instead stack all 4568 // the adds, xors and left-rotations from the first 4 quarter rounds together 4569 // and then do the same for the second set of 4 quarter rounds. This removes 4570 // some latency that would otherwise be incurred by waiting for an add to 4571 // complete before performing an xor (which depends on the result of the 4572 // add), etc. An adjustment happens between the first and second groups of 4 4573 // quarter rounds, but this is done only in the inputs to the macro functions 4574 // that generate the assembly instructions - these adjustments themselves are 4575 // not part of the resulting assembly. 4576 // The 4 registers v0-v3 are used during the quarter round operations as 4577 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4578 // registers become the vectors involved in adding the start state back onto 4579 // the post-QR working state. After the adds are complete, each of the 16 4580 // vectors write their first lane back to the keystream buffer, followed 4581 // by the second lane from all vectors and so on. 4582 address generate_chacha20Block_blockpar() { 4583 Label L_twoRounds, L_cc20_const; 4584 // The constant data is broken into two 128-bit segments to be loaded 4585 // onto FloatRegisters. The first 128 bits are a counter add overlay 4586 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4587 // The second 128-bits is a table constant used for 8-bit left rotations. 4588 __ BIND(L_cc20_const); 4589 __ emit_int64(0x0000000100000000UL); 4590 __ emit_int64(0x0000000300000002UL); 4591 __ emit_int64(0x0605040702010003UL); 4592 __ emit_int64(0x0E0D0C0F0A09080BUL); 4593 4594 __ align(CodeEntryAlignment); 4595 StubId stub_id = StubId::stubgen_chacha20Block_id; 4596 StubCodeMark mark(this, stub_id); 4597 address start = __ pc(); 4598 __ enter(); 4599 4600 int i, j; 4601 const Register state = c_rarg0; 4602 const Register keystream = c_rarg1; 4603 const Register loopCtr = r10; 4604 const Register tmpAddr = r11; 4605 const FloatRegister ctrAddOverlay = v28; 4606 const FloatRegister lrot8Tbl = v29; 4607 4608 // Organize SIMD registers in an array that facilitates 4609 // putting repetitive opcodes into loop structures. It is 4610 // important that each grouping of 4 registers is monotonically 4611 // increasing to support the requirements of multi-register 4612 // instructions (e.g. ld4r, st4, etc.) 4613 const FloatRegister workSt[16] = { 4614 v4, v5, v6, v7, v16, v17, v18, v19, 4615 v20, v21, v22, v23, v24, v25, v26, v27 4616 }; 4617 4618 // Pull in constant data. The first 16 bytes are the add overlay 4619 // which is applied to the vector holding the counter (state[12]). 4620 // The second 16 bytes is the index register for the 8-bit left 4621 // rotation tbl instruction. 4622 __ adr(tmpAddr, L_cc20_const); 4623 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4624 4625 // Load from memory and interlace across 16 SIMD registers, 4626 // With each word from memory being broadcast to all lanes of 4627 // each successive SIMD register. 4628 // Addr(0) -> All lanes in workSt[i] 4629 // Addr(4) -> All lanes workSt[i + 1], etc. 4630 __ mov(tmpAddr, state); 4631 for (i = 0; i < 16; i += 4) { 4632 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4633 __ post(tmpAddr, 16)); 4634 } 4635 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4636 4637 // Before entering the loop, create 5 4-register arrays. These 4638 // will hold the 4 registers that represent the a/b/c/d fields 4639 // in the quarter round operation. For instance the "b" field 4640 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4641 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4642 // since it is part of a diagonal organization. The aSet and scratch 4643 // register sets are defined at declaration time because they do not change 4644 // organization at any point during the 20-round processing. 4645 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4646 FloatRegister bSet[4]; 4647 FloatRegister cSet[4]; 4648 FloatRegister dSet[4]; 4649 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4650 4651 // Set up the 10 iteration loop and perform all 8 quarter round ops 4652 __ mov(loopCtr, 10); 4653 __ BIND(L_twoRounds); 4654 4655 // Set to columnar organization and do the following 4 quarter-rounds: 4656 // QUARTERROUND(0, 4, 8, 12) 4657 // QUARTERROUND(1, 5, 9, 13) 4658 // QUARTERROUND(2, 6, 10, 14) 4659 // QUARTERROUND(3, 7, 11, 15) 4660 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4661 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4662 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4663 4664 __ cc20_qr_add4(aSet, bSet); // a += b 4665 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4666 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4667 4668 __ cc20_qr_add4(cSet, dSet); // c += d 4669 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4670 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4671 4672 __ cc20_qr_add4(aSet, bSet); // a += b 4673 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4674 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4675 4676 __ cc20_qr_add4(cSet, dSet); // c += d 4677 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4678 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4679 4680 // Set to diagonal organization and do the next 4 quarter-rounds: 4681 // QUARTERROUND(0, 5, 10, 15) 4682 // QUARTERROUND(1, 6, 11, 12) 4683 // QUARTERROUND(2, 7, 8, 13) 4684 // QUARTERROUND(3, 4, 9, 14) 4685 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4686 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4687 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4688 4689 __ cc20_qr_add4(aSet, bSet); // a += b 4690 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4691 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4692 4693 __ cc20_qr_add4(cSet, dSet); // c += d 4694 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4695 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4696 4697 __ cc20_qr_add4(aSet, bSet); // a += b 4698 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4699 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4700 4701 __ cc20_qr_add4(cSet, dSet); // c += d 4702 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4703 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4704 4705 // Decrement and iterate 4706 __ sub(loopCtr, loopCtr, 1); 4707 __ cbnz(loopCtr, L_twoRounds); 4708 4709 __ mov(tmpAddr, state); 4710 4711 // Add the starting state back to the post-loop keystream 4712 // state. We read/interlace the state array from memory into 4713 // 4 registers similar to what we did in the beginning. Then 4714 // add the counter overlay onto workSt[12] at the end. 4715 for (i = 0; i < 16; i += 4) { 4716 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4717 __ addv(workSt[i], __ T4S, workSt[i], v0); 4718 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4719 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4720 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4721 } 4722 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4723 4724 // Write working state into the keystream buffer. This is accomplished 4725 // by taking the lane "i" from each of the four vectors and writing 4726 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4727 // repeating with the next 4 vectors until all 16 vectors have been used. 4728 // Then move to the next lane and repeat the process until all lanes have 4729 // been written. 4730 for (i = 0; i < 4; i++) { 4731 for (j = 0; j < 16; j += 4) { 4732 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4733 __ post(keystream, 16)); 4734 } 4735 } 4736 4737 __ mov(r0, 256); // Return length of output keystream 4738 __ leave(); 4739 __ ret(lr); 4740 4741 return start; 4742 } 4743 4744 // Helpers to schedule parallel operation bundles across vector 4745 // register sequences of size 2, 4 or 8. 4746 4747 // Implement various primitive computations across vector sequences 4748 4749 template<int N> 4750 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4751 const VSeq<N>& v1, const VSeq<N>& v2) { 4752 // output must not be constant 4753 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4754 // output cannot overwrite pending inputs 4755 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4756 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4757 for (int i = 0; i < N; i++) { 4758 __ addv(v[i], T, v1[i], v2[i]); 4759 } 4760 } 4761 4762 template<int N> 4763 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4764 const VSeq<N>& v1, const VSeq<N>& v2) { 4765 // output must not be constant 4766 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4767 // output cannot overwrite pending inputs 4768 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4769 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4770 for (int i = 0; i < N; i++) { 4771 __ subv(v[i], T, v1[i], v2[i]); 4772 } 4773 } 4774 4775 template<int N> 4776 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4777 const VSeq<N>& v1, const VSeq<N>& v2) { 4778 // output must not be constant 4779 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4780 // output cannot overwrite pending inputs 4781 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4782 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4783 for (int i = 0; i < N; i++) { 4784 __ mulv(v[i], T, v1[i], v2[i]); 4785 } 4786 } 4787 4788 template<int N> 4789 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4790 // output must not be constant 4791 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4792 // output cannot overwrite pending inputs 4793 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4794 for (int i = 0; i < N; i++) { 4795 __ negr(v[i], T, v1[i]); 4796 } 4797 } 4798 4799 template<int N> 4800 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4801 const VSeq<N>& v1, int shift) { 4802 // output must not be constant 4803 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4804 // output cannot overwrite pending inputs 4805 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4806 for (int i = 0; i < N; i++) { 4807 __ sshr(v[i], T, v1[i], shift); 4808 } 4809 } 4810 4811 template<int N> 4812 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4813 // output must not be constant 4814 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4815 // output cannot overwrite pending inputs 4816 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4817 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4818 for (int i = 0; i < N; i++) { 4819 __ andr(v[i], __ T16B, v1[i], v2[i]); 4820 } 4821 } 4822 4823 template<int N> 4824 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4825 // output must not be constant 4826 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4827 // output cannot overwrite pending inputs 4828 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4829 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4830 for (int i = 0; i < N; i++) { 4831 __ orr(v[i], __ T16B, v1[i], v2[i]); 4832 } 4833 } 4834 4835 template<int N> 4836 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4837 // output must not be constant 4838 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4839 // output cannot overwrite pending inputs 4840 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4841 for (int i = 0; i < N; i++) { 4842 __ notr(v[i], __ T16B, v1[i]); 4843 } 4844 } 4845 4846 template<int N> 4847 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4848 // output must not be constant 4849 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4850 // output cannot overwrite pending inputs 4851 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4852 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4853 for (int i = 0; i < N; i++) { 4854 __ sqdmulh(v[i], T, v1[i], v2[i]); 4855 } 4856 } 4857 4858 template<int N> 4859 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4860 // output must not be constant 4861 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4862 // output cannot overwrite pending inputs 4863 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4864 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4865 for (int i = 0; i < N; i++) { 4866 __ mlsv(v[i], T, v1[i], v2[i]); 4867 } 4868 } 4869 4870 // load N/2 successive pairs of quadword values from memory in order 4871 // into N successive vector registers of the sequence via the 4872 // address supplied in base. 4873 template<int N> 4874 void vs_ldpq(const VSeq<N>& v, Register base) { 4875 for (int i = 0; i < N; i += 2) { 4876 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4877 } 4878 } 4879 4880 // load N/2 successive pairs of quadword values from memory in order 4881 // into N vector registers of the sequence via the address supplied 4882 // in base using post-increment addressing 4883 template<int N> 4884 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4885 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4886 for (int i = 0; i < N; i += 2) { 4887 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4888 } 4889 } 4890 4891 // store N successive vector registers of the sequence into N/2 4892 // successive pairs of quadword memory locations via the address 4893 // supplied in base using post-increment addressing 4894 template<int N> 4895 void vs_stpq_post(const VSeq<N>& v, Register base) { 4896 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4897 for (int i = 0; i < N; i += 2) { 4898 __ stpq(v[i], v[i+1], __ post(base, 32)); 4899 } 4900 } 4901 4902 // load N/2 pairs of quadword values from memory de-interleaved into 4903 // N vector registers 2 at a time via the address supplied in base 4904 // using post-increment addressing. 4905 template<int N> 4906 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4907 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4908 for (int i = 0; i < N; i += 2) { 4909 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4910 } 4911 } 4912 4913 // store N vector registers interleaved into N/2 pairs of quadword 4914 // memory locations via the address supplied in base using 4915 // post-increment addressing. 4916 template<int N> 4917 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4918 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4919 for (int i = 0; i < N; i += 2) { 4920 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4921 } 4922 } 4923 4924 // load N quadword values from memory de-interleaved into N vector 4925 // registers 3 elements at a time via the address supplied in base. 4926 template<int N> 4927 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4928 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4929 for (int i = 0; i < N; i += 3) { 4930 __ ld3(v[i], v[i+1], v[i+2], T, base); 4931 } 4932 } 4933 4934 // load N quadword values from memory de-interleaved into N vector 4935 // registers 3 elements at a time via the address supplied in base 4936 // using post-increment addressing. 4937 template<int N> 4938 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4939 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4940 for (int i = 0; i < N; i += 3) { 4941 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4942 } 4943 } 4944 4945 // load N/2 pairs of quadword values from memory into N vector 4946 // registers via the address supplied in base with each pair indexed 4947 // using the the start offset plus the corresponding entry in the 4948 // offsets array 4949 template<int N> 4950 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4951 for (int i = 0; i < N/2; i++) { 4952 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4953 } 4954 } 4955 4956 // store N vector registers into N/2 pairs of quadword memory 4957 // locations via the address supplied in base with each pair indexed 4958 // using the the start offset plus the corresponding entry in the 4959 // offsets array 4960 template<int N> 4961 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4962 for (int i = 0; i < N/2; i++) { 4963 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4964 } 4965 } 4966 4967 // load N single quadword values from memory into N vector registers 4968 // via the address supplied in base with each value indexed using 4969 // the the start offset plus the corresponding entry in the offsets 4970 // array 4971 template<int N> 4972 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4973 int start, int (&offsets)[N]) { 4974 for (int i = 0; i < N; i++) { 4975 __ ldr(v[i], T, Address(base, start + offsets[i])); 4976 } 4977 } 4978 4979 // store N vector registers into N single quadword memory locations 4980 // via the address supplied in base with each value indexed using 4981 // the the start offset plus the corresponding entry in the offsets 4982 // array 4983 template<int N> 4984 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4985 int start, int (&offsets)[N]) { 4986 for (int i = 0; i < N; i++) { 4987 __ str(v[i], T, Address(base, start + offsets[i])); 4988 } 4989 } 4990 4991 // load N/2 pairs of quadword values from memory de-interleaved into 4992 // N vector registers 2 at a time via the address supplied in base 4993 // with each pair indexed using the the start offset plus the 4994 // corresponding entry in the offsets array 4995 template<int N> 4996 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4997 Register tmp, int start, int (&offsets)[N/2]) { 4998 for (int i = 0; i < N/2; i++) { 4999 __ add(tmp, base, start + offsets[i]); 5000 __ ld2(v[2*i], v[2*i+1], T, tmp); 5001 } 5002 } 5003 5004 // store N vector registers 2 at a time interleaved into N/2 pairs 5005 // of quadword memory locations via the address supplied in base 5006 // with each pair indexed using the the start offset plus the 5007 // corresponding entry in the offsets array 5008 template<int N> 5009 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 5010 Register tmp, int start, int (&offsets)[N/2]) { 5011 for (int i = 0; i < N/2; i++) { 5012 __ add(tmp, base, start + offsets[i]); 5013 __ st2(v[2*i], v[2*i+1], T, tmp); 5014 } 5015 } 5016 5017 // Helper routines for various flavours of Montgomery multiply 5018 5019 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 5020 // multiplications in parallel 5021 // 5022 5023 // See the montMul() method of the sun.security.provider.ML_DSA 5024 // class. 5025 // 5026 // Computes 4x4S results or 8x8H results 5027 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5028 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5029 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5030 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5031 // Outputs: va - 4x4S or 4x8H vector register sequences 5032 // vb, vc, vtmp and vq must all be disjoint 5033 // va must be disjoint from all other inputs/temps or must equal vc 5034 // va must have a non-zero delta i.e. it must not be a constant vseq. 5035 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5036 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5037 Assembler::SIMD_Arrangement T, 5038 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5039 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5040 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5041 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5042 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5043 5044 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5045 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5046 5047 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5048 5049 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5050 assert(vs_disjoint(va, vb), "va and vb overlap"); 5051 assert(vs_disjoint(va, vq), "va and vq overlap"); 5052 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5053 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5054 5055 // schedule 4 streams of instructions across the vector sequences 5056 for (int i = 0; i < 4; i++) { 5057 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5058 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5059 } 5060 5061 for (int i = 0; i < 4; i++) { 5062 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5063 } 5064 5065 for (int i = 0; i < 4; i++) { 5066 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5067 } 5068 5069 for (int i = 0; i < 4; i++) { 5070 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5071 } 5072 } 5073 5074 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5075 // multiplications in parallel 5076 // 5077 5078 // See the montMul() method of the sun.security.provider.ML_DSA 5079 // class. 5080 // 5081 // Computes 4x4S results or 8x8H results 5082 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5083 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5084 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5085 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5086 // Outputs: va - 4x4S or 4x8H vector register sequences 5087 // vb, vc, vtmp and vq must all be disjoint 5088 // va must be disjoint from all other inputs/temps or must equal vc 5089 // va must have a non-zero delta i.e. it must not be a constant vseq. 5090 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5091 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5092 Assembler::SIMD_Arrangement T, 5093 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5094 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5095 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5096 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5097 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5098 5099 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5100 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5101 5102 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5103 5104 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5105 assert(vs_disjoint(va, vb), "va and vb overlap"); 5106 assert(vs_disjoint(va, vq), "va and vq overlap"); 5107 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5108 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5109 5110 // schedule 2 streams of instructions across the vector sequences 5111 for (int i = 0; i < 2; i++) { 5112 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5113 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5114 } 5115 5116 for (int i = 0; i < 2; i++) { 5117 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5118 } 5119 5120 for (int i = 0; i < 2; i++) { 5121 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5122 } 5123 5124 for (int i = 0; i < 2; i++) { 5125 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5126 } 5127 } 5128 5129 // Perform 16 16-bit Montgomery multiplications in parallel. 5130 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5131 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5132 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5133 // It will assert that the register use is valid 5134 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5135 } 5136 5137 // Perform 32 16-bit Montgomery multiplications in parallel. 5138 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5139 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5140 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5141 // It will assert that the register use is valid 5142 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5143 } 5144 5145 // Perform 64 16-bit Montgomery multiplications in parallel. 5146 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5147 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5148 // Schedule two successive 4x8H multiplies via the montmul helper 5149 // on the front and back halves of va, vb and vc. The helper will 5150 // assert that the register use has no overlap conflicts on each 5151 // individual call but we also need to ensure that the necessary 5152 // disjoint/equality constraints are met across both calls. 5153 5154 // vb, vc, vtmp and vq must be disjoint. va must either be 5155 // disjoint from all other registers or equal vc 5156 5157 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5158 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5159 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5160 5161 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5162 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5163 5164 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5165 5166 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5167 assert(vs_disjoint(va, vb), "va and vb overlap"); 5168 assert(vs_disjoint(va, vq), "va and vq overlap"); 5169 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5170 5171 // we multiply the front and back halves of each sequence 4 at a 5172 // time because 5173 // 5174 // 1) we are currently only able to get 4-way instruction 5175 // parallelism at best 5176 // 5177 // 2) we need registers for the constants in vq and temporary 5178 // scratch registers to hold intermediate results so vtmp can only 5179 // be a VSeq<4> which means we only have 4 scratch slots 5180 5181 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5182 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5183 } 5184 5185 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5186 const VSeq<4>& vc, 5187 const VSeq<4>& vtmp, 5188 const VSeq<2>& vq) { 5189 // compute a = montmul(a1, c) 5190 kyber_montmul32(vc, va1, vc, vtmp, vq); 5191 // ouptut a1 = a0 - a 5192 vs_subv(va1, __ T8H, va0, vc); 5193 // and a0 = a0 + a 5194 vs_addv(va0, __ T8H, va0, vc); 5195 } 5196 5197 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5198 const VSeq<4>& vb, 5199 const VSeq<4>& vtmp1, 5200 const VSeq<4>& vtmp2, 5201 const VSeq<2>& vq) { 5202 // compute c = a0 - a1 5203 vs_subv(vtmp1, __ T8H, va0, va1); 5204 // output a0 = a0 + a1 5205 vs_addv(va0, __ T8H, va0, va1); 5206 // output a1 = b montmul c 5207 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5208 } 5209 5210 void load64shorts(const VSeq<8>& v, Register shorts) { 5211 vs_ldpq_post(v, shorts); 5212 } 5213 5214 void load32shorts(const VSeq<4>& v, Register shorts) { 5215 vs_ldpq_post(v, shorts); 5216 } 5217 5218 void store64shorts(VSeq<8> v, Register tmpAddr) { 5219 vs_stpq_post(v, tmpAddr); 5220 } 5221 5222 // Kyber NTT function. 5223 // Implements 5224 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5225 // 5226 // coeffs (short[256]) = c_rarg0 5227 // ntt_zetas (short[256]) = c_rarg1 5228 address generate_kyberNtt() { 5229 5230 __ align(CodeEntryAlignment); 5231 StubId stub_id = StubId::stubgen_kyberNtt_id; 5232 StubCodeMark mark(this, stub_id); 5233 address start = __ pc(); 5234 __ enter(); 5235 5236 const Register coeffs = c_rarg0; 5237 const Register zetas = c_rarg1; 5238 5239 const Register kyberConsts = r10; 5240 const Register tmpAddr = r11; 5241 5242 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5243 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5244 VSeq<2> vq(30); // n.b. constants overlap vs3 5245 5246 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5247 // load the montmul constants 5248 vs_ldpq(vq, kyberConsts); 5249 5250 // Each level corresponds to an iteration of the outermost loop of the 5251 // Java method seilerNTT(int[] coeffs). There are some differences 5252 // from what is done in the seilerNTT() method, though: 5253 // 1. The computation is using 16-bit signed values, we do not convert them 5254 // to ints here. 5255 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5256 // this array for each level, it is easier that way to fill up the vector 5257 // registers. 5258 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5259 // multiplications (this is because that way there should not be any 5260 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5261 // that we can use the 16-bit arithmetic in the vector unit. 5262 // 5263 // On each level, we fill up the vector registers in such a way that the 5264 // array elements that need to be multiplied by the zetas go into one 5265 // set of vector registers while the corresponding ones that don't need to 5266 // be multiplied, go into another set. 5267 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5268 // registers interleaving the steps of 4 identical computations, 5269 // each done on 8 16-bit values per register. 5270 5271 // At levels 0-3 the coefficients multiplied by or added/subtracted 5272 // to the zetas occur in discrete blocks whose size is some multiple 5273 // of 32. 5274 5275 // level 0 5276 __ add(tmpAddr, coeffs, 256); 5277 load64shorts(vs1, tmpAddr); 5278 load64shorts(vs2, zetas); 5279 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5280 __ add(tmpAddr, coeffs, 0); 5281 load64shorts(vs1, tmpAddr); 5282 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5283 vs_addv(vs1, __ T8H, vs1, vs2); 5284 __ add(tmpAddr, coeffs, 0); 5285 vs_stpq_post(vs1, tmpAddr); 5286 __ add(tmpAddr, coeffs, 256); 5287 vs_stpq_post(vs3, tmpAddr); 5288 // restore montmul constants 5289 vs_ldpq(vq, kyberConsts); 5290 load64shorts(vs1, tmpAddr); 5291 load64shorts(vs2, zetas); 5292 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5293 __ add(tmpAddr, coeffs, 128); 5294 load64shorts(vs1, tmpAddr); 5295 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5296 vs_addv(vs1, __ T8H, vs1, vs2); 5297 __ add(tmpAddr, coeffs, 128); 5298 store64shorts(vs1, tmpAddr); 5299 __ add(tmpAddr, coeffs, 384); 5300 store64shorts(vs3, tmpAddr); 5301 5302 // level 1 5303 // restore montmul constants 5304 vs_ldpq(vq, kyberConsts); 5305 __ add(tmpAddr, coeffs, 128); 5306 load64shorts(vs1, tmpAddr); 5307 load64shorts(vs2, zetas); 5308 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5309 __ add(tmpAddr, coeffs, 0); 5310 load64shorts(vs1, tmpAddr); 5311 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5312 vs_addv(vs1, __ T8H, vs1, vs2); 5313 __ add(tmpAddr, coeffs, 0); 5314 store64shorts(vs1, tmpAddr); 5315 store64shorts(vs3, tmpAddr); 5316 vs_ldpq(vq, kyberConsts); 5317 __ add(tmpAddr, coeffs, 384); 5318 load64shorts(vs1, tmpAddr); 5319 load64shorts(vs2, zetas); 5320 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5321 __ add(tmpAddr, coeffs, 256); 5322 load64shorts(vs1, tmpAddr); 5323 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5324 vs_addv(vs1, __ T8H, vs1, vs2); 5325 __ add(tmpAddr, coeffs, 256); 5326 store64shorts(vs1, tmpAddr); 5327 store64shorts(vs3, tmpAddr); 5328 5329 // level 2 5330 vs_ldpq(vq, kyberConsts); 5331 int offsets1[4] = { 0, 32, 128, 160 }; 5332 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5333 load64shorts(vs2, zetas); 5334 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5335 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5336 // kyber_subv_addv64(); 5337 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5338 vs_addv(vs1, __ T8H, vs1, vs2); 5339 __ add(tmpAddr, coeffs, 0); 5340 vs_stpq_post(vs_front(vs1), tmpAddr); 5341 vs_stpq_post(vs_front(vs3), tmpAddr); 5342 vs_stpq_post(vs_back(vs1), tmpAddr); 5343 vs_stpq_post(vs_back(vs3), tmpAddr); 5344 vs_ldpq(vq, kyberConsts); 5345 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5346 load64shorts(vs2, zetas); 5347 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5348 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5349 // kyber_subv_addv64(); 5350 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5351 vs_addv(vs1, __ T8H, vs1, vs2); 5352 __ add(tmpAddr, coeffs, 256); 5353 vs_stpq_post(vs_front(vs1), tmpAddr); 5354 vs_stpq_post(vs_front(vs3), tmpAddr); 5355 vs_stpq_post(vs_back(vs1), tmpAddr); 5356 vs_stpq_post(vs_back(vs3), tmpAddr); 5357 5358 // level 3 5359 vs_ldpq(vq, kyberConsts); 5360 int offsets2[4] = { 0, 64, 128, 192 }; 5361 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5362 load64shorts(vs2, zetas); 5363 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5364 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5365 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5366 vs_addv(vs1, __ T8H, vs1, vs2); 5367 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5368 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5369 5370 vs_ldpq(vq, kyberConsts); 5371 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5372 load64shorts(vs2, zetas); 5373 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5374 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5375 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5376 vs_addv(vs1, __ T8H, vs1, vs2); 5377 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5378 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5379 5380 // level 4 5381 // At level 4 coefficients occur in 8 discrete blocks of size 16 5382 // so they are loaded using employing an ldr at 8 distinct offsets. 5383 5384 vs_ldpq(vq, kyberConsts); 5385 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5386 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5387 load64shorts(vs2, zetas); 5388 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5389 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5390 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5391 vs_addv(vs1, __ T8H, vs1, vs2); 5392 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5393 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5394 5395 vs_ldpq(vq, kyberConsts); 5396 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5397 load64shorts(vs2, zetas); 5398 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5399 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5400 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5401 vs_addv(vs1, __ T8H, vs1, vs2); 5402 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5403 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5404 5405 // level 5 5406 // At level 5 related coefficients occur in discrete blocks of size 8 so 5407 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5408 5409 vs_ldpq(vq, kyberConsts); 5410 int offsets4[4] = { 0, 32, 64, 96 }; 5411 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5412 load32shorts(vs_front(vs2), zetas); 5413 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5414 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5415 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5416 load32shorts(vs_front(vs2), zetas); 5417 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5418 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5419 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5420 load32shorts(vs_front(vs2), zetas); 5421 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5422 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5423 5424 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5425 load32shorts(vs_front(vs2), zetas); 5426 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5427 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5428 5429 // level 6 5430 // At level 6 related coefficients occur in discrete blocks of size 4 so 5431 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5432 5433 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5434 load32shorts(vs_front(vs2), zetas); 5435 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5436 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5437 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5438 // __ ldpq(v18, v19, __ post(zetas, 32)); 5439 load32shorts(vs_front(vs2), zetas); 5440 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5441 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5442 5443 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5444 load32shorts(vs_front(vs2), zetas); 5445 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5446 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5447 5448 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5449 load32shorts(vs_front(vs2), zetas); 5450 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5451 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5452 5453 __ leave(); // required for proper stackwalking of RuntimeStub frame 5454 __ mov(r0, zr); // return 0 5455 __ ret(lr); 5456 5457 return start; 5458 } 5459 5460 // Kyber Inverse NTT function 5461 // Implements 5462 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5463 // 5464 // coeffs (short[256]) = c_rarg0 5465 // ntt_zetas (short[256]) = c_rarg1 5466 address generate_kyberInverseNtt() { 5467 5468 __ align(CodeEntryAlignment); 5469 StubId stub_id = StubId::stubgen_kyberInverseNtt_id; 5470 StubCodeMark mark(this, stub_id); 5471 address start = __ pc(); 5472 __ enter(); 5473 5474 const Register coeffs = c_rarg0; 5475 const Register zetas = c_rarg1; 5476 5477 const Register kyberConsts = r10; 5478 const Register tmpAddr = r11; 5479 const Register tmpAddr2 = c_rarg2; 5480 5481 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5482 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5483 VSeq<2> vq(30); // n.b. constants overlap vs3 5484 5485 __ lea(kyberConsts, 5486 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5487 5488 // level 0 5489 // At level 0 related coefficients occur in discrete blocks of size 4 so 5490 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5491 5492 vs_ldpq(vq, kyberConsts); 5493 int offsets4[4] = { 0, 32, 64, 96 }; 5494 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5495 load32shorts(vs_front(vs2), zetas); 5496 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5497 vs_front(vs2), vs_back(vs2), vtmp, vq); 5498 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5499 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5500 load32shorts(vs_front(vs2), zetas); 5501 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5502 vs_front(vs2), vs_back(vs2), vtmp, vq); 5503 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5504 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5505 load32shorts(vs_front(vs2), zetas); 5506 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5507 vs_front(vs2), vs_back(vs2), vtmp, vq); 5508 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5509 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5510 load32shorts(vs_front(vs2), zetas); 5511 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5512 vs_front(vs2), vs_back(vs2), vtmp, vq); 5513 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5514 5515 // level 1 5516 // At level 1 related coefficients occur in discrete blocks of size 8 so 5517 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5518 5519 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5520 load32shorts(vs_front(vs2), zetas); 5521 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5522 vs_front(vs2), vs_back(vs2), vtmp, vq); 5523 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5524 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5525 load32shorts(vs_front(vs2), zetas); 5526 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5527 vs_front(vs2), vs_back(vs2), vtmp, vq); 5528 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5529 5530 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5531 load32shorts(vs_front(vs2), zetas); 5532 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5533 vs_front(vs2), vs_back(vs2), vtmp, vq); 5534 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5535 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5536 load32shorts(vs_front(vs2), zetas); 5537 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5538 vs_front(vs2), vs_back(vs2), vtmp, vq); 5539 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5540 5541 // level 2 5542 // At level 2 coefficients occur in 8 discrete blocks of size 16 5543 // so they are loaded using employing an ldr at 8 distinct offsets. 5544 5545 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5546 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5547 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5548 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5549 vs_subv(vs1, __ T8H, vs1, vs2); 5550 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5551 load64shorts(vs2, zetas); 5552 vs_ldpq(vq, kyberConsts); 5553 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5554 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5555 5556 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5557 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5558 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5559 vs_subv(vs1, __ T8H, vs1, vs2); 5560 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5561 load64shorts(vs2, zetas); 5562 vs_ldpq(vq, kyberConsts); 5563 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5564 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5565 5566 // Barrett reduction at indexes where overflow may happen 5567 5568 // load q and the multiplier for the Barrett reduction 5569 __ add(tmpAddr, kyberConsts, 16); 5570 vs_ldpq(vq, tmpAddr); 5571 5572 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5573 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5574 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5575 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5576 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5577 vs_sshr(vs2, __ T8H, vs2, 11); 5578 vs_mlsv(vs1, __ T8H, vs2, vq1); 5579 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5580 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5581 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5582 vs_sshr(vs2, __ T8H, vs2, 11); 5583 vs_mlsv(vs1, __ T8H, vs2, vq1); 5584 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5585 5586 // level 3 5587 // From level 3 upwards coefficients occur in discrete blocks whose size is 5588 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5589 5590 int offsets2[4] = { 0, 64, 128, 192 }; 5591 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5592 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5593 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5594 vs_subv(vs1, __ T8H, vs1, vs2); 5595 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5596 load64shorts(vs2, zetas); 5597 vs_ldpq(vq, kyberConsts); 5598 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5599 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5600 5601 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5602 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5603 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5604 vs_subv(vs1, __ T8H, vs1, vs2); 5605 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5606 load64shorts(vs2, zetas); 5607 vs_ldpq(vq, kyberConsts); 5608 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5609 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5610 5611 // level 4 5612 5613 int offsets1[4] = { 0, 32, 128, 160 }; 5614 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5615 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5616 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5617 vs_subv(vs1, __ T8H, vs1, vs2); 5618 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5619 load64shorts(vs2, zetas); 5620 vs_ldpq(vq, kyberConsts); 5621 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5622 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5623 5624 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5625 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5626 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5627 vs_subv(vs1, __ T8H, vs1, vs2); 5628 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5629 load64shorts(vs2, zetas); 5630 vs_ldpq(vq, kyberConsts); 5631 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5632 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5633 5634 // level 5 5635 5636 __ add(tmpAddr, coeffs, 0); 5637 load64shorts(vs1, tmpAddr); 5638 __ add(tmpAddr, coeffs, 128); 5639 load64shorts(vs2, tmpAddr); 5640 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5641 vs_subv(vs1, __ T8H, vs1, vs2); 5642 __ add(tmpAddr, coeffs, 0); 5643 store64shorts(vs3, tmpAddr); 5644 load64shorts(vs2, zetas); 5645 vs_ldpq(vq, kyberConsts); 5646 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5647 __ add(tmpAddr, coeffs, 128); 5648 store64shorts(vs2, tmpAddr); 5649 5650 load64shorts(vs1, tmpAddr); 5651 __ add(tmpAddr, coeffs, 384); 5652 load64shorts(vs2, tmpAddr); 5653 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5654 vs_subv(vs1, __ T8H, vs1, vs2); 5655 __ add(tmpAddr, coeffs, 256); 5656 store64shorts(vs3, tmpAddr); 5657 load64shorts(vs2, zetas); 5658 vs_ldpq(vq, kyberConsts); 5659 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5660 __ add(tmpAddr, coeffs, 384); 5661 store64shorts(vs2, tmpAddr); 5662 5663 // Barrett reduction at indexes where overflow may happen 5664 5665 // load q and the multiplier for the Barrett reduction 5666 __ add(tmpAddr, kyberConsts, 16); 5667 vs_ldpq(vq, tmpAddr); 5668 5669 int offsets0[2] = { 0, 256 }; 5670 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5671 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5672 vs_sshr(vs2, __ T8H, vs2, 11); 5673 vs_mlsv(vs1, __ T8H, vs2, vq1); 5674 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5675 5676 // level 6 5677 5678 __ add(tmpAddr, coeffs, 0); 5679 load64shorts(vs1, tmpAddr); 5680 __ add(tmpAddr, coeffs, 256); 5681 load64shorts(vs2, tmpAddr); 5682 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5683 vs_subv(vs1, __ T8H, vs1, vs2); 5684 __ add(tmpAddr, coeffs, 0); 5685 store64shorts(vs3, tmpAddr); 5686 load64shorts(vs2, zetas); 5687 vs_ldpq(vq, kyberConsts); 5688 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5689 __ add(tmpAddr, coeffs, 256); 5690 store64shorts(vs2, tmpAddr); 5691 5692 __ add(tmpAddr, coeffs, 128); 5693 load64shorts(vs1, tmpAddr); 5694 __ add(tmpAddr, coeffs, 384); 5695 load64shorts(vs2, tmpAddr); 5696 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5697 vs_subv(vs1, __ T8H, vs1, vs2); 5698 __ add(tmpAddr, coeffs, 128); 5699 store64shorts(vs3, tmpAddr); 5700 load64shorts(vs2, zetas); 5701 vs_ldpq(vq, kyberConsts); 5702 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5703 __ add(tmpAddr, coeffs, 384); 5704 store64shorts(vs2, tmpAddr); 5705 5706 // multiply by 2^-n 5707 5708 // load toMont(2^-n mod q) 5709 __ add(tmpAddr, kyberConsts, 48); 5710 __ ldr(v29, __ Q, tmpAddr); 5711 5712 vs_ldpq(vq, kyberConsts); 5713 __ add(tmpAddr, coeffs, 0); 5714 load64shorts(vs1, tmpAddr); 5715 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5716 __ add(tmpAddr, coeffs, 0); 5717 store64shorts(vs2, tmpAddr); 5718 5719 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5720 load64shorts(vs1, tmpAddr); 5721 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5722 __ add(tmpAddr, coeffs, 128); 5723 store64shorts(vs2, tmpAddr); 5724 5725 // now tmpAddr contains coeffs + 256 5726 load64shorts(vs1, tmpAddr); 5727 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5728 __ add(tmpAddr, coeffs, 256); 5729 store64shorts(vs2, tmpAddr); 5730 5731 // now tmpAddr contains coeffs + 384 5732 load64shorts(vs1, tmpAddr); 5733 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5734 __ add(tmpAddr, coeffs, 384); 5735 store64shorts(vs2, tmpAddr); 5736 5737 __ leave(); // required for proper stackwalking of RuntimeStub frame 5738 __ mov(r0, zr); // return 0 5739 __ ret(lr); 5740 5741 return start; 5742 } 5743 5744 // Kyber multiply polynomials in the NTT domain. 5745 // Implements 5746 // static int implKyberNttMult( 5747 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5748 // 5749 // result (short[256]) = c_rarg0 5750 // ntta (short[256]) = c_rarg1 5751 // nttb (short[256]) = c_rarg2 5752 // zetas (short[128]) = c_rarg3 5753 address generate_kyberNttMult() { 5754 5755 __ align(CodeEntryAlignment); 5756 StubId stub_id = StubId::stubgen_kyberNttMult_id; 5757 StubCodeMark mark(this, stub_id); 5758 address start = __ pc(); 5759 __ enter(); 5760 5761 const Register result = c_rarg0; 5762 const Register ntta = c_rarg1; 5763 const Register nttb = c_rarg2; 5764 const Register zetas = c_rarg3; 5765 5766 const Register kyberConsts = r10; 5767 const Register limit = r11; 5768 5769 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5770 VSeq<4> vs3(16), vs4(20); 5771 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5772 VSeq<2> vz(28); // pair of zetas 5773 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5774 5775 __ lea(kyberConsts, 5776 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5777 5778 Label kyberNttMult_loop; 5779 5780 __ add(limit, result, 512); 5781 5782 // load q and qinv 5783 vs_ldpq(vq, kyberConsts); 5784 5785 // load R^2 mod q (to convert back from Montgomery representation) 5786 __ add(kyberConsts, kyberConsts, 64); 5787 __ ldr(v27, __ Q, kyberConsts); 5788 5789 __ BIND(kyberNttMult_loop); 5790 5791 // load 16 zetas 5792 vs_ldpq_post(vz, zetas); 5793 5794 // load 2 sets of 32 coefficients from the two input arrays 5795 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5796 // are striped across pairs of vector registers 5797 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5798 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5799 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5800 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5801 5802 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5803 // i.e. montmul the first and second halves of vs1 in order and 5804 // then with one sequence reversed storing the two results in vs3 5805 // 5806 // vs3[0] <- montmul(a0, b0) 5807 // vs3[1] <- montmul(a1, b1) 5808 // vs3[2] <- montmul(a0, b1) 5809 // vs3[3] <- montmul(a1, b0) 5810 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5811 kyber_montmul16(vs_back(vs3), 5812 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5813 5814 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5815 // i.e. montmul the first and second halves of vs4 in order and 5816 // then with one sequence reversed storing the two results in vs1 5817 // 5818 // vs1[0] <- montmul(a2, b2) 5819 // vs1[1] <- montmul(a3, b3) 5820 // vs1[2] <- montmul(a2, b3) 5821 // vs1[3] <- montmul(a3, b2) 5822 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5823 kyber_montmul16(vs_back(vs1), 5824 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5825 5826 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5827 // We can schedule two montmuls at a time if we use a suitable vector 5828 // sequence <vs3[1], vs1[1]>. 5829 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5830 VSeq<2> vs5(vs3[1], delta); 5831 5832 // vs3[1] <- montmul(montmul(a1, b1), z0) 5833 // vs1[1] <- montmul(montmul(a3, b3), z1) 5834 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5835 5836 // add results in pairs storing in vs3 5837 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5838 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5839 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5840 5841 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5842 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5843 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5844 5845 // vs1 <- montmul(vs3, montRSquareModQ) 5846 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5847 5848 // store back the two pairs of result vectors de-interleaved as 8H elements 5849 // i.e. storing each pairs of shorts striped across a register pair adjacent 5850 // in memory 5851 vs_st2_post(vs1, __ T8H, result); 5852 5853 __ cmp(result, limit); 5854 __ br(Assembler::NE, kyberNttMult_loop); 5855 5856 __ leave(); // required for proper stackwalking of RuntimeStub frame 5857 __ mov(r0, zr); // return 0 5858 __ ret(lr); 5859 5860 return start; 5861 } 5862 5863 // Kyber add 2 polynomials. 5864 // Implements 5865 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5866 // 5867 // result (short[256]) = c_rarg0 5868 // a (short[256]) = c_rarg1 5869 // b (short[256]) = c_rarg2 5870 address generate_kyberAddPoly_2() { 5871 5872 __ align(CodeEntryAlignment); 5873 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id; 5874 StubCodeMark mark(this, stub_id); 5875 address start = __ pc(); 5876 __ enter(); 5877 5878 const Register result = c_rarg0; 5879 const Register a = c_rarg1; 5880 const Register b = c_rarg2; 5881 5882 const Register kyberConsts = r11; 5883 5884 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5885 // So, we can load, add and store the data in 3 groups of 11, 5886 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5887 // registers. A further constraint is that the mapping needs 5888 // to skip callee saves. So, we allocate the register 5889 // sequences using two 8 sequences, two 2 sequences and two 5890 // single registers. 5891 VSeq<8> vs1_1(0); 5892 VSeq<2> vs1_2(16); 5893 FloatRegister vs1_3 = v28; 5894 VSeq<8> vs2_1(18); 5895 VSeq<2> vs2_2(26); 5896 FloatRegister vs2_3 = v29; 5897 5898 // two constant vector sequences 5899 VSeq<8> vc_1(31, 0); 5900 VSeq<2> vc_2(31, 0); 5901 5902 FloatRegister vc_3 = v31; 5903 __ lea(kyberConsts, 5904 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5905 5906 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5907 for (int i = 0; i < 3; i++) { 5908 // load 80 or 88 values from a into vs1_1/2/3 5909 vs_ldpq_post(vs1_1, a); 5910 vs_ldpq_post(vs1_2, a); 5911 if (i < 2) { 5912 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5913 } 5914 // load 80 or 88 values from b into vs2_1/2/3 5915 vs_ldpq_post(vs2_1, b); 5916 vs_ldpq_post(vs2_2, b); 5917 if (i < 2) { 5918 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5919 } 5920 // sum 80 or 88 values across vs1 and vs2 into vs1 5921 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5922 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5923 if (i < 2) { 5924 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5925 } 5926 // add constant to all 80 or 88 results 5927 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5928 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5929 if (i < 2) { 5930 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5931 } 5932 // store 80 or 88 values 5933 vs_stpq_post(vs1_1, result); 5934 vs_stpq_post(vs1_2, result); 5935 if (i < 2) { 5936 __ str(vs1_3, __ Q, __ post(result, 16)); 5937 } 5938 } 5939 5940 __ leave(); // required for proper stackwalking of RuntimeStub frame 5941 __ mov(r0, zr); // return 0 5942 __ ret(lr); 5943 5944 return start; 5945 } 5946 5947 // Kyber add 3 polynomials. 5948 // Implements 5949 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5950 // 5951 // result (short[256]) = c_rarg0 5952 // a (short[256]) = c_rarg1 5953 // b (short[256]) = c_rarg2 5954 // c (short[256]) = c_rarg3 5955 address generate_kyberAddPoly_3() { 5956 5957 __ align(CodeEntryAlignment); 5958 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id; 5959 StubCodeMark mark(this, stub_id); 5960 address start = __ pc(); 5961 __ enter(); 5962 5963 const Register result = c_rarg0; 5964 const Register a = c_rarg1; 5965 const Register b = c_rarg2; 5966 const Register c = c_rarg3; 5967 5968 const Register kyberConsts = r11; 5969 5970 // As above we sum 256 sets of values in total i.e. 32 x 8H 5971 // quadwords. So, we can load, add and store the data in 3 5972 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5973 // of 10 or 11 registers. A further constraint is that the 5974 // mapping needs to skip callee saves. So, we allocate the 5975 // register sequences using two 8 sequences, two 2 sequences 5976 // and two single registers. 5977 VSeq<8> vs1_1(0); 5978 VSeq<2> vs1_2(16); 5979 FloatRegister vs1_3 = v28; 5980 VSeq<8> vs2_1(18); 5981 VSeq<2> vs2_2(26); 5982 FloatRegister vs2_3 = v29; 5983 5984 // two constant vector sequences 5985 VSeq<8> vc_1(31, 0); 5986 VSeq<2> vc_2(31, 0); 5987 5988 FloatRegister vc_3 = v31; 5989 5990 __ lea(kyberConsts, 5991 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5992 5993 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5994 for (int i = 0; i < 3; i++) { 5995 // load 80 or 88 values from a into vs1_1/2/3 5996 vs_ldpq_post(vs1_1, a); 5997 vs_ldpq_post(vs1_2, a); 5998 if (i < 2) { 5999 __ ldr(vs1_3, __ Q, __ post(a, 16)); 6000 } 6001 // load 80 or 88 values from b into vs2_1/2/3 6002 vs_ldpq_post(vs2_1, b); 6003 vs_ldpq_post(vs2_2, b); 6004 if (i < 2) { 6005 __ ldr(vs2_3, __ Q, __ post(b, 16)); 6006 } 6007 // sum 80 or 88 values across vs1 and vs2 into vs1 6008 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6009 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6010 if (i < 2) { 6011 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6012 } 6013 // load 80 or 88 values from c into vs2_1/2/3 6014 vs_ldpq_post(vs2_1, c); 6015 vs_ldpq_post(vs2_2, c); 6016 if (i < 2) { 6017 __ ldr(vs2_3, __ Q, __ post(c, 16)); 6018 } 6019 // sum 80 or 88 values across vs1 and vs2 into vs1 6020 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6021 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6022 if (i < 2) { 6023 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6024 } 6025 // add constant to all 80 or 88 results 6026 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6027 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6028 if (i < 2) { 6029 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6030 } 6031 // store 80 or 88 values 6032 vs_stpq_post(vs1_1, result); 6033 vs_stpq_post(vs1_2, result); 6034 if (i < 2) { 6035 __ str(vs1_3, __ Q, __ post(result, 16)); 6036 } 6037 } 6038 6039 __ leave(); // required for proper stackwalking of RuntimeStub frame 6040 __ mov(r0, zr); // return 0 6041 __ ret(lr); 6042 6043 return start; 6044 } 6045 6046 // Kyber parse XOF output to polynomial coefficient candidates 6047 // or decodePoly(12, ...). 6048 // Implements 6049 // static int implKyber12To16( 6050 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6051 // 6052 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6053 // 6054 // condensed (byte[]) = c_rarg0 6055 // condensedIndex = c_rarg1 6056 // parsed (short[112 or 256]) = c_rarg2 6057 // parsedLength (112 or 256) = c_rarg3 6058 address generate_kyber12To16() { 6059 Label L_F00, L_loop, L_end; 6060 6061 __ BIND(L_F00); 6062 __ emit_int64(0x0f000f000f000f00); 6063 __ emit_int64(0x0f000f000f000f00); 6064 6065 __ align(CodeEntryAlignment); 6066 StubId stub_id = StubId::stubgen_kyber12To16_id; 6067 StubCodeMark mark(this, stub_id); 6068 address start = __ pc(); 6069 __ enter(); 6070 6071 const Register condensed = c_rarg0; 6072 const Register condensedOffs = c_rarg1; 6073 const Register parsed = c_rarg2; 6074 const Register parsedLength = c_rarg3; 6075 6076 const Register tmpAddr = r11; 6077 6078 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6079 // quadwords so we need a 6 vector sequence for the inputs. 6080 // Parsing produces 64 shorts, employing two 8 vector 6081 // sequences to store and combine the intermediate data. 6082 VSeq<6> vin(24); 6083 VSeq<8> va(0), vb(16); 6084 6085 __ adr(tmpAddr, L_F00); 6086 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6087 __ add(condensed, condensed, condensedOffs); 6088 6089 __ BIND(L_loop); 6090 // load 96 (6 x 16B) byte values 6091 vs_ld3_post(vin, __ T16B, condensed); 6092 6093 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6094 // holds 48 (16x3) contiguous bytes from memory striped 6095 // horizontally across each of the 16 byte lanes. Equivalently, 6096 // that is 16 pairs of 12-bit integers. Likewise the back half 6097 // holds the next 48 bytes in the same arrangement. 6098 6099 // Each vector in the front half can also be viewed as a vertical 6100 // strip across the 16 pairs of 12 bit integers. Each byte in 6101 // vin[0] stores the low 8 bits of the first int in a pair. Each 6102 // byte in vin[1] stores the high 4 bits of the first int and the 6103 // low 4 bits of the second int. Each byte in vin[2] stores the 6104 // high 8 bits of the second int. Likewise the vectors in second 6105 // half. 6106 6107 // Converting the data to 16-bit shorts requires first of all 6108 // expanding each of the 6 x 16B vectors into 6 corresponding 6109 // pairs of 8H vectors. Mask, shift and add operations on the 6110 // resulting vector pairs can be used to combine 4 and 8 bit 6111 // parts of related 8H vector elements. 6112 // 6113 // The middle vectors (vin[2] and vin[5]) are actually expanded 6114 // twice, one copy manipulated to provide the lower 4 bits 6115 // belonging to the first short in a pair and another copy 6116 // manipulated to provide the higher 4 bits belonging to the 6117 // second short in a pair. This is why the the vector sequences va 6118 // and vb used to hold the expanded 8H elements are of length 8. 6119 6120 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6121 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6122 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6123 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6124 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6125 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6126 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6127 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6128 6129 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6130 // and vb[4:5] 6131 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6132 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6133 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6134 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6135 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6136 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6137 6138 // shift lo byte of copy 1 of the middle stripe into the high byte 6139 __ shl(va[2], __ T8H, va[2], 8); 6140 __ shl(va[3], __ T8H, va[3], 8); 6141 __ shl(vb[2], __ T8H, vb[2], 8); 6142 __ shl(vb[3], __ T8H, vb[3], 8); 6143 6144 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6145 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6146 // are in bit positions [4..11]. 6147 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6148 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6149 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6150 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6151 6152 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6153 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6154 // copy2 6155 __ andr(va[2], __ T16B, va[2], v31); 6156 __ andr(va[3], __ T16B, va[3], v31); 6157 __ ushr(va[4], __ T8H, va[4], 4); 6158 __ ushr(va[5], __ T8H, va[5], 4); 6159 __ andr(vb[2], __ T16B, vb[2], v31); 6160 __ andr(vb[3], __ T16B, vb[3], v31); 6161 __ ushr(vb[4], __ T8H, vb[4], 4); 6162 __ ushr(vb[5], __ T8H, vb[5], 4); 6163 6164 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6165 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6166 // n.b. the ordering ensures: i) inputs are consumed before they 6167 // are overwritten ii) the order of 16-bit results across successive 6168 // pairs of vectors in va and then vb reflects the order of the 6169 // corresponding 12-bit inputs 6170 __ addv(va[0], __ T8H, va[0], va[2]); 6171 __ addv(va[2], __ T8H, va[1], va[3]); 6172 __ addv(va[1], __ T8H, va[4], va[6]); 6173 __ addv(va[3], __ T8H, va[5], va[7]); 6174 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6175 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6176 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6177 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6178 6179 // store 64 results interleaved as shorts 6180 vs_st2_post(vs_front(va), __ T8H, parsed); 6181 vs_st2_post(vs_front(vb), __ T8H, parsed); 6182 6183 __ sub(parsedLength, parsedLength, 64); 6184 __ cmp(parsedLength, (u1)64); 6185 __ br(Assembler::GE, L_loop); 6186 __ cbz(parsedLength, L_end); 6187 6188 // if anything is left it should be a final 72 bytes of input 6189 // i.e. a final 48 12-bit values. so we handle this by loading 6190 // 48 bytes into all 16B lanes of front(vin) and only 24 6191 // bytes into the lower 8B lane of back(vin) 6192 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6193 vs_ld3(vs_back(vin), __ T8B, condensed); 6194 6195 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6196 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6197 // 5 and target element 2 of vb duplicates element 4. 6198 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6199 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6200 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6201 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6202 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6203 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6204 6205 // This time expand just the lower 8 lanes 6206 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6207 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6208 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6209 6210 // shift lo byte of copy 1 of the middle stripe into the high byte 6211 __ shl(va[2], __ T8H, va[2], 8); 6212 __ shl(va[3], __ T8H, va[3], 8); 6213 __ shl(vb[2], __ T8H, vb[2], 8); 6214 6215 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6216 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6217 // int are in bit positions [4..11]. 6218 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6219 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6220 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6221 6222 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6223 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6224 // copy2 6225 __ andr(va[2], __ T16B, va[2], v31); 6226 __ andr(va[3], __ T16B, va[3], v31); 6227 __ ushr(va[4], __ T8H, va[4], 4); 6228 __ ushr(va[5], __ T8H, va[5], 4); 6229 __ andr(vb[2], __ T16B, vb[2], v31); 6230 __ ushr(vb[4], __ T8H, vb[4], 4); 6231 6232 6233 6234 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6235 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6236 6237 // n.b. ordering ensures: i) inputs are consumed before they are 6238 // overwritten ii) order of 16-bit results across succsessive 6239 // pairs of vectors in va and then lower half of vb reflects order 6240 // of corresponding 12-bit inputs 6241 __ addv(va[0], __ T8H, va[0], va[2]); 6242 __ addv(va[2], __ T8H, va[1], va[3]); 6243 __ addv(va[1], __ T8H, va[4], va[6]); 6244 __ addv(va[3], __ T8H, va[5], va[7]); 6245 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6246 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6247 6248 // store 48 results interleaved as shorts 6249 vs_st2_post(vs_front(va), __ T8H, parsed); 6250 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6251 6252 __ BIND(L_end); 6253 6254 __ leave(); // required for proper stackwalking of RuntimeStub frame 6255 __ mov(r0, zr); // return 0 6256 __ ret(lr); 6257 6258 return start; 6259 } 6260 6261 // Kyber Barrett reduce function. 6262 // Implements 6263 // static int implKyberBarrettReduce(short[] coeffs) {} 6264 // 6265 // coeffs (short[256]) = c_rarg0 6266 address generate_kyberBarrettReduce() { 6267 6268 __ align(CodeEntryAlignment); 6269 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id; 6270 StubCodeMark mark(this, stub_id); 6271 address start = __ pc(); 6272 __ enter(); 6273 6274 const Register coeffs = c_rarg0; 6275 6276 const Register kyberConsts = r10; 6277 const Register result = r11; 6278 6279 // As above we process 256 sets of values in total i.e. 32 x 6280 // 8H quadwords. So, we can load, add and store the data in 3 6281 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6282 // of 10 or 11 registers. A further constraint is that the 6283 // mapping needs to skip callee saves. So, we allocate the 6284 // register sequences using two 8 sequences, two 2 sequences 6285 // and two single registers. 6286 VSeq<8> vs1_1(0); 6287 VSeq<2> vs1_2(16); 6288 FloatRegister vs1_3 = v28; 6289 VSeq<8> vs2_1(18); 6290 VSeq<2> vs2_2(26); 6291 FloatRegister vs2_3 = v29; 6292 6293 // we also need a pair of corresponding constant sequences 6294 6295 VSeq<8> vc1_1(30, 0); 6296 VSeq<2> vc1_2(30, 0); 6297 FloatRegister vc1_3 = v30; // for kyber_q 6298 6299 VSeq<8> vc2_1(31, 0); 6300 VSeq<2> vc2_2(31, 0); 6301 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6302 6303 __ add(result, coeffs, 0); 6304 __ lea(kyberConsts, 6305 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6306 6307 // load q and the multiplier for the Barrett reduction 6308 __ add(kyberConsts, kyberConsts, 16); 6309 __ ldpq(vc1_3, vc2_3, kyberConsts); 6310 6311 for (int i = 0; i < 3; i++) { 6312 // load 80 or 88 coefficients 6313 vs_ldpq_post(vs1_1, coeffs); 6314 vs_ldpq_post(vs1_2, coeffs); 6315 if (i < 2) { 6316 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6317 } 6318 6319 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6320 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6321 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6322 if (i < 2) { 6323 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6324 } 6325 6326 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6327 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6328 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6329 if (i < 2) { 6330 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6331 } 6332 6333 // vs1 <- vs1 - vs2 * kyber_q 6334 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6335 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6336 if (i < 2) { 6337 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6338 } 6339 6340 vs_stpq_post(vs1_1, result); 6341 vs_stpq_post(vs1_2, result); 6342 if (i < 2) { 6343 __ str(vs1_3, __ Q, __ post(result, 16)); 6344 } 6345 } 6346 6347 __ leave(); // required for proper stackwalking of RuntimeStub frame 6348 __ mov(r0, zr); // return 0 6349 __ ret(lr); 6350 6351 return start; 6352 } 6353 6354 6355 // Dilithium-specific montmul helper routines that generate parallel 6356 // code for, respectively, a single 4x4s vector sequence montmul or 6357 // two such multiplies in a row. 6358 6359 // Perform 16 32-bit Montgomery multiplications in parallel 6360 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6361 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6362 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6363 // It will assert that the register use is valid 6364 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6365 } 6366 6367 // Perform 2x16 32-bit Montgomery multiplications in parallel 6368 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6369 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6370 // Schedule two successive 4x4S multiplies via the montmul helper 6371 // on the front and back halves of va, vb and vc. The helper will 6372 // assert that the register use has no overlap conflicts on each 6373 // individual call but we also need to ensure that the necessary 6374 // disjoint/equality constraints are met across both calls. 6375 6376 // vb, vc, vtmp and vq must be disjoint. va must either be 6377 // disjoint from all other registers or equal vc 6378 6379 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6380 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6381 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6382 6383 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6384 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6385 6386 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6387 6388 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6389 assert(vs_disjoint(va, vb), "va and vb overlap"); 6390 assert(vs_disjoint(va, vq), "va and vq overlap"); 6391 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6392 6393 // We multiply the front and back halves of each sequence 4 at a 6394 // time because 6395 // 6396 // 1) we are currently only able to get 4-way instruction 6397 // parallelism at best 6398 // 6399 // 2) we need registers for the constants in vq and temporary 6400 // scratch registers to hold intermediate results so vtmp can only 6401 // be a VSeq<4> which means we only have 4 scratch slots. 6402 6403 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6404 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6405 } 6406 6407 // Perform combined montmul then add/sub on 4x4S vectors. 6408 void dilithium_montmul16_sub_add( 6409 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6410 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6411 // compute a = montmul(a1, c) 6412 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6413 // ouptut a1 = a0 - a 6414 vs_subv(va1, __ T4S, va0, vc); 6415 // and a0 = a0 + a 6416 vs_addv(va0, __ T4S, va0, vc); 6417 } 6418 6419 // Perform combined add/sub then montul on 4x4S vectors. 6420 void dilithium_sub_add_montmul16( 6421 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6422 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6423 // compute c = a0 - a1 6424 vs_subv(vtmp1, __ T4S, va0, va1); 6425 // output a0 = a0 + a1 6426 vs_addv(va0, __ T4S, va0, va1); 6427 // output a1 = b montmul c 6428 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6429 } 6430 6431 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6432 // in the Java implementation come in sequences of at least 8, so we 6433 // can use ldpq to collect the corresponding data into pairs of vector 6434 // registers. 6435 // We collect the coefficients corresponding to the 'j+l' indexes into 6436 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6437 // then we do the (Montgomery) multiplications by the zetas in parallel 6438 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6439 // v0-v7, then do the additions into v24-v31 and the subtractions into 6440 // v0-v7 and finally save the results back to the coeffs array. 6441 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6442 const Register coeffs, const Register zetas) { 6443 int c1 = 0; 6444 int c2 = 512; 6445 int startIncr; 6446 // don't use callee save registers v8 - v15 6447 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6448 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6449 VSeq<2> vq(30); // n.b. constants overlap vs3 6450 int offsets[4] = { 0, 32, 64, 96 }; 6451 6452 for (int level = 0; level < 5; level++) { 6453 int c1Start = c1; 6454 int c2Start = c2; 6455 if (level == 3) { 6456 offsets[1] = 32; 6457 offsets[2] = 128; 6458 offsets[3] = 160; 6459 } else if (level == 4) { 6460 offsets[1] = 64; 6461 offsets[2] = 128; 6462 offsets[3] = 192; 6463 } 6464 6465 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6466 // time at 4 different offsets and multiply them in order by the 6467 // next set of input values. So we employ indexed load and store 6468 // pair instructions with arrangement 4S. 6469 for (int i = 0; i < 4; i++) { 6470 // reload q and qinv 6471 vs_ldpq(vq, dilithiumConsts); // qInv, q 6472 // load 8x4S coefficients via second start pos == c2 6473 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6474 // load next 8x4S inputs == b 6475 vs_ldpq_post(vs2, zetas); 6476 // compute a == c2 * b mod MONT_Q 6477 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6478 // load 8x4s coefficients via first start pos == c1 6479 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6480 // compute a1 = c1 + a 6481 vs_addv(vs3, __ T4S, vs1, vs2); 6482 // compute a2 = c1 - a 6483 vs_subv(vs1, __ T4S, vs1, vs2); 6484 // output a1 and a2 6485 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6486 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6487 6488 int k = 4 * level + i; 6489 6490 if (k > 7) { 6491 startIncr = 256; 6492 } else if (k == 5) { 6493 startIncr = 384; 6494 } else { 6495 startIncr = 128; 6496 } 6497 6498 c1Start += startIncr; 6499 c2Start += startIncr; 6500 } 6501 6502 c2 /= 2; 6503 } 6504 } 6505 6506 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6507 // Implements the method 6508 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6509 // of the Java class sun.security.provider 6510 // 6511 // coeffs (int[256]) = c_rarg0 6512 // zetas (int[256]) = c_rarg1 6513 address generate_dilithiumAlmostNtt() { 6514 6515 __ align(CodeEntryAlignment); 6516 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id; 6517 StubCodeMark mark(this, stub_id); 6518 address start = __ pc(); 6519 __ enter(); 6520 6521 const Register coeffs = c_rarg0; 6522 const Register zetas = c_rarg1; 6523 6524 const Register tmpAddr = r9; 6525 const Register dilithiumConsts = r10; 6526 const Register result = r11; 6527 // don't use callee save registers v8 - v15 6528 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6529 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6530 VSeq<2> vq(30); // n.b. constants overlap vs3 6531 int offsets[4] = { 0, 32, 64, 96}; 6532 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6533 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6534 __ add(result, coeffs, 0); 6535 __ lea(dilithiumConsts, 6536 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6537 6538 // Each level represents one iteration of the outer for loop of the Java version. 6539 6540 // level 0-4 6541 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6542 6543 // level 5 6544 6545 // At level 5 the coefficients we need to combine with the zetas 6546 // are grouped in memory in blocks of size 4. So, for both sets of 6547 // coefficients we load 4 adjacent values at 8 different offsets 6548 // using an indexed ldr with register variant Q and multiply them 6549 // in sequence order by the next set of inputs. Likewise we store 6550 // the resuls using an indexed str with register variant Q. 6551 for (int i = 0; i < 1024; i += 256) { 6552 // reload constants q, qinv each iteration as they get clobbered later 6553 vs_ldpq(vq, dilithiumConsts); // qInv, q 6554 // load 32 (8x4S) coefficients via first offsets = c1 6555 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6556 // load next 32 (8x4S) inputs = b 6557 vs_ldpq_post(vs2, zetas); 6558 // a = b montul c1 6559 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6560 // load 32 (8x4S) coefficients via second offsets = c2 6561 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6562 // add/sub with result of multiply 6563 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6564 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6565 // write back new coefficients using same offsets 6566 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6567 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6568 } 6569 6570 // level 6 6571 // At level 6 the coefficients we need to combine with the zetas 6572 // are grouped in memory in pairs, the first two being montmul 6573 // inputs and the second add/sub inputs. We can still implement 6574 // the montmul+sub+add using 4-way parallelism but only if we 6575 // combine the coefficients with the zetas 16 at a time. We load 8 6576 // adjacent values at 4 different offsets using an ld2 load with 6577 // arrangement 2D. That interleaves the lower and upper halves of 6578 // each pair of quadwords into successive vector registers. We 6579 // then need to montmul the 4 even elements of the coefficients 6580 // register sequence by the zetas in order and then add/sub the 4 6581 // odd elements of the coefficients register sequence. We use an 6582 // equivalent st2 operation to store the results back into memory 6583 // de-interleaved. 6584 for (int i = 0; i < 1024; i += 128) { 6585 // reload constants q, qinv each iteration as they get clobbered later 6586 vs_ldpq(vq, dilithiumConsts); // qInv, q 6587 // load interleaved 16 (4x2D) coefficients via offsets 6588 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6589 // load next 16 (4x4S) inputs 6590 vs_ldpq_post(vs_front(vs2), zetas); 6591 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6592 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6593 vs_front(vs2), vtmp, vq); 6594 // store interleaved 16 (4x2D) coefficients via offsets 6595 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6596 } 6597 6598 // level 7 6599 // At level 7 the coefficients we need to combine with the zetas 6600 // occur singly with montmul inputs alterating with add/sub 6601 // inputs. Once again we can use 4-way parallelism to combine 16 6602 // zetas at a time. However, we have to load 8 adjacent values at 6603 // 4 different offsets using an ld2 load with arrangement 4S. That 6604 // interleaves the the odd words of each pair into one 6605 // coefficients vector register and the even words of the pair 6606 // into the next register. We then need to montmul the 4 even 6607 // elements of the coefficients register sequence by the zetas in 6608 // order and then add/sub the 4 odd elements of the coefficients 6609 // register sequence. We use an equivalent st2 operation to store 6610 // the results back into memory de-interleaved. 6611 6612 for (int i = 0; i < 1024; i += 128) { 6613 // reload constants q, qinv each iteration as they get clobbered later 6614 vs_ldpq(vq, dilithiumConsts); // qInv, q 6615 // load interleaved 16 (4x4S) coefficients via offsets 6616 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6617 // load next 16 (4x4S) inputs 6618 vs_ldpq_post(vs_front(vs2), zetas); 6619 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6620 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6621 vs_front(vs2), vtmp, vq); 6622 // store interleaved 16 (4x4S) coefficients via offsets 6623 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6624 } 6625 __ leave(); // required for proper stackwalking of RuntimeStub frame 6626 __ mov(r0, zr); // return 0 6627 __ ret(lr); 6628 6629 return start; 6630 } 6631 6632 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6633 // in the Java implementation come in sequences of at least 8, so we 6634 // can use ldpq to collect the corresponding data into pairs of vector 6635 // registers 6636 // We collect the coefficients that correspond to the 'j's into vs1 6637 // the coefficiets that correspond to the 'j+l's into vs2 then 6638 // do the additions into vs3 and the subtractions into vs1 then 6639 // save the result of the additions, load the zetas into vs2 6640 // do the (Montgomery) multiplications by zeta in parallel into vs2 6641 // finally save the results back to the coeffs array 6642 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6643 const Register coeffs, const Register zetas) { 6644 int c1 = 0; 6645 int c2 = 32; 6646 int startIncr; 6647 int offsets[4]; 6648 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6649 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6650 VSeq<2> vq(30); // n.b. constants overlap vs3 6651 6652 offsets[0] = 0; 6653 6654 for (int level = 3; level < 8; level++) { 6655 int c1Start = c1; 6656 int c2Start = c2; 6657 if (level == 3) { 6658 offsets[1] = 64; 6659 offsets[2] = 128; 6660 offsets[3] = 192; 6661 } else if (level == 4) { 6662 offsets[1] = 32; 6663 offsets[2] = 128; 6664 offsets[3] = 160; 6665 } else { 6666 offsets[1] = 32; 6667 offsets[2] = 64; 6668 offsets[3] = 96; 6669 } 6670 6671 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6672 // time at 4 different offsets and multiply them in order by the 6673 // next set of input values. So we employ indexed load and store 6674 // pair instructions with arrangement 4S. 6675 for (int i = 0; i < 4; i++) { 6676 // load v1 32 (8x4S) coefficients relative to first start index 6677 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6678 // load v2 32 (8x4S) coefficients relative to second start index 6679 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6680 // a0 = v1 + v2 -- n.b. clobbers vqs 6681 vs_addv(vs3, __ T4S, vs1, vs2); 6682 // a1 = v1 - v2 6683 vs_subv(vs1, __ T4S, vs1, vs2); 6684 // save a1 relative to first start index 6685 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6686 // load constants q, qinv each iteration as they get clobbered above 6687 vs_ldpq(vq, dilithiumConsts); // qInv, q 6688 // load b next 32 (8x4S) inputs 6689 vs_ldpq_post(vs2, zetas); 6690 // a = a1 montmul b 6691 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6692 // save a relative to second start index 6693 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6694 6695 int k = 4 * level + i; 6696 6697 if (k < 24) { 6698 startIncr = 256; 6699 } else if (k == 25) { 6700 startIncr = 384; 6701 } else { 6702 startIncr = 128; 6703 } 6704 6705 c1Start += startIncr; 6706 c2Start += startIncr; 6707 } 6708 6709 c2 *= 2; 6710 } 6711 } 6712 6713 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6714 // Implements the method 6715 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6716 // the sun.security.provider.ML_DSA class. 6717 // 6718 // coeffs (int[256]) = c_rarg0 6719 // zetas (int[256]) = c_rarg1 6720 address generate_dilithiumAlmostInverseNtt() { 6721 6722 __ align(CodeEntryAlignment); 6723 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id; 6724 StubCodeMark mark(this, stub_id); 6725 address start = __ pc(); 6726 __ enter(); 6727 6728 const Register coeffs = c_rarg0; 6729 const Register zetas = c_rarg1; 6730 6731 const Register tmpAddr = r9; 6732 const Register dilithiumConsts = r10; 6733 const Register result = r11; 6734 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6735 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6736 VSeq<2> vq(30); // n.b. constants overlap vs3 6737 int offsets[4] = { 0, 32, 64, 96 }; 6738 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6739 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6740 6741 __ add(result, coeffs, 0); 6742 __ lea(dilithiumConsts, 6743 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6744 6745 // Each level represents one iteration of the outer for loop of the Java version 6746 6747 // level 0 6748 // At level 0 we need to interleave adjacent quartets of 6749 // coefficients before we multiply and add/sub by the next 16 6750 // zetas just as we did for level 7 in the multiply code. So we 6751 // load and store the values using an ld2/st2 with arrangement 4S. 6752 for (int i = 0; i < 1024; i += 128) { 6753 // load constants q, qinv 6754 // n.b. this can be moved out of the loop as they do not get 6755 // clobbered by first two loops 6756 vs_ldpq(vq, dilithiumConsts); // qInv, q 6757 // a0/a1 load interleaved 32 (8x4S) coefficients 6758 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6759 // b load next 32 (8x4S) inputs 6760 vs_ldpq_post(vs_front(vs2), zetas); 6761 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6762 // n.b. second half of vs2 provides temporary register storage 6763 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6764 vs_front(vs2), vs_back(vs2), vtmp, vq); 6765 // a0/a1 store interleaved 32 (8x4S) coefficients 6766 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6767 } 6768 6769 // level 1 6770 // At level 1 we need to interleave pairs of adjacent pairs of 6771 // coefficients before we multiply by the next 16 zetas just as we 6772 // did for level 6 in the multiply code. So we load and store the 6773 // values an ld2/st2 with arrangement 2D. 6774 for (int i = 0; i < 1024; i += 128) { 6775 // a0/a1 load interleaved 32 (8x2D) coefficients 6776 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6777 // b load next 16 (4x4S) inputs 6778 vs_ldpq_post(vs_front(vs2), zetas); 6779 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6780 // n.b. second half of vs2 provides temporary register storage 6781 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6782 vs_front(vs2), vs_back(vs2), vtmp, vq); 6783 // a0/a1 store interleaved 32 (8x2D) coefficients 6784 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6785 } 6786 6787 // level 2 6788 // At level 2 coefficients come in blocks of 4. So, we load 4 6789 // adjacent coefficients at 8 distinct offsets for both the first 6790 // and second coefficient sequences, using an ldr with register 6791 // variant Q then combine them with next set of 32 zetas. Likewise 6792 // we store the results using an str with register variant Q. 6793 for (int i = 0; i < 1024; i += 256) { 6794 // c0 load 32 (8x4S) coefficients via first offsets 6795 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6796 // c1 load 32 (8x4S) coefficients via second offsets 6797 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6798 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6799 vs_addv(vs3, __ T4S, vs1, vs2); 6800 // c = c0 - c1 6801 vs_subv(vs1, __ T4S, vs1, vs2); 6802 // store a0 32 (8x4S) coefficients via first offsets 6803 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6804 // b load 32 (8x4S) next inputs 6805 vs_ldpq_post(vs2, zetas); 6806 // reload constants q, qinv -- they were clobbered earlier 6807 vs_ldpq(vq, dilithiumConsts); // qInv, q 6808 // compute a1 = b montmul c 6809 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6810 // store a1 32 (8x4S) coefficients via second offsets 6811 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6812 } 6813 6814 // level 3-7 6815 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6816 6817 __ leave(); // required for proper stackwalking of RuntimeStub frame 6818 __ mov(r0, zr); // return 0 6819 __ ret(lr); 6820 6821 return start; 6822 } 6823 6824 // Dilithium multiply polynomials in the NTT domain. 6825 // Straightforward implementation of the method 6826 // static int implDilithiumNttMult( 6827 // int[] result, int[] ntta, int[] nttb {} of 6828 // the sun.security.provider.ML_DSA class. 6829 // 6830 // result (int[256]) = c_rarg0 6831 // poly1 (int[256]) = c_rarg1 6832 // poly2 (int[256]) = c_rarg2 6833 address generate_dilithiumNttMult() { 6834 6835 __ align(CodeEntryAlignment); 6836 StubId stub_id = StubId::stubgen_dilithiumNttMult_id; 6837 StubCodeMark mark(this, stub_id); 6838 address start = __ pc(); 6839 __ enter(); 6840 6841 Label L_loop; 6842 6843 const Register result = c_rarg0; 6844 const Register poly1 = c_rarg1; 6845 const Register poly2 = c_rarg2; 6846 6847 const Register dilithiumConsts = r10; 6848 const Register len = r11; 6849 6850 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6851 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6852 VSeq<2> vq(30); // n.b. constants overlap vs3 6853 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6854 6855 __ lea(dilithiumConsts, 6856 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6857 6858 // load constants q, qinv 6859 vs_ldpq(vq, dilithiumConsts); // qInv, q 6860 // load constant rSquare into v29 6861 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6862 6863 __ mov(len, zr); 6864 __ add(len, len, 1024); 6865 6866 __ BIND(L_loop); 6867 6868 // b load 32 (8x4S) next inputs from poly1 6869 vs_ldpq_post(vs1, poly1); 6870 // c load 32 (8x4S) next inputs from poly2 6871 vs_ldpq_post(vs2, poly2); 6872 // compute a = b montmul c 6873 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6874 // compute a = rsquare montmul a 6875 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6876 // save a 32 (8x4S) results 6877 vs_stpq_post(vs2, result); 6878 6879 __ sub(len, len, 128); 6880 __ cmp(len, (u1)128); 6881 __ br(Assembler::GE, L_loop); 6882 6883 __ leave(); // required for proper stackwalking of RuntimeStub frame 6884 __ mov(r0, zr); // return 0 6885 __ ret(lr); 6886 6887 return start; 6888 } 6889 6890 // Dilithium Motgomery multiply an array by a constant. 6891 // A straightforward implementation of the method 6892 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6893 // of the sun.security.provider.MLDSA class 6894 // 6895 // coeffs (int[256]) = c_rarg0 6896 // constant (int) = c_rarg1 6897 address generate_dilithiumMontMulByConstant() { 6898 6899 __ align(CodeEntryAlignment); 6900 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id; 6901 StubCodeMark mark(this, stub_id); 6902 address start = __ pc(); 6903 __ enter(); 6904 6905 Label L_loop; 6906 6907 const Register coeffs = c_rarg0; 6908 const Register constant = c_rarg1; 6909 6910 const Register dilithiumConsts = r10; 6911 const Register result = r11; 6912 const Register len = r12; 6913 6914 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6915 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6916 VSeq<2> vq(30); // n.b. constants overlap vs3 6917 VSeq<8> vconst(29, 0); // for montmul by constant 6918 6919 // results track inputs 6920 __ add(result, coeffs, 0); 6921 __ lea(dilithiumConsts, 6922 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6923 6924 // load constants q, qinv -- they do not get clobbered by first two loops 6925 vs_ldpq(vq, dilithiumConsts); // qInv, q 6926 // copy caller supplied constant across vconst 6927 __ dup(vconst[0], __ T4S, constant); 6928 __ mov(len, zr); 6929 __ add(len, len, 1024); 6930 6931 __ BIND(L_loop); 6932 6933 // load next 32 inputs 6934 vs_ldpq_post(vs2, coeffs); 6935 // mont mul by constant 6936 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6937 // write next 32 results 6938 vs_stpq_post(vs2, result); 6939 6940 __ sub(len, len, 128); 6941 __ cmp(len, (u1)128); 6942 __ br(Assembler::GE, L_loop); 6943 6944 __ leave(); // required for proper stackwalking of RuntimeStub frame 6945 __ mov(r0, zr); // return 0 6946 __ ret(lr); 6947 6948 return start; 6949 } 6950 6951 // Dilithium decompose poly. 6952 // Implements the method 6953 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6954 // of the sun.security.provider.ML_DSA class 6955 // 6956 // input (int[256]) = c_rarg0 6957 // lowPart (int[256]) = c_rarg1 6958 // highPart (int[256]) = c_rarg2 6959 // twoGamma2 (int) = c_rarg3 6960 // multiplier (int) = c_rarg4 6961 address generate_dilithiumDecomposePoly() { 6962 6963 __ align(CodeEntryAlignment); 6964 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id; 6965 StubCodeMark mark(this, stub_id); 6966 address start = __ pc(); 6967 Label L_loop; 6968 6969 const Register input = c_rarg0; 6970 const Register lowPart = c_rarg1; 6971 const Register highPart = c_rarg2; 6972 const Register twoGamma2 = c_rarg3; 6973 const Register multiplier = c_rarg4; 6974 6975 const Register len = r9; 6976 const Register dilithiumConsts = r10; 6977 const Register tmp = r11; 6978 6979 // 6 independent sets of 4x4s values 6980 VSeq<4> vs1(0), vs2(4), vs3(8); 6981 VSeq<4> vs4(12), vs5(16), vtmp(20); 6982 6983 // 7 constants for cross-multiplying 6984 VSeq<4> one(25, 0); 6985 VSeq<4> qminus1(26, 0); 6986 VSeq<4> g2(27, 0); 6987 VSeq<4> twog2(28, 0); 6988 VSeq<4> mult(29, 0); 6989 VSeq<4> q(30, 0); 6990 VSeq<4> qadd(31, 0); 6991 6992 __ enter(); 6993 6994 __ lea(dilithiumConsts, 6995 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6996 6997 // save callee-saved registers 6998 __ stpd(v8, v9, __ pre(sp, -64)); 6999 __ stpd(v10, v11, Address(sp, 16)); 7000 __ stpd(v12, v13, Address(sp, 32)); 7001 __ stpd(v14, v15, Address(sp, 48)); 7002 7003 // populate constant registers 7004 __ mov(tmp, zr); 7005 __ add(tmp, tmp, 1); 7006 __ dup(one[0], __ T4S, tmp); // 1 7007 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 7008 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 7009 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 7010 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 7011 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 7012 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 7013 7014 __ mov(len, zr); 7015 __ add(len, len, 1024); 7016 7017 __ BIND(L_loop); 7018 7019 // load next 4x4S inputs interleaved: rplus --> vs1 7020 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 7021 7022 // rplus = rplus - ((rplus + qadd) >> 23) * q 7023 vs_addv(vtmp, __ T4S, vs1, qadd); 7024 vs_sshr(vtmp, __ T4S, vtmp, 23); 7025 vs_mulv(vtmp, __ T4S, vtmp, q); 7026 vs_subv(vs1, __ T4S, vs1, vtmp); 7027 7028 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7029 vs_sshr(vtmp, __ T4S, vs1, 31); 7030 vs_andr(vtmp, vtmp, q); 7031 vs_addv(vs1, __ T4S, vs1, vtmp); 7032 7033 // quotient --> vs2 7034 // int quotient = (rplus * multiplier) >> 22; 7035 vs_mulv(vtmp, __ T4S, vs1, mult); 7036 vs_sshr(vs2, __ T4S, vtmp, 22); 7037 7038 // r0 --> vs3 7039 // int r0 = rplus - quotient * twoGamma2; 7040 vs_mulv(vtmp, __ T4S, vs2, twog2); 7041 vs_subv(vs3, __ T4S, vs1, vtmp); 7042 7043 // mask --> vs4 7044 // int mask = (twoGamma2 - r0) >> 22; 7045 vs_subv(vtmp, __ T4S, twog2, vs3); 7046 vs_sshr(vs4, __ T4S, vtmp, 22); 7047 7048 // r0 -= (mask & twoGamma2); 7049 vs_andr(vtmp, vs4, twog2); 7050 vs_subv(vs3, __ T4S, vs3, vtmp); 7051 7052 // quotient += (mask & 1); 7053 vs_andr(vtmp, vs4, one); 7054 vs_addv(vs2, __ T4S, vs2, vtmp); 7055 7056 // mask = (twoGamma2 / 2 - r0) >> 31; 7057 vs_subv(vtmp, __ T4S, g2, vs3); 7058 vs_sshr(vs4, __ T4S, vtmp, 31); 7059 7060 // r0 -= (mask & twoGamma2); 7061 vs_andr(vtmp, vs4, twog2); 7062 vs_subv(vs3, __ T4S, vs3, vtmp); 7063 7064 // quotient += (mask & 1); 7065 vs_andr(vtmp, vs4, one); 7066 vs_addv(vs2, __ T4S, vs2, vtmp); 7067 7068 // r1 --> vs5 7069 // int r1 = rplus - r0 - (dilithium_q - 1); 7070 vs_subv(vtmp, __ T4S, vs1, vs3); 7071 vs_subv(vs5, __ T4S, vtmp, qminus1); 7072 7073 // r1 --> vs1 (overwriting rplus) 7074 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7075 vs_negr(vtmp, __ T4S, vs5); 7076 vs_orr(vtmp, vs5, vtmp); 7077 vs_sshr(vs1, __ T4S, vtmp, 31); 7078 7079 // r0 += ~r1; 7080 vs_notr(vtmp, vs1); 7081 vs_addv(vs3, __ T4S, vs3, vtmp); 7082 7083 // r1 = r1 & quotient; 7084 vs_andr(vs1, vs2, vs1); 7085 7086 // store results inteleaved 7087 // lowPart[m] = r0; 7088 // highPart[m] = r1; 7089 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7090 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7091 7092 __ sub(len, len, 64); 7093 __ cmp(len, (u1)64); 7094 __ br(Assembler::GE, L_loop); 7095 7096 // restore callee-saved vector registers 7097 __ ldpd(v14, v15, Address(sp, 48)); 7098 __ ldpd(v12, v13, Address(sp, 32)); 7099 __ ldpd(v10, v11, Address(sp, 16)); 7100 __ ldpd(v8, v9, __ post(sp, 64)); 7101 7102 __ leave(); // required for proper stackwalking of RuntimeStub frame 7103 __ mov(r0, zr); // return 0 7104 __ ret(lr); 7105 7106 return start; 7107 } 7108 7109 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4, 7110 Register tmp0, Register tmp1, Register tmp2) { 7111 __ bic(tmp0, a2, a1); // for a0 7112 __ bic(tmp1, a3, a2); // for a1 7113 __ bic(tmp2, a4, a3); // for a2 7114 __ eor(a2, a2, tmp2); 7115 __ bic(tmp2, a0, a4); // for a3 7116 __ eor(a3, a3, tmp2); 7117 __ bic(tmp2, a1, a0); // for a4 7118 __ eor(a0, a0, tmp0); 7119 __ eor(a1, a1, tmp1); 7120 __ eor(a4, a4, tmp2); 7121 } 7122 7123 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc, 7124 Register a0, Register a1, Register a2, Register a3, Register a4, 7125 Register a5, Register a6, Register a7, Register a8, Register a9, 7126 Register a10, Register a11, Register a12, Register a13, Register a14, 7127 Register a15, Register a16, Register a17, Register a18, Register a19, 7128 Register a20, Register a21, Register a22, Register a23, Register a24, 7129 Register tmp0, Register tmp1, Register tmp2) { 7130 __ eor3(tmp1, a4, a9, a14); 7131 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4 7132 __ eor3(tmp2, a1, a6, a11); 7133 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1 7134 __ rax1(tmp2, tmp0, tmp1); // d0 7135 { 7136 7137 Register tmp3, tmp4; 7138 if (can_use_fp && can_use_r18) { 7139 tmp3 = rfp; 7140 tmp4 = r18_tls; 7141 } else { 7142 tmp3 = a4; 7143 tmp4 = a9; 7144 __ stp(tmp3, tmp4, __ pre(sp, -16)); 7145 } 7146 7147 __ eor3(tmp3, a0, a5, a10); 7148 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0 7149 __ eor(a0, a0, tmp2); 7150 __ eor(a5, a5, tmp2); 7151 __ eor(a10, a10, tmp2); 7152 __ eor(a15, a15, tmp2); 7153 __ eor(a20, a20, tmp2); // d0(tmp2) 7154 __ eor3(tmp3, a2, a7, a12); 7155 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2 7156 __ rax1(tmp3, tmp4, tmp2); // d1 7157 __ eor(a1, a1, tmp3); 7158 __ eor(a6, a6, tmp3); 7159 __ eor(a11, a11, tmp3); 7160 __ eor(a16, a16, tmp3); 7161 __ eor(a21, a21, tmp3); // d1(tmp3) 7162 __ rax1(tmp3, tmp2, tmp0); // d3 7163 __ eor3(tmp2, a3, a8, a13); 7164 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3 7165 __ eor(a3, a3, tmp3); 7166 __ eor(a8, a8, tmp3); 7167 __ eor(a13, a13, tmp3); 7168 __ eor(a18, a18, tmp3); 7169 __ eor(a23, a23, tmp3); 7170 __ rax1(tmp2, tmp1, tmp0); // d2 7171 __ eor(a2, a2, tmp2); 7172 __ eor(a7, a7, tmp2); 7173 __ eor(a12, a12, tmp2); 7174 __ rax1(tmp0, tmp0, tmp4); // d4 7175 if (!can_use_fp || !can_use_r18) { 7176 __ ldp(tmp3, tmp4, __ post(sp, 16)); 7177 } 7178 __ eor(a17, a17, tmp2); 7179 __ eor(a22, a22, tmp2); 7180 __ eor(a4, a4, tmp0); 7181 __ eor(a9, a9, tmp0); 7182 __ eor(a14, a14, tmp0); 7183 __ eor(a19, a19, tmp0); 7184 __ eor(a24, a24, tmp0); 7185 } 7186 7187 __ rol(tmp0, a10, 3); 7188 __ rol(a10, a1, 1); 7189 __ rol(a1, a6, 44); 7190 __ rol(a6, a9, 20); 7191 __ rol(a9, a22, 61); 7192 __ rol(a22, a14, 39); 7193 __ rol(a14, a20, 18); 7194 __ rol(a20, a2, 62); 7195 __ rol(a2, a12, 43); 7196 __ rol(a12, a13, 25); 7197 __ rol(a13, a19, 8) ; 7198 __ rol(a19, a23, 56); 7199 __ rol(a23, a15, 41); 7200 __ rol(a15, a4, 27); 7201 __ rol(a4, a24, 14); 7202 __ rol(a24, a21, 2); 7203 __ rol(a21, a8, 55); 7204 __ rol(a8, a16, 45); 7205 __ rol(a16, a5, 36); 7206 __ rol(a5, a3, 28); 7207 __ rol(a3, a18, 21); 7208 __ rol(a18, a17, 15); 7209 __ rol(a17, a11, 10); 7210 __ rol(a11, a7, 6); 7211 __ mov(a7, tmp0); 7212 7213 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2); 7214 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2); 7215 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2); 7216 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2); 7217 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2); 7218 7219 __ ldr(tmp1, __ post(rc, 8)); 7220 __ eor(a0, a0, tmp1); 7221 7222 } 7223 7224 // Arguments: 7225 // 7226 // Inputs: 7227 // c_rarg0 - byte[] source+offset 7228 // c_rarg1 - byte[] SHA.state 7229 // c_rarg2 - int block_size 7230 // c_rarg3 - int offset 7231 // c_rarg4 - int limit 7232 // 7233 address generate_sha3_implCompress_gpr(StubId stub_id) { 7234 bool multi_block; 7235 switch (stub_id) { 7236 case StubId::stubgen_sha3_implCompress_id: 7237 multi_block = false; 7238 break; 7239 case StubId::stubgen_sha3_implCompressMB_id: 7240 multi_block = true; 7241 break; 7242 default: 7243 ShouldNotReachHere(); 7244 } 7245 7246 static const uint64_t round_consts[24] = { 7247 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 7248 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 7249 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 7250 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 7251 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 7252 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 7253 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 7254 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 7255 }; 7256 7257 __ align(CodeEntryAlignment); 7258 StubCodeMark mark(this, stub_id); 7259 address start = __ pc(); 7260 7261 Register buf = c_rarg0; 7262 Register state = c_rarg1; 7263 Register block_size = c_rarg2; 7264 Register ofs = c_rarg3; 7265 Register limit = c_rarg4; 7266 7267 // use r3.r17,r19..r28 to keep a0..a24. 7268 // a0..a24 are respective locals from SHA3.java 7269 Register a0 = r25, 7270 a1 = r26, 7271 a2 = r27, 7272 a3 = r3, 7273 a4 = r4, 7274 a5 = r5, 7275 a6 = r6, 7276 a7 = r7, 7277 a8 = rscratch1, // r8 7278 a9 = rscratch2, // r9 7279 a10 = r10, 7280 a11 = r11, 7281 a12 = r12, 7282 a13 = r13, 7283 a14 = r14, 7284 a15 = r15, 7285 a16 = r16, 7286 a17 = r17, 7287 a18 = r28, 7288 a19 = r19, 7289 a20 = r20, 7290 a21 = r21, 7291 a22 = r22, 7292 a23 = r23, 7293 a24 = r24; 7294 7295 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30; 7296 7297 Label sha3_loop, rounds24_preloop, loop_body; 7298 Label sha3_512_or_sha3_384, shake128; 7299 7300 bool can_use_r18 = false; 7301 #ifndef R18_RESERVED 7302 can_use_r18 = true; 7303 #endif 7304 bool can_use_fp = !PreserveFramePointer; 7305 7306 __ enter(); 7307 7308 // save almost all yet unsaved gpr registers on stack 7309 __ str(block_size, __ pre(sp, -128)); 7310 if (multi_block) { 7311 __ stpw(ofs, limit, Address(sp, 8)); 7312 } 7313 // 8 bytes at sp+16 will be used to keep buf 7314 __ stp(r19, r20, Address(sp, 32)); 7315 __ stp(r21, r22, Address(sp, 48)); 7316 __ stp(r23, r24, Address(sp, 64)); 7317 __ stp(r25, r26, Address(sp, 80)); 7318 __ stp(r27, r28, Address(sp, 96)); 7319 if (can_use_r18 && can_use_fp) { 7320 __ stp(r18_tls, state, Address(sp, 112)); 7321 } else { 7322 __ str(state, Address(sp, 112)); 7323 } 7324 7325 // begin sha3 calculations: loading a0..a24 from state arrary 7326 __ ldp(a0, a1, state); 7327 __ ldp(a2, a3, Address(state, 16)); 7328 __ ldp(a4, a5, Address(state, 32)); 7329 __ ldp(a6, a7, Address(state, 48)); 7330 __ ldp(a8, a9, Address(state, 64)); 7331 __ ldp(a10, a11, Address(state, 80)); 7332 __ ldp(a12, a13, Address(state, 96)); 7333 __ ldp(a14, a15, Address(state, 112)); 7334 __ ldp(a16, a17, Address(state, 128)); 7335 __ ldp(a18, a19, Address(state, 144)); 7336 __ ldp(a20, a21, Address(state, 160)); 7337 __ ldp(a22, a23, Address(state, 176)); 7338 __ ldr(a24, Address(state, 192)); 7339 7340 __ BIND(sha3_loop); 7341 7342 // load input 7343 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7344 __ eor(a0, a0, tmp3); 7345 __ eor(a1, a1, tmp2); 7346 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7347 __ eor(a2, a2, tmp3); 7348 __ eor(a3, a3, tmp2); 7349 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7350 __ eor(a4, a4, tmp3); 7351 __ eor(a5, a5, tmp2); 7352 __ ldr(tmp3, __ post(buf, 8)); 7353 __ eor(a6, a6, tmp3); 7354 7355 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 7356 __ tbz(block_size, 7, sha3_512_or_sha3_384); 7357 7358 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7359 __ eor(a7, a7, tmp3); 7360 __ eor(a8, a8, tmp2); 7361 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7362 __ eor(a9, a9, tmp3); 7363 __ eor(a10, a10, tmp2); 7364 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7365 __ eor(a11, a11, tmp3); 7366 __ eor(a12, a12, tmp2); 7367 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7368 __ eor(a13, a13, tmp3); 7369 __ eor(a14, a14, tmp2); 7370 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7371 __ eor(a15, a15, tmp3); 7372 __ eor(a16, a16, tmp2); 7373 7374 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 7375 __ andw(tmp2, block_size, 48); 7376 __ cbzw(tmp2, rounds24_preloop); 7377 __ tbnz(block_size, 5, shake128); 7378 // block_size == 144, bit5 == 0, SHA3-244 7379 __ ldr(tmp3, __ post(buf, 8)); 7380 __ eor(a17, a17, tmp3); 7381 __ b(rounds24_preloop); 7382 7383 __ BIND(shake128); 7384 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7385 __ eor(a17, a17, tmp3); 7386 __ eor(a18, a18, tmp2); 7387 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7388 __ eor(a19, a19, tmp3); 7389 __ eor(a20, a20, tmp2); 7390 __ b(rounds24_preloop); // block_size == 168, SHAKE128 7391 7392 __ BIND(sha3_512_or_sha3_384); 7393 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7394 __ eor(a7, a7, tmp3); 7395 __ eor(a8, a8, tmp2); 7396 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512 7397 7398 // SHA3-384 7399 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7400 __ eor(a9, a9, tmp3); 7401 __ eor(a10, a10, tmp2); 7402 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7403 __ eor(a11, a11, tmp3); 7404 __ eor(a12, a12, tmp2); 7405 7406 __ BIND(rounds24_preloop); 7407 __ fmovs(v0, 24.0); // float loop counter, 7408 __ fmovs(v1, 1.0); // exact representation 7409 7410 __ str(buf, Address(sp, 16)); 7411 __ lea(tmp3, ExternalAddress((address) round_consts)); 7412 7413 __ BIND(loop_body); 7414 keccak_round_gpr(can_use_fp, can_use_r18, tmp3, 7415 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, 7416 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, 7417 tmp0, tmp1, tmp2); 7418 __ fsubs(v0, v0, v1); 7419 __ fcmps(v0, 0.0); 7420 __ br(__ NE, loop_body); 7421 7422 if (multi_block) { 7423 __ ldrw(block_size, sp); // block_size 7424 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit 7425 __ addw(tmp2, tmp2, block_size); 7426 __ cmpw(tmp2, tmp1); 7427 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping 7428 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping 7429 __ br(Assembler::LE, sha3_loop); 7430 __ movw(c_rarg0, tmp2); // return offset 7431 } 7432 if (can_use_fp && can_use_r18) { 7433 __ ldp(r18_tls, state, Address(sp, 112)); 7434 } else { 7435 __ ldr(state, Address(sp, 112)); 7436 } 7437 // save calculated sha3 state 7438 __ stp(a0, a1, Address(state)); 7439 __ stp(a2, a3, Address(state, 16)); 7440 __ stp(a4, a5, Address(state, 32)); 7441 __ stp(a6, a7, Address(state, 48)); 7442 __ stp(a8, a9, Address(state, 64)); 7443 __ stp(a10, a11, Address(state, 80)); 7444 __ stp(a12, a13, Address(state, 96)); 7445 __ stp(a14, a15, Address(state, 112)); 7446 __ stp(a16, a17, Address(state, 128)); 7447 __ stp(a18, a19, Address(state, 144)); 7448 __ stp(a20, a21, Address(state, 160)); 7449 __ stp(a22, a23, Address(state, 176)); 7450 __ str(a24, Address(state, 192)); 7451 7452 // restore required registers from stack 7453 __ ldp(r19, r20, Address(sp, 32)); 7454 __ ldp(r21, r22, Address(sp, 48)); 7455 __ ldp(r23, r24, Address(sp, 64)); 7456 __ ldp(r25, r26, Address(sp, 80)); 7457 __ ldp(r27, r28, Address(sp, 96)); 7458 if (can_use_fp && can_use_r18) { 7459 __ add(rfp, sp, 128); // leave() will copy rfp to sp below 7460 } // else no need to recalculate rfp, since it wasn't changed 7461 7462 __ leave(); 7463 7464 __ ret(lr); 7465 7466 return start; 7467 } 7468 7469 /** 7470 * Arguments: 7471 * 7472 * Inputs: 7473 * c_rarg0 - int crc 7474 * c_rarg1 - byte* buf 7475 * c_rarg2 - int length 7476 * 7477 * Output: 7478 * rax - int crc result 7479 */ 7480 address generate_updateBytesCRC32() { 7481 assert(UseCRC32Intrinsics, "what are we doing here?"); 7482 7483 __ align(CodeEntryAlignment); 7484 StubId stub_id = StubId::stubgen_updateBytesCRC32_id; 7485 StubCodeMark mark(this, stub_id); 7486 7487 address start = __ pc(); 7488 7489 const Register crc = c_rarg0; // crc 7490 const Register buf = c_rarg1; // source java byte array address 7491 const Register len = c_rarg2; // length 7492 const Register table0 = c_rarg3; // crc_table address 7493 const Register table1 = c_rarg4; 7494 const Register table2 = c_rarg5; 7495 const Register table3 = c_rarg6; 7496 const Register tmp3 = c_rarg7; 7497 7498 BLOCK_COMMENT("Entry:"); 7499 __ enter(); // required for proper stackwalking of RuntimeStub frame 7500 7501 __ kernel_crc32(crc, buf, len, 7502 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7503 7504 __ leave(); // required for proper stackwalking of RuntimeStub frame 7505 __ ret(lr); 7506 7507 return start; 7508 } 7509 7510 /** 7511 * Arguments: 7512 * 7513 * Inputs: 7514 * c_rarg0 - int crc 7515 * c_rarg1 - byte* buf 7516 * c_rarg2 - int length 7517 * c_rarg3 - int* table 7518 * 7519 * Output: 7520 * r0 - int crc result 7521 */ 7522 address generate_updateBytesCRC32C() { 7523 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7524 7525 __ align(CodeEntryAlignment); 7526 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id; 7527 StubCodeMark mark(this, stub_id); 7528 7529 address start = __ pc(); 7530 7531 const Register crc = c_rarg0; // crc 7532 const Register buf = c_rarg1; // source java byte array address 7533 const Register len = c_rarg2; // length 7534 const Register table0 = c_rarg3; // crc_table address 7535 const Register table1 = c_rarg4; 7536 const Register table2 = c_rarg5; 7537 const Register table3 = c_rarg6; 7538 const Register tmp3 = c_rarg7; 7539 7540 BLOCK_COMMENT("Entry:"); 7541 __ enter(); // required for proper stackwalking of RuntimeStub frame 7542 7543 __ kernel_crc32c(crc, buf, len, 7544 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7545 7546 __ leave(); // required for proper stackwalking of RuntimeStub frame 7547 __ ret(lr); 7548 7549 return start; 7550 } 7551 7552 /*** 7553 * Arguments: 7554 * 7555 * Inputs: 7556 * c_rarg0 - int adler 7557 * c_rarg1 - byte* buff 7558 * c_rarg2 - int len 7559 * 7560 * Output: 7561 * c_rarg0 - int adler result 7562 */ 7563 address generate_updateBytesAdler32() { 7564 __ align(CodeEntryAlignment); 7565 StubId stub_id = StubId::stubgen_updateBytesAdler32_id; 7566 StubCodeMark mark(this, stub_id); 7567 address start = __ pc(); 7568 7569 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7570 7571 // Aliases 7572 Register adler = c_rarg0; 7573 Register s1 = c_rarg0; 7574 Register s2 = c_rarg3; 7575 Register buff = c_rarg1; 7576 Register len = c_rarg2; 7577 Register nmax = r4; 7578 Register base = r5; 7579 Register count = r6; 7580 Register temp0 = rscratch1; 7581 Register temp1 = rscratch2; 7582 FloatRegister vbytes = v0; 7583 FloatRegister vs1acc = v1; 7584 FloatRegister vs2acc = v2; 7585 FloatRegister vtable = v3; 7586 7587 // Max number of bytes we can process before having to take the mod 7588 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7589 uint64_t BASE = 0xfff1; 7590 uint64_t NMAX = 0x15B0; 7591 7592 __ mov(base, BASE); 7593 __ mov(nmax, NMAX); 7594 7595 // Load accumulation coefficients for the upper 16 bits 7596 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7597 __ ld1(vtable, __ T16B, Address(temp0)); 7598 7599 // s1 is initialized to the lower 16 bits of adler 7600 // s2 is initialized to the upper 16 bits of adler 7601 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7602 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7603 7604 // The pipelined loop needs at least 16 elements for 1 iteration 7605 // It does check this, but it is more effective to skip to the cleanup loop 7606 __ cmp(len, (u1)16); 7607 __ br(Assembler::HS, L_nmax); 7608 __ cbz(len, L_combine); 7609 7610 __ bind(L_simple_by1_loop); 7611 __ ldrb(temp0, Address(__ post(buff, 1))); 7612 __ add(s1, s1, temp0); 7613 __ add(s2, s2, s1); 7614 __ subs(len, len, 1); 7615 __ br(Assembler::HI, L_simple_by1_loop); 7616 7617 // s1 = s1 % BASE 7618 __ subs(temp0, s1, base); 7619 __ csel(s1, temp0, s1, Assembler::HS); 7620 7621 // s2 = s2 % BASE 7622 __ lsr(temp0, s2, 16); 7623 __ lsl(temp1, temp0, 4); 7624 __ sub(temp1, temp1, temp0); 7625 __ add(s2, temp1, s2, ext::uxth); 7626 7627 __ subs(temp0, s2, base); 7628 __ csel(s2, temp0, s2, Assembler::HS); 7629 7630 __ b(L_combine); 7631 7632 __ bind(L_nmax); 7633 __ subs(len, len, nmax); 7634 __ sub(count, nmax, 16); 7635 __ br(Assembler::LO, L_by16); 7636 7637 __ bind(L_nmax_loop); 7638 7639 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7640 vbytes, vs1acc, vs2acc, vtable); 7641 7642 __ subs(count, count, 16); 7643 __ br(Assembler::HS, L_nmax_loop); 7644 7645 // s1 = s1 % BASE 7646 __ lsr(temp0, s1, 16); 7647 __ lsl(temp1, temp0, 4); 7648 __ sub(temp1, temp1, temp0); 7649 __ add(temp1, temp1, s1, ext::uxth); 7650 7651 __ lsr(temp0, temp1, 16); 7652 __ lsl(s1, temp0, 4); 7653 __ sub(s1, s1, temp0); 7654 __ add(s1, s1, temp1, ext:: uxth); 7655 7656 __ subs(temp0, s1, base); 7657 __ csel(s1, temp0, s1, Assembler::HS); 7658 7659 // s2 = s2 % BASE 7660 __ lsr(temp0, s2, 16); 7661 __ lsl(temp1, temp0, 4); 7662 __ sub(temp1, temp1, temp0); 7663 __ add(temp1, temp1, s2, ext::uxth); 7664 7665 __ lsr(temp0, temp1, 16); 7666 __ lsl(s2, temp0, 4); 7667 __ sub(s2, s2, temp0); 7668 __ add(s2, s2, temp1, ext:: uxth); 7669 7670 __ subs(temp0, s2, base); 7671 __ csel(s2, temp0, s2, Assembler::HS); 7672 7673 __ subs(len, len, nmax); 7674 __ sub(count, nmax, 16); 7675 __ br(Assembler::HS, L_nmax_loop); 7676 7677 __ bind(L_by16); 7678 __ adds(len, len, count); 7679 __ br(Assembler::LO, L_by1); 7680 7681 __ bind(L_by16_loop); 7682 7683 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7684 vbytes, vs1acc, vs2acc, vtable); 7685 7686 __ subs(len, len, 16); 7687 __ br(Assembler::HS, L_by16_loop); 7688 7689 __ bind(L_by1); 7690 __ adds(len, len, 15); 7691 __ br(Assembler::LO, L_do_mod); 7692 7693 __ bind(L_by1_loop); 7694 __ ldrb(temp0, Address(__ post(buff, 1))); 7695 __ add(s1, temp0, s1); 7696 __ add(s2, s2, s1); 7697 __ subs(len, len, 1); 7698 __ br(Assembler::HS, L_by1_loop); 7699 7700 __ bind(L_do_mod); 7701 // s1 = s1 % BASE 7702 __ lsr(temp0, s1, 16); 7703 __ lsl(temp1, temp0, 4); 7704 __ sub(temp1, temp1, temp0); 7705 __ add(temp1, temp1, s1, ext::uxth); 7706 7707 __ lsr(temp0, temp1, 16); 7708 __ lsl(s1, temp0, 4); 7709 __ sub(s1, s1, temp0); 7710 __ add(s1, s1, temp1, ext:: uxth); 7711 7712 __ subs(temp0, s1, base); 7713 __ csel(s1, temp0, s1, Assembler::HS); 7714 7715 // s2 = s2 % BASE 7716 __ lsr(temp0, s2, 16); 7717 __ lsl(temp1, temp0, 4); 7718 __ sub(temp1, temp1, temp0); 7719 __ add(temp1, temp1, s2, ext::uxth); 7720 7721 __ lsr(temp0, temp1, 16); 7722 __ lsl(s2, temp0, 4); 7723 __ sub(s2, s2, temp0); 7724 __ add(s2, s2, temp1, ext:: uxth); 7725 7726 __ subs(temp0, s2, base); 7727 __ csel(s2, temp0, s2, Assembler::HS); 7728 7729 // Combine lower bits and higher bits 7730 __ bind(L_combine); 7731 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7732 7733 __ ret(lr); 7734 7735 return start; 7736 } 7737 7738 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7739 Register temp0, Register temp1, FloatRegister vbytes, 7740 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7741 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7742 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7743 // In non-vectorized code, we update s1 and s2 as: 7744 // s1 <- s1 + b1 7745 // s2 <- s2 + s1 7746 // s1 <- s1 + b2 7747 // s2 <- s2 + b1 7748 // ... 7749 // s1 <- s1 + b16 7750 // s2 <- s2 + s1 7751 // Putting above assignments together, we have: 7752 // s1_new = s1 + b1 + b2 + ... + b16 7753 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7754 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7755 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7756 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7757 7758 // s2 = s2 + s1 * 16 7759 __ add(s2, s2, s1, Assembler::LSL, 4); 7760 7761 // vs1acc = b1 + b2 + b3 + ... + b16 7762 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7763 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7764 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7765 __ uaddlv(vs1acc, __ T16B, vbytes); 7766 __ uaddlv(vs2acc, __ T8H, vs2acc); 7767 7768 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7769 __ fmovd(temp0, vs1acc); 7770 __ fmovd(temp1, vs2acc); 7771 __ add(s1, s1, temp0); 7772 __ add(s2, s2, temp1); 7773 } 7774 7775 /** 7776 * Arguments: 7777 * 7778 * Input: 7779 * c_rarg0 - x address 7780 * c_rarg1 - x length 7781 * c_rarg2 - y address 7782 * c_rarg3 - y length 7783 * c_rarg4 - z address 7784 */ 7785 address generate_multiplyToLen() { 7786 __ align(CodeEntryAlignment); 7787 StubId stub_id = StubId::stubgen_multiplyToLen_id; 7788 StubCodeMark mark(this, stub_id); 7789 7790 address start = __ pc(); 7791 const Register x = r0; 7792 const Register xlen = r1; 7793 const Register y = r2; 7794 const Register ylen = r3; 7795 const Register z = r4; 7796 7797 const Register tmp0 = r5; 7798 const Register tmp1 = r10; 7799 const Register tmp2 = r11; 7800 const Register tmp3 = r12; 7801 const Register tmp4 = r13; 7802 const Register tmp5 = r14; 7803 const Register tmp6 = r15; 7804 const Register tmp7 = r16; 7805 7806 BLOCK_COMMENT("Entry:"); 7807 __ enter(); // required for proper stackwalking of RuntimeStub frame 7808 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7809 __ leave(); // required for proper stackwalking of RuntimeStub frame 7810 __ ret(lr); 7811 7812 return start; 7813 } 7814 7815 address generate_squareToLen() { 7816 // squareToLen algorithm for sizes 1..127 described in java code works 7817 // faster than multiply_to_len on some CPUs and slower on others, but 7818 // multiply_to_len shows a bit better overall results 7819 __ align(CodeEntryAlignment); 7820 StubId stub_id = StubId::stubgen_squareToLen_id; 7821 StubCodeMark mark(this, stub_id); 7822 address start = __ pc(); 7823 7824 const Register x = r0; 7825 const Register xlen = r1; 7826 const Register z = r2; 7827 const Register y = r4; // == x 7828 const Register ylen = r5; // == xlen 7829 7830 const Register tmp0 = r3; 7831 const Register tmp1 = r10; 7832 const Register tmp2 = r11; 7833 const Register tmp3 = r12; 7834 const Register tmp4 = r13; 7835 const Register tmp5 = r14; 7836 const Register tmp6 = r15; 7837 const Register tmp7 = r16; 7838 7839 RegSet spilled_regs = RegSet::of(y, ylen); 7840 BLOCK_COMMENT("Entry:"); 7841 __ enter(); 7842 __ push(spilled_regs, sp); 7843 __ mov(y, x); 7844 __ mov(ylen, xlen); 7845 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7846 __ pop(spilled_regs, sp); 7847 __ leave(); 7848 __ ret(lr); 7849 return start; 7850 } 7851 7852 address generate_mulAdd() { 7853 __ align(CodeEntryAlignment); 7854 StubId stub_id = StubId::stubgen_mulAdd_id; 7855 StubCodeMark mark(this, stub_id); 7856 7857 address start = __ pc(); 7858 7859 const Register out = r0; 7860 const Register in = r1; 7861 const Register offset = r2; 7862 const Register len = r3; 7863 const Register k = r4; 7864 7865 BLOCK_COMMENT("Entry:"); 7866 __ enter(); 7867 __ mul_add(out, in, offset, len, k); 7868 __ leave(); 7869 __ ret(lr); 7870 7871 return start; 7872 } 7873 7874 // Arguments: 7875 // 7876 // Input: 7877 // c_rarg0 - newArr address 7878 // c_rarg1 - oldArr address 7879 // c_rarg2 - newIdx 7880 // c_rarg3 - shiftCount 7881 // c_rarg4 - numIter 7882 // 7883 address generate_bigIntegerRightShift() { 7884 __ align(CodeEntryAlignment); 7885 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id; 7886 StubCodeMark mark(this, stub_id); 7887 address start = __ pc(); 7888 7889 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7890 7891 Register newArr = c_rarg0; 7892 Register oldArr = c_rarg1; 7893 Register newIdx = c_rarg2; 7894 Register shiftCount = c_rarg3; 7895 Register numIter = c_rarg4; 7896 Register idx = numIter; 7897 7898 Register newArrCur = rscratch1; 7899 Register shiftRevCount = rscratch2; 7900 Register oldArrCur = r13; 7901 Register oldArrNext = r14; 7902 7903 FloatRegister oldElem0 = v0; 7904 FloatRegister oldElem1 = v1; 7905 FloatRegister newElem = v2; 7906 FloatRegister shiftVCount = v3; 7907 FloatRegister shiftVRevCount = v4; 7908 7909 __ cbz(idx, Exit); 7910 7911 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7912 7913 // left shift count 7914 __ movw(shiftRevCount, 32); 7915 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7916 7917 // numIter too small to allow a 4-words SIMD loop, rolling back 7918 __ cmp(numIter, (u1)4); 7919 __ br(Assembler::LT, ShiftThree); 7920 7921 __ dup(shiftVCount, __ T4S, shiftCount); 7922 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7923 __ negr(shiftVCount, __ T4S, shiftVCount); 7924 7925 __ BIND(ShiftSIMDLoop); 7926 7927 // Calculate the load addresses 7928 __ sub(idx, idx, 4); 7929 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7930 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7931 __ add(oldArrCur, oldArrNext, 4); 7932 7933 // Load 4 words and process 7934 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7935 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7936 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7937 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7938 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7939 __ st1(newElem, __ T4S, Address(newArrCur)); 7940 7941 __ cmp(idx, (u1)4); 7942 __ br(Assembler::LT, ShiftTwoLoop); 7943 __ b(ShiftSIMDLoop); 7944 7945 __ BIND(ShiftTwoLoop); 7946 __ cbz(idx, Exit); 7947 __ cmp(idx, (u1)1); 7948 __ br(Assembler::EQ, ShiftOne); 7949 7950 // Calculate the load addresses 7951 __ sub(idx, idx, 2); 7952 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7953 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7954 __ add(oldArrCur, oldArrNext, 4); 7955 7956 // Load 2 words and process 7957 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7958 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7959 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7960 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7961 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7962 __ st1(newElem, __ T2S, Address(newArrCur)); 7963 __ b(ShiftTwoLoop); 7964 7965 __ BIND(ShiftThree); 7966 __ tbz(idx, 1, ShiftOne); 7967 __ tbz(idx, 0, ShiftTwo); 7968 __ ldrw(r10, Address(oldArr, 12)); 7969 __ ldrw(r11, Address(oldArr, 8)); 7970 __ lsrvw(r10, r10, shiftCount); 7971 __ lslvw(r11, r11, shiftRevCount); 7972 __ orrw(r12, r10, r11); 7973 __ strw(r12, Address(newArr, 8)); 7974 7975 __ BIND(ShiftTwo); 7976 __ ldrw(r10, Address(oldArr, 8)); 7977 __ ldrw(r11, Address(oldArr, 4)); 7978 __ lsrvw(r10, r10, shiftCount); 7979 __ lslvw(r11, r11, shiftRevCount); 7980 __ orrw(r12, r10, r11); 7981 __ strw(r12, Address(newArr, 4)); 7982 7983 __ BIND(ShiftOne); 7984 __ ldrw(r10, Address(oldArr, 4)); 7985 __ ldrw(r11, Address(oldArr)); 7986 __ lsrvw(r10, r10, shiftCount); 7987 __ lslvw(r11, r11, shiftRevCount); 7988 __ orrw(r12, r10, r11); 7989 __ strw(r12, Address(newArr)); 7990 7991 __ BIND(Exit); 7992 __ ret(lr); 7993 7994 return start; 7995 } 7996 7997 // Arguments: 7998 // 7999 // Input: 8000 // c_rarg0 - newArr address 8001 // c_rarg1 - oldArr address 8002 // c_rarg2 - newIdx 8003 // c_rarg3 - shiftCount 8004 // c_rarg4 - numIter 8005 // 8006 address generate_bigIntegerLeftShift() { 8007 __ align(CodeEntryAlignment); 8008 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id; 8009 StubCodeMark mark(this, stub_id); 8010 address start = __ pc(); 8011 8012 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 8013 8014 Register newArr = c_rarg0; 8015 Register oldArr = c_rarg1; 8016 Register newIdx = c_rarg2; 8017 Register shiftCount = c_rarg3; 8018 Register numIter = c_rarg4; 8019 8020 Register shiftRevCount = rscratch1; 8021 Register oldArrNext = rscratch2; 8022 8023 FloatRegister oldElem0 = v0; 8024 FloatRegister oldElem1 = v1; 8025 FloatRegister newElem = v2; 8026 FloatRegister shiftVCount = v3; 8027 FloatRegister shiftVRevCount = v4; 8028 8029 __ cbz(numIter, Exit); 8030 8031 __ add(oldArrNext, oldArr, 4); 8032 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 8033 8034 // right shift count 8035 __ movw(shiftRevCount, 32); 8036 __ subw(shiftRevCount, shiftRevCount, shiftCount); 8037 8038 // numIter too small to allow a 4-words SIMD loop, rolling back 8039 __ cmp(numIter, (u1)4); 8040 __ br(Assembler::LT, ShiftThree); 8041 8042 __ dup(shiftVCount, __ T4S, shiftCount); 8043 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 8044 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 8045 8046 __ BIND(ShiftSIMDLoop); 8047 8048 // load 4 words and process 8049 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 8050 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 8051 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 8052 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 8053 __ orr(newElem, __ T16B, oldElem0, oldElem1); 8054 __ st1(newElem, __ T4S, __ post(newArr, 16)); 8055 __ sub(numIter, numIter, 4); 8056 8057 __ cmp(numIter, (u1)4); 8058 __ br(Assembler::LT, ShiftTwoLoop); 8059 __ b(ShiftSIMDLoop); 8060 8061 __ BIND(ShiftTwoLoop); 8062 __ cbz(numIter, Exit); 8063 __ cmp(numIter, (u1)1); 8064 __ br(Assembler::EQ, ShiftOne); 8065 8066 // load 2 words and process 8067 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 8068 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 8069 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 8070 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 8071 __ orr(newElem, __ T8B, oldElem0, oldElem1); 8072 __ st1(newElem, __ T2S, __ post(newArr, 8)); 8073 __ sub(numIter, numIter, 2); 8074 __ b(ShiftTwoLoop); 8075 8076 __ BIND(ShiftThree); 8077 __ ldrw(r10, __ post(oldArr, 4)); 8078 __ ldrw(r11, __ post(oldArrNext, 4)); 8079 __ lslvw(r10, r10, shiftCount); 8080 __ lsrvw(r11, r11, shiftRevCount); 8081 __ orrw(r12, r10, r11); 8082 __ strw(r12, __ post(newArr, 4)); 8083 __ tbz(numIter, 1, Exit); 8084 __ tbz(numIter, 0, ShiftOne); 8085 8086 __ BIND(ShiftTwo); 8087 __ ldrw(r10, __ post(oldArr, 4)); 8088 __ ldrw(r11, __ post(oldArrNext, 4)); 8089 __ lslvw(r10, r10, shiftCount); 8090 __ lsrvw(r11, r11, shiftRevCount); 8091 __ orrw(r12, r10, r11); 8092 __ strw(r12, __ post(newArr, 4)); 8093 8094 __ BIND(ShiftOne); 8095 __ ldrw(r10, Address(oldArr)); 8096 __ ldrw(r11, Address(oldArrNext)); 8097 __ lslvw(r10, r10, shiftCount); 8098 __ lsrvw(r11, r11, shiftRevCount); 8099 __ orrw(r12, r10, r11); 8100 __ strw(r12, Address(newArr)); 8101 8102 __ BIND(Exit); 8103 __ ret(lr); 8104 8105 return start; 8106 } 8107 8108 address generate_count_positives(address &count_positives_long) { 8109 const u1 large_loop_size = 64; 8110 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 8111 int dcache_line = VM_Version::dcache_line_size(); 8112 8113 Register ary1 = r1, len = r2, result = r0; 8114 8115 __ align(CodeEntryAlignment); 8116 8117 StubId stub_id = StubId::stubgen_count_positives_id; 8118 StubCodeMark mark(this, stub_id); 8119 8120 address entry = __ pc(); 8121 8122 __ enter(); 8123 // precondition: a copy of len is already in result 8124 // __ mov(result, len); 8125 8126 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 8127 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 8128 8129 __ cmp(len, (u1)15); 8130 __ br(Assembler::GT, LEN_OVER_15); 8131 // The only case when execution falls into this code is when pointer is near 8132 // the end of memory page and we have to avoid reading next page 8133 __ add(ary1, ary1, len); 8134 __ subs(len, len, 8); 8135 __ br(Assembler::GT, LEN_OVER_8); 8136 __ ldr(rscratch2, Address(ary1, -8)); 8137 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 8138 __ lsrv(rscratch2, rscratch2, rscratch1); 8139 __ tst(rscratch2, UPPER_BIT_MASK); 8140 __ csel(result, zr, result, Assembler::NE); 8141 __ leave(); 8142 __ ret(lr); 8143 __ bind(LEN_OVER_8); 8144 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 8145 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 8146 __ tst(rscratch2, UPPER_BIT_MASK); 8147 __ br(Assembler::NE, RET_NO_POP); 8148 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 8149 __ lsrv(rscratch1, rscratch1, rscratch2); 8150 __ tst(rscratch1, UPPER_BIT_MASK); 8151 __ bind(RET_NO_POP); 8152 __ csel(result, zr, result, Assembler::NE); 8153 __ leave(); 8154 __ ret(lr); 8155 8156 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 8157 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 8158 8159 count_positives_long = __ pc(); // 2nd entry point 8160 8161 __ enter(); 8162 8163 __ bind(LEN_OVER_15); 8164 __ push(spilled_regs, sp); 8165 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 8166 __ cbz(rscratch2, ALIGNED); 8167 __ ldp(tmp6, tmp1, Address(ary1)); 8168 __ mov(tmp5, 16); 8169 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 8170 __ add(ary1, ary1, rscratch1); 8171 __ orr(tmp6, tmp6, tmp1); 8172 __ tst(tmp6, UPPER_BIT_MASK); 8173 __ br(Assembler::NE, RET_ADJUST); 8174 __ sub(len, len, rscratch1); 8175 8176 __ bind(ALIGNED); 8177 __ cmp(len, large_loop_size); 8178 __ br(Assembler::LT, CHECK_16); 8179 // Perform 16-byte load as early return in pre-loop to handle situation 8180 // when initially aligned large array has negative values at starting bytes, 8181 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 8182 // slower. Cases with negative bytes further ahead won't be affected that 8183 // much. In fact, it'll be faster due to early loads, less instructions and 8184 // less branches in LARGE_LOOP. 8185 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 8186 __ sub(len, len, 16); 8187 __ orr(tmp6, tmp6, tmp1); 8188 __ tst(tmp6, UPPER_BIT_MASK); 8189 __ br(Assembler::NE, RET_ADJUST_16); 8190 __ cmp(len, large_loop_size); 8191 __ br(Assembler::LT, CHECK_16); 8192 8193 if (SoftwarePrefetchHintDistance >= 0 8194 && SoftwarePrefetchHintDistance >= dcache_line) { 8195 // initial prefetch 8196 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 8197 } 8198 __ bind(LARGE_LOOP); 8199 if (SoftwarePrefetchHintDistance >= 0) { 8200 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 8201 } 8202 // Issue load instructions first, since it can save few CPU/MEM cycles, also 8203 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 8204 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 8205 // instructions per cycle and have less branches, but this approach disables 8206 // early return, thus, all 64 bytes are loaded and checked every time. 8207 __ ldp(tmp2, tmp3, Address(ary1)); 8208 __ ldp(tmp4, tmp5, Address(ary1, 16)); 8209 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 8210 __ ldp(tmp6, tmp1, Address(ary1, 48)); 8211 __ add(ary1, ary1, large_loop_size); 8212 __ sub(len, len, large_loop_size); 8213 __ orr(tmp2, tmp2, tmp3); 8214 __ orr(tmp4, tmp4, tmp5); 8215 __ orr(rscratch1, rscratch1, rscratch2); 8216 __ orr(tmp6, tmp6, tmp1); 8217 __ orr(tmp2, tmp2, tmp4); 8218 __ orr(rscratch1, rscratch1, tmp6); 8219 __ orr(tmp2, tmp2, rscratch1); 8220 __ tst(tmp2, UPPER_BIT_MASK); 8221 __ br(Assembler::NE, RET_ADJUST_LONG); 8222 __ cmp(len, large_loop_size); 8223 __ br(Assembler::GE, LARGE_LOOP); 8224 8225 __ bind(CHECK_16); // small 16-byte load pre-loop 8226 __ cmp(len, (u1)16); 8227 __ br(Assembler::LT, POST_LOOP16); 8228 8229 __ bind(LOOP16); // small 16-byte load loop 8230 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 8231 __ sub(len, len, 16); 8232 __ orr(tmp2, tmp2, tmp3); 8233 __ tst(tmp2, UPPER_BIT_MASK); 8234 __ br(Assembler::NE, RET_ADJUST_16); 8235 __ cmp(len, (u1)16); 8236 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 8237 8238 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 8239 __ cmp(len, (u1)8); 8240 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 8241 __ ldr(tmp3, Address(__ post(ary1, 8))); 8242 __ tst(tmp3, UPPER_BIT_MASK); 8243 __ br(Assembler::NE, RET_ADJUST); 8244 __ sub(len, len, 8); 8245 8246 __ bind(POST_LOOP16_LOAD_TAIL); 8247 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 8248 __ ldr(tmp1, Address(ary1)); 8249 __ mov(tmp2, 64); 8250 __ sub(tmp4, tmp2, len, __ LSL, 3); 8251 __ lslv(tmp1, tmp1, tmp4); 8252 __ tst(tmp1, UPPER_BIT_MASK); 8253 __ br(Assembler::NE, RET_ADJUST); 8254 // Fallthrough 8255 8256 __ bind(RET_LEN); 8257 __ pop(spilled_regs, sp); 8258 __ leave(); 8259 __ ret(lr); 8260 8261 // difference result - len is the count of guaranteed to be 8262 // positive bytes 8263 8264 __ bind(RET_ADJUST_LONG); 8265 __ add(len, len, (u1)(large_loop_size - 16)); 8266 __ bind(RET_ADJUST_16); 8267 __ add(len, len, 16); 8268 __ bind(RET_ADJUST); 8269 __ pop(spilled_regs, sp); 8270 __ leave(); 8271 __ sub(result, result, len); 8272 __ ret(lr); 8273 8274 return entry; 8275 } 8276 8277 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 8278 bool usePrefetch, Label &NOT_EQUAL) { 8279 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8280 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8281 tmp7 = r12, tmp8 = r13; 8282 Label LOOP; 8283 8284 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8285 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8286 __ bind(LOOP); 8287 if (usePrefetch) { 8288 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8289 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8290 } 8291 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8292 __ eor(tmp1, tmp1, tmp2); 8293 __ eor(tmp3, tmp3, tmp4); 8294 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8295 __ orr(tmp1, tmp1, tmp3); 8296 __ cbnz(tmp1, NOT_EQUAL); 8297 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8298 __ eor(tmp5, tmp5, tmp6); 8299 __ eor(tmp7, tmp7, tmp8); 8300 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8301 __ orr(tmp5, tmp5, tmp7); 8302 __ cbnz(tmp5, NOT_EQUAL); 8303 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8304 __ eor(tmp1, tmp1, tmp2); 8305 __ eor(tmp3, tmp3, tmp4); 8306 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8307 __ orr(tmp1, tmp1, tmp3); 8308 __ cbnz(tmp1, NOT_EQUAL); 8309 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8310 __ eor(tmp5, tmp5, tmp6); 8311 __ sub(cnt1, cnt1, 8 * wordSize); 8312 __ eor(tmp7, tmp7, tmp8); 8313 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8314 // tmp6 is not used. MacroAssembler::subs is used here (rather than 8315 // cmp) because subs allows an unlimited range of immediate operand. 8316 __ subs(tmp6, cnt1, loopThreshold); 8317 __ orr(tmp5, tmp5, tmp7); 8318 __ cbnz(tmp5, NOT_EQUAL); 8319 __ br(__ GE, LOOP); 8320 // post-loop 8321 __ eor(tmp1, tmp1, tmp2); 8322 __ eor(tmp3, tmp3, tmp4); 8323 __ orr(tmp1, tmp1, tmp3); 8324 __ sub(cnt1, cnt1, 2 * wordSize); 8325 __ cbnz(tmp1, NOT_EQUAL); 8326 } 8327 8328 void generate_large_array_equals_loop_simd(int loopThreshold, 8329 bool usePrefetch, Label &NOT_EQUAL) { 8330 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8331 tmp2 = rscratch2; 8332 Label LOOP; 8333 8334 __ bind(LOOP); 8335 if (usePrefetch) { 8336 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8337 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8338 } 8339 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 8340 __ sub(cnt1, cnt1, 8 * wordSize); 8341 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 8342 __ subs(tmp1, cnt1, loopThreshold); 8343 __ eor(v0, __ T16B, v0, v4); 8344 __ eor(v1, __ T16B, v1, v5); 8345 __ eor(v2, __ T16B, v2, v6); 8346 __ eor(v3, __ T16B, v3, v7); 8347 __ orr(v0, __ T16B, v0, v1); 8348 __ orr(v1, __ T16B, v2, v3); 8349 __ orr(v0, __ T16B, v0, v1); 8350 __ umov(tmp1, v0, __ D, 0); 8351 __ umov(tmp2, v0, __ D, 1); 8352 __ orr(tmp1, tmp1, tmp2); 8353 __ cbnz(tmp1, NOT_EQUAL); 8354 __ br(__ GE, LOOP); 8355 } 8356 8357 // a1 = r1 - array1 address 8358 // a2 = r2 - array2 address 8359 // result = r0 - return value. Already contains "false" 8360 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 8361 // r3-r5 are reserved temporary registers 8362 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 8363 address generate_large_array_equals() { 8364 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8365 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8366 tmp7 = r12, tmp8 = r13; 8367 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 8368 SMALL_LOOP, POST_LOOP; 8369 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 8370 // calculate if at least 32 prefetched bytes are used 8371 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 8372 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 8373 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 8374 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 8375 tmp5, tmp6, tmp7, tmp8); 8376 8377 __ align(CodeEntryAlignment); 8378 8379 StubId stub_id = StubId::stubgen_large_array_equals_id; 8380 StubCodeMark mark(this, stub_id); 8381 8382 address entry = __ pc(); 8383 __ enter(); 8384 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8385 // also advance pointers to use post-increment instead of pre-increment 8386 __ add(a1, a1, wordSize); 8387 __ add(a2, a2, wordSize); 8388 if (AvoidUnalignedAccesses) { 8389 // both implementations (SIMD/nonSIMD) are using relatively large load 8390 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8391 // on some CPUs in case of address is not at least 16-byte aligned. 8392 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8393 // load if needed at least for 1st address and make if 16-byte aligned. 8394 Label ALIGNED16; 8395 __ tbz(a1, 3, ALIGNED16); 8396 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8397 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8398 __ sub(cnt1, cnt1, wordSize); 8399 __ eor(tmp1, tmp1, tmp2); 8400 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8401 __ bind(ALIGNED16); 8402 } 8403 if (UseSIMDForArrayEquals) { 8404 if (SoftwarePrefetchHintDistance >= 0) { 8405 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8406 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8407 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8408 /* prfm = */ true, NOT_EQUAL); 8409 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8410 __ br(__ LT, TAIL); 8411 } 8412 __ bind(NO_PREFETCH_LARGE_LOOP); 8413 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8414 /* prfm = */ false, NOT_EQUAL); 8415 } else { 8416 __ push(spilled_regs, sp); 8417 if (SoftwarePrefetchHintDistance >= 0) { 8418 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8419 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8420 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8421 /* prfm = */ true, NOT_EQUAL); 8422 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8423 __ br(__ LT, TAIL); 8424 } 8425 __ bind(NO_PREFETCH_LARGE_LOOP); 8426 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8427 /* prfm = */ false, NOT_EQUAL); 8428 } 8429 __ bind(TAIL); 8430 __ cbz(cnt1, EQUAL); 8431 __ subs(cnt1, cnt1, wordSize); 8432 __ br(__ LE, POST_LOOP); 8433 __ bind(SMALL_LOOP); 8434 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8435 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8436 __ subs(cnt1, cnt1, wordSize); 8437 __ eor(tmp1, tmp1, tmp2); 8438 __ cbnz(tmp1, NOT_EQUAL); 8439 __ br(__ GT, SMALL_LOOP); 8440 __ bind(POST_LOOP); 8441 __ ldr(tmp1, Address(a1, cnt1)); 8442 __ ldr(tmp2, Address(a2, cnt1)); 8443 __ eor(tmp1, tmp1, tmp2); 8444 __ cbnz(tmp1, NOT_EQUAL); 8445 __ bind(EQUAL); 8446 __ mov(result, true); 8447 __ bind(NOT_EQUAL); 8448 if (!UseSIMDForArrayEquals) { 8449 __ pop(spilled_regs, sp); 8450 } 8451 __ bind(NOT_EQUAL_NO_POP); 8452 __ leave(); 8453 __ ret(lr); 8454 return entry; 8455 } 8456 8457 // result = r0 - return value. Contains initial hashcode value on entry. 8458 // ary = r1 - array address 8459 // cnt = r2 - elements count 8460 // Clobbers: v0-v13, rscratch1, rscratch2 8461 address generate_large_arrays_hashcode(BasicType eltype) { 8462 const Register result = r0, ary = r1, cnt = r2; 8463 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8464 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8465 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8466 const FloatRegister vpowm = v13; 8467 8468 ARRAYS_HASHCODE_REGISTERS; 8469 8470 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8471 8472 unsigned int vf; // vectorization factor 8473 bool multiply_by_halves; 8474 Assembler::SIMD_Arrangement load_arrangement; 8475 switch (eltype) { 8476 case T_BOOLEAN: 8477 case T_BYTE: 8478 load_arrangement = Assembler::T8B; 8479 multiply_by_halves = true; 8480 vf = 8; 8481 break; 8482 case T_CHAR: 8483 case T_SHORT: 8484 load_arrangement = Assembler::T8H; 8485 multiply_by_halves = true; 8486 vf = 8; 8487 break; 8488 case T_INT: 8489 load_arrangement = Assembler::T4S; 8490 multiply_by_halves = false; 8491 vf = 4; 8492 break; 8493 default: 8494 ShouldNotReachHere(); 8495 } 8496 8497 // Unroll factor 8498 const unsigned uf = 4; 8499 8500 // Effective vectorization factor 8501 const unsigned evf = vf * uf; 8502 8503 __ align(CodeEntryAlignment); 8504 8505 StubId stub_id; 8506 switch (eltype) { 8507 case T_BOOLEAN: 8508 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id; 8509 break; 8510 case T_BYTE: 8511 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id; 8512 break; 8513 case T_CHAR: 8514 stub_id = StubId::stubgen_large_arrays_hashcode_char_id; 8515 break; 8516 case T_SHORT: 8517 stub_id = StubId::stubgen_large_arrays_hashcode_short_id; 8518 break; 8519 case T_INT: 8520 stub_id = StubId::stubgen_large_arrays_hashcode_int_id; 8521 break; 8522 default: 8523 stub_id = StubId::NO_STUBID; 8524 ShouldNotReachHere(); 8525 }; 8526 8527 StubCodeMark mark(this, stub_id); 8528 8529 address entry = __ pc(); 8530 __ enter(); 8531 8532 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8533 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8534 // value shouldn't change throughout both loops. 8535 __ movw(rscratch1, intpow(31U, 3)); 8536 __ mov(vpow, Assembler::S, 0, rscratch1); 8537 __ movw(rscratch1, intpow(31U, 2)); 8538 __ mov(vpow, Assembler::S, 1, rscratch1); 8539 __ movw(rscratch1, intpow(31U, 1)); 8540 __ mov(vpow, Assembler::S, 2, rscratch1); 8541 __ movw(rscratch1, intpow(31U, 0)); 8542 __ mov(vpow, Assembler::S, 3, rscratch1); 8543 8544 __ mov(vmul0, Assembler::T16B, 0); 8545 __ mov(vmul0, Assembler::S, 3, result); 8546 8547 __ andr(rscratch2, cnt, (uf - 1) * vf); 8548 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8549 8550 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8551 __ mov(vpowm, Assembler::S, 0, rscratch1); 8552 8553 // SMALL LOOP 8554 __ bind(SMALL_LOOP); 8555 8556 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8557 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8558 __ subsw(rscratch2, rscratch2, vf); 8559 8560 if (load_arrangement == Assembler::T8B) { 8561 // Extend 8B to 8H to be able to use vector multiply 8562 // instructions 8563 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8564 if (is_signed_subword_type(eltype)) { 8565 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8566 } else { 8567 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8568 } 8569 } 8570 8571 switch (load_arrangement) { 8572 case Assembler::T4S: 8573 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8574 break; 8575 case Assembler::T8B: 8576 case Assembler::T8H: 8577 assert(is_subword_type(eltype), "subword type expected"); 8578 if (is_signed_subword_type(eltype)) { 8579 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8580 } else { 8581 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8582 } 8583 break; 8584 default: 8585 __ should_not_reach_here(); 8586 } 8587 8588 // Process the upper half of a vector 8589 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8590 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8591 if (is_signed_subword_type(eltype)) { 8592 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8593 } else { 8594 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8595 } 8596 } 8597 8598 __ br(Assembler::HI, SMALL_LOOP); 8599 8600 // SMALL LOOP'S EPILOQUE 8601 __ lsr(rscratch2, cnt, exact_log2(evf)); 8602 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8603 8604 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8605 __ addv(vmul0, Assembler::T4S, vmul0); 8606 __ umov(result, vmul0, Assembler::S, 0); 8607 8608 // TAIL 8609 __ bind(TAIL); 8610 8611 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8612 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8613 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8614 __ andr(rscratch2, cnt, vf - 1); 8615 __ bind(TAIL_SHORTCUT); 8616 __ adr(rscratch1, BR_BASE); 8617 // For Cortex-A53 offset is 4 because 2 nops are generated. 8618 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8619 __ movw(rscratch2, 0x1f); 8620 __ br(rscratch1); 8621 8622 for (size_t i = 0; i < vf - 1; ++i) { 8623 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8624 eltype); 8625 __ maddw(result, result, rscratch2, rscratch1); 8626 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8627 // Generate 2nd nop to have 4 instructions per iteration. 8628 if (VM_Version::supports_a53mac()) { 8629 __ nop(); 8630 } 8631 } 8632 __ bind(BR_BASE); 8633 8634 __ leave(); 8635 __ ret(lr); 8636 8637 // LARGE LOOP 8638 __ bind(LARGE_LOOP_PREHEADER); 8639 8640 __ lsr(rscratch2, cnt, exact_log2(evf)); 8641 8642 if (multiply_by_halves) { 8643 // 31^4 - multiplier between lower and upper parts of a register 8644 __ movw(rscratch1, intpow(31U, vf / 2)); 8645 __ mov(vpowm, Assembler::S, 1, rscratch1); 8646 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8647 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8648 __ mov(vpowm, Assembler::S, 0, rscratch1); 8649 } else { 8650 // 31^16 8651 __ movw(rscratch1, intpow(31U, evf)); 8652 __ mov(vpowm, Assembler::S, 0, rscratch1); 8653 } 8654 8655 __ mov(vmul3, Assembler::T16B, 0); 8656 __ mov(vmul2, Assembler::T16B, 0); 8657 __ mov(vmul1, Assembler::T16B, 0); 8658 8659 __ bind(LARGE_LOOP); 8660 8661 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8662 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8663 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8664 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8665 8666 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8667 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8668 8669 if (load_arrangement == Assembler::T8B) { 8670 // Extend 8B to 8H to be able to use vector multiply 8671 // instructions 8672 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8673 if (is_signed_subword_type(eltype)) { 8674 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8675 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8676 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8677 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8678 } else { 8679 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8680 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8681 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8682 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8683 } 8684 } 8685 8686 switch (load_arrangement) { 8687 case Assembler::T4S: 8688 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8689 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8690 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8691 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8692 break; 8693 case Assembler::T8B: 8694 case Assembler::T8H: 8695 assert(is_subword_type(eltype), "subword type expected"); 8696 if (is_signed_subword_type(eltype)) { 8697 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8698 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8699 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8700 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8701 } else { 8702 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8703 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8704 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8705 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8706 } 8707 break; 8708 default: 8709 __ should_not_reach_here(); 8710 } 8711 8712 // Process the upper half of a vector 8713 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8714 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8715 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8716 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8717 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8718 if (is_signed_subword_type(eltype)) { 8719 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8720 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8721 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8722 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8723 } else { 8724 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8725 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8726 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8727 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8728 } 8729 } 8730 8731 __ subsw(rscratch2, rscratch2, 1); 8732 __ br(Assembler::HI, LARGE_LOOP); 8733 8734 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8735 __ addv(vmul3, Assembler::T4S, vmul3); 8736 __ umov(result, vmul3, Assembler::S, 0); 8737 8738 __ mov(rscratch2, intpow(31U, vf)); 8739 8740 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8741 __ addv(vmul2, Assembler::T4S, vmul2); 8742 __ umov(rscratch1, vmul2, Assembler::S, 0); 8743 __ maddw(result, result, rscratch2, rscratch1); 8744 8745 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8746 __ addv(vmul1, Assembler::T4S, vmul1); 8747 __ umov(rscratch1, vmul1, Assembler::S, 0); 8748 __ maddw(result, result, rscratch2, rscratch1); 8749 8750 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8751 __ addv(vmul0, Assembler::T4S, vmul0); 8752 __ umov(rscratch1, vmul0, Assembler::S, 0); 8753 __ maddw(result, result, rscratch2, rscratch1); 8754 8755 __ andr(rscratch2, cnt, vf - 1); 8756 __ cbnz(rscratch2, TAIL_SHORTCUT); 8757 8758 __ leave(); 8759 __ ret(lr); 8760 8761 return entry; 8762 } 8763 8764 address generate_dsin_dcos(bool isCos) { 8765 __ align(CodeEntryAlignment); 8766 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id); 8767 StubCodeMark mark(this, stub_id); 8768 address start = __ pc(); 8769 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8770 (address)StubRoutines::aarch64::_two_over_pi, 8771 (address)StubRoutines::aarch64::_pio2, 8772 (address)StubRoutines::aarch64::_dsin_coef, 8773 (address)StubRoutines::aarch64::_dcos_coef); 8774 return start; 8775 } 8776 8777 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8778 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8779 Label &DIFF2) { 8780 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8781 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8782 8783 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8784 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8785 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8786 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8787 8788 __ fmovd(tmpL, vtmp3); 8789 __ eor(rscratch2, tmp3, tmpL); 8790 __ cbnz(rscratch2, DIFF2); 8791 8792 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8793 __ umov(tmpL, vtmp3, __ D, 1); 8794 __ eor(rscratch2, tmpU, tmpL); 8795 __ cbnz(rscratch2, DIFF1); 8796 8797 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8798 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8799 __ fmovd(tmpL, vtmp); 8800 __ eor(rscratch2, tmp3, tmpL); 8801 __ cbnz(rscratch2, DIFF2); 8802 8803 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8804 __ umov(tmpL, vtmp, __ D, 1); 8805 __ eor(rscratch2, tmpU, tmpL); 8806 __ cbnz(rscratch2, DIFF1); 8807 } 8808 8809 // r0 = result 8810 // r1 = str1 8811 // r2 = cnt1 8812 // r3 = str2 8813 // r4 = cnt2 8814 // r10 = tmp1 8815 // r11 = tmp2 8816 address generate_compare_long_string_different_encoding(bool isLU) { 8817 __ align(CodeEntryAlignment); 8818 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id); 8819 StubCodeMark mark(this, stub_id); 8820 address entry = __ pc(); 8821 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8822 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8823 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8824 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8825 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8826 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8827 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8828 8829 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8830 8831 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8832 // cnt2 == amount of characters left to compare 8833 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8834 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8835 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8836 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8837 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8838 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8839 __ eor(rscratch2, tmp1, tmp2); 8840 __ mov(rscratch1, tmp2); 8841 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8842 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8843 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8844 __ push(spilled_regs, sp); 8845 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8846 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8847 8848 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8849 8850 if (SoftwarePrefetchHintDistance >= 0) { 8851 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8852 __ br(__ LT, NO_PREFETCH); 8853 __ bind(LARGE_LOOP_PREFETCH); 8854 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8855 __ mov(tmp4, 2); 8856 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8857 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8858 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8859 __ subs(tmp4, tmp4, 1); 8860 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8861 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8862 __ mov(tmp4, 2); 8863 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8864 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8865 __ subs(tmp4, tmp4, 1); 8866 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8867 __ sub(cnt2, cnt2, 64); 8868 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8869 __ br(__ GE, LARGE_LOOP_PREFETCH); 8870 } 8871 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8872 __ bind(NO_PREFETCH); 8873 __ subs(cnt2, cnt2, 16); 8874 __ br(__ LT, TAIL); 8875 __ align(OptoLoopAlignment); 8876 __ bind(SMALL_LOOP); // smaller loop 8877 __ subs(cnt2, cnt2, 16); 8878 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8879 __ br(__ GE, SMALL_LOOP); 8880 __ cmn(cnt2, (u1)16); 8881 __ br(__ EQ, LOAD_LAST); 8882 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8883 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8884 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8885 __ ldr(tmp3, Address(cnt1, -8)); 8886 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8887 __ b(LOAD_LAST); 8888 __ bind(DIFF2); 8889 __ mov(tmpU, tmp3); 8890 __ bind(DIFF1); 8891 __ pop(spilled_regs, sp); 8892 __ b(CALCULATE_DIFFERENCE); 8893 __ bind(LOAD_LAST); 8894 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8895 // No need to load it again 8896 __ mov(tmpU, tmp3); 8897 __ pop(spilled_regs, sp); 8898 8899 // tmp2 points to the address of the last 4 Latin1 characters right now 8900 __ ldrs(vtmp, Address(tmp2)); 8901 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8902 __ fmovd(tmpL, vtmp); 8903 8904 __ eor(rscratch2, tmpU, tmpL); 8905 __ cbz(rscratch2, DONE); 8906 8907 // Find the first different characters in the longwords and 8908 // compute their difference. 8909 __ bind(CALCULATE_DIFFERENCE); 8910 __ rev(rscratch2, rscratch2); 8911 __ clz(rscratch2, rscratch2); 8912 __ andr(rscratch2, rscratch2, -16); 8913 __ lsrv(tmp1, tmp1, rscratch2); 8914 __ uxthw(tmp1, tmp1); 8915 __ lsrv(rscratch1, rscratch1, rscratch2); 8916 __ uxthw(rscratch1, rscratch1); 8917 __ subw(result, tmp1, rscratch1); 8918 __ bind(DONE); 8919 __ ret(lr); 8920 return entry; 8921 } 8922 8923 // r0 = input (float16) 8924 // v0 = result (float) 8925 // v1 = temporary float register 8926 address generate_float16ToFloat() { 8927 __ align(CodeEntryAlignment); 8928 StubId stub_id = StubId::stubgen_hf2f_id; 8929 StubCodeMark mark(this, stub_id); 8930 address entry = __ pc(); 8931 BLOCK_COMMENT("Entry:"); 8932 __ flt16_to_flt(v0, r0, v1); 8933 __ ret(lr); 8934 return entry; 8935 } 8936 8937 // v0 = input (float) 8938 // r0 = result (float16) 8939 // v1 = temporary float register 8940 address generate_floatToFloat16() { 8941 __ align(CodeEntryAlignment); 8942 StubId stub_id = StubId::stubgen_f2hf_id; 8943 StubCodeMark mark(this, stub_id); 8944 address entry = __ pc(); 8945 BLOCK_COMMENT("Entry:"); 8946 __ flt_to_flt16(r0, v0, v1); 8947 __ ret(lr); 8948 return entry; 8949 } 8950 8951 address generate_method_entry_barrier() { 8952 __ align(CodeEntryAlignment); 8953 StubId stub_id = StubId::stubgen_method_entry_barrier_id; 8954 StubCodeMark mark(this, stub_id); 8955 8956 Label deoptimize_label; 8957 8958 address start = __ pc(); 8959 8960 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8961 8962 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8963 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8964 // We can get here despite the nmethod being good, if we have not 8965 // yet applied our cross modification fence (or data fence). 8966 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8967 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8968 __ ldrw(rscratch2, rscratch2); 8969 __ strw(rscratch2, thread_epoch_addr); 8970 __ isb(); 8971 __ membar(__ LoadLoad); 8972 } 8973 8974 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8975 8976 __ enter(); 8977 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8978 8979 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8980 8981 __ push_call_clobbered_registers(); 8982 8983 __ mov(c_rarg0, rscratch2); 8984 __ call_VM_leaf 8985 (CAST_FROM_FN_PTR 8986 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8987 8988 __ reset_last_Java_frame(true); 8989 8990 __ mov(rscratch1, r0); 8991 8992 __ pop_call_clobbered_registers(); 8993 8994 __ cbnz(rscratch1, deoptimize_label); 8995 8996 __ leave(); 8997 __ ret(lr); 8998 8999 __ BIND(deoptimize_label); 9000 9001 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 9002 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 9003 9004 __ mov(sp, rscratch1); 9005 __ br(rscratch2); 9006 9007 return start; 9008 } 9009 9010 // r0 = result 9011 // r1 = str1 9012 // r2 = cnt1 9013 // r3 = str2 9014 // r4 = cnt2 9015 // r10 = tmp1 9016 // r11 = tmp2 9017 address generate_compare_long_string_same_encoding(bool isLL) { 9018 __ align(CodeEntryAlignment); 9019 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id); 9020 StubCodeMark mark(this, stub_id); 9021 address entry = __ pc(); 9022 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9023 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 9024 9025 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 9026 9027 // exit from large loop when less than 64 bytes left to read or we're about 9028 // to prefetch memory behind array border 9029 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 9030 9031 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 9032 __ eor(rscratch2, tmp1, tmp2); 9033 __ cbnz(rscratch2, CAL_DIFFERENCE); 9034 9035 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 9036 // update pointers, because of previous read 9037 __ add(str1, str1, wordSize); 9038 __ add(str2, str2, wordSize); 9039 if (SoftwarePrefetchHintDistance >= 0) { 9040 __ align(OptoLoopAlignment); 9041 __ bind(LARGE_LOOP_PREFETCH); 9042 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 9043 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 9044 9045 for (int i = 0; i < 4; i++) { 9046 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 9047 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 9048 __ cmp(tmp1, tmp2); 9049 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9050 __ br(Assembler::NE, DIFF); 9051 } 9052 __ sub(cnt2, cnt2, isLL ? 64 : 32); 9053 __ add(str1, str1, 64); 9054 __ add(str2, str2, 64); 9055 __ subs(rscratch2, cnt2, largeLoopExitCondition); 9056 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 9057 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 9058 } 9059 9060 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 9061 __ br(Assembler::LE, LESS16); 9062 __ align(OptoLoopAlignment); 9063 __ bind(LOOP_COMPARE16); 9064 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9065 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9066 __ cmp(tmp1, tmp2); 9067 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9068 __ br(Assembler::NE, DIFF); 9069 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9070 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9071 __ br(Assembler::LT, LESS16); 9072 9073 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9074 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9075 __ cmp(tmp1, tmp2); 9076 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9077 __ br(Assembler::NE, DIFF); 9078 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9079 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9080 __ br(Assembler::GE, LOOP_COMPARE16); 9081 __ cbz(cnt2, LENGTH_DIFF); 9082 9083 __ bind(LESS16); 9084 // each 8 compare 9085 __ subs(cnt2, cnt2, isLL ? 8 : 4); 9086 __ br(Assembler::LE, LESS8); 9087 __ ldr(tmp1, Address(__ post(str1, 8))); 9088 __ ldr(tmp2, Address(__ post(str2, 8))); 9089 __ eor(rscratch2, tmp1, tmp2); 9090 __ cbnz(rscratch2, CAL_DIFFERENCE); 9091 __ sub(cnt2, cnt2, isLL ? 8 : 4); 9092 9093 __ bind(LESS8); // directly load last 8 bytes 9094 if (!isLL) { 9095 __ add(cnt2, cnt2, cnt2); 9096 } 9097 __ ldr(tmp1, Address(str1, cnt2)); 9098 __ ldr(tmp2, Address(str2, cnt2)); 9099 __ eor(rscratch2, tmp1, tmp2); 9100 __ cbz(rscratch2, LENGTH_DIFF); 9101 __ b(CAL_DIFFERENCE); 9102 9103 __ bind(DIFF); 9104 __ cmp(tmp1, tmp2); 9105 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 9106 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 9107 // reuse rscratch2 register for the result of eor instruction 9108 __ eor(rscratch2, tmp1, tmp2); 9109 9110 __ bind(CAL_DIFFERENCE); 9111 __ rev(rscratch2, rscratch2); 9112 __ clz(rscratch2, rscratch2); 9113 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 9114 __ lsrv(tmp1, tmp1, rscratch2); 9115 __ lsrv(tmp2, tmp2, rscratch2); 9116 if (isLL) { 9117 __ uxtbw(tmp1, tmp1); 9118 __ uxtbw(tmp2, tmp2); 9119 } else { 9120 __ uxthw(tmp1, tmp1); 9121 __ uxthw(tmp2, tmp2); 9122 } 9123 __ subw(result, tmp1, tmp2); 9124 9125 __ bind(LENGTH_DIFF); 9126 __ ret(lr); 9127 return entry; 9128 } 9129 9130 enum string_compare_mode { 9131 LL, 9132 LU, 9133 UL, 9134 UU, 9135 }; 9136 9137 // The following registers are declared in aarch64.ad 9138 // r0 = result 9139 // r1 = str1 9140 // r2 = cnt1 9141 // r3 = str2 9142 // r4 = cnt2 9143 // r10 = tmp1 9144 // r11 = tmp2 9145 // z0 = ztmp1 9146 // z1 = ztmp2 9147 // p0 = pgtmp1 9148 // p1 = pgtmp2 9149 address generate_compare_long_string_sve(string_compare_mode mode) { 9150 StubId stub_id; 9151 switch (mode) { 9152 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break; 9153 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break; 9154 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break; 9155 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break; 9156 default: ShouldNotReachHere(); 9157 } 9158 9159 __ align(CodeEntryAlignment); 9160 address entry = __ pc(); 9161 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9162 tmp1 = r10, tmp2 = r11; 9163 9164 Label LOOP, DONE, MISMATCH; 9165 Register vec_len = tmp1; 9166 Register idx = tmp2; 9167 // The minimum of the string lengths has been stored in cnt2. 9168 Register cnt = cnt2; 9169 FloatRegister ztmp1 = z0, ztmp2 = z1; 9170 PRegister pgtmp1 = p0, pgtmp2 = p1; 9171 9172 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 9173 switch (mode) { \ 9174 case LL: \ 9175 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 9176 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 9177 break; \ 9178 case LU: \ 9179 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 9180 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9181 break; \ 9182 case UL: \ 9183 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9184 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 9185 break; \ 9186 case UU: \ 9187 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9188 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9189 break; \ 9190 default: \ 9191 ShouldNotReachHere(); \ 9192 } 9193 9194 StubCodeMark mark(this, stub_id); 9195 9196 __ mov(idx, 0); 9197 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9198 9199 if (mode == LL) { 9200 __ sve_cntb(vec_len); 9201 } else { 9202 __ sve_cnth(vec_len); 9203 } 9204 9205 __ sub(rscratch1, cnt, vec_len); 9206 9207 __ bind(LOOP); 9208 9209 // main loop 9210 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9211 __ add(idx, idx, vec_len); 9212 // Compare strings. 9213 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9214 __ br(__ NE, MISMATCH); 9215 __ cmp(idx, rscratch1); 9216 __ br(__ LT, LOOP); 9217 9218 // post loop, last iteration 9219 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9220 9221 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9222 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9223 __ br(__ EQ, DONE); 9224 9225 __ bind(MISMATCH); 9226 9227 // Crop the vector to find its location. 9228 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 9229 // Extract the first different characters of each string. 9230 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 9231 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 9232 9233 // Compute the difference of the first different characters. 9234 __ sub(result, rscratch1, rscratch2); 9235 9236 __ bind(DONE); 9237 __ ret(lr); 9238 #undef LOAD_PAIR 9239 return entry; 9240 } 9241 9242 void generate_compare_long_strings() { 9243 if (UseSVE == 0) { 9244 StubRoutines::aarch64::_compare_long_string_LL 9245 = generate_compare_long_string_same_encoding(true); 9246 StubRoutines::aarch64::_compare_long_string_UU 9247 = generate_compare_long_string_same_encoding(false); 9248 StubRoutines::aarch64::_compare_long_string_LU 9249 = generate_compare_long_string_different_encoding(true); 9250 StubRoutines::aarch64::_compare_long_string_UL 9251 = generate_compare_long_string_different_encoding(false); 9252 } else { 9253 StubRoutines::aarch64::_compare_long_string_LL 9254 = generate_compare_long_string_sve(LL); 9255 StubRoutines::aarch64::_compare_long_string_UU 9256 = generate_compare_long_string_sve(UU); 9257 StubRoutines::aarch64::_compare_long_string_LU 9258 = generate_compare_long_string_sve(LU); 9259 StubRoutines::aarch64::_compare_long_string_UL 9260 = generate_compare_long_string_sve(UL); 9261 } 9262 } 9263 9264 // R0 = result 9265 // R1 = str2 9266 // R2 = cnt1 9267 // R3 = str1 9268 // R4 = cnt2 9269 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 9270 // 9271 // This generic linear code use few additional ideas, which makes it faster: 9272 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 9273 // in order to skip initial loading(help in systems with 1 ld pipeline) 9274 // 2) we can use "fast" algorithm of finding single character to search for 9275 // first symbol with less branches(1 branch per each loaded register instead 9276 // of branch for each symbol), so, this is where constants like 9277 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 9278 // 3) after loading and analyzing 1st register of source string, it can be 9279 // used to search for every 1st character entry, saving few loads in 9280 // comparison with "simplier-but-slower" implementation 9281 // 4) in order to avoid lots of push/pop operations, code below is heavily 9282 // re-using/re-initializing/compressing register values, which makes code 9283 // larger and a bit less readable, however, most of extra operations are 9284 // issued during loads or branches, so, penalty is minimal 9285 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 9286 StubId stub_id; 9287 if (str1_isL) { 9288 if (str2_isL) { 9289 stub_id = StubId::stubgen_string_indexof_linear_ll_id; 9290 } else { 9291 stub_id = StubId::stubgen_string_indexof_linear_ul_id; 9292 } 9293 } else { 9294 if (str2_isL) { 9295 ShouldNotReachHere(); 9296 } else { 9297 stub_id = StubId::stubgen_string_indexof_linear_uu_id; 9298 } 9299 } 9300 __ align(CodeEntryAlignment); 9301 StubCodeMark mark(this, stub_id); 9302 address entry = __ pc(); 9303 9304 int str1_chr_size = str1_isL ? 1 : 2; 9305 int str2_chr_size = str2_isL ? 1 : 2; 9306 int str1_chr_shift = str1_isL ? 0 : 1; 9307 int str2_chr_shift = str2_isL ? 0 : 1; 9308 bool isL = str1_isL && str2_isL; 9309 // parameters 9310 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 9311 // temporary registers 9312 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 9313 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 9314 // redefinitions 9315 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 9316 9317 __ push(spilled_regs, sp); 9318 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 9319 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 9320 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 9321 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 9322 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 9323 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 9324 // Read whole register from str1. It is safe, because length >=8 here 9325 __ ldr(ch1, Address(str1)); 9326 // Read whole register from str2. It is safe, because length >=8 here 9327 __ ldr(ch2, Address(str2)); 9328 __ sub(cnt2, cnt2, cnt1); 9329 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 9330 if (str1_isL != str2_isL) { 9331 __ eor(v0, __ T16B, v0, v0); 9332 } 9333 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 9334 __ mul(first, first, tmp1); 9335 // check if we have less than 1 register to check 9336 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 9337 if (str1_isL != str2_isL) { 9338 __ fmovd(v1, ch1); 9339 } 9340 __ br(__ LE, L_SMALL); 9341 __ eor(ch2, first, ch2); 9342 if (str1_isL != str2_isL) { 9343 __ zip1(v1, __ T16B, v1, v0); 9344 } 9345 __ sub(tmp2, ch2, tmp1); 9346 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9347 __ bics(tmp2, tmp2, ch2); 9348 if (str1_isL != str2_isL) { 9349 __ fmovd(ch1, v1); 9350 } 9351 __ br(__ NE, L_HAS_ZERO); 9352 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9353 __ add(result, result, wordSize/str2_chr_size); 9354 __ add(str2, str2, wordSize); 9355 __ br(__ LT, L_POST_LOOP); 9356 __ BIND(L_LOOP); 9357 __ ldr(ch2, Address(str2)); 9358 __ eor(ch2, first, ch2); 9359 __ sub(tmp2, ch2, tmp1); 9360 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9361 __ bics(tmp2, tmp2, ch2); 9362 __ br(__ NE, L_HAS_ZERO); 9363 __ BIND(L_LOOP_PROCEED); 9364 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9365 __ add(str2, str2, wordSize); 9366 __ add(result, result, wordSize/str2_chr_size); 9367 __ br(__ GE, L_LOOP); 9368 __ BIND(L_POST_LOOP); 9369 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 9370 __ br(__ LE, NOMATCH); 9371 __ ldr(ch2, Address(str2)); 9372 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9373 __ eor(ch2, first, ch2); 9374 __ sub(tmp2, ch2, tmp1); 9375 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9376 __ mov(tmp4, -1); // all bits set 9377 __ b(L_SMALL_PROCEED); 9378 __ align(OptoLoopAlignment); 9379 __ BIND(L_SMALL); 9380 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9381 __ eor(ch2, first, ch2); 9382 if (str1_isL != str2_isL) { 9383 __ zip1(v1, __ T16B, v1, v0); 9384 } 9385 __ sub(tmp2, ch2, tmp1); 9386 __ mov(tmp4, -1); // all bits set 9387 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9388 if (str1_isL != str2_isL) { 9389 __ fmovd(ch1, v1); // move converted 4 symbols 9390 } 9391 __ BIND(L_SMALL_PROCEED); 9392 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9393 __ bic(tmp2, tmp2, ch2); 9394 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9395 __ rbit(tmp2, tmp2); 9396 __ br(__ EQ, NOMATCH); 9397 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9398 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9399 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9400 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9401 if (str2_isL) { // LL 9402 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9403 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9404 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9405 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9406 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9407 } else { 9408 __ mov(ch2, 0xE); // all bits in byte set except last one 9409 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9410 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9411 __ lslv(tmp2, tmp2, tmp4); 9412 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9413 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9414 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9415 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9416 } 9417 __ cmp(ch1, ch2); 9418 __ mov(tmp4, wordSize/str2_chr_size); 9419 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9420 __ BIND(L_SMALL_CMP_LOOP); 9421 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9422 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9423 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9424 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9425 __ add(tmp4, tmp4, 1); 9426 __ cmp(tmp4, cnt1); 9427 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9428 __ cmp(first, ch2); 9429 __ br(__ EQ, L_SMALL_CMP_LOOP); 9430 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9431 __ cbz(tmp2, NOMATCH); // no more matches. exit 9432 __ clz(tmp4, tmp2); 9433 __ add(result, result, 1); // advance index 9434 __ add(str2, str2, str2_chr_size); // advance pointer 9435 __ b(L_SMALL_HAS_ZERO_LOOP); 9436 __ align(OptoLoopAlignment); 9437 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9438 __ cmp(first, ch2); 9439 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9440 __ b(DONE); 9441 __ align(OptoLoopAlignment); 9442 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9443 if (str2_isL) { // LL 9444 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9445 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9446 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9447 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9448 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9449 } else { 9450 __ mov(ch2, 0xE); // all bits in byte set except last one 9451 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9452 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9453 __ lslv(tmp2, tmp2, tmp4); 9454 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9455 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9456 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9457 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9458 } 9459 __ cmp(ch1, ch2); 9460 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9461 __ b(DONE); 9462 __ align(OptoLoopAlignment); 9463 __ BIND(L_HAS_ZERO); 9464 __ rbit(tmp2, tmp2); 9465 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9466 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9467 // It's fine because both counters are 32bit and are not changed in this 9468 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9469 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9470 __ sub(result, result, 1); 9471 __ BIND(L_HAS_ZERO_LOOP); 9472 __ mov(cnt1, wordSize/str2_chr_size); 9473 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9474 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9475 if (str2_isL) { 9476 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9477 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9478 __ lslv(tmp2, tmp2, tmp4); 9479 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9480 __ add(tmp4, tmp4, 1); 9481 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9482 __ lsl(tmp2, tmp2, 1); 9483 __ mov(tmp4, wordSize/str2_chr_size); 9484 } else { 9485 __ mov(ch2, 0xE); 9486 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9487 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9488 __ lslv(tmp2, tmp2, tmp4); 9489 __ add(tmp4, tmp4, 1); 9490 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9491 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9492 __ lsl(tmp2, tmp2, 1); 9493 __ mov(tmp4, wordSize/str2_chr_size); 9494 __ sub(str2, str2, str2_chr_size); 9495 } 9496 __ cmp(ch1, ch2); 9497 __ mov(tmp4, wordSize/str2_chr_size); 9498 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9499 __ BIND(L_CMP_LOOP); 9500 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9501 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9502 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9503 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9504 __ add(tmp4, tmp4, 1); 9505 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9506 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9507 __ cmp(cnt1, ch2); 9508 __ br(__ EQ, L_CMP_LOOP); 9509 __ BIND(L_CMP_LOOP_NOMATCH); 9510 // here we're not matched 9511 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9512 __ clz(tmp4, tmp2); 9513 __ add(str2, str2, str2_chr_size); // advance pointer 9514 __ b(L_HAS_ZERO_LOOP); 9515 __ align(OptoLoopAlignment); 9516 __ BIND(L_CMP_LOOP_LAST_CMP); 9517 __ cmp(cnt1, ch2); 9518 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9519 __ b(DONE); 9520 __ align(OptoLoopAlignment); 9521 __ BIND(L_CMP_LOOP_LAST_CMP2); 9522 if (str2_isL) { 9523 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9524 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9525 __ lslv(tmp2, tmp2, tmp4); 9526 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9527 __ add(tmp4, tmp4, 1); 9528 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9529 __ lsl(tmp2, tmp2, 1); 9530 } else { 9531 __ mov(ch2, 0xE); 9532 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9533 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9534 __ lslv(tmp2, tmp2, tmp4); 9535 __ add(tmp4, tmp4, 1); 9536 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9537 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9538 __ lsl(tmp2, tmp2, 1); 9539 __ sub(str2, str2, str2_chr_size); 9540 } 9541 __ cmp(ch1, ch2); 9542 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9543 __ b(DONE); 9544 __ align(OptoLoopAlignment); 9545 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9546 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9547 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9548 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9549 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9550 // result by analyzed characters value, so, we can just reset lower bits 9551 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9552 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9553 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9554 // index of last analyzed substring inside current octet. So, str2 in at 9555 // respective start address. We need to advance it to next octet 9556 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9557 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9558 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9559 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9560 __ movw(cnt2, cnt2); 9561 __ b(L_LOOP_PROCEED); 9562 __ align(OptoLoopAlignment); 9563 __ BIND(NOMATCH); 9564 __ mov(result, -1); 9565 __ BIND(DONE); 9566 __ pop(spilled_regs, sp); 9567 __ ret(lr); 9568 return entry; 9569 } 9570 9571 void generate_string_indexof_stubs() { 9572 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9573 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9574 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9575 } 9576 9577 void inflate_and_store_2_fp_registers(bool generatePrfm, 9578 FloatRegister src1, FloatRegister src2) { 9579 Register dst = r1; 9580 __ zip1(v1, __ T16B, src1, v0); 9581 __ zip2(v2, __ T16B, src1, v0); 9582 if (generatePrfm) { 9583 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9584 } 9585 __ zip1(v3, __ T16B, src2, v0); 9586 __ zip2(v4, __ T16B, src2, v0); 9587 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9588 } 9589 9590 // R0 = src 9591 // R1 = dst 9592 // R2 = len 9593 // R3 = len >> 3 9594 // V0 = 0 9595 // v1 = loaded 8 bytes 9596 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9597 address generate_large_byte_array_inflate() { 9598 __ align(CodeEntryAlignment); 9599 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id; 9600 StubCodeMark mark(this, stub_id); 9601 address entry = __ pc(); 9602 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9603 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9604 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9605 9606 // do one more 8-byte read to have address 16-byte aligned in most cases 9607 // also use single store instruction 9608 __ ldrd(v2, __ post(src, 8)); 9609 __ sub(octetCounter, octetCounter, 2); 9610 __ zip1(v1, __ T16B, v1, v0); 9611 __ zip1(v2, __ T16B, v2, v0); 9612 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9613 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9614 __ subs(rscratch1, octetCounter, large_loop_threshold); 9615 __ br(__ LE, LOOP_START); 9616 __ b(LOOP_PRFM_START); 9617 __ bind(LOOP_PRFM); 9618 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9619 __ bind(LOOP_PRFM_START); 9620 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9621 __ sub(octetCounter, octetCounter, 8); 9622 __ subs(rscratch1, octetCounter, large_loop_threshold); 9623 inflate_and_store_2_fp_registers(true, v3, v4); 9624 inflate_and_store_2_fp_registers(true, v5, v6); 9625 __ br(__ GT, LOOP_PRFM); 9626 __ cmp(octetCounter, (u1)8); 9627 __ br(__ LT, DONE); 9628 __ bind(LOOP); 9629 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9630 __ bind(LOOP_START); 9631 __ sub(octetCounter, octetCounter, 8); 9632 __ cmp(octetCounter, (u1)8); 9633 inflate_and_store_2_fp_registers(false, v3, v4); 9634 inflate_and_store_2_fp_registers(false, v5, v6); 9635 __ br(__ GE, LOOP); 9636 __ bind(DONE); 9637 __ ret(lr); 9638 return entry; 9639 } 9640 9641 /** 9642 * Arguments: 9643 * 9644 * Input: 9645 * c_rarg0 - current state address 9646 * c_rarg1 - H key address 9647 * c_rarg2 - data address 9648 * c_rarg3 - number of blocks 9649 * 9650 * Output: 9651 * Updated state at c_rarg0 9652 */ 9653 address generate_ghash_processBlocks() { 9654 // Bafflingly, GCM uses little-endian for the byte order, but 9655 // big-endian for the bit order. For example, the polynomial 1 is 9656 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9657 // 9658 // So, we must either reverse the bytes in each word and do 9659 // everything big-endian or reverse the bits in each byte and do 9660 // it little-endian. On AArch64 it's more idiomatic to reverse 9661 // the bits in each byte (we have an instruction, RBIT, to do 9662 // that) and keep the data in little-endian bit order through the 9663 // calculation, bit-reversing the inputs and outputs. 9664 9665 StubId stub_id = StubId::stubgen_ghash_processBlocks_id; 9666 StubCodeMark mark(this, stub_id); 9667 __ align(wordSize * 2); 9668 address p = __ pc(); 9669 __ emit_int64(0x87); // The low-order bits of the field 9670 // polynomial (i.e. p = z^7+z^2+z+1) 9671 // repeated in the low and high parts of a 9672 // 128-bit vector 9673 __ emit_int64(0x87); 9674 9675 __ align(CodeEntryAlignment); 9676 address start = __ pc(); 9677 9678 Register state = c_rarg0; 9679 Register subkeyH = c_rarg1; 9680 Register data = c_rarg2; 9681 Register blocks = c_rarg3; 9682 9683 FloatRegister vzr = v30; 9684 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9685 9686 __ ldrq(v24, p); // The field polynomial 9687 9688 __ ldrq(v0, Address(state)); 9689 __ ldrq(v1, Address(subkeyH)); 9690 9691 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9692 __ rbit(v0, __ T16B, v0); 9693 __ rev64(v1, __ T16B, v1); 9694 __ rbit(v1, __ T16B, v1); 9695 9696 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9697 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9698 9699 { 9700 Label L_ghash_loop; 9701 __ bind(L_ghash_loop); 9702 9703 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9704 // reversing each byte 9705 __ rbit(v2, __ T16B, v2); 9706 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9707 9708 // Multiply state in v2 by subkey in v1 9709 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9710 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9711 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9712 // Reduce v7:v5 by the field polynomial 9713 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9714 9715 __ sub(blocks, blocks, 1); 9716 __ cbnz(blocks, L_ghash_loop); 9717 } 9718 9719 // The bit-reversed result is at this point in v0 9720 __ rev64(v0, __ T16B, v0); 9721 __ rbit(v0, __ T16B, v0); 9722 9723 __ st1(v0, __ T16B, state); 9724 __ ret(lr); 9725 9726 return start; 9727 } 9728 9729 address generate_ghash_processBlocks_wide() { 9730 address small = generate_ghash_processBlocks(); 9731 9732 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id; 9733 StubCodeMark mark(this, stub_id); 9734 __ align(wordSize * 2); 9735 address p = __ pc(); 9736 __ emit_int64(0x87); // The low-order bits of the field 9737 // polynomial (i.e. p = z^7+z^2+z+1) 9738 // repeated in the low and high parts of a 9739 // 128-bit vector 9740 __ emit_int64(0x87); 9741 9742 __ align(CodeEntryAlignment); 9743 address start = __ pc(); 9744 9745 Register state = c_rarg0; 9746 Register subkeyH = c_rarg1; 9747 Register data = c_rarg2; 9748 Register blocks = c_rarg3; 9749 9750 const int unroll = 4; 9751 9752 __ cmp(blocks, (unsigned char)(unroll * 2)); 9753 __ br(__ LT, small); 9754 9755 if (unroll > 1) { 9756 // Save state before entering routine 9757 __ sub(sp, sp, 4 * 16); 9758 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9759 __ sub(sp, sp, 4 * 16); 9760 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9761 } 9762 9763 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9764 9765 if (unroll > 1) { 9766 // And restore state 9767 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9768 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9769 } 9770 9771 __ cmp(blocks, (unsigned char)0); 9772 __ br(__ GT, small); 9773 9774 __ ret(lr); 9775 9776 return start; 9777 } 9778 9779 void generate_base64_encode_simdround(Register src, Register dst, 9780 FloatRegister codec, u8 size) { 9781 9782 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9783 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9784 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9785 9786 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9787 9788 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9789 9790 __ ushr(ind0, arrangement, in0, 2); 9791 9792 __ ushr(ind1, arrangement, in1, 2); 9793 __ shl(in0, arrangement, in0, 6); 9794 __ orr(ind1, arrangement, ind1, in0); 9795 __ ushr(ind1, arrangement, ind1, 2); 9796 9797 __ ushr(ind2, arrangement, in2, 4); 9798 __ shl(in1, arrangement, in1, 4); 9799 __ orr(ind2, arrangement, in1, ind2); 9800 __ ushr(ind2, arrangement, ind2, 2); 9801 9802 __ shl(ind3, arrangement, in2, 2); 9803 __ ushr(ind3, arrangement, ind3, 2); 9804 9805 __ tbl(out0, arrangement, codec, 4, ind0); 9806 __ tbl(out1, arrangement, codec, 4, ind1); 9807 __ tbl(out2, arrangement, codec, 4, ind2); 9808 __ tbl(out3, arrangement, codec, 4, ind3); 9809 9810 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9811 } 9812 9813 /** 9814 * Arguments: 9815 * 9816 * Input: 9817 * c_rarg0 - src_start 9818 * c_rarg1 - src_offset 9819 * c_rarg2 - src_length 9820 * c_rarg3 - dest_start 9821 * c_rarg4 - dest_offset 9822 * c_rarg5 - isURL 9823 * 9824 */ 9825 address generate_base64_encodeBlock() { 9826 9827 static const char toBase64[64] = { 9828 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9829 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9830 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9831 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9832 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9833 }; 9834 9835 static const char toBase64URL[64] = { 9836 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9837 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9838 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9839 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9840 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9841 }; 9842 9843 __ align(CodeEntryAlignment); 9844 StubId stub_id = StubId::stubgen_base64_encodeBlock_id; 9845 StubCodeMark mark(this, stub_id); 9846 address start = __ pc(); 9847 9848 Register src = c_rarg0; // source array 9849 Register soff = c_rarg1; // source start offset 9850 Register send = c_rarg2; // source end offset 9851 Register dst = c_rarg3; // dest array 9852 Register doff = c_rarg4; // position for writing to dest array 9853 Register isURL = c_rarg5; // Base64 or URL character set 9854 9855 // c_rarg6 and c_rarg7 are free to use as temps 9856 Register codec = c_rarg6; 9857 Register length = c_rarg7; 9858 9859 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9860 9861 __ add(src, src, soff); 9862 __ add(dst, dst, doff); 9863 __ sub(length, send, soff); 9864 9865 // load the codec base address 9866 __ lea(codec, ExternalAddress((address) toBase64)); 9867 __ cbz(isURL, ProcessData); 9868 __ lea(codec, ExternalAddress((address) toBase64URL)); 9869 9870 __ BIND(ProcessData); 9871 9872 // too short to formup a SIMD loop, roll back 9873 __ cmp(length, (u1)24); 9874 __ br(Assembler::LT, Process3B); 9875 9876 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9877 9878 __ BIND(Process48B); 9879 __ cmp(length, (u1)48); 9880 __ br(Assembler::LT, Process24B); 9881 generate_base64_encode_simdround(src, dst, v0, 16); 9882 __ sub(length, length, 48); 9883 __ b(Process48B); 9884 9885 __ BIND(Process24B); 9886 __ cmp(length, (u1)24); 9887 __ br(Assembler::LT, SIMDExit); 9888 generate_base64_encode_simdround(src, dst, v0, 8); 9889 __ sub(length, length, 24); 9890 9891 __ BIND(SIMDExit); 9892 __ cbz(length, Exit); 9893 9894 __ BIND(Process3B); 9895 // 3 src bytes, 24 bits 9896 __ ldrb(r10, __ post(src, 1)); 9897 __ ldrb(r11, __ post(src, 1)); 9898 __ ldrb(r12, __ post(src, 1)); 9899 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9900 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9901 // codec index 9902 __ ubfmw(r15, r12, 18, 23); 9903 __ ubfmw(r14, r12, 12, 17); 9904 __ ubfmw(r13, r12, 6, 11); 9905 __ andw(r12, r12, 63); 9906 // get the code based on the codec 9907 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9908 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9909 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9910 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9911 __ strb(r15, __ post(dst, 1)); 9912 __ strb(r14, __ post(dst, 1)); 9913 __ strb(r13, __ post(dst, 1)); 9914 __ strb(r12, __ post(dst, 1)); 9915 __ sub(length, length, 3); 9916 __ cbnz(length, Process3B); 9917 9918 __ BIND(Exit); 9919 __ ret(lr); 9920 9921 return start; 9922 } 9923 9924 void generate_base64_decode_simdround(Register src, Register dst, 9925 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9926 9927 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9928 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9929 9930 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9931 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9932 9933 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9934 9935 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9936 9937 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9938 9939 // we need unsigned saturating subtract, to make sure all input values 9940 // in range [0, 63] will have 0U value in the higher half lookup 9941 __ uqsubv(decH0, __ T16B, in0, v27); 9942 __ uqsubv(decH1, __ T16B, in1, v27); 9943 __ uqsubv(decH2, __ T16B, in2, v27); 9944 __ uqsubv(decH3, __ T16B, in3, v27); 9945 9946 // lower half lookup 9947 __ tbl(decL0, arrangement, codecL, 4, in0); 9948 __ tbl(decL1, arrangement, codecL, 4, in1); 9949 __ tbl(decL2, arrangement, codecL, 4, in2); 9950 __ tbl(decL3, arrangement, codecL, 4, in3); 9951 9952 // higher half lookup 9953 __ tbx(decH0, arrangement, codecH, 4, decH0); 9954 __ tbx(decH1, arrangement, codecH, 4, decH1); 9955 __ tbx(decH2, arrangement, codecH, 4, decH2); 9956 __ tbx(decH3, arrangement, codecH, 4, decH3); 9957 9958 // combine lower and higher 9959 __ orr(decL0, arrangement, decL0, decH0); 9960 __ orr(decL1, arrangement, decL1, decH1); 9961 __ orr(decL2, arrangement, decL2, decH2); 9962 __ orr(decL3, arrangement, decL3, decH3); 9963 9964 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9965 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9966 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9967 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9968 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9969 __ orr(in0, arrangement, decH0, decH1); 9970 __ orr(in1, arrangement, decH2, decH3); 9971 __ orr(in2, arrangement, in0, in1); 9972 __ umaxv(in3, arrangement, in2); 9973 __ umov(rscratch2, in3, __ B, 0); 9974 9975 // get the data to output 9976 __ shl(out0, arrangement, decL0, 2); 9977 __ ushr(out1, arrangement, decL1, 4); 9978 __ orr(out0, arrangement, out0, out1); 9979 __ shl(out1, arrangement, decL1, 4); 9980 __ ushr(out2, arrangement, decL2, 2); 9981 __ orr(out1, arrangement, out1, out2); 9982 __ shl(out2, arrangement, decL2, 6); 9983 __ orr(out2, arrangement, out2, decL3); 9984 9985 __ cbz(rscratch2, NoIllegalData); 9986 9987 // handle illegal input 9988 __ umov(r10, in2, __ D, 0); 9989 if (size == 16) { 9990 __ cbnz(r10, ErrorInLowerHalf); 9991 9992 // illegal input is in higher half, store the lower half now. 9993 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9994 9995 __ umov(r10, in2, __ D, 1); 9996 __ umov(r11, out0, __ D, 1); 9997 __ umov(r12, out1, __ D, 1); 9998 __ umov(r13, out2, __ D, 1); 9999 __ b(StoreLegalData); 10000 10001 __ BIND(ErrorInLowerHalf); 10002 } 10003 __ umov(r11, out0, __ D, 0); 10004 __ umov(r12, out1, __ D, 0); 10005 __ umov(r13, out2, __ D, 0); 10006 10007 __ BIND(StoreLegalData); 10008 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 10009 __ strb(r11, __ post(dst, 1)); 10010 __ strb(r12, __ post(dst, 1)); 10011 __ strb(r13, __ post(dst, 1)); 10012 __ lsr(r10, r10, 8); 10013 __ lsr(r11, r11, 8); 10014 __ lsr(r12, r12, 8); 10015 __ lsr(r13, r13, 8); 10016 __ b(StoreLegalData); 10017 10018 __ BIND(NoIllegalData); 10019 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 10020 } 10021 10022 10023 /** 10024 * Arguments: 10025 * 10026 * Input: 10027 * c_rarg0 - src_start 10028 * c_rarg1 - src_offset 10029 * c_rarg2 - src_length 10030 * c_rarg3 - dest_start 10031 * c_rarg4 - dest_offset 10032 * c_rarg5 - isURL 10033 * c_rarg6 - isMIME 10034 * 10035 */ 10036 address generate_base64_decodeBlock() { 10037 10038 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 10039 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 10040 // titled "Base64 decoding". 10041 10042 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 10043 // except the trailing character '=' is also treated illegal value in this intrinsic. That 10044 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 10045 static const uint8_t fromBase64ForNoSIMD[256] = { 10046 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10047 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10048 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10049 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10050 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10051 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 10052 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10053 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10054 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10055 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10056 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10057 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10058 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10059 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10060 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10061 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10062 }; 10063 10064 static const uint8_t fromBase64URLForNoSIMD[256] = { 10065 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10066 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10067 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10068 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10069 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10070 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 10071 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10072 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10073 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10074 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10075 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10076 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10077 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10078 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10079 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10080 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10081 }; 10082 10083 // A legal value of base64 code is in range [0, 127]. We need two lookups 10084 // with tbl/tbx and combine them to get the decode data. The 1st table vector 10085 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 10086 // table vector lookup use tbx, out of range indices are unchanged in 10087 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 10088 // The value of index 64 is set to 0, so that we know that we already get the 10089 // decoded data with the 1st lookup. 10090 static const uint8_t fromBase64ForSIMD[128] = { 10091 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10092 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10093 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10094 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10095 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10096 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10097 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10098 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10099 }; 10100 10101 static const uint8_t fromBase64URLForSIMD[128] = { 10102 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10103 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10104 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10105 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10106 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10107 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10108 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10109 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10110 }; 10111 10112 __ align(CodeEntryAlignment); 10113 StubId stub_id = StubId::stubgen_base64_decodeBlock_id; 10114 StubCodeMark mark(this, stub_id); 10115 address start = __ pc(); 10116 10117 Register src = c_rarg0; // source array 10118 Register soff = c_rarg1; // source start offset 10119 Register send = c_rarg2; // source end offset 10120 Register dst = c_rarg3; // dest array 10121 Register doff = c_rarg4; // position for writing to dest array 10122 Register isURL = c_rarg5; // Base64 or URL character set 10123 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 10124 10125 Register length = send; // reuse send as length of source data to process 10126 10127 Register simd_codec = c_rarg6; 10128 Register nosimd_codec = c_rarg7; 10129 10130 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 10131 10132 __ enter(); 10133 10134 __ add(src, src, soff); 10135 __ add(dst, dst, doff); 10136 10137 __ mov(doff, dst); 10138 10139 __ sub(length, send, soff); 10140 __ bfm(length, zr, 0, 1); 10141 10142 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 10143 __ cbz(isURL, ProcessData); 10144 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 10145 10146 __ BIND(ProcessData); 10147 __ mov(rscratch1, length); 10148 __ cmp(length, (u1)144); // 144 = 80 + 64 10149 __ br(Assembler::LT, Process4B); 10150 10151 // In the MIME case, the line length cannot be more than 76 10152 // bytes (see RFC 2045). This is too short a block for SIMD 10153 // to be worthwhile, so we use non-SIMD here. 10154 __ movw(rscratch1, 79); 10155 10156 __ BIND(Process4B); 10157 __ ldrw(r14, __ post(src, 4)); 10158 __ ubfxw(r10, r14, 0, 8); 10159 __ ubfxw(r11, r14, 8, 8); 10160 __ ubfxw(r12, r14, 16, 8); 10161 __ ubfxw(r13, r14, 24, 8); 10162 // get the de-code 10163 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 10164 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 10165 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 10166 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 10167 // error detection, 255u indicates an illegal input 10168 __ orrw(r14, r10, r11); 10169 __ orrw(r15, r12, r13); 10170 __ orrw(r14, r14, r15); 10171 __ tbnz(r14, 7, Exit); 10172 // recover the data 10173 __ lslw(r14, r10, 10); 10174 __ bfiw(r14, r11, 4, 6); 10175 __ bfmw(r14, r12, 2, 5); 10176 __ rev16w(r14, r14); 10177 __ bfiw(r13, r12, 6, 2); 10178 __ strh(r14, __ post(dst, 2)); 10179 __ strb(r13, __ post(dst, 1)); 10180 // non-simd loop 10181 __ subsw(rscratch1, rscratch1, 4); 10182 __ br(Assembler::GT, Process4B); 10183 10184 // if exiting from PreProcess80B, rscratch1 == -1; 10185 // otherwise, rscratch1 == 0. 10186 __ cbzw(rscratch1, Exit); 10187 __ sub(length, length, 80); 10188 10189 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 10190 __ cbz(isURL, SIMDEnter); 10191 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 10192 10193 __ BIND(SIMDEnter); 10194 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 10195 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 10196 __ mov(rscratch1, 63); 10197 __ dup(v27, __ T16B, rscratch1); 10198 10199 __ BIND(Process64B); 10200 __ cmp(length, (u1)64); 10201 __ br(Assembler::LT, Process32B); 10202 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 10203 __ sub(length, length, 64); 10204 __ b(Process64B); 10205 10206 __ BIND(Process32B); 10207 __ cmp(length, (u1)32); 10208 __ br(Assembler::LT, SIMDExit); 10209 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 10210 __ sub(length, length, 32); 10211 __ b(Process32B); 10212 10213 __ BIND(SIMDExit); 10214 __ cbz(length, Exit); 10215 __ movw(rscratch1, length); 10216 __ b(Process4B); 10217 10218 __ BIND(Exit); 10219 __ sub(c_rarg0, dst, doff); 10220 10221 __ leave(); 10222 __ ret(lr); 10223 10224 return start; 10225 } 10226 10227 // Support for spin waits. 10228 address generate_spin_wait() { 10229 __ align(CodeEntryAlignment); 10230 StubId stub_id = StubId::stubgen_spin_wait_id; 10231 StubCodeMark mark(this, stub_id); 10232 address start = __ pc(); 10233 10234 __ spin_wait(); 10235 __ ret(lr); 10236 10237 return start; 10238 } 10239 10240 void generate_lookup_secondary_supers_table_stub() { 10241 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id; 10242 StubCodeMark mark(this, stub_id); 10243 10244 const Register 10245 r_super_klass = r0, 10246 r_array_base = r1, 10247 r_array_length = r2, 10248 r_array_index = r3, 10249 r_sub_klass = r4, 10250 r_bitmap = rscratch2, 10251 result = r5; 10252 const FloatRegister 10253 vtemp = v0; 10254 10255 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 10256 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 10257 Label L_success; 10258 __ enter(); 10259 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 10260 r_array_base, r_array_length, r_array_index, 10261 vtemp, result, slot, 10262 /*stub_is_near*/true); 10263 __ leave(); 10264 __ ret(lr); 10265 } 10266 } 10267 10268 // Slow path implementation for UseSecondarySupersTable. 10269 address generate_lookup_secondary_supers_table_slow_path_stub() { 10270 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id; 10271 StubCodeMark mark(this, stub_id); 10272 10273 address start = __ pc(); 10274 const Register 10275 r_super_klass = r0, // argument 10276 r_array_base = r1, // argument 10277 temp1 = r2, // temp 10278 r_array_index = r3, // argument 10279 r_bitmap = rscratch2, // argument 10280 result = r5; // argument 10281 10282 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 10283 __ ret(lr); 10284 10285 return start; 10286 } 10287 10288 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10289 10290 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 10291 // 10292 // If LSE is in use, generate LSE versions of all the stubs. The 10293 // non-LSE versions are in atomic_aarch64.S. 10294 10295 // class AtomicStubMark records the entry point of a stub and the 10296 // stub pointer which will point to it. The stub pointer is set to 10297 // the entry point when ~AtomicStubMark() is called, which must be 10298 // after ICache::invalidate_range. This ensures safe publication of 10299 // the generated code. 10300 class AtomicStubMark { 10301 address _entry_point; 10302 aarch64_atomic_stub_t *_stub; 10303 MacroAssembler *_masm; 10304 public: 10305 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 10306 _masm = masm; 10307 __ align(32); 10308 _entry_point = __ pc(); 10309 _stub = stub; 10310 } 10311 ~AtomicStubMark() { 10312 *_stub = (aarch64_atomic_stub_t)_entry_point; 10313 } 10314 }; 10315 10316 // NB: For memory_order_conservative we need a trailing membar after 10317 // LSE atomic operations but not a leading membar. 10318 // 10319 // We don't need a leading membar because a clause in the Arm ARM 10320 // says: 10321 // 10322 // Barrier-ordered-before 10323 // 10324 // Barrier instructions order prior Memory effects before subsequent 10325 // Memory effects generated by the same Observer. A read or a write 10326 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 10327 // Observer if and only if RW1 appears in program order before RW 2 10328 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 10329 // instruction with both Acquire and Release semantics. 10330 // 10331 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 10332 // and Release semantics, therefore we don't need a leading 10333 // barrier. However, there is no corresponding Barrier-ordered-after 10334 // relationship, therefore we need a trailing membar to prevent a 10335 // later store or load from being reordered with the store in an 10336 // atomic instruction. 10337 // 10338 // This was checked by using the herd7 consistency model simulator 10339 // (http://diy.inria.fr/) with this test case: 10340 // 10341 // AArch64 LseCas 10342 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 10343 // P0 | P1; 10344 // LDR W4, [X2] | MOV W3, #0; 10345 // DMB LD | MOV W4, #1; 10346 // LDR W3, [X1] | CASAL W3, W4, [X1]; 10347 // | DMB ISH; 10348 // | STR W4, [X2]; 10349 // exists 10350 // (0:X3=0 /\ 0:X4=1) 10351 // 10352 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 10353 // with the store to x in P1. Without the DMB in P1 this may happen. 10354 // 10355 // At the time of writing we don't know of any AArch64 hardware that 10356 // reorders stores in this way, but the Reference Manual permits it. 10357 10358 void gen_cas_entry(Assembler::operand_size size, 10359 atomic_memory_order order) { 10360 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 10361 exchange_val = c_rarg2; 10362 bool acquire, release; 10363 switch (order) { 10364 case memory_order_relaxed: 10365 acquire = false; 10366 release = false; 10367 break; 10368 case memory_order_release: 10369 acquire = false; 10370 release = true; 10371 break; 10372 default: 10373 acquire = true; 10374 release = true; 10375 break; 10376 } 10377 __ mov(prev, compare_val); 10378 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 10379 if (order == memory_order_conservative) { 10380 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10381 } 10382 if (size == Assembler::xword) { 10383 __ mov(r0, prev); 10384 } else { 10385 __ movw(r0, prev); 10386 } 10387 __ ret(lr); 10388 } 10389 10390 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10391 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10392 // If not relaxed, then default to conservative. Relaxed is the only 10393 // case we use enough to be worth specializing. 10394 if (order == memory_order_relaxed) { 10395 __ ldadd(size, incr, prev, addr); 10396 } else { 10397 __ ldaddal(size, incr, prev, addr); 10398 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10399 } 10400 if (size == Assembler::xword) { 10401 __ mov(r0, prev); 10402 } else { 10403 __ movw(r0, prev); 10404 } 10405 __ ret(lr); 10406 } 10407 10408 void gen_swpal_entry(Assembler::operand_size size) { 10409 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10410 __ swpal(size, incr, prev, addr); 10411 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10412 if (size == Assembler::xword) { 10413 __ mov(r0, prev); 10414 } else { 10415 __ movw(r0, prev); 10416 } 10417 __ ret(lr); 10418 } 10419 10420 void generate_atomic_entry_points() { 10421 if (! UseLSE) { 10422 return; 10423 } 10424 __ align(CodeEntryAlignment); 10425 StubId stub_id = StubId::stubgen_atomic_entry_points_id; 10426 StubCodeMark mark(this, stub_id); 10427 address first_entry = __ pc(); 10428 10429 // ADD, memory_order_conservative 10430 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10431 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10432 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10433 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10434 10435 // ADD, memory_order_relaxed 10436 AtomicStubMark mark_fetch_add_4_relaxed 10437 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10438 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10439 AtomicStubMark mark_fetch_add_8_relaxed 10440 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10441 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10442 10443 // XCHG, memory_order_conservative 10444 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10445 gen_swpal_entry(Assembler::word); 10446 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10447 gen_swpal_entry(Assembler::xword); 10448 10449 // CAS, memory_order_conservative 10450 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10451 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10452 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10453 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10454 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10455 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10456 10457 // CAS, memory_order_relaxed 10458 AtomicStubMark mark_cmpxchg_1_relaxed 10459 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10460 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10461 AtomicStubMark mark_cmpxchg_4_relaxed 10462 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10463 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10464 AtomicStubMark mark_cmpxchg_8_relaxed 10465 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10466 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10467 10468 AtomicStubMark mark_cmpxchg_4_release 10469 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10470 gen_cas_entry(MacroAssembler::word, memory_order_release); 10471 AtomicStubMark mark_cmpxchg_8_release 10472 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10473 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10474 10475 AtomicStubMark mark_cmpxchg_4_seq_cst 10476 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10477 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10478 AtomicStubMark mark_cmpxchg_8_seq_cst 10479 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10480 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10481 10482 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10483 } 10484 #endif // LINUX 10485 10486 static void save_return_registers(MacroAssembler* masm) { 10487 if (InlineTypeReturnedAsFields) { 10488 masm->push(RegSet::range(r0, r7), sp); 10489 masm->sub(sp, sp, 4 * wordSize); 10490 masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp)); 10491 masm->sub(sp, sp, 4 * wordSize); 10492 masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp)); 10493 } else { 10494 masm->fmovd(rscratch1, v0); 10495 masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize))); 10496 } 10497 } 10498 10499 static void restore_return_registers(MacroAssembler* masm) { 10500 if (InlineTypeReturnedAsFields) { 10501 masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize))); 10502 masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize))); 10503 masm->pop(RegSet::range(r0, r7), sp); 10504 } else { 10505 masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize))); 10506 masm->fmovd(v0, rscratch1); 10507 } 10508 } 10509 10510 address generate_cont_thaw(Continuation::thaw_kind kind) { 10511 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10512 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10513 10514 address start = __ pc(); 10515 10516 if (return_barrier) { 10517 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10518 __ mov(sp, rscratch1); 10519 } 10520 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10521 10522 if (return_barrier) { 10523 // preserve possible return value from a method returning to the return barrier 10524 save_return_registers(_masm); 10525 } 10526 10527 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10528 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10529 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10530 10531 if (return_barrier) { 10532 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10533 restore_return_registers(_masm); 10534 } 10535 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10536 10537 10538 Label thaw_success; 10539 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10540 __ cbnz(rscratch2, thaw_success); 10541 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10542 __ br(rscratch1); 10543 __ bind(thaw_success); 10544 10545 // make room for the thawed frames 10546 __ sub(rscratch1, sp, rscratch2); 10547 __ andr(rscratch1, rscratch1, -16); // align 10548 __ mov(sp, rscratch1); 10549 10550 if (return_barrier) { 10551 // save original return value -- again 10552 save_return_registers(_masm); 10553 } 10554 10555 // If we want, we can templatize thaw by kind, and have three different entries 10556 __ movw(c_rarg1, (uint32_t)kind); 10557 10558 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10559 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10560 10561 if (return_barrier) { 10562 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10563 restore_return_registers(_masm); 10564 } else { 10565 __ mov(r0, zr); // return 0 (success) from doYield 10566 } 10567 10568 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10569 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10570 __ mov(rfp, sp); 10571 10572 if (return_barrier_exception) { 10573 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10574 __ authenticate_return_address(c_rarg1); 10575 __ verify_oop(r0); 10576 // save return value containing the exception oop in callee-saved R19 10577 __ mov(r19, r0); 10578 10579 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10580 10581 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10582 // __ reinitialize_ptrue(); 10583 10584 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10585 10586 __ mov(r1, r0); // the exception handler 10587 __ mov(r0, r19); // restore return value containing the exception oop 10588 __ verify_oop(r0); 10589 10590 __ leave(); 10591 __ mov(r3, lr); 10592 __ br(r1); // the exception handler 10593 } else { 10594 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10595 __ leave(); 10596 __ ret(lr); 10597 } 10598 10599 return start; 10600 } 10601 10602 address generate_cont_thaw() { 10603 if (!Continuations::enabled()) return nullptr; 10604 10605 StubId stub_id = StubId::stubgen_cont_thaw_id; 10606 StubCodeMark mark(this, stub_id); 10607 address start = __ pc(); 10608 generate_cont_thaw(Continuation::thaw_top); 10609 return start; 10610 } 10611 10612 address generate_cont_returnBarrier() { 10613 if (!Continuations::enabled()) return nullptr; 10614 10615 // TODO: will probably need multiple return barriers depending on return type 10616 StubId stub_id = StubId::stubgen_cont_returnBarrier_id; 10617 StubCodeMark mark(this, stub_id); 10618 address start = __ pc(); 10619 10620 generate_cont_thaw(Continuation::thaw_return_barrier); 10621 10622 return start; 10623 } 10624 10625 address generate_cont_returnBarrier_exception() { 10626 if (!Continuations::enabled()) return nullptr; 10627 10628 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id; 10629 StubCodeMark mark(this, stub_id); 10630 address start = __ pc(); 10631 10632 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10633 10634 return start; 10635 } 10636 10637 address generate_cont_preempt_stub() { 10638 if (!Continuations::enabled()) return nullptr; 10639 StubId stub_id = StubId::stubgen_cont_preempt_id; 10640 StubCodeMark mark(this, stub_id); 10641 address start = __ pc(); 10642 10643 __ reset_last_Java_frame(true); 10644 10645 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10646 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10647 __ mov(sp, rscratch2); 10648 10649 Label preemption_cancelled; 10650 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10651 __ cbnz(rscratch1, preemption_cancelled); 10652 10653 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10654 SharedRuntime::continuation_enter_cleanup(_masm); 10655 __ leave(); 10656 __ ret(lr); 10657 10658 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10659 __ bind(preemption_cancelled); 10660 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10661 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10662 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10663 __ ldr(rscratch1, Address(rscratch1)); 10664 __ br(rscratch1); 10665 10666 return start; 10667 } 10668 10669 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10670 // are represented as long[5], with BITS_PER_LIMB = 26. 10671 // Pack five 26-bit limbs into three 64-bit registers. 10672 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10673 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10674 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10675 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10676 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10677 10678 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10679 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10680 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10681 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10682 10683 if (dest2->is_valid()) { 10684 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10685 } else { 10686 #ifdef ASSERT 10687 Label OK; 10688 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10689 __ br(__ EQ, OK); 10690 __ stop("high bits of Poly1305 integer should be zero"); 10691 __ should_not_reach_here(); 10692 __ bind(OK); 10693 #endif 10694 } 10695 } 10696 10697 // As above, but return only a 128-bit integer, packed into two 10698 // 64-bit registers. 10699 void pack_26(Register dest0, Register dest1, Register src) { 10700 pack_26(dest0, dest1, noreg, src); 10701 } 10702 10703 // Multiply and multiply-accumulate unsigned 64-bit registers. 10704 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10705 __ mul(prod_lo, n, m); 10706 __ umulh(prod_hi, n, m); 10707 } 10708 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10709 wide_mul(rscratch1, rscratch2, n, m); 10710 __ adds(sum_lo, sum_lo, rscratch1); 10711 __ adc(sum_hi, sum_hi, rscratch2); 10712 } 10713 10714 // Poly1305, RFC 7539 10715 10716 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10717 // description of the tricks used to simplify and accelerate this 10718 // computation. 10719 10720 address generate_poly1305_processBlocks() { 10721 __ align(CodeEntryAlignment); 10722 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id; 10723 StubCodeMark mark(this, stub_id); 10724 address start = __ pc(); 10725 Label here; 10726 __ enter(); 10727 RegSet callee_saved = RegSet::range(r19, r28); 10728 __ push(callee_saved, sp); 10729 10730 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10731 10732 // Arguments 10733 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10734 10735 // R_n is the 128-bit randomly-generated key, packed into two 10736 // registers. The caller passes this key to us as long[5], with 10737 // BITS_PER_LIMB = 26. 10738 const Register R_0 = *++regs, R_1 = *++regs; 10739 pack_26(R_0, R_1, r_start); 10740 10741 // RR_n is (R_n >> 2) * 5 10742 const Register RR_0 = *++regs, RR_1 = *++regs; 10743 __ lsr(RR_0, R_0, 2); 10744 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10745 __ lsr(RR_1, R_1, 2); 10746 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10747 10748 // U_n is the current checksum 10749 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10750 pack_26(U_0, U_1, U_2, acc_start); 10751 10752 static constexpr int BLOCK_LENGTH = 16; 10753 Label DONE, LOOP; 10754 10755 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10756 __ br(Assembler::LT, DONE); { 10757 __ bind(LOOP); 10758 10759 // S_n is to be the sum of U_n and the next block of data 10760 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10761 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10762 __ adds(S_0, U_0, S_0); 10763 __ adcs(S_1, U_1, S_1); 10764 __ adc(S_2, U_2, zr); 10765 __ add(S_2, S_2, 1); 10766 10767 const Register U_0HI = *++regs, U_1HI = *++regs; 10768 10769 // NB: this logic depends on some of the special properties of 10770 // Poly1305 keys. In particular, because we know that the top 10771 // four bits of R_0 and R_1 are zero, we can add together 10772 // partial products without any risk of needing to propagate a 10773 // carry out. 10774 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10775 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10776 __ andr(U_2, R_0, 3); 10777 __ mul(U_2, S_2, U_2); 10778 10779 // Recycle registers S_0, S_1, S_2 10780 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10781 10782 // Partial reduction mod 2**130 - 5 10783 __ adds(U_1, U_0HI, U_1); 10784 __ adc(U_2, U_1HI, U_2); 10785 // Sum now in U_2:U_1:U_0. 10786 // Dead: U_0HI, U_1HI. 10787 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10788 10789 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10790 10791 // First, U_2:U_1:U_0 += (U_2 >> 2) 10792 __ lsr(rscratch1, U_2, 2); 10793 __ andr(U_2, U_2, (u8)3); 10794 __ adds(U_0, U_0, rscratch1); 10795 __ adcs(U_1, U_1, zr); 10796 __ adc(U_2, U_2, zr); 10797 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10798 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10799 __ adcs(U_1, U_1, zr); 10800 __ adc(U_2, U_2, zr); 10801 10802 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10803 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10804 __ br(~ Assembler::LT, LOOP); 10805 } 10806 10807 // Further reduce modulo 2^130 - 5 10808 __ lsr(rscratch1, U_2, 2); 10809 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10810 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10811 __ adcs(U_1, U_1, zr); 10812 __ andr(U_2, U_2, (u1)3); 10813 __ adc(U_2, U_2, zr); 10814 10815 // Unpack the sum into five 26-bit limbs and write to memory. 10816 __ ubfiz(rscratch1, U_0, 0, 26); 10817 __ ubfx(rscratch2, U_0, 26, 26); 10818 __ stp(rscratch1, rscratch2, Address(acc_start)); 10819 __ ubfx(rscratch1, U_0, 52, 12); 10820 __ bfi(rscratch1, U_1, 12, 14); 10821 __ ubfx(rscratch2, U_1, 14, 26); 10822 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10823 __ ubfx(rscratch1, U_1, 40, 24); 10824 __ bfi(rscratch1, U_2, 24, 3); 10825 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10826 10827 __ bind(DONE); 10828 __ pop(callee_saved, sp); 10829 __ leave(); 10830 __ ret(lr); 10831 10832 return start; 10833 } 10834 10835 // exception handler for upcall stubs 10836 address generate_upcall_stub_exception_handler() { 10837 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id; 10838 StubCodeMark mark(this, stub_id); 10839 address start = __ pc(); 10840 10841 // Native caller has no idea how to handle exceptions, 10842 // so we just crash here. Up to callee to catch exceptions. 10843 __ verify_oop(r0); 10844 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10845 __ blr(rscratch1); 10846 __ should_not_reach_here(); 10847 10848 return start; 10849 } 10850 10851 // load Method* target of MethodHandle 10852 // j_rarg0 = jobject receiver 10853 // rmethod = result 10854 address generate_upcall_stub_load_target() { 10855 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id; 10856 StubCodeMark mark(this, stub_id); 10857 address start = __ pc(); 10858 10859 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10860 // Load target method from receiver 10861 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10862 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10863 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10864 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10865 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10866 noreg, noreg); 10867 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10868 10869 __ ret(lr); 10870 10871 return start; 10872 } 10873 10874 #undef __ 10875 #define __ masm-> 10876 10877 class MontgomeryMultiplyGenerator : public MacroAssembler { 10878 10879 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10880 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10881 10882 RegSet _toSave; 10883 bool _squaring; 10884 10885 public: 10886 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10887 : MacroAssembler(as->code()), _squaring(squaring) { 10888 10889 // Register allocation 10890 10891 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10892 Pa_base = *regs; // Argument registers 10893 if (squaring) 10894 Pb_base = Pa_base; 10895 else 10896 Pb_base = *++regs; 10897 Pn_base = *++regs; 10898 Rlen= *++regs; 10899 inv = *++regs; 10900 Pm_base = *++regs; 10901 10902 // Working registers: 10903 Ra = *++regs; // The current digit of a, b, n, and m. 10904 Rb = *++regs; 10905 Rm = *++regs; 10906 Rn = *++regs; 10907 10908 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10909 Pb = *++regs; 10910 Pm = *++regs; 10911 Pn = *++regs; 10912 10913 t0 = *++regs; // Three registers which form a 10914 t1 = *++regs; // triple-precision accumuator. 10915 t2 = *++regs; 10916 10917 Ri = *++regs; // Inner and outer loop indexes. 10918 Rj = *++regs; 10919 10920 Rhi_ab = *++regs; // Product registers: low and high parts 10921 Rlo_ab = *++regs; // of a*b and m*n. 10922 Rhi_mn = *++regs; 10923 Rlo_mn = *++regs; 10924 10925 // r19 and up are callee-saved. 10926 _toSave = RegSet::range(r19, *regs) + Pm_base; 10927 } 10928 10929 private: 10930 void save_regs() { 10931 push(_toSave, sp); 10932 } 10933 10934 void restore_regs() { 10935 pop(_toSave, sp); 10936 } 10937 10938 template <typename T> 10939 void unroll_2(Register count, T block) { 10940 Label loop, end, odd; 10941 tbnz(count, 0, odd); 10942 cbz(count, end); 10943 align(16); 10944 bind(loop); 10945 (this->*block)(); 10946 bind(odd); 10947 (this->*block)(); 10948 subs(count, count, 2); 10949 br(Assembler::GT, loop); 10950 bind(end); 10951 } 10952 10953 template <typename T> 10954 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10955 Label loop, end, odd; 10956 tbnz(count, 0, odd); 10957 cbz(count, end); 10958 align(16); 10959 bind(loop); 10960 (this->*block)(d, s, tmp); 10961 bind(odd); 10962 (this->*block)(d, s, tmp); 10963 subs(count, count, 2); 10964 br(Assembler::GT, loop); 10965 bind(end); 10966 } 10967 10968 void pre1(RegisterOrConstant i) { 10969 block_comment("pre1"); 10970 // Pa = Pa_base; 10971 // Pb = Pb_base + i; 10972 // Pm = Pm_base; 10973 // Pn = Pn_base + i; 10974 // Ra = *Pa; 10975 // Rb = *Pb; 10976 // Rm = *Pm; 10977 // Rn = *Pn; 10978 ldr(Ra, Address(Pa_base)); 10979 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10980 ldr(Rm, Address(Pm_base)); 10981 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10982 lea(Pa, Address(Pa_base)); 10983 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10984 lea(Pm, Address(Pm_base)); 10985 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10986 10987 // Zero the m*n result. 10988 mov(Rhi_mn, zr); 10989 mov(Rlo_mn, zr); 10990 } 10991 10992 // The core multiply-accumulate step of a Montgomery 10993 // multiplication. The idea is to schedule operations as a 10994 // pipeline so that instructions with long latencies (loads and 10995 // multiplies) have time to complete before their results are 10996 // used. This most benefits in-order implementations of the 10997 // architecture but out-of-order ones also benefit. 10998 void step() { 10999 block_comment("step"); 11000 // MACC(Ra, Rb, t0, t1, t2); 11001 // Ra = *++Pa; 11002 // Rb = *--Pb; 11003 umulh(Rhi_ab, Ra, Rb); 11004 mul(Rlo_ab, Ra, Rb); 11005 ldr(Ra, pre(Pa, wordSize)); 11006 ldr(Rb, pre(Pb, -wordSize)); 11007 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 11008 // previous iteration. 11009 // MACC(Rm, Rn, t0, t1, t2); 11010 // Rm = *++Pm; 11011 // Rn = *--Pn; 11012 umulh(Rhi_mn, Rm, Rn); 11013 mul(Rlo_mn, Rm, Rn); 11014 ldr(Rm, pre(Pm, wordSize)); 11015 ldr(Rn, pre(Pn, -wordSize)); 11016 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11017 } 11018 11019 void post1() { 11020 block_comment("post1"); 11021 11022 // MACC(Ra, Rb, t0, t1, t2); 11023 // Ra = *++Pa; 11024 // Rb = *--Pb; 11025 umulh(Rhi_ab, Ra, Rb); 11026 mul(Rlo_ab, Ra, Rb); 11027 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11028 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11029 11030 // *Pm = Rm = t0 * inv; 11031 mul(Rm, t0, inv); 11032 str(Rm, Address(Pm)); 11033 11034 // MACC(Rm, Rn, t0, t1, t2); 11035 // t0 = t1; t1 = t2; t2 = 0; 11036 umulh(Rhi_mn, Rm, Rn); 11037 11038 #ifndef PRODUCT 11039 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11040 { 11041 mul(Rlo_mn, Rm, Rn); 11042 add(Rlo_mn, t0, Rlo_mn); 11043 Label ok; 11044 cbz(Rlo_mn, ok); { 11045 stop("broken Montgomery multiply"); 11046 } bind(ok); 11047 } 11048 #endif 11049 // We have very carefully set things up so that 11050 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11051 // the lower half of Rm * Rn because we know the result already: 11052 // it must be -t0. t0 + (-t0) must generate a carry iff 11053 // t0 != 0. So, rather than do a mul and an adds we just set 11054 // the carry flag iff t0 is nonzero. 11055 // 11056 // mul(Rlo_mn, Rm, Rn); 11057 // adds(zr, t0, Rlo_mn); 11058 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11059 adcs(t0, t1, Rhi_mn); 11060 adc(t1, t2, zr); 11061 mov(t2, zr); 11062 } 11063 11064 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 11065 block_comment("pre2"); 11066 // Pa = Pa_base + i-len; 11067 // Pb = Pb_base + len; 11068 // Pm = Pm_base + i-len; 11069 // Pn = Pn_base + len; 11070 11071 if (i.is_register()) { 11072 sub(Rj, i.as_register(), len); 11073 } else { 11074 mov(Rj, i.as_constant()); 11075 sub(Rj, Rj, len); 11076 } 11077 // Rj == i-len 11078 11079 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 11080 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 11081 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11082 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 11083 11084 // Ra = *++Pa; 11085 // Rb = *--Pb; 11086 // Rm = *++Pm; 11087 // Rn = *--Pn; 11088 ldr(Ra, pre(Pa, wordSize)); 11089 ldr(Rb, pre(Pb, -wordSize)); 11090 ldr(Rm, pre(Pm, wordSize)); 11091 ldr(Rn, pre(Pn, -wordSize)); 11092 11093 mov(Rhi_mn, zr); 11094 mov(Rlo_mn, zr); 11095 } 11096 11097 void post2(RegisterOrConstant i, RegisterOrConstant len) { 11098 block_comment("post2"); 11099 if (i.is_constant()) { 11100 mov(Rj, i.as_constant()-len.as_constant()); 11101 } else { 11102 sub(Rj, i.as_register(), len); 11103 } 11104 11105 adds(t0, t0, Rlo_mn); // The pending m*n, low part 11106 11107 // As soon as we know the least significant digit of our result, 11108 // store it. 11109 // Pm_base[i-len] = t0; 11110 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11111 11112 // t0 = t1; t1 = t2; t2 = 0; 11113 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 11114 adc(t1, t2, zr); 11115 mov(t2, zr); 11116 } 11117 11118 // A carry in t0 after Montgomery multiplication means that we 11119 // should subtract multiples of n from our result in m. We'll 11120 // keep doing that until there is no carry. 11121 void normalize(RegisterOrConstant len) { 11122 block_comment("normalize"); 11123 // while (t0) 11124 // t0 = sub(Pm_base, Pn_base, t0, len); 11125 Label loop, post, again; 11126 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 11127 cbz(t0, post); { 11128 bind(again); { 11129 mov(i, zr); 11130 mov(cnt, len); 11131 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11132 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11133 subs(zr, zr, zr); // set carry flag, i.e. no borrow 11134 align(16); 11135 bind(loop); { 11136 sbcs(Rm, Rm, Rn); 11137 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11138 add(i, i, 1); 11139 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11140 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11141 sub(cnt, cnt, 1); 11142 } cbnz(cnt, loop); 11143 sbc(t0, t0, zr); 11144 } cbnz(t0, again); 11145 } bind(post); 11146 } 11147 11148 // Move memory at s to d, reversing words. 11149 // Increments d to end of copied memory 11150 // Destroys tmp1, tmp2 11151 // Preserves len 11152 // Leaves s pointing to the address which was in d at start 11153 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 11154 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 11155 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 11156 11157 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 11158 mov(tmp1, len); 11159 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 11160 sub(s, d, len, ext::uxtw, LogBytesPerWord); 11161 } 11162 // where 11163 void reverse1(Register d, Register s, Register tmp) { 11164 ldr(tmp, pre(s, -wordSize)); 11165 ror(tmp, tmp, 32); 11166 str(tmp, post(d, wordSize)); 11167 } 11168 11169 void step_squaring() { 11170 // An extra ACC 11171 step(); 11172 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11173 } 11174 11175 void last_squaring(RegisterOrConstant i) { 11176 Label dont; 11177 // if ((i & 1) == 0) { 11178 tbnz(i.as_register(), 0, dont); { 11179 // MACC(Ra, Rb, t0, t1, t2); 11180 // Ra = *++Pa; 11181 // Rb = *--Pb; 11182 umulh(Rhi_ab, Ra, Rb); 11183 mul(Rlo_ab, Ra, Rb); 11184 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11185 } bind(dont); 11186 } 11187 11188 void extra_step_squaring() { 11189 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11190 11191 // MACC(Rm, Rn, t0, t1, t2); 11192 // Rm = *++Pm; 11193 // Rn = *--Pn; 11194 umulh(Rhi_mn, Rm, Rn); 11195 mul(Rlo_mn, Rm, Rn); 11196 ldr(Rm, pre(Pm, wordSize)); 11197 ldr(Rn, pre(Pn, -wordSize)); 11198 } 11199 11200 void post1_squaring() { 11201 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11202 11203 // *Pm = Rm = t0 * inv; 11204 mul(Rm, t0, inv); 11205 str(Rm, Address(Pm)); 11206 11207 // MACC(Rm, Rn, t0, t1, t2); 11208 // t0 = t1; t1 = t2; t2 = 0; 11209 umulh(Rhi_mn, Rm, Rn); 11210 11211 #ifndef PRODUCT 11212 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11213 { 11214 mul(Rlo_mn, Rm, Rn); 11215 add(Rlo_mn, t0, Rlo_mn); 11216 Label ok; 11217 cbz(Rlo_mn, ok); { 11218 stop("broken Montgomery multiply"); 11219 } bind(ok); 11220 } 11221 #endif 11222 // We have very carefully set things up so that 11223 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11224 // the lower half of Rm * Rn because we know the result already: 11225 // it must be -t0. t0 + (-t0) must generate a carry iff 11226 // t0 != 0. So, rather than do a mul and an adds we just set 11227 // the carry flag iff t0 is nonzero. 11228 // 11229 // mul(Rlo_mn, Rm, Rn); 11230 // adds(zr, t0, Rlo_mn); 11231 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11232 adcs(t0, t1, Rhi_mn); 11233 adc(t1, t2, zr); 11234 mov(t2, zr); 11235 } 11236 11237 void acc(Register Rhi, Register Rlo, 11238 Register t0, Register t1, Register t2) { 11239 adds(t0, t0, Rlo); 11240 adcs(t1, t1, Rhi); 11241 adc(t2, t2, zr); 11242 } 11243 11244 public: 11245 /** 11246 * Fast Montgomery multiplication. The derivation of the 11247 * algorithm is in A Cryptographic Library for the Motorola 11248 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 11249 * 11250 * Arguments: 11251 * 11252 * Inputs for multiplication: 11253 * c_rarg0 - int array elements a 11254 * c_rarg1 - int array elements b 11255 * c_rarg2 - int array elements n (the modulus) 11256 * c_rarg3 - int length 11257 * c_rarg4 - int inv 11258 * c_rarg5 - int array elements m (the result) 11259 * 11260 * Inputs for squaring: 11261 * c_rarg0 - int array elements a 11262 * c_rarg1 - int array elements n (the modulus) 11263 * c_rarg2 - int length 11264 * c_rarg3 - int inv 11265 * c_rarg4 - int array elements m (the result) 11266 * 11267 */ 11268 address generate_multiply() { 11269 Label argh, nothing; 11270 bind(argh); 11271 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11272 11273 align(CodeEntryAlignment); 11274 address entry = pc(); 11275 11276 cbzw(Rlen, nothing); 11277 11278 enter(); 11279 11280 // Make room. 11281 cmpw(Rlen, 512); 11282 br(Assembler::HI, argh); 11283 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11284 andr(sp, Ra, -2 * wordSize); 11285 11286 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11287 11288 { 11289 // Copy input args, reversing as we go. We use Ra as a 11290 // temporary variable. 11291 reverse(Ra, Pa_base, Rlen, t0, t1); 11292 if (!_squaring) 11293 reverse(Ra, Pb_base, Rlen, t0, t1); 11294 reverse(Ra, Pn_base, Rlen, t0, t1); 11295 } 11296 11297 // Push all call-saved registers and also Pm_base which we'll need 11298 // at the end. 11299 save_regs(); 11300 11301 #ifndef PRODUCT 11302 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 11303 { 11304 ldr(Rn, Address(Pn_base, 0)); 11305 mul(Rlo_mn, Rn, inv); 11306 subs(zr, Rlo_mn, -1); 11307 Label ok; 11308 br(EQ, ok); { 11309 stop("broken inverse in Montgomery multiply"); 11310 } bind(ok); 11311 } 11312 #endif 11313 11314 mov(Pm_base, Ra); 11315 11316 mov(t0, zr); 11317 mov(t1, zr); 11318 mov(t2, zr); 11319 11320 block_comment("for (int i = 0; i < len; i++) {"); 11321 mov(Ri, zr); { 11322 Label loop, end; 11323 cmpw(Ri, Rlen); 11324 br(Assembler::GE, end); 11325 11326 bind(loop); 11327 pre1(Ri); 11328 11329 block_comment(" for (j = i; j; j--) {"); { 11330 movw(Rj, Ri); 11331 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11332 } block_comment(" } // j"); 11333 11334 post1(); 11335 addw(Ri, Ri, 1); 11336 cmpw(Ri, Rlen); 11337 br(Assembler::LT, loop); 11338 bind(end); 11339 block_comment("} // i"); 11340 } 11341 11342 block_comment("for (int i = len; i < 2*len; i++) {"); 11343 mov(Ri, Rlen); { 11344 Label loop, end; 11345 cmpw(Ri, Rlen, Assembler::LSL, 1); 11346 br(Assembler::GE, end); 11347 11348 bind(loop); 11349 pre2(Ri, Rlen); 11350 11351 block_comment(" for (j = len*2-i-1; j; j--) {"); { 11352 lslw(Rj, Rlen, 1); 11353 subw(Rj, Rj, Ri); 11354 subw(Rj, Rj, 1); 11355 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11356 } block_comment(" } // j"); 11357 11358 post2(Ri, Rlen); 11359 addw(Ri, Ri, 1); 11360 cmpw(Ri, Rlen, Assembler::LSL, 1); 11361 br(Assembler::LT, loop); 11362 bind(end); 11363 } 11364 block_comment("} // i"); 11365 11366 normalize(Rlen); 11367 11368 mov(Ra, Pm_base); // Save Pm_base in Ra 11369 restore_regs(); // Restore caller's Pm_base 11370 11371 // Copy our result into caller's Pm_base 11372 reverse(Pm_base, Ra, Rlen, t0, t1); 11373 11374 leave(); 11375 bind(nothing); 11376 ret(lr); 11377 11378 return entry; 11379 } 11380 // In C, approximately: 11381 11382 // void 11383 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 11384 // julong Pn_base[], julong Pm_base[], 11385 // julong inv, int len) { 11386 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11387 // julong *Pa, *Pb, *Pn, *Pm; 11388 // julong Ra, Rb, Rn, Rm; 11389 11390 // int i; 11391 11392 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11393 11394 // for (i = 0; i < len; i++) { 11395 // int j; 11396 11397 // Pa = Pa_base; 11398 // Pb = Pb_base + i; 11399 // Pm = Pm_base; 11400 // Pn = Pn_base + i; 11401 11402 // Ra = *Pa; 11403 // Rb = *Pb; 11404 // Rm = *Pm; 11405 // Rn = *Pn; 11406 11407 // int iters = i; 11408 // for (j = 0; iters--; j++) { 11409 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11410 // MACC(Ra, Rb, t0, t1, t2); 11411 // Ra = *++Pa; 11412 // Rb = *--Pb; 11413 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11414 // MACC(Rm, Rn, t0, t1, t2); 11415 // Rm = *++Pm; 11416 // Rn = *--Pn; 11417 // } 11418 11419 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11420 // MACC(Ra, Rb, t0, t1, t2); 11421 // *Pm = Rm = t0 * inv; 11422 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11423 // MACC(Rm, Rn, t0, t1, t2); 11424 11425 // assert(t0 == 0, "broken Montgomery multiply"); 11426 11427 // t0 = t1; t1 = t2; t2 = 0; 11428 // } 11429 11430 // for (i = len; i < 2*len; i++) { 11431 // int j; 11432 11433 // Pa = Pa_base + i-len; 11434 // Pb = Pb_base + len; 11435 // Pm = Pm_base + i-len; 11436 // Pn = Pn_base + len; 11437 11438 // Ra = *++Pa; 11439 // Rb = *--Pb; 11440 // Rm = *++Pm; 11441 // Rn = *--Pn; 11442 11443 // int iters = len*2-i-1; 11444 // for (j = i-len+1; iters--; j++) { 11445 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11446 // MACC(Ra, Rb, t0, t1, t2); 11447 // Ra = *++Pa; 11448 // Rb = *--Pb; 11449 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11450 // MACC(Rm, Rn, t0, t1, t2); 11451 // Rm = *++Pm; 11452 // Rn = *--Pn; 11453 // } 11454 11455 // Pm_base[i-len] = t0; 11456 // t0 = t1; t1 = t2; t2 = 0; 11457 // } 11458 11459 // while (t0) 11460 // t0 = sub(Pm_base, Pn_base, t0, len); 11461 // } 11462 11463 /** 11464 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11465 * multiplies than Montgomery multiplication so it should be up to 11466 * 25% faster. However, its loop control is more complex and it 11467 * may actually run slower on some machines. 11468 * 11469 * Arguments: 11470 * 11471 * Inputs: 11472 * c_rarg0 - int array elements a 11473 * c_rarg1 - int array elements n (the modulus) 11474 * c_rarg2 - int length 11475 * c_rarg3 - int inv 11476 * c_rarg4 - int array elements m (the result) 11477 * 11478 */ 11479 address generate_square() { 11480 Label argh; 11481 bind(argh); 11482 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11483 11484 align(CodeEntryAlignment); 11485 address entry = pc(); 11486 11487 enter(); 11488 11489 // Make room. 11490 cmpw(Rlen, 512); 11491 br(Assembler::HI, argh); 11492 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11493 andr(sp, Ra, -2 * wordSize); 11494 11495 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11496 11497 { 11498 // Copy input args, reversing as we go. We use Ra as a 11499 // temporary variable. 11500 reverse(Ra, Pa_base, Rlen, t0, t1); 11501 reverse(Ra, Pn_base, Rlen, t0, t1); 11502 } 11503 11504 // Push all call-saved registers and also Pm_base which we'll need 11505 // at the end. 11506 save_regs(); 11507 11508 mov(Pm_base, Ra); 11509 11510 mov(t0, zr); 11511 mov(t1, zr); 11512 mov(t2, zr); 11513 11514 block_comment("for (int i = 0; i < len; i++) {"); 11515 mov(Ri, zr); { 11516 Label loop, end; 11517 bind(loop); 11518 cmp(Ri, Rlen); 11519 br(Assembler::GE, end); 11520 11521 pre1(Ri); 11522 11523 block_comment("for (j = (i+1)/2; j; j--) {"); { 11524 add(Rj, Ri, 1); 11525 lsr(Rj, Rj, 1); 11526 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11527 } block_comment(" } // j"); 11528 11529 last_squaring(Ri); 11530 11531 block_comment(" for (j = i/2; j; j--) {"); { 11532 lsr(Rj, Ri, 1); 11533 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11534 } block_comment(" } // j"); 11535 11536 post1_squaring(); 11537 add(Ri, Ri, 1); 11538 cmp(Ri, Rlen); 11539 br(Assembler::LT, loop); 11540 11541 bind(end); 11542 block_comment("} // i"); 11543 } 11544 11545 block_comment("for (int i = len; i < 2*len; i++) {"); 11546 mov(Ri, Rlen); { 11547 Label loop, end; 11548 bind(loop); 11549 cmp(Ri, Rlen, Assembler::LSL, 1); 11550 br(Assembler::GE, end); 11551 11552 pre2(Ri, Rlen); 11553 11554 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11555 lsl(Rj, Rlen, 1); 11556 sub(Rj, Rj, Ri); 11557 sub(Rj, Rj, 1); 11558 lsr(Rj, Rj, 1); 11559 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11560 } block_comment(" } // j"); 11561 11562 last_squaring(Ri); 11563 11564 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11565 lsl(Rj, Rlen, 1); 11566 sub(Rj, Rj, Ri); 11567 lsr(Rj, Rj, 1); 11568 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11569 } block_comment(" } // j"); 11570 11571 post2(Ri, Rlen); 11572 add(Ri, Ri, 1); 11573 cmp(Ri, Rlen, Assembler::LSL, 1); 11574 11575 br(Assembler::LT, loop); 11576 bind(end); 11577 block_comment("} // i"); 11578 } 11579 11580 normalize(Rlen); 11581 11582 mov(Ra, Pm_base); // Save Pm_base in Ra 11583 restore_regs(); // Restore caller's Pm_base 11584 11585 // Copy our result into caller's Pm_base 11586 reverse(Pm_base, Ra, Rlen, t0, t1); 11587 11588 leave(); 11589 ret(lr); 11590 11591 return entry; 11592 } 11593 // In C, approximately: 11594 11595 // void 11596 // montgomery_square(julong Pa_base[], julong Pn_base[], 11597 // julong Pm_base[], julong inv, int len) { 11598 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11599 // julong *Pa, *Pb, *Pn, *Pm; 11600 // julong Ra, Rb, Rn, Rm; 11601 11602 // int i; 11603 11604 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11605 11606 // for (i = 0; i < len; i++) { 11607 // int j; 11608 11609 // Pa = Pa_base; 11610 // Pb = Pa_base + i; 11611 // Pm = Pm_base; 11612 // Pn = Pn_base + i; 11613 11614 // Ra = *Pa; 11615 // Rb = *Pb; 11616 // Rm = *Pm; 11617 // Rn = *Pn; 11618 11619 // int iters = (i+1)/2; 11620 // for (j = 0; iters--; j++) { 11621 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11622 // MACC2(Ra, Rb, t0, t1, t2); 11623 // Ra = *++Pa; 11624 // Rb = *--Pb; 11625 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11626 // MACC(Rm, Rn, t0, t1, t2); 11627 // Rm = *++Pm; 11628 // Rn = *--Pn; 11629 // } 11630 // if ((i & 1) == 0) { 11631 // assert(Ra == Pa_base[j], "must be"); 11632 // MACC(Ra, Ra, t0, t1, t2); 11633 // } 11634 // iters = i/2; 11635 // assert(iters == i-j, "must be"); 11636 // for (; iters--; j++) { 11637 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11638 // MACC(Rm, Rn, t0, t1, t2); 11639 // Rm = *++Pm; 11640 // Rn = *--Pn; 11641 // } 11642 11643 // *Pm = Rm = t0 * inv; 11644 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11645 // MACC(Rm, Rn, t0, t1, t2); 11646 11647 // assert(t0 == 0, "broken Montgomery multiply"); 11648 11649 // t0 = t1; t1 = t2; t2 = 0; 11650 // } 11651 11652 // for (i = len; i < 2*len; i++) { 11653 // int start = i-len+1; 11654 // int end = start + (len - start)/2; 11655 // int j; 11656 11657 // Pa = Pa_base + i-len; 11658 // Pb = Pa_base + len; 11659 // Pm = Pm_base + i-len; 11660 // Pn = Pn_base + len; 11661 11662 // Ra = *++Pa; 11663 // Rb = *--Pb; 11664 // Rm = *++Pm; 11665 // Rn = *--Pn; 11666 11667 // int iters = (2*len-i-1)/2; 11668 // assert(iters == end-start, "must be"); 11669 // for (j = start; iters--; j++) { 11670 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11671 // MACC2(Ra, Rb, t0, t1, t2); 11672 // Ra = *++Pa; 11673 // Rb = *--Pb; 11674 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11675 // MACC(Rm, Rn, t0, t1, t2); 11676 // Rm = *++Pm; 11677 // Rn = *--Pn; 11678 // } 11679 // if ((i & 1) == 0) { 11680 // assert(Ra == Pa_base[j], "must be"); 11681 // MACC(Ra, Ra, t0, t1, t2); 11682 // } 11683 // iters = (2*len-i)/2; 11684 // assert(iters == len-j, "must be"); 11685 // for (; iters--; j++) { 11686 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11687 // MACC(Rm, Rn, t0, t1, t2); 11688 // Rm = *++Pm; 11689 // Rn = *--Pn; 11690 // } 11691 // Pm_base[i-len] = t0; 11692 // t0 = t1; t1 = t2; t2 = 0; 11693 // } 11694 11695 // while (t0) 11696 // t0 = sub(Pm_base, Pn_base, t0, len); 11697 // } 11698 }; 11699 11700 // Call here from the interpreter or compiled code to either load 11701 // multiple returned values from the inline type instance being 11702 // returned to registers or to store returned values to a newly 11703 // allocated inline type instance. 11704 address generate_return_value_stub(address destination, const char* name, bool has_res) { 11705 // We need to save all registers the calling convention may use so 11706 // the runtime calls read or update those registers. This needs to 11707 // be in sync with SharedRuntime::java_return_convention(). 11708 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 11709 enum layout { 11710 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 11711 j_rarg6_off, j_rarg6_2, 11712 j_rarg5_off, j_rarg5_2, 11713 j_rarg4_off, j_rarg4_2, 11714 j_rarg3_off, j_rarg3_2, 11715 j_rarg2_off, j_rarg2_2, 11716 j_rarg1_off, j_rarg1_2, 11717 j_rarg0_off, j_rarg0_2, 11718 11719 j_farg7_off, j_farg7_2, 11720 j_farg6_off, j_farg6_2, 11721 j_farg5_off, j_farg5_2, 11722 j_farg4_off, j_farg4_2, 11723 j_farg3_off, j_farg3_2, 11724 j_farg2_off, j_farg2_2, 11725 j_farg1_off, j_farg1_2, 11726 j_farg0_off, j_farg0_2, 11727 11728 rfp_off, rfp_off2, 11729 return_off, return_off2, 11730 11731 framesize // inclusive of return address 11732 }; 11733 11734 CodeBuffer code(name, 512, 64); 11735 MacroAssembler* masm = new MacroAssembler(&code); 11736 11737 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 11738 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 11739 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 11740 int frame_size_in_words = frame_size_in_bytes / wordSize; 11741 11742 OopMapSet* oop_maps = new OopMapSet(); 11743 OopMap* map = new OopMap(frame_size_in_slots, 0); 11744 11745 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 11746 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 11747 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 11748 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 11749 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 11750 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 11751 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 11752 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 11753 11754 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 11755 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 11756 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 11757 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 11758 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 11759 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 11760 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 11761 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 11762 11763 address start = __ pc(); 11764 11765 __ enter(); // Save FP and LR before call 11766 11767 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 11768 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 11769 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 11770 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 11771 11772 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 11773 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 11774 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 11775 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 11776 11777 int frame_complete = __ offset(); 11778 11779 // Set up last_Java_sp and last_Java_fp 11780 address the_pc = __ pc(); 11781 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 11782 11783 // Call runtime 11784 __ mov(c_rarg1, r0); 11785 __ mov(c_rarg0, rthread); 11786 11787 __ mov(rscratch1, destination); 11788 __ blr(rscratch1); 11789 11790 oop_maps->add_gc_map(the_pc - start, map); 11791 11792 __ reset_last_Java_frame(false); 11793 11794 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 11795 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 11796 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 11797 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 11798 11799 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 11800 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 11801 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 11802 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 11803 11804 __ leave(); 11805 11806 // check for pending exceptions 11807 Label pending; 11808 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 11809 __ cbnz(rscratch1, pending); 11810 11811 if (has_res) { 11812 __ get_vm_result_oop(r0, rthread); 11813 } 11814 11815 __ ret(lr); 11816 11817 __ bind(pending); 11818 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 11819 11820 // ------------- 11821 // make sure all code is generated 11822 masm->flush(); 11823 11824 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 11825 return stub->entry_point(); 11826 } 11827 11828 // Initialization 11829 void generate_preuniverse_stubs() { 11830 // preuniverse stubs are not needed for aarch64 11831 } 11832 11833 void generate_initial_stubs() { 11834 // Generate initial stubs and initializes the entry points 11835 11836 // entry points that exist in all platforms Note: This is code 11837 // that could be shared among different platforms - however the 11838 // benefit seems to be smaller than the disadvantage of having a 11839 // much more complicated generator structure. See also comment in 11840 // stubRoutines.hpp. 11841 11842 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11843 11844 StubRoutines::_call_stub_entry = 11845 generate_call_stub(StubRoutines::_call_stub_return_address); 11846 11847 // is referenced by megamorphic call 11848 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11849 11850 // Initialize table for copy memory (arraycopy) check. 11851 if (UnsafeMemoryAccess::_table == nullptr) { 11852 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11853 } 11854 11855 if (UseCRC32Intrinsics) { 11856 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11857 } 11858 11859 if (UseCRC32CIntrinsics) { 11860 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11861 } 11862 11863 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11864 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11865 } 11866 11867 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11868 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11869 } 11870 11871 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11872 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11873 StubRoutines::_hf2f = generate_float16ToFloat(); 11874 StubRoutines::_f2hf = generate_floatToFloat16(); 11875 } 11876 11877 if (InlineTypeReturnedAsFields) { 11878 StubRoutines::_load_inline_type_fields_in_regs = 11879 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 11880 StubRoutines::_store_inline_type_fields_to_buf = 11881 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 11882 } 11883 11884 } 11885 11886 void generate_continuation_stubs() { 11887 // Continuation stubs: 11888 StubRoutines::_cont_thaw = generate_cont_thaw(); 11889 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11890 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11891 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11892 } 11893 11894 void generate_final_stubs() { 11895 // support for verify_oop (must happen after universe_init) 11896 if (VerifyOops) { 11897 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11898 } 11899 11900 // arraycopy stubs used by compilers 11901 generate_arraycopy_stubs(); 11902 11903 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11904 11905 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11906 11907 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11908 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11909 11910 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11911 11912 generate_atomic_entry_points(); 11913 11914 #endif // LINUX 11915 11916 #ifdef COMPILER2 11917 if (UseSecondarySupersTable) { 11918 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11919 if (! InlineSecondarySupersTest) { 11920 generate_lookup_secondary_supers_table_stub(); 11921 } 11922 } 11923 #endif 11924 11925 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11926 11927 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11928 } 11929 11930 void generate_compiler_stubs() { 11931 #if COMPILER2_OR_JVMCI 11932 11933 if (UseSVE == 0) { 11934 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id); 11935 } 11936 11937 // array equals stub for large arrays. 11938 if (!UseSimpleArrayEquals) { 11939 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11940 } 11941 11942 // arrays_hascode stub for large arrays. 11943 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11944 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11945 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11946 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11947 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11948 11949 // byte_array_inflate stub for large arrays. 11950 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11951 11952 // countPositives stub for large arrays. 11953 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11954 11955 generate_compare_long_strings(); 11956 11957 generate_string_indexof_stubs(); 11958 11959 #ifdef COMPILER2 11960 if (UseMultiplyToLenIntrinsic) { 11961 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11962 } 11963 11964 if (UseSquareToLenIntrinsic) { 11965 StubRoutines::_squareToLen = generate_squareToLen(); 11966 } 11967 11968 if (UseMulAddIntrinsic) { 11969 StubRoutines::_mulAdd = generate_mulAdd(); 11970 } 11971 11972 if (UseSIMDForBigIntegerShiftIntrinsics) { 11973 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11974 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11975 } 11976 11977 if (UseMontgomeryMultiplyIntrinsic) { 11978 StubId stub_id = StubId::stubgen_montgomeryMultiply_id; 11979 StubCodeMark mark(this, stub_id); 11980 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11981 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11982 } 11983 11984 if (UseMontgomerySquareIntrinsic) { 11985 StubId stub_id = StubId::stubgen_montgomerySquare_id; 11986 StubCodeMark mark(this, stub_id); 11987 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11988 // We use generate_multiply() rather than generate_square() 11989 // because it's faster for the sizes of modulus we care about. 11990 StubRoutines::_montgomerySquare = g.generate_multiply(); 11991 } 11992 11993 #endif // COMPILER2 11994 11995 if (UseChaCha20Intrinsics) { 11996 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11997 } 11998 11999 if (UseKyberIntrinsics) { 12000 StubRoutines::_kyberNtt = generate_kyberNtt(); 12001 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 12002 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 12003 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 12004 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 12005 StubRoutines::_kyber12To16 = generate_kyber12To16(); 12006 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 12007 } 12008 12009 if (UseDilithiumIntrinsics) { 12010 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 12011 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 12012 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 12013 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 12014 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 12015 } 12016 12017 if (UseBASE64Intrinsics) { 12018 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 12019 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 12020 } 12021 12022 // data cache line writeback 12023 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 12024 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 12025 12026 if (UseAESIntrinsics) { 12027 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 12028 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 12029 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 12030 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 12031 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 12032 } 12033 if (UseGHASHIntrinsics) { 12034 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 12035 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 12036 } 12037 if (UseAESIntrinsics && UseGHASHIntrinsics) { 12038 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 12039 } 12040 12041 if (UseMD5Intrinsics) { 12042 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id); 12043 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id); 12044 } 12045 if (UseSHA1Intrinsics) { 12046 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id); 12047 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id); 12048 } 12049 if (UseSHA256Intrinsics) { 12050 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id); 12051 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id); 12052 } 12053 if (UseSHA512Intrinsics) { 12054 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id); 12055 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id); 12056 } 12057 if (UseSHA3Intrinsics) { 12058 12059 StubRoutines::_double_keccak = generate_double_keccak(); 12060 if (UseSIMDForSHA3Intrinsic) { 12061 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id); 12062 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id); 12063 } else { 12064 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id); 12065 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id); 12066 } 12067 } 12068 12069 if (UsePoly1305Intrinsics) { 12070 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 12071 } 12072 12073 // generate Adler32 intrinsics code 12074 if (UseAdler32Intrinsics) { 12075 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 12076 } 12077 12078 #endif // COMPILER2_OR_JVMCI 12079 } 12080 12081 public: 12082 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) { 12083 switch(blob_id) { 12084 case BlobId::stubgen_preuniverse_id: 12085 generate_preuniverse_stubs(); 12086 break; 12087 case BlobId::stubgen_initial_id: 12088 generate_initial_stubs(); 12089 break; 12090 case BlobId::stubgen_continuation_id: 12091 generate_continuation_stubs(); 12092 break; 12093 case BlobId::stubgen_compiler_id: 12094 generate_compiler_stubs(); 12095 break; 12096 case BlobId::stubgen_final_id: 12097 generate_final_stubs(); 12098 break; 12099 default: 12100 fatal("unexpected blob id: %s", StubInfo::name(blob_id)); 12101 break; 12102 }; 12103 } 12104 }; // end class declaration 12105 12106 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) { 12107 StubGenerator g(code, blob_id); 12108 } 12109 12110 12111 #if defined (LINUX) 12112 12113 // Define pointers to atomic stubs and initialize them to point to the 12114 // code in atomic_aarch64.S. 12115 12116 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 12117 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 12118 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 12119 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 12120 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 12121 12122 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 12123 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 12124 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 12125 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 12126 DEFAULT_ATOMIC_OP(xchg, 4, ) 12127 DEFAULT_ATOMIC_OP(xchg, 8, ) 12128 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 12129 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 12130 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 12131 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 12132 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 12133 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 12134 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 12135 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 12136 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 12137 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 12138 12139 #undef DEFAULT_ATOMIC_OP 12140 12141 #endif // LINUX